9 files changed, 12060 insertions, 0 deletions
diff --git a/storage/innobase/buf/buf0block_hint.cc b/storage/innobase/buf/buf0block_hint.cc
new file mode 100644
index 00000000..6d99d0b6
--- /dev/null
+++ b/storage/innobase/buf/buf0block_hint.cc
@@ -0,0 +1,59 @@
+/*****************************************************************************
+
+Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License, version 2.0, as published by the
+Free Software Foundation.
+
+This program is also distributed with certain software (including but not
+limited to OpenSSL) that is licensed under separate terms, as designated in a
+particular file or component or in included license documentation. The authors
+of MySQL hereby grant you an additional permission to link the program and
+your derivative works with the separately licensed software that they have
+included with MySQL.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
+for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
+
+*****************************************************************************/
+
+#include "buf0block_hint.h"
+namespace buf {
+
+void Block_hint::buffer_fix_block_if_still_valid()
+{
+  /* To check if m_block belongs to the current buf_pool, we must
+  prevent freeing memory while we check, and until we buffer-fix the
+  block. For this purpose it is enough to latch any of the many
+  latches taken by buf_pool_t::resize().
+
+  Similar to buf_page_optimistic_get(), we must validate
+  m_block->page.id() after acquiring the hash_lock, because the object
+  may have been freed and not actually attached to buf_pool.page_hash
+  at the moment. (The block could have been reused to store a
+  different page, and that slice of buf_pool.page_hash could be protected
+  by another hash_lock that we are not holding.)
+
+  Finally, assuming that we have correct hash bucket latched, we must
+  validate m_block->state() to ensure that the block is not being freed. */
+  if (m_block)
+  {
+    const ulint fold= m_page_id.fold();
+    page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
+    if (buf_pool.is_uncompressed(m_block) && m_page_id == m_block->page.id() &&
+        m_block->page.state() == BUF_BLOCK_FILE_PAGE)
+      buf_block_buf_fix_inc(m_block, __FILE__, __LINE__);
+    else
+      clear();
+    hash_lock->read_unlock();
+  }
+}
+}  // namespace buf
diff --git a/storage/innobase/buf/buf0buddy.cc b/storage/innobase/buf/buf0buddy.cc
new file mode 100644
index 00000000..f822adc3
--- /dev/null
+++ b/storage/innobase/buf/buf0buddy.cc
@@ -0,0 +1,764 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buddy.cc
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#include "buf0buddy.h"
+#include "buf0buf.h"
+#include "buf0lru.h"
+#include "buf0flu.h"
+#include "page0zip.h"
+#include "srv0start.h"
+
+/** When freeing a buf we attempt to coalesce by looking at its buddy
+and deciding whether it is free or not. To ascertain if the buddy is
+free we look for BUF_BUDDY_STAMP_FREE at BUF_BUDDY_STAMP_OFFSET
+within the buddy. The question is how we can be sure that it is
+safe to look at BUF_BUDDY_STAMP_OFFSET.
+The answer lies in following invariants:
+* All blocks allocated by buddy allocator are used for compressed
+page frame.
+* A compressed table always have space_id < SRV_SPACE_ID_UPPER_BOUND
+* BUF_BUDDY_STAMP_OFFSET always points to the space_id field in
+a frame.
+  -- The above is true because we look at these fields when the
+     corresponding buddy block is free which implies that:
+     * The block we are looking at must have an address aligned at
+       the same size that its free buddy has. For example, if we have
+       a free block of 8K then its buddy's address must be aligned at
+       8K as well.
+     * It is possible that the block we are looking at may have been
+       further divided into smaller sized blocks but its starting
+       address must still remain the start of a page frame i.e.: it
+       cannot be middle of a block. For example, if we have a free
+       block of size 8K then its buddy may be divided into blocks
+       of, say, 1K, 1K, 2K, 4K but the buddy's address will still be
+       the starting address of first 1K compressed page.
+     * What is important to note is that for any given block, the
+       buddy's address cannot be in the middle of a larger block i.e.:
+       in above example, our 8K block cannot have a buddy whose address
+       is aligned on 8K but it is part of a larger 16K block.
+*/
+
+/** Offset within buf_buddy_free_t where free or non_free stamps
+are written.*/
+#define BUF_BUDDY_STAMP_OFFSET	FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID
+
+/** Value that we stamp on all buffers that are currently on the zip_free
+list. This value is stamped at BUF_BUDDY_STAMP_OFFSET offset */
+#define BUF_BUDDY_STAMP_FREE	 SRV_SPACE_ID_UPPER_BOUND
+
+/** Stamp value for non-free buffers. Will be overwritten by a non-zero
+value by the consumer of the block */
+#define BUF_BUDDY_STAMP_NONFREE	0XFFFFFFFFUL
+
+/** Return type of buf_buddy_is_free() */
+enum buf_buddy_state_t {
+	BUF_BUDDY_STATE_FREE,	/*!< If the buddy to completely free */
+	BUF_BUDDY_STATE_USED,	/*!< Buddy currently in used */
+	BUF_BUDDY_STATE_PARTIALLY_USED/*!< Some sub-blocks in the buddy
+				are in use */
+};
+
+/**********************************************************************//**
+Invalidate memory area that we won't access while page is free */
+UNIV_INLINE
+void
+buf_buddy_mem_invalid(
+/*==================*/
+	buf_buddy_free_t*	buf,	/*!< in: block to check */
+	ulint			i)	/*!< in: index of zip_free[] */
+{
+  ut_ad(i <= BUF_BUDDY_SIZES);
+
+  MEM_CHECK_ADDRESSABLE(buf, BUF_BUDDY_LOW << i);
+  MEM_UNDEFINED(buf, BUF_BUDDY_LOW << i);
+}
+
+/**********************************************************************//**
+Check if a buddy is stamped free.
+@return whether the buddy is free */
+UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
+bool
+buf_buddy_stamp_is_free(
+/*====================*/
+	const buf_buddy_free_t*	buf)	/*!< in: block to check */
+{
+	compile_time_assert(BUF_BUDDY_STAMP_FREE < BUF_BUDDY_STAMP_NONFREE);
+	return(mach_read_from_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET)
+	       == BUF_BUDDY_STAMP_FREE);
+}
+
+/**********************************************************************//**
+Stamps a buddy free. */
+UNIV_INLINE
+void
+buf_buddy_stamp_free(
+/*=================*/
+	buf_buddy_free_t*	buf,	/*!< in/out: block to stamp */
+	ulint			i)	/*!< in: block size */
+{
+	ut_d(memset(&buf->stamp.bytes, int(i), BUF_BUDDY_LOW << i));
+	buf_buddy_mem_invalid(buf, i);
+	mach_write_to_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET,
+			BUF_BUDDY_STAMP_FREE);
+	buf->stamp.size = i;
+}
+
+/**********************************************************************//**
+Stamps a buddy nonfree.
+@param[in,out]	buf	block to stamp
+@param[in]	i	block size */
+static inline void buf_buddy_stamp_nonfree(buf_buddy_free_t* buf, ulint i)
+{
+	buf_buddy_mem_invalid(buf, i);
+	compile_time_assert(BUF_BUDDY_STAMP_NONFREE == 0xffffffffU);
+	memset(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET, 0xff, 4);
+}
+
+/**********************************************************************//**
+Get the offset of the buddy of a compressed page frame.
+@return the buddy relative of page */
+UNIV_INLINE
+void*
+buf_buddy_get(
+/*==========*/
+	byte*	page,	/*!< in: compressed page */
+	ulint	size)	/*!< in: page size in bytes */
+{
+	ut_ad(ut_is_2pow(size));
+	ut_ad(size >= BUF_BUDDY_LOW);
+	ut_ad(BUF_BUDDY_LOW <= UNIV_ZIP_SIZE_MIN);
+	ut_ad(size < BUF_BUDDY_HIGH);
+	ut_ad(BUF_BUDDY_HIGH == srv_page_size);
+	ut_ad(!ut_align_offset(page, size));
+
+	if (((ulint) page) & size) {
+		return(page - size);
+	} else {
+		return(page + size);
+	}
+}
+
+#ifdef UNIV_DEBUG
+/** Validate a given zip_free list. */
+struct	CheckZipFree {
+	CheckZipFree(ulint i) : m_i(i) {}
+
+	void operator()(const buf_buddy_free_t* elem) const
+	{
+		ut_ad(buf_buddy_stamp_is_free(elem));
+		ut_ad(elem->stamp.size <= m_i);
+	}
+
+	const ulint m_i;
+};
+
+/** Validate a buddy list.
+@param[in]	i		buddy size to validate */
+static void buf_buddy_list_validate(ulint i)
+{
+	ut_list_validate(buf_pool.zip_free[i], CheckZipFree(i));
+}
+
+/**********************************************************************//**
+Debug function to validate that a buffer is indeed free i.e.: in the
+zip_free[].
+@param[in]	buf		block to check
+@param[in]	i		index of buf_pool.zip_free[]
+@return true if free */
+static bool buf_buddy_check_free(const buf_buddy_free_t* buf, ulint i)
+{
+	const ulint	size	= BUF_BUDDY_LOW << i;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(!ut_align_offset(buf, size));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+	buf_buddy_free_t* itr;
+
+	for (itr = UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
+	     itr && itr != buf;
+	     itr = UT_LIST_GET_NEXT(list, itr)) {
+	}
+
+	return(itr == buf);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Checks if a buf is free i.e.: in the zip_free[].
+@retval BUF_BUDDY_STATE_FREE if fully free
+@retval BUF_BUDDY_STATE_USED if currently in use
+@retval BUF_BUDDY_STATE_PARTIALLY_USED if partially in use. */
+static  MY_ATTRIBUTE((warn_unused_result))
+buf_buddy_state_t
+buf_buddy_is_free(
+/*==============*/
+	buf_buddy_free_t*	buf,	/*!< in: block to check */
+	ulint			i)	/*!< in: index of
+					buf_pool.zip_free[] */
+{
+#ifdef UNIV_DEBUG
+	const ulint	size	= BUF_BUDDY_LOW << i;
+	ut_ad(!ut_align_offset(buf, size));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+#endif /* UNIV_DEBUG */
+
+	/* We assume that all memory from buf_buddy_alloc()
+	is used for compressed page frames. */
+
+	/* We look inside the allocated objects returned by
+	buf_buddy_alloc() and assume that each block is a compressed
+	page that contains one of the following in space_id.
+	* BUF_BUDDY_STAMP_FREE if the block is in a zip_free list or
+	* BUF_BUDDY_STAMP_NONFREE if the block has been allocated but
+	not initialized yet or
+	* A valid space_id of a compressed tablespace
+
+	The call below attempts to read from free memory.  The memory
+	is "owned" by the buddy allocator (and it has been allocated
+	from the buffer pool), so there is nothing wrong about this. */
+	if (!buf_buddy_stamp_is_free(buf)) {
+		return(BUF_BUDDY_STATE_USED);
+	}
+
+	/* A block may be free but a fragment of it may still be in use.
+	To guard against that we write the free block size in terms of
+	zip_free index at start of stamped block. Note that we can
+	safely rely on this value only if the buf is free. */
+	ut_ad(buf->stamp.size <= i);
+	return(buf->stamp.size == i
+	       ? BUF_BUDDY_STATE_FREE
+	       : BUF_BUDDY_STATE_PARTIALLY_USED);
+}
+
+/** Add a block to the head of the appropriate buddy free list.
+@param[in,out]	buf		block to be freed
+@param[in]	i		index of buf_pool.zip_free[] */
+UNIV_INLINE
+void
+buf_buddy_add_to_free(buf_buddy_free_t* buf, ulint i)
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(buf_pool.zip_free[i].start != buf);
+
+	buf_buddy_stamp_free(buf, i);
+	UT_LIST_ADD_FIRST(buf_pool.zip_free[i], buf);
+	ut_d(buf_buddy_list_validate(i));
+}
+
+/** Remove a block from the appropriate buddy free list.
+@param[in,out]	buf		block to be freed
+@param[in]	i		index of buf_pool.zip_free[] */
+UNIV_INLINE
+void
+buf_buddy_remove_from_free(buf_buddy_free_t* buf, ulint i)
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(buf_buddy_check_free(buf, i));
+
+	UT_LIST_REMOVE(buf_pool.zip_free[i], buf);
+	buf_buddy_stamp_nonfree(buf, i);
+}
+
+/** Try to allocate a block from buf_pool.zip_free[].
+@param[in]	i		index of buf_pool.zip_free[]
+@return allocated block, or NULL if buf_pool.zip_free[] was empty */
+static buf_buddy_free_t* buf_buddy_alloc_zip(ulint i)
+{
+	buf_buddy_free_t*	buf;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_a(i < BUF_BUDDY_SIZES);
+	ut_a(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+	ut_d(buf_buddy_list_validate(i));
+
+	buf = UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
+
+	if (buf_pool.curr_size < buf_pool.old_size
+	    && UT_LIST_GET_LEN(buf_pool.withdraw)
+	    < buf_pool.withdraw_target) {
+
+		while (buf != NULL
+		       && buf_pool.will_be_withdrawn(
+			       reinterpret_cast<byte*>(buf))) {
+			/* This should be withdrawn, not to be allocated */
+			buf = UT_LIST_GET_NEXT(list, buf);
+		}
+	}
+
+	if (buf) {
+		buf_buddy_remove_from_free(buf, i);
+	} else if (i + 1 < BUF_BUDDY_SIZES) {
+		/* Attempt to split. */
+		buf = buf_buddy_alloc_zip(i + 1);
+
+		if (buf) {
+			buf_buddy_free_t* buddy =
+				reinterpret_cast<buf_buddy_free_t*>(
+					reinterpret_cast<byte*>(buf)
+					+ (BUF_BUDDY_LOW << i));
+			ut_ad(!buf_pool.contains_zip(buddy));
+			buf_buddy_add_to_free(buddy, i);
+		}
+	}
+
+	if (buf) {
+		/* Trash the page other than the BUF_BUDDY_STAMP_NONFREE. */
+		MEM_UNDEFINED(buf, BUF_BUDDY_STAMP_OFFSET);
+		MEM_UNDEFINED(BUF_BUDDY_STAMP_OFFSET + 4 + buf->stamp.bytes,
+			      (BUF_BUDDY_LOW << i)
+			      - (BUF_BUDDY_STAMP_OFFSET + 4));
+		ut_ad(mach_read_from_4(buf->stamp.bytes
+				       + BUF_BUDDY_STAMP_OFFSET)
+		      == BUF_BUDDY_STAMP_NONFREE);
+	}
+
+	return(buf);
+}
+
+/** Deallocate a buffer frame of srv_page_size.
+@param[in]	buf		buffer frame to deallocate */
+static
+void
+buf_buddy_block_free(void* buf)
+{
+	const ulint	fold	= BUF_POOL_ZIP_FOLD_PTR(buf);
+	buf_page_t*	bpage;
+	buf_block_t*	block;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_a(!ut_align_offset(buf, srv_page_size));
+
+	HASH_SEARCH(hash, &buf_pool.zip_hash, fold, buf_page_t*, bpage,
+		    ut_ad(bpage->state() == BUF_BLOCK_MEMORY
+			  && bpage->in_zip_hash),
+		    ((buf_block_t*) bpage)->frame == buf);
+	ut_a(bpage);
+	ut_a(bpage->state() == BUF_BLOCK_MEMORY);
+	ut_ad(bpage->in_zip_hash);
+	ut_d(bpage->in_zip_hash = false);
+	HASH_DELETE(buf_page_t, hash, &buf_pool.zip_hash, fold, bpage);
+
+	ut_d(memset(buf, 0, srv_page_size));
+	MEM_UNDEFINED(buf, srv_page_size);
+
+	block = (buf_block_t*) bpage;
+	buf_LRU_block_free_non_file_page(block);
+
+	ut_ad(buf_pool.buddy_n_frames > 0);
+	ut_d(buf_pool.buddy_n_frames--);
+}
+
+/**********************************************************************//**
+Allocate a buffer block to the buddy allocator. */
+static
+void
+buf_buddy_block_register(
+/*=====================*/
+	buf_block_t*	block)	/*!< in: buffer frame to allocate */
+{
+	const ulint	fold = BUF_POOL_ZIP_FOLD(block);
+	ut_ad(block->page.state() == BUF_BLOCK_MEMORY);
+
+	ut_a(block->frame);
+	ut_a(!ut_align_offset(block->frame, srv_page_size));
+
+	ut_ad(!block->page.in_zip_hash);
+	ut_d(block->page.in_zip_hash = true);
+	HASH_INSERT(buf_page_t, hash, &buf_pool.zip_hash, fold, &block->page);
+
+	ut_d(buf_pool.buddy_n_frames++);
+}
+
+/** Allocate a block from a bigger object.
+@param[in]	buf		a block that is free to use
+@param[in]	i		index of buf_pool.zip_free[]
+@param[in]	j		size of buf as an index of buf_pool.zip_free[]
+@return allocated block */
+static
+void*
+buf_buddy_alloc_from(void* buf, ulint i, ulint j)
+{
+	ulint	offs	= BUF_BUDDY_LOW << j;
+	ut_ad(j <= BUF_BUDDY_SIZES);
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+	ut_ad(j >= i);
+	ut_ad(!ut_align_offset(buf, offs));
+
+	/* Add the unused parts of the block to the free lists. */
+	while (j > i) {
+		buf_buddy_free_t*	zip_buf;
+
+		offs >>= 1;
+		j--;
+
+		zip_buf = reinterpret_cast<buf_buddy_free_t*>(
+			reinterpret_cast<byte*>(buf) + offs);
+		buf_buddy_add_to_free(zip_buf, j);
+	}
+
+	buf_buddy_stamp_nonfree(reinterpret_cast<buf_buddy_free_t*>(buf), i);
+	return(buf);
+}
+
+/** Allocate a ROW_FORMAT=COMPRESSED block.
+@param i      index of buf_pool.zip_free[] or BUF_BUDDY_SIZES
+@param lru    assigned to true if buf_pool.mutex was temporarily released
+@return allocated block, never NULL */
+byte *buf_buddy_alloc_low(ulint i, bool *lru)
+{
+	buf_block_t*	block;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+	if (i < BUF_BUDDY_SIZES) {
+		/* Try to allocate from the buddy system. */
+		block = (buf_block_t*) buf_buddy_alloc_zip(i);
+
+		if (block) {
+			goto func_exit;
+		}
+	}
+
+	/* Try allocating from the buf_pool.free list. */
+	block = buf_LRU_get_free_only();
+
+	if (block) {
+		goto alloc_big;
+	}
+
+	/* Try replacing an uncompressed page in the buffer pool. */
+	block = buf_LRU_get_free_block(true);
+	if (lru) {
+		*lru = true;
+	}
+
+alloc_big:
+	buf_buddy_block_register(block);
+
+	block = (buf_block_t*) buf_buddy_alloc_from(
+		block->frame, i, BUF_BUDDY_SIZES);
+
+func_exit:
+	buf_pool.buddy_stat[i].used++;
+	return reinterpret_cast<byte*>(block);
+}
+
+/** Try to relocate a block. The caller must hold zip_free_mutex, and this
+function will release and lock it again.
+@param[in]	src		block to relocate
+@param[in]	dst		free block to relocated to
+@param[in]	i		index of buf_pool.zip_free[]
+@param[in]	force		true if we must relocated always
+@return true if relocated */
+static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
+{
+	buf_page_t*	bpage;
+	const ulint	size = BUF_BUDDY_LOW << i;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(!ut_align_offset(src, size));
+	ut_ad(!ut_align_offset(dst, size));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+	MEM_CHECK_ADDRESSABLE(dst, size);
+
+	uint32_t space = mach_read_from_4(static_cast<const byte*>(src)
+					  + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	uint32_t offset = mach_read_from_4(static_cast<const byte*>(src)
+					   + FIL_PAGE_OFFSET);
+
+	/* Suppress Valgrind or MSAN warnings. */
+	MEM_MAKE_DEFINED(&space, sizeof space);
+	MEM_MAKE_DEFINED(&offset, sizeof offset);
+
+	ut_ad(space != BUF_BUDDY_STAMP_FREE);
+
+	const page_id_t	page_id(space, offset);
+	const ulint fold= page_id.fold();
+
+	bpage = buf_pool.page_hash_get_low(page_id, fold);
+
+	if (!bpage || bpage->zip.data != src) {
+		/* The block has probably been freshly
+		allocated by buf_LRU_get_free_block() but not
+		added to buf_pool.page_hash yet.  Obviously,
+		it cannot be relocated. */
+
+		if (!force || space != 0 || offset != 0) {
+			return(false);
+		}
+
+		/* It might be just uninitialized page.
+		We should search from LRU list also. */
+
+		bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
+		while (bpage != NULL) {
+			if (bpage->zip.data == src) {
+				ut_ad(bpage->id() == page_id);
+				break;
+			}
+			bpage = UT_LIST_GET_NEXT(LRU, bpage);
+		}
+
+		if (bpage == NULL) {
+			return(false);
+		}
+	}
+
+	if (page_zip_get_size(&bpage->zip) != size) {
+		/* The block is of different size.  We would
+		have to relocate all blocks covered by src.
+		For the sake of simplicity, give up. */
+		ut_ad(page_zip_get_size(&bpage->zip) < size);
+		return(false);
+	}
+
+	/* The block must have been allocated, but it may
+	contain uninitialized data. */
+	MEM_CHECK_ADDRESSABLE(src, size);
+
+	if (!bpage->can_relocate()) {
+		return false;
+	}
+
+	page_hash_latch *hash_lock = buf_pool.page_hash.lock_get(fold);
+	hash_lock->write_lock();
+
+	if (bpage->can_relocate()) {
+		/* Relocate the compressed page. */
+		const ulonglong ns = my_interval_timer();
+
+		ut_a(bpage->zip.data == src);
+
+		memcpy(dst, src, size);
+		bpage->zip.data = reinterpret_cast<page_zip_t*>(dst);
+
+		hash_lock->write_unlock();
+
+		buf_buddy_mem_invalid(
+			reinterpret_cast<buf_buddy_free_t*>(src), i);
+
+		buf_buddy_stat_t*	buddy_stat = &buf_pool.buddy_stat[i];
+		buddy_stat->relocated++;
+		buddy_stat->relocated_usec+= (my_interval_timer() - ns) / 1000;
+		return(true);
+	}
+
+	hash_lock->write_unlock();
+
+	return(false);
+}
+
+/** Deallocate a block.
+@param[in]	buf	block to be freed, must not be pointed to
+			by the buffer pool
+@param[in]	i	index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
+void buf_buddy_free_low(void* buf, ulint i)
+{
+	buf_buddy_free_t*	buddy;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(i <= BUF_BUDDY_SIZES);
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+	ut_ad(buf_pool.buddy_stat[i].used > 0);
+
+	buf_pool.buddy_stat[i].used--;
+recombine:
+	MEM_UNDEFINED(buf, BUF_BUDDY_LOW << i);
+
+	if (i == BUF_BUDDY_SIZES) {
+		buf_buddy_block_free(buf);
+		return;
+	}
+
+	ut_ad(i < BUF_BUDDY_SIZES);
+	ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i));
+	ut_ad(!buf_pool.contains_zip(buf));
+
+	/* Do not recombine blocks if there are few free blocks.
+	We may waste up to 15360*max_len bytes to free blocks
+	(1024 + 2048 + 4096 + 8192 = 15360) */
+	if (UT_LIST_GET_LEN(buf_pool.zip_free[i]) < 16
+	    && buf_pool.curr_size >= buf_pool.old_size) {
+		goto func_exit;
+	}
+
+	/* Try to combine adjacent blocks. */
+	buddy = reinterpret_cast<buf_buddy_free_t*>(
+		buf_buddy_get(reinterpret_cast<byte*>(buf),
+			      BUF_BUDDY_LOW << i));
+
+	switch (buf_buddy_is_free(buddy, i)) {
+	case BUF_BUDDY_STATE_FREE:
+		/* The buddy is free: recombine */
+		buf_buddy_remove_from_free(buddy, i);
+buddy_is_free:
+		ut_ad(!buf_pool.contains_zip(buddy));
+		i++;
+		buf = ut_align_down(buf, BUF_BUDDY_LOW << i);
+
+		goto recombine;
+
+	case BUF_BUDDY_STATE_USED:
+		ut_d(buf_buddy_list_validate(i));
+
+		/* The buddy is not free. Is there a free block of
+		this size? */
+		if (buf_buddy_free_t* zip_buf =
+			UT_LIST_GET_FIRST(buf_pool.zip_free[i])) {
+
+			/* Remove the block from the free list, because
+			a successful buf_buddy_relocate() will overwrite
+			zip_free->list. */
+			buf_buddy_remove_from_free(zip_buf, i);
+
+			/* Try to relocate the buddy of buf to the free
+			block. */
+			if (buf_buddy_relocate(buddy, zip_buf, i, false)) {
+				goto buddy_is_free;
+			}
+
+			buf_buddy_add_to_free(zip_buf, i);
+		}
+
+		break;
+	case BUF_BUDDY_STATE_PARTIALLY_USED:
+		/* Some sub-blocks in the buddy are still in use.
+		Relocation will fail. No need to try. */
+		break;
+	}
+
+func_exit:
+	/* Free the block to the buddy list. */
+	buf_buddy_add_to_free(reinterpret_cast<buf_buddy_free_t*>(buf), i);
+}
+
+/** Try to reallocate a block.
+@param[in]	buf	buf_pool block to be reallocated
+@param[in]	size	block size, up to srv_page_size
+@return	whether the reallocation succeeded */
+bool
+buf_buddy_realloc(void* buf, ulint size)
+{
+	buf_block_t*	block = NULL;
+	ulint		i = buf_buddy_get_slot(size);
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(i <= BUF_BUDDY_SIZES);
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+	if (i < BUF_BUDDY_SIZES) {
+		/* Try to allocate from the buddy system. */
+		block = reinterpret_cast<buf_block_t*>(buf_buddy_alloc_zip(i));
+	}
+
+	if (block == NULL) {
+		/* Try allocating from the buf_pool.free list. */
+		block = buf_LRU_get_free_only();
+
+		if (block == NULL) {
+			return(false); /* free_list was not enough */
+		}
+
+		buf_buddy_block_register(block);
+
+		block = reinterpret_cast<buf_block_t*>(
+			buf_buddy_alloc_from(
+				block->frame, i, BUF_BUDDY_SIZES));
+	}
+
+	buf_pool.buddy_stat[i].used++;
+
+	/* Try to relocate the buddy of buf to the free block. */
+	if (buf_buddy_relocate(buf, block, i, true)) {
+		/* succeeded */
+		buf_buddy_free_low(buf, i);
+	} else {
+		/* failed */
+		buf_buddy_free_low(block, i);
+	}
+
+	return(true); /* free_list was enough */
+}
+
+/** Combine all pairs of free buddies. */
+void buf_buddy_condense_free()
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(buf_pool.curr_size < buf_pool.old_size);
+
+	for (ulint i = 0; i < UT_ARR_SIZE(buf_pool.zip_free); ++i) {
+		buf_buddy_free_t* buf =
+			UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
+
+		/* seek to withdraw target */
+		while (buf != NULL
+		       && !buf_pool.will_be_withdrawn(
+			       reinterpret_cast<byte*>(buf))) {
+			buf = UT_LIST_GET_NEXT(list, buf);
+		}
+
+		while (buf != NULL) {
+			buf_buddy_free_t* next =
+				UT_LIST_GET_NEXT(list, buf);
+
+			buf_buddy_free_t* buddy =
+				reinterpret_cast<buf_buddy_free_t*>(
+					buf_buddy_get(
+						reinterpret_cast<byte*>(buf),
+						BUF_BUDDY_LOW << i));
+
+			/* seek to the next withdraw target */
+			while (true) {
+				while (next != NULL
+				       && !buf_pool.will_be_withdrawn(
+						reinterpret_cast<byte*>(next))) {
+					 next = UT_LIST_GET_NEXT(list, next);
+				}
+
+				if (buddy != next) {
+					break;
+				}
+
+				next = UT_LIST_GET_NEXT(list, next);
+			}
+
+			if (buf_buddy_is_free(buddy, i)
+			    == BUF_BUDDY_STATE_FREE) {
+				/* Both buf and buddy are free.
+				Try to combine them. */
+				buf_buddy_remove_from_free(buf, i);
+				buf_pool.buddy_stat[i].used++;
+
+				buf_buddy_free_low(buf, i);
+			}
+
+			buf = next;
+		}
+	}
+}
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
new file mode 100644
index 00000000..b658bdfc
--- /dev/null
+++ b/storage/innobase/buf/buf0buf.cc
@@ -0,0 +1,4728 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buf.cc
+The database buffer buf_pool
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "assume_aligned.h"
+#include "mtr0types.h"
+#include "mach0data.h"
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "ut0crc32.h"
+#include <string.h>
+
+#ifndef UNIV_INNOCHECKSUM
+#include "my_cpu.h"
+#include "mem0mem.h"
+#include "btr0btr.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "buf0buddy.h"
+#include "buf0dblwr.h"
+#include "lock0lock.h"
+#include "sync0rw.h"
+#include "btr0sea.h"
+#include "ibuf0ibuf.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "log0log.h"
+#include "dict0stats_bg.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "dict0dict.h"
+#include "log0recv.h"
+#include "srv0mon.h"
+#include "log0crypt.h"
+#include "fil0pagecompress.h"
+#endif /* !UNIV_INNOCHECKSUM */
+#include "page0zip.h"
+#include "sync0sync.h"
+#include "buf0dump.h"
+#include <map>
+#include <sstream>
+
+using st_::span;
+
+#ifdef HAVE_LIBNUMA
+#include <numa.h>
+#include <numaif.h>
+struct set_numa_interleave_t
+{
+	set_numa_interleave_t()
+	{
+		if (srv_numa_interleave) {
+
+			struct bitmask *numa_mems_allowed = numa_get_mems_allowed();
+			ib::info() << "Setting NUMA memory policy to"
+				" MPOL_INTERLEAVE";
+			if (set_mempolicy(MPOL_INTERLEAVE,
+					  numa_mems_allowed->maskp,
+					  numa_mems_allowed->size) != 0) {
+
+				ib::warn() << "Failed to set NUMA memory"
+					" policy to MPOL_INTERLEAVE: "
+					<< strerror(errno);
+			}
+			numa_bitmask_free(numa_mems_allowed);
+		}
+	}
+
+	~set_numa_interleave_t()
+	{
+		if (srv_numa_interleave) {
+
+			ib::info() << "Setting NUMA memory policy to"
+				" MPOL_DEFAULT";
+			if (set_mempolicy(MPOL_DEFAULT, NULL, 0) != 0) {
+				ib::warn() << "Failed to set NUMA memory"
+					" policy to MPOL_DEFAULT: "
+					<< strerror(errno);
+			}
+		}
+	}
+};
+
+#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE set_numa_interleave_t scoped_numa
+#else
+#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE
+#endif /* HAVE_LIBNUMA */
+
+/*
+		IMPLEMENTATION OF THE BUFFER POOL
+		=================================
+
+		Buffer frames and blocks
+		------------------------
+Following the terminology of Gray and Reuter, we call the memory
+blocks where file pages are loaded buffer frames. For each buffer
+frame there is a control block, or shortly, a block, in the buffer
+control array. The control info which does not need to be stored
+in the file along with the file page, resides in the control block.
+
+		Buffer pool struct
+		------------------
+The buffer buf_pool contains a single mutex which protects all the
+control data structures of the buf_pool. The content of a buffer frame is
+protected by a separate read-write lock in its control block, though.
+These locks can be locked and unlocked without owning the buf_pool.mutex.
+The OS events in the buf_pool struct can be waited for without owning the
+buf_pool.mutex.
+
+The buf_pool.mutex is a hot-spot in main memory, causing a lot of
+memory bus traffic on multiprocessor systems when processors
+alternately access the mutex. On our Pentium, the mutex is accessed
+maybe every 10 microseconds. We gave up the solution to have mutexes
+for each control block, for instance, because it seemed to be
+complicated.
+
+A solution to reduce mutex contention of the buf_pool.mutex is to
+create a separate mutex for the page hash table. On Pentium,
+accessing the hash table takes 2 microseconds, about half
+of the total buf_pool.mutex hold time.
+
+		Control blocks
+		--------------
+
+The control block contains, for instance, the bufferfix count
+which is incremented when a thread wants a file page to be fixed
+in a buffer frame. The bufferfix operation does not lock the
+contents of the frame, however. For this purpose, the control
+block contains a read-write lock.
+
+The buffer frames have to be aligned so that the start memory
+address of a frame is divisible by the universal page size, which
+is a power of two.
+
+The control blocks containing file pages are put to a hash table
+according to the file address of the page.
+We could speed up the access to an individual page by using
+"pointer swizzling": we could replace the page references on
+non-leaf index pages by direct pointers to the page, if it exists
+in the buf_pool. We could make a separate hash table where we could
+chain all the page references in non-leaf pages residing in the buf_pool,
+using the page reference as the hash key,
+and at the time of reading of a page update the pointers accordingly.
+Drawbacks of this solution are added complexity and,
+possibly, extra space required on non-leaf pages for memory pointers.
+A simpler solution is just to speed up the hash table mechanism
+in the database, using tables whose size is a power of 2.
+
+		Lists of blocks
+		---------------
+
+There are several lists of control blocks.
+
+The free list (buf_pool.free) contains blocks which are currently not
+used.
+
+The common LRU list contains all the blocks holding a file page
+except those for which the bufferfix count is non-zero.
+The pages are in the LRU list roughly in the order of the last
+access to the page, so that the oldest pages are at the end of the
+list. We also keep a pointer to near the end of the LRU list,
+which we can use when we want to artificially age a page in the
+buf_pool. This is used if we know that some page is not needed
+again for some time: we insert the block right after the pointer,
+causing it to be replaced sooner than would normally be the case.
+Currently this aging mechanism is used for read-ahead mechanism
+of pages, and it can also be used when there is a scan of a full
+table which cannot fit in the memory. Putting the pages near the
+end of the LRU list, we make sure that most of the buf_pool stays
+in the main memory, undisturbed.
+
+The unzip_LRU list contains a subset of the common LRU list.  The
+blocks on the unzip_LRU list hold a compressed file page and the
+corresponding uncompressed page frame.  A block is in unzip_LRU if and
+only if the predicate block->page.belongs_to_unzip_LRU()
+holds.  The blocks in unzip_LRU will be in same order as they are in
+the common LRU list.  That is, each manipulation of the common LRU
+list will result in the same manipulation of the unzip_LRU list.
+
+The chain of modified blocks (buf_pool.flush_list) contains the blocks
+holding persistent file pages that have been modified in the memory
+but not written to disk yet. The block with the oldest modification
+which has not yet been written to disk is at the end of the chain.
+The access to this list is protected by buf_pool.flush_list_mutex.
+
+The control blocks for uncompressed pages are accessible via
+buf_block_t objects that are reachable via buf_pool.chunks[].
+The control blocks (buf_page_t) of those ROW_FORMAT=COMPRESSED pages
+that are not in buf_pool.flush_list and for which no uncompressed
+page has been allocated in buf_pool are only accessible via
+buf_pool.LRU.
+
+The chains of free memory blocks (buf_pool.zip_free[]) are used by
+the buddy allocator (buf0buddy.cc) to keep track of currently unused
+memory blocks of size sizeof(buf_page_t)..srv_page_size / 2.  These
+blocks are inside the srv_page_size-sized memory blocks of type
+BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
+pool.  The buddy allocator is solely used for allocating control
+blocks for compressed pages (buf_page_t) and compressed page frames.
+
+		Loading a file page
+		-------------------
+
+First, a victim block for replacement has to be found in the
+buf_pool. It is taken from the free list or searched for from the
+end of the LRU-list. An exclusive lock is reserved for the frame,
+the io_fix field is set in the block fixing the block in buf_pool,
+and the io-operation for loading the page is queued. The io-handler thread
+releases the X-lock on the frame and resets the io_fix field
+when the io operation completes.
+
+A thread may request the above operation using the function
+buf_page_get(). It may then continue to request a lock on the frame.
+The lock is granted when the io-handler releases the x-lock.
+
+		Read-ahead
+		----------
+
+The read-ahead mechanism is intended to be intelligent and
+isolated from the semantically higher levels of the database
+index management. From the higher level we only need the
+information if a file page has a natural successor or
+predecessor page. On the leaf level of a B-tree index,
+these are the next and previous pages in the natural
+order of the pages.
+
+Let us first explain the read-ahead mechanism when the leafs
+of a B-tree are scanned in an ascending or descending order.
+When a read page is the first time referenced in the buf_pool,
+the buffer manager checks if it is at the border of a so-called
+linear read-ahead area. The tablespace is divided into these
+areas of size 64 blocks, for example. So if the page is at the
+border of such an area, the read-ahead mechanism checks if
+all the other blocks in the area have been accessed in an
+ascending or descending order. If this is the case, the system
+looks at the natural successor or predecessor of the page,
+checks if that is at the border of another area, and in this case
+issues read-requests for all the pages in that area. Maybe
+we could relax the condition that all the pages in the area
+have to be accessed: if data is deleted from a table, there may
+appear holes of unused pages in the area.
+
+A different read-ahead mechanism is used when there appears
+to be a random access pattern to a file.
+If a new page is referenced in the buf_pool, and several pages
+of its random access area (for instance, 32 consecutive pages
+in a tablespace) have recently been referenced, we may predict
+that the whole area may be needed in the near future, and issue
+the read requests for the whole area.
+*/
+
+#ifndef UNIV_INNOCHECKSUM
+void page_hash_latch::read_lock_wait()
+{
+  /* First, try busy spinning for a while. */
+  for (auto spin= srv_n_spin_wait_rounds; spin--; )
+  {
+    ut_delay(srv_spin_wait_delay);
+    if (read_trylock())
+      return;
+  }
+  /* Fall back to yielding to other threads. */
+  do
+    os_thread_yield();
+  while (!read_trylock());
+}
+
+void page_hash_latch::write_lock_wait()
+{
+  write_lock_wait_start();
+
+  /* First, try busy spinning for a while. */
+  for (auto spin= srv_n_spin_wait_rounds; spin--; )
+  {
+    if (write_lock_poll())
+      return;
+    ut_delay(srv_spin_wait_delay);
+  }
+
+  /* Fall back to yielding to other threads. */
+  do
+    os_thread_yield();
+  while (!write_lock_poll());
+}
+
+/** Value in microseconds */
+constexpr int WAIT_FOR_READ= 100;
+constexpr int WAIT_FOR_WRITE= 100;
+/** Number of attempts made to read in a page in the buffer pool */
+constexpr ulint	BUF_PAGE_READ_MAX_RETRIES= 100;
+/** The maximum portion of the buffer pool that can be used for the
+read-ahead buffer.  (Divide buf_pool size by this amount) */
+constexpr uint32_t BUF_READ_AHEAD_PORTION= 32;
+
+/** A 64KiB buffer of NUL bytes, for use in assertions and checks,
+and dummy default values of instantly dropped columns.
+Initially, BLOB field references are set to NUL bytes, in
+dtuple_convert_big_rec(). */
+const byte *field_ref_zero;
+
+/** The InnoDB buffer pool */
+buf_pool_t buf_pool;
+buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_reg;
+buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_ref;
+
+#ifdef UNIV_DEBUG
+/** Disable resizing buffer pool to make assertion code not expensive. */
+my_bool			buf_disable_resize_buffer_pool_debug = TRUE;
+
+/** This is used to insert validation operations in execution
+in the debug version */
+static ulint buf_dbg_counter;
+#endif /* UNIV_DEBUG */
+
+/** Macro to determine whether the read of write counter is used depending
+on the io_type */
+#define MONITOR_RW_COUNTER(io_type, counter)		\
+	((io_type == BUF_IO_READ)			\
+	 ? (counter##_READ)				\
+	 : (counter##_WRITTEN))
+
+
+/** Decrypt a page for temporary tablespace.
+@param[in,out]	tmp_frame	Temporary buffer
+@param[in]	src_frame	Page to decrypt
+@return true if temporary tablespace decrypted, false if not */
+static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame)
+{
+	if (buf_is_zeroes(span<const byte>(src_frame, srv_page_size))) {
+		return true;
+	}
+
+	/* read space & lsn */
+	uint header_len = FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+
+	/* Copy FIL page header, it is not encrypted */
+	memcpy(tmp_frame, src_frame, header_len);
+
+	/* Calculate the offset where decryption starts */
+	const byte* src = src_frame + header_len;
+	byte* dst = tmp_frame + header_len;
+	uint srclen = uint(srv_page_size)
+		- (header_len + FIL_PAGE_FCRC32_CHECKSUM);
+	ulint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET);
+
+	if (!log_tmp_block_decrypt(src, srclen, dst,
+				   (offset * srv_page_size))) {
+		return false;
+	}
+
+	static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment");
+	memcpy_aligned<4>(tmp_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+			  src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+			  FIL_PAGE_FCRC32_CHECKSUM);
+
+	memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(src_frame, tmp_frame,
+					       srv_page_size);
+	srv_stats.pages_decrypted.inc();
+	srv_stats.n_temp_blocks_decrypted.inc();
+
+	return true; /* page was decrypted */
+}
+
+/** Decrypt a page.
+@param[in,out]	bpage	Page control block
+@param[in]	node	data file
+@return whether the operation was successful */
+static bool buf_page_decrypt_after_read(buf_page_t *bpage,
+                                        const fil_node_t &node)
+{
+	ut_ad(node.space->referenced());
+	ut_ad(node.space->id == bpage->id().space());
+	const auto flags = node.space->flags;
+
+	byte* dst_frame = bpage->zip.data ? bpage->zip.data :
+		((buf_block_t*) bpage)->frame;
+	bool page_compressed = node.space->is_compressed()
+		&& buf_page_is_compressed(dst_frame, flags);
+	const page_id_t id(bpage->id());
+
+	if (id.page_no() == 0) {
+		/* File header pages are not encrypted/compressed */
+		return (true);
+	}
+
+	if (node.space->purpose == FIL_TYPE_TEMPORARY
+	    && innodb_encrypt_temporary_tables) {
+		buf_tmp_buffer_t* slot = buf_pool.io_buf_reserve();
+		ut_a(slot);
+		slot->allocate();
+
+		if (!buf_tmp_page_decrypt(slot->crypt_buf, dst_frame)) {
+			slot->release();
+			ib::error() << "Encrypted page " << id
+				    << " in file " << node.name;
+			return false;
+		}
+
+		slot->release();
+		return true;
+	}
+
+	/* Page is encrypted if encryption information is found from
+	tablespace and page contains used key_version. This is true
+	also for pages first compressed and then encrypted. */
+
+	buf_tmp_buffer_t* slot;
+	uint key_version = buf_page_get_key_version(dst_frame, flags);
+
+	if (page_compressed && !key_version) {
+		/* the page we read is unencrypted */
+		/* Find free slot from temporary memory array */
+decompress:
+		if (fil_space_t::full_crc32(flags)
+		    && buf_page_is_corrupted(true, dst_frame, flags)) {
+			return false;
+		}
+
+		slot = buf_pool.io_buf_reserve();
+		ut_a(slot);
+		slot->allocate();
+
+decompress_with_slot:
+		ut_d(fil_page_type_validate(node.space, dst_frame));
+
+		ulint write_size = fil_page_decompress(
+			slot->crypt_buf, dst_frame, flags);
+		slot->release();
+		ut_ad(!write_size
+		      || fil_page_type_validate(node.space, dst_frame));
+		ut_ad(node.space->referenced());
+		return write_size != 0;
+	}
+
+	if (key_version && node.space->crypt_data) {
+		/* Verify encryption checksum before we even try to
+		decrypt. */
+		if (!buf_page_verify_crypt_checksum(dst_frame, flags)) {
+decrypt_failed:
+			ib::error() << "Encrypted page " << id
+				    << " in file " << node.name
+				    << " looks corrupted; key_version="
+				    << key_version;
+			return false;
+		}
+
+		slot = buf_pool.io_buf_reserve();
+		ut_a(slot);
+		slot->allocate();
+		ut_d(fil_page_type_validate(node.space, dst_frame));
+
+		/* decrypt using crypt_buf to dst_frame */
+		if (!fil_space_decrypt(node.space, slot->crypt_buf, dst_frame)) {
+			slot->release();
+			goto decrypt_failed;
+		}
+
+		ut_d(fil_page_type_validate(node.space, dst_frame));
+
+		if ((fil_space_t::full_crc32(flags) && page_compressed)
+		    || fil_page_get_type(dst_frame)
+		    == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
+			goto decompress_with_slot;
+		}
+
+		slot->release();
+	} else if (fil_page_get_type(dst_frame)
+		   == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
+		goto decompress;
+	}
+
+	ut_ad(node.space->referenced());
+	return true;
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Checks if the page is in crc32 checksum format.
+@param[in]	read_buf		database page
+@param[in]	checksum_field1		new checksum field
+@param[in]	checksum_field2		old checksum field
+@return true if the page is in crc32 checksum format. */
+bool
+buf_page_is_checksum_valid_crc32(
+	const byte*			read_buf,
+	ulint				checksum_field1,
+	ulint				checksum_field2)
+{
+	const uint32_t	crc32 = buf_calc_page_crc32(read_buf);
+
+#ifdef UNIV_INNOCHECKSUM
+	if (log_file
+	    && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32) {
+		fprintf(log_file, "page::" UINT32PF ";"
+			" crc32 calculated = " UINT32PF ";"
+			" recorded checksum field1 = " ULINTPF " recorded"
+			" checksum field2 =" ULINTPF "\n", cur_page_num,
+			crc32, checksum_field1, checksum_field2);
+	}
+#endif /* UNIV_INNOCHECKSUM */
+
+	if (checksum_field1 != checksum_field2) {
+		return false;
+	}
+
+	return checksum_field1 == crc32;
+}
+
+/** Checks if the page is in innodb checksum format.
+@param[in]	read_buf	database page
+@param[in]	checksum_field1	new checksum field
+@param[in]	checksum_field2	old checksum field
+@return true if the page is in innodb checksum format. */
+bool
+buf_page_is_checksum_valid_innodb(
+	const byte*			read_buf,
+	ulint				checksum_field1,
+	ulint				checksum_field2)
+{
+	/* There are 2 valid formulas for
+	checksum_field2 (old checksum field) which algo=innodb could have
+	written to the page:
+
+	1. Very old versions of InnoDB only stored 8 byte lsn to the
+	start and the end of the page.
+
+	2. Newer InnoDB versions store the old formula checksum
+	(buf_calc_page_old_checksum()). */
+
+	ulint	old_checksum = buf_calc_page_old_checksum(read_buf);
+	ulint	new_checksum = buf_calc_page_new_checksum(read_buf);
+
+#ifdef UNIV_INNOCHECKSUM
+	if (log_file
+	    && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB) {
+		fprintf(log_file, "page::" UINT32PF ";"
+			" old style: calculated ="
+			" " ULINTPF "; recorded = " ULINTPF "\n",
+			cur_page_num, old_checksum,
+			checksum_field2);
+		fprintf(log_file, "page::" UINT32PF ";"
+			" new style: calculated ="
+			" " ULINTPF "; crc32 = " UINT32PF "; recorded = " ULINTPF "\n",
+			cur_page_num, new_checksum,
+			buf_calc_page_crc32(read_buf), checksum_field1);
+	}
+
+	if (log_file
+	    && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB) {
+		fprintf(log_file, "page::" UINT32PF ";"
+			" old style: calculated ="
+			" " ULINTPF "; recorded checksum = " ULINTPF "\n",
+			cur_page_num, old_checksum,
+			checksum_field2);
+		fprintf(log_file, "page::" UINT32PF ";"
+			" new style: calculated ="
+			" " ULINTPF "; recorded checksum  = " ULINTPF "\n",
+			cur_page_num, new_checksum,
+			checksum_field1);
+	}
+#endif /* UNIV_INNOCHECKSUM */
+
+
+	if (checksum_field2 != mach_read_from_4(read_buf + FIL_PAGE_LSN)
+	    && checksum_field2 != old_checksum) {
+		DBUG_LOG("checksum",
+			 "Page checksum crc32 not valid"
+			 << " field1 " << checksum_field1
+			 << " field2 " << checksum_field2
+			 << " crc32 " << buf_calc_page_old_checksum(read_buf)
+			 << " lsn " << mach_read_from_4(
+				 read_buf + FIL_PAGE_LSN));
+		return(false);
+	}
+
+	/* old field is fine, check the new field */
+
+	/* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
+	(always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
+
+	if (checksum_field1 != 0 && checksum_field1 != new_checksum) {
+		DBUG_LOG("checksum",
+			 "Page checksum crc32 not valid"
+			 << " field1 " << checksum_field1
+			 << " field2 " << checksum_field2
+			 << " crc32 " << buf_calc_page_new_checksum(read_buf)
+			 << " lsn " << mach_read_from_4(
+				 read_buf + FIL_PAGE_LSN));
+		return(false);
+	}
+
+	return(true);
+}
+
+/** Checks if the page is in none checksum format.
+@param[in]	read_buf	database page
+@param[in]	checksum_field1	new checksum field
+@param[in]	checksum_field2	old checksum field
+@return true if the page is in none checksum format. */
+bool
+buf_page_is_checksum_valid_none(
+	const byte*			read_buf,
+	ulint				checksum_field1,
+	ulint				checksum_field2)
+{
+#ifndef DBUG_OFF
+	if (checksum_field1 != checksum_field2
+	    && checksum_field1 != BUF_NO_CHECKSUM_MAGIC) {
+		DBUG_LOG("checksum",
+			 "Page checksum crc32 not valid"
+			 << " field1 " << checksum_field1
+			 << " field2 " << checksum_field2
+			 << " crc32 " << BUF_NO_CHECKSUM_MAGIC
+			 << " lsn " << mach_read_from_4(read_buf
+							+ FIL_PAGE_LSN));
+	}
+#endif /* DBUG_OFF */
+
+#ifdef UNIV_INNOCHECKSUM
+	if (log_file
+	    && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_NONE) {
+		fprintf(log_file,
+			"page::" UINT32PF "; none checksum: calculated"
+			" = %lu; recorded checksum_field1 = " ULINTPF
+			" recorded checksum_field2 = " ULINTPF "\n",
+			cur_page_num, BUF_NO_CHECKSUM_MAGIC,
+			checksum_field1, checksum_field2);
+	}
+#endif /* UNIV_INNOCHECKSUM */
+
+	return(checksum_field1 == checksum_field2
+	       && checksum_field1 == BUF_NO_CHECKSUM_MAGIC);
+}
+
+/** Checks whether the lsn present in the page is lesser than the
+peek current lsn.
+@param[in]	check_lsn	lsn to check
+@param[in]	read_buf	page. */
+static void buf_page_check_lsn(bool check_lsn, const byte* read_buf)
+{
+#ifndef UNIV_INNOCHECKSUM
+	if (check_lsn && recv_lsn_checks_on) {
+		const lsn_t current_lsn = log_sys.get_lsn();
+		const lsn_t	page_lsn
+			= mach_read_from_8(read_buf + FIL_PAGE_LSN);
+
+		/* Since we are going to reset the page LSN during the import
+		phase it makes no sense to spam the log with error messages. */
+		if (current_lsn < page_lsn) {
+
+			const uint32_t space_id = mach_read_from_4(
+				read_buf + FIL_PAGE_SPACE_ID);
+			const uint32_t page_no = mach_read_from_4(
+				read_buf + FIL_PAGE_OFFSET);
+
+			ib::error() << "Page " << page_id_t(space_id, page_no)
+				<< " log sequence number " << page_lsn
+				<< " is in the future! Current system"
+				<< " log sequence number "
+				<< current_lsn << ".";
+
+			ib::error() << "Your database may be corrupt or"
+				" you may have copied the InnoDB"
+				" tablespace but not the InnoDB"
+				" log files. "
+				<< FORCE_RECOVERY_MSG;
+
+		}
+	}
+#endif /* !UNIV_INNOCHECKSUM */
+}
+
+
+/** Check if a buffer is all zeroes.
+@param[in]	buf	data to check
+@return whether the buffer is all zeroes */
+bool buf_is_zeroes(span<const byte> buf)
+{
+  ut_ad(buf.size() <= UNIV_PAGE_SIZE_MAX);
+  return memcmp(buf.data(), field_ref_zero, buf.size()) == 0;
+}
+
+/** Check if a page is corrupt.
+@param[in]	check_lsn	whether the LSN should be checked
+@param[in]	read_buf	database page
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	space		tablespace
+@return whether the page is corrupted */
+bool
+buf_page_is_corrupted(
+	bool			check_lsn,
+	const byte*		read_buf,
+	ulint			fsp_flags)
+{
+#ifndef UNIV_INNOCHECKSUM
+	DBUG_EXECUTE_IF("buf_page_import_corrupt_failure", return(true); );
+#endif
+	if (fil_space_t::full_crc32(fsp_flags)) {
+		bool compressed = false, corrupted = false;
+		const uint size = buf_page_full_crc32_size(
+			read_buf, &compressed, &corrupted);
+		if (corrupted) {
+			return true;
+		}
+		const byte* end = read_buf + (size - FIL_PAGE_FCRC32_CHECKSUM);
+		uint crc32 = mach_read_from_4(end);
+
+		if (!crc32 && size == srv_page_size
+		    && buf_is_zeroes(span<const byte>(read_buf, size))) {
+			return false;
+		}
+
+		DBUG_EXECUTE_IF(
+			"page_intermittent_checksum_mismatch", {
+			static int page_counter;
+			if (page_counter++ == 2) {
+				crc32++;
+			}
+		});
+
+		if (crc32 != ut_crc32(read_buf,
+				      size - FIL_PAGE_FCRC32_CHECKSUM)) {
+			return true;
+		}
+		static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "alignment");
+		static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+		static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+		if (!compressed
+		    && !mach_read_from_4(FIL_PAGE_FCRC32_KEY_VERSION
+					 + read_buf)
+		    && memcmp_aligned<4>(read_buf + (FIL_PAGE_LSN + 4),
+					 end - (FIL_PAGE_FCRC32_END_LSN
+						- FIL_PAGE_FCRC32_CHECKSUM),
+					 4)) {
+			return true;
+		}
+
+		buf_page_check_lsn(check_lsn, read_buf);
+		return false;
+	}
+
+	size_t		checksum_field1 = 0;
+	size_t		checksum_field2 = 0;
+	uint32_t	crc32 = 0;
+	bool		crc32_inited = false;
+	bool		crc32_chksum = false;
+	const ulint zip_size = fil_space_t::zip_size(fsp_flags);
+	const uint16_t page_type = fil_page_get_type(read_buf);
+
+	/* We can trust page type if page compression is set on tablespace
+	flags because page compression flag means file must have been
+	created with 10.1 (later than 5.5 code base). In 10.1 page
+	compressed tables do not contain post compression checksum and
+	FIL_PAGE_END_LSN_OLD_CHKSUM field stored. Note that space can
+	be null if we are in fil_check_first_page() and first page
+	is not compressed or encrypted. Page checksum is verified
+	after decompression (i.e. normally pages are already
+	decompressed at this stage). */
+	if ((page_type == FIL_PAGE_PAGE_COMPRESSED ||
+	     page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED)
+#ifndef UNIV_INNOCHECKSUM
+	    && FSP_FLAGS_HAS_PAGE_COMPRESSION(fsp_flags)
+#endif
+	) {
+		return(false);
+	}
+
+	static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+	static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 4 == 0, "alignment");
+
+	if (!zip_size
+	    && memcmp_aligned<4>(read_buf + FIL_PAGE_LSN + 4,
+				 read_buf + srv_page_size
+				 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
+		/* Stored log sequence numbers at the start and the end
+		of page do not match */
+
+		return(true);
+	}
+
+	buf_page_check_lsn(check_lsn, read_buf);
+
+	/* Check whether the checksum fields have correct values */
+
+	const srv_checksum_algorithm_t curr_algo =
+		static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm);
+
+	if (curr_algo == SRV_CHECKSUM_ALGORITHM_NONE) {
+		return(false);
+	}
+
+	if (zip_size) {
+		return !page_zip_verify_checksum(read_buf, zip_size);
+	}
+
+	checksum_field1 = mach_read_from_4(
+		read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
+
+	checksum_field2 = mach_read_from_4(
+		read_buf + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+	static_assert(FIL_PAGE_LSN % 8 == 0, "alignment");
+
+	/* A page filled with NUL bytes is considered not corrupted.
+	Before MariaDB Server 10.1.25 (MDEV-12113) or 10.2.2 (or MySQL 5.7),
+	the FIL_PAGE_FILE_FLUSH_LSN field may have been written nonzero
+	for the first page of each file of the system tablespace.
+	We want to ignore it for the system tablespace, but because
+	we do not know the expected tablespace here, we ignore the
+	field for all data files, except for
+	innodb_checksum_algorithm=full_crc32 which we handled above. */
+	if (!checksum_field1 && !checksum_field2) {
+		/* Checksum fields can have valid value as zero.
+		If the page is not empty then do the checksum
+		calculation for the page. */
+		bool all_zeroes = true;
+		for (size_t i = 0; i < srv_page_size; i++) {
+#ifndef UNIV_INNOCHECKSUM
+			if (i == FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) {
+				i += 8;
+			}
+#endif
+			if (read_buf[i]) {
+				all_zeroes = false;
+				break;
+			}
+		}
+
+		if (all_zeroes) {
+			return false;
+		}
+	}
+
+	switch (curr_algo) {
+	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+		return !buf_page_is_checksum_valid_crc32(
+			read_buf, checksum_field1, checksum_field2);
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+		return !buf_page_is_checksum_valid_innodb(
+			read_buf, checksum_field1, checksum_field2);
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+		return !buf_page_is_checksum_valid_none(
+			read_buf, checksum_field1, checksum_field2);
+	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+		if (buf_page_is_checksum_valid_none(read_buf,
+			checksum_field1, checksum_field2)) {
+#ifdef UNIV_INNOCHECKSUM
+			if (log_file) {
+				fprintf(log_file, "page::" UINT32PF ";"
+					" old style: calculated = %u;"
+					" recorded = " ULINTPF ";\n",
+					cur_page_num,
+					buf_calc_page_old_checksum(read_buf),
+					checksum_field2);
+				fprintf(log_file, "page::" UINT32PF ";"
+					" new style: calculated = " UINT32PF ";"
+					" crc32 = " UINT32PF "; recorded = " ULINTPF ";\n",
+					cur_page_num,
+					buf_calc_page_new_checksum(read_buf),
+					buf_calc_page_crc32(read_buf),
+					checksum_field1);
+			}
+#endif /* UNIV_INNOCHECKSUM */
+			return false;
+		}
+
+		crc32_chksum = curr_algo == SRV_CHECKSUM_ALGORITHM_CRC32
+			|| curr_algo == SRV_CHECKSUM_ALGORITHM_FULL_CRC32;
+
+		/* Very old versions of InnoDB only stored 8 byte lsn to the
+		start and the end of the page. */
+
+		/* Since innodb_checksum_algorithm is not strict_* allow
+		any of the algos to match for the old field */
+
+		if (checksum_field2
+		    != mach_read_from_4(read_buf + FIL_PAGE_LSN)
+		    && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) {
+
+			if (crc32_chksum) {
+				crc32 = buf_calc_page_crc32(read_buf);
+				crc32_inited = true;
+
+				DBUG_EXECUTE_IF(
+					"page_intermittent_checksum_mismatch", {
+					static int page_counter;
+					if (page_counter++ == 2) {
+						crc32++;
+					}
+				});
+
+				if (checksum_field2 != crc32
+				    && checksum_field2
+				       != buf_calc_page_old_checksum(read_buf)) {
+					return true;
+				}
+			} else {
+				ut_ad(curr_algo
+				      == SRV_CHECKSUM_ALGORITHM_INNODB);
+
+				if (checksum_field2
+				    != buf_calc_page_old_checksum(read_buf)) {
+					crc32 = buf_calc_page_crc32(read_buf);
+					crc32_inited = true;
+
+					if (checksum_field2 != crc32) {
+						return true;
+					}
+				}
+			}
+		}
+
+		if (checksum_field1 == 0
+		    || checksum_field1 == BUF_NO_CHECKSUM_MAGIC) {
+		} else if (crc32_chksum) {
+
+			if (!crc32_inited) {
+				crc32 = buf_calc_page_crc32(read_buf);
+				crc32_inited = true;
+			}
+
+			if (checksum_field1 != crc32
+			    && checksum_field1
+			    != buf_calc_page_new_checksum(read_buf)) {
+				return true;
+			}
+		} else {
+			ut_ad(curr_algo == SRV_CHECKSUM_ALGORITHM_INNODB);
+
+			if (checksum_field1
+			    != buf_calc_page_new_checksum(read_buf)) {
+
+				if (!crc32_inited) {
+					crc32 = buf_calc_page_crc32(read_buf);
+					crc32_inited = true;
+				}
+
+				if (checksum_field1 != crc32) {
+					return true;
+				}
+			}
+		}
+
+		if (crc32_inited
+		    && ((checksum_field1 == crc32
+			 && checksum_field2 != crc32)
+			|| (checksum_field1 != crc32
+			    && checksum_field2 == crc32))) {
+			return true;
+		}
+
+		break;
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+		/* should have returned false earlier */
+		break;
+	}
+
+	return false;
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+#if defined(DBUG_OFF) && defined(HAVE_MADVISE) &&  defined(MADV_DODUMP)
+/** Enable buffers to be dumped to core files
+
+A convience function, not called anyhwere directly however
+it is left available for gdb or any debugger to call
+in the event that you want all of the memory to be dumped
+to a core file.
+
+Returns number of errors found in madvise calls. */
+int
+buf_madvise_do_dump()
+{
+	int ret= 0;
+
+	/* mirrors allocation in log_t::create() */
+	if (log_sys.buf) {
+		ret += madvise(log_sys.buf,
+			       srv_log_buffer_size,
+			       MADV_DODUMP);
+		ret += madvise(log_sys.flush_buf,
+			       srv_log_buffer_size,
+			       MADV_DODUMP);
+	}
+	/* mirrors recv_sys_t::create() */
+	if (recv_sys.buf)
+	{
+		ret+= madvise(recv_sys.buf, recv_sys.len, MADV_DODUMP);
+	}
+
+	mysql_mutex_lock(&buf_pool.mutex);
+	auto chunk = buf_pool.chunks;
+
+	for (ulint n = buf_pool.n_chunks; n--; chunk++) {
+		ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP);
+	}
+
+	mysql_mutex_unlock(&buf_pool.mutex);
+	return ret;
+}
+#endif
+
+/** Dump a page to stderr.
+@param[in]	read_buf	database page
+@param[in]	zip_size	compressed page size, or 0 */
+void buf_page_print(const byte* read_buf, ulint zip_size)
+{
+	dict_index_t*	index;
+
+#ifndef UNIV_DEBUG
+	const ulint size = zip_size ? zip_size : srv_page_size;
+	ib::info() << "Page dump in ascii and hex ("
+		<< size << " bytes):";
+
+	ut_print_buf(stderr, read_buf, size);
+	fputs("\nInnoDB: End of page dump\n", stderr);
+#endif
+
+	if (zip_size) {
+		/* Print compressed page. */
+		ib::info() << "Compressed page type ("
+			<< fil_page_get_type(read_buf)
+			<< "); stored checksum in field1 "
+			<< mach_read_from_4(
+				read_buf + FIL_PAGE_SPACE_OR_CHKSUM)
+			<< "; calculated checksums for field1: "
+			<< buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_CRC32)
+			<< " "
+			<< page_zip_calc_checksum(
+				read_buf, zip_size,
+				SRV_CHECKSUM_ALGORITHM_CRC32)
+			<< ", "
+			<< buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_INNODB)
+			<< " "
+			<< page_zip_calc_checksum(
+				read_buf, zip_size,
+				SRV_CHECKSUM_ALGORITHM_INNODB)
+			<< ", "
+			<< buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_NONE)
+			<< " "
+			<< page_zip_calc_checksum(
+				read_buf, zip_size,
+				SRV_CHECKSUM_ALGORITHM_NONE)
+			<< "; page LSN "
+			<< mach_read_from_8(read_buf + FIL_PAGE_LSN)
+			<< "; page number (if stored to page"
+			<< " already) "
+			<< mach_read_from_4(read_buf + FIL_PAGE_OFFSET)
+			<< "; space id (if stored to page already) "
+			<< mach_read_from_4(
+				read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+	} else {
+		const uint32_t	crc32 = buf_calc_page_crc32(read_buf);
+		ulint page_type = fil_page_get_type(read_buf);
+
+		ib::info() << "Uncompressed page, stored checksum in field1 "
+			<< mach_read_from_4(
+				read_buf + FIL_PAGE_SPACE_OR_CHKSUM)
+			<< ", calculated checksums for field1: "
+			<< buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_CRC32) << " "
+			<< crc32
+			<< ", "
+			<< buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_INNODB) << " "
+			<< buf_calc_page_new_checksum(read_buf)
+			<< ", "
+			<< " page type " << page_type << " == "
+			<< fil_get_page_type_name(page_type) << "."
+			<< buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_NONE) << " "
+			<< BUF_NO_CHECKSUM_MAGIC
+			<< ", stored checksum in field2 "
+			<< mach_read_from_4(read_buf + srv_page_size
+					    - FIL_PAGE_END_LSN_OLD_CHKSUM)
+			<< ", calculated checksums for field2: "
+			<< buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_CRC32) << " "
+			<< crc32
+			<< ", "
+			<< buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_INNODB) << " "
+			<< buf_calc_page_old_checksum(read_buf)
+			<< ", "
+			<< buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_NONE) << " "
+			<< BUF_NO_CHECKSUM_MAGIC
+			<< ",  page LSN "
+			<< mach_read_from_4(read_buf + FIL_PAGE_LSN)
+			<< " "
+			<< mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
+			<< ", low 4 bytes of LSN at page end "
+			<< mach_read_from_4(read_buf + srv_page_size
+					    - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)
+			<< ", page number (if stored to page already) "
+			<< mach_read_from_4(read_buf + FIL_PAGE_OFFSET)
+			<< ", space id (if created with >= MySQL-4.1.1"
+			   " and stored already) "
+			<< mach_read_from_4(
+				read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	}
+
+	switch (fil_page_get_type(read_buf)) {
+		index_id_t	index_id;
+	case FIL_PAGE_INDEX:
+	case FIL_PAGE_TYPE_INSTANT:
+	case FIL_PAGE_RTREE:
+		index_id = btr_page_get_index_id(read_buf);
+		ib::info() << "Page may be an index page where"
+			" index id is " << index_id;
+
+		index = dict_index_find_on_id_low(index_id);
+		if (index) {
+			ib::info()
+				<< "Index " << index_id
+				<< " is " << index->name
+				<< " in table " << index->table->name;
+		}
+		break;
+	case FIL_PAGE_UNDO_LOG:
+		fputs("InnoDB: Page may be an undo log page\n", stderr);
+		break;
+	case FIL_PAGE_INODE:
+		fputs("InnoDB: Page may be an 'inode' page\n", stderr);
+		break;
+	case FIL_PAGE_IBUF_FREE_LIST:
+		fputs("InnoDB: Page may be an insert buffer free list page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_ALLOCATED:
+		fputs("InnoDB: Page may be a freshly allocated page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_IBUF_BITMAP:
+		fputs("InnoDB: Page may be an insert buffer bitmap page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_SYS:
+		fputs("InnoDB: Page may be a system page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_TRX_SYS:
+		fputs("InnoDB: Page may be a transaction system page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_FSP_HDR:
+		fputs("InnoDB: Page may be a file space header page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_XDES:
+		fputs("InnoDB: Page may be an extent descriptor page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_BLOB:
+		fputs("InnoDB: Page may be a BLOB page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_ZBLOB:
+	case FIL_PAGE_TYPE_ZBLOB2:
+		fputs("InnoDB: Page may be a compressed BLOB page\n",
+		      stderr);
+		break;
+	}
+}
+
+/** Initialize a buffer page descriptor.
+@param[in,out]	block	buffer page descriptor
+@param[in]	frame	buffer page frame */
+static
+void
+buf_block_init(buf_block_t* block, byte* frame)
+{
+	/* This function should only be executed at database startup or by
+	buf_pool.resize(). Either way, adaptive hash index must not exist. */
+	assert_block_ahi_empty_on_init(block);
+
+	block->frame = frame;
+
+	block->modify_clock = 0;
+	block->page.init(BUF_BLOCK_NOT_USED, page_id_t(~0ULL));
+#ifdef BTR_CUR_HASH_ADAPT
+	block->index = NULL;
+#endif /* BTR_CUR_HASH_ADAPT */
+	ut_d(block->in_unzip_LRU_list = false);
+	ut_d(block->in_withdraw_list = false);
+
+	page_zip_des_init(&block->page.zip);
+
+	ut_d(block->debug_latch = (rw_lock_t *) ut_malloc_nokey(sizeof(rw_lock_t)));
+
+	rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING);
+
+	ut_d(rw_lock_create(PFS_NOT_INSTRUMENTED, block->debug_latch,
+			    SYNC_LEVEL_VARYING));
+
+	block->lock.is_block_lock = 1;
+
+	ut_ad(rw_lock_validate(&(block->lock)));
+}
+
+/** Allocate a chunk of buffer frames.
+@param bytes    requested size
+@return whether the allocation succeeded */
+inline bool buf_pool_t::chunk_t::create(size_t bytes)
+{
+  DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return false;);
+  /* Round down to a multiple of page size, although it already should be. */
+  bytes= ut_2pow_round<size_t>(bytes, srv_page_size);
+
+  mem= buf_pool.allocator.allocate_large_dontdump(bytes, &mem_pfx);
+
+  if (UNIV_UNLIKELY(!mem))
+    return false;
+
+  MEM_UNDEFINED(mem, mem_size());
+
+#ifdef HAVE_LIBNUMA
+  if (srv_numa_interleave)
+  {
+    struct bitmask *numa_mems_allowed= numa_get_mems_allowed();
+    if (mbind(mem, mem_size(), MPOL_INTERLEAVE,
+              numa_mems_allowed->maskp, numa_mems_allowed->size,
+              MPOL_MF_MOVE))
+    {
+      ib::warn() << "Failed to set NUMA memory policy of"
+              " buffer pool page frames to MPOL_INTERLEAVE"
+              " (error: " << strerror(errno) << ").";
+    }
+    numa_bitmask_free(numa_mems_allowed);
+  }
+#endif /* HAVE_LIBNUMA */
+
+
+  /* Allocate the block descriptors from
+  the start of the memory block. */
+  blocks= reinterpret_cast<buf_block_t*>(mem);
+
+  /* Align a pointer to the first frame.  Note that when
+  opt_large_page_size is smaller than srv_page_size,
+  (with max srv_page_size at 64k don't think any hardware
+  makes this true),
+  we may allocate one fewer block than requested.  When
+  it is bigger, we may allocate more blocks than requested. */
+  static_assert(sizeof(byte*) == sizeof(ulint), "pointer size");
+
+  byte *frame= reinterpret_cast<byte*>((reinterpret_cast<ulint>(mem) +
+                                        srv_page_size - 1) &
+                                       ~ulint{srv_page_size - 1});
+  size= (mem_pfx.m_size >> srv_page_size_shift) - (frame != mem);
+
+  /* Subtract the space needed for block descriptors. */
+  {
+    ulint s= size;
+
+    while (frame < reinterpret_cast<const byte*>(blocks + s))
+    {
+      frame+= srv_page_size;
+      s--;
+    }
+
+    size= s;
+  }
+
+  /* Init block structs and assign frames for them. Then we assign the
+  frames to the first blocks (we already mapped the memory above). */
+
+  buf_block_t *block= blocks;
+
+  for (auto i= size; i--; ) {
+    buf_block_init(block, frame);
+    MEM_UNDEFINED(block->frame, srv_page_size);
+    /* Add the block to the free list */
+    UT_LIST_ADD_LAST(buf_pool.free, &block->page);
+
+    ut_d(block->page.in_free_list = TRUE);
+    block++;
+    frame+= srv_page_size;
+  }
+
+  reg();
+
+  return true;
+}
+
+#ifdef UNIV_DEBUG
+/** Check that all file pages in the buffer chunk are in a replaceable state.
+@return address of a non-free block
+@retval nullptr if all freed */
+inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const
+{
+  buf_block_t *block= blocks;
+  for (auto i= size; i--; block++)
+  {
+    switch (block->page.state()) {
+    case BUF_BLOCK_ZIP_PAGE:
+      /* The uncompressed buffer pool should never
+      contain ROW_FORMAT=COMPRESSED block descriptors. */
+      ut_error;
+      break;
+    case BUF_BLOCK_NOT_USED:
+    case BUF_BLOCK_MEMORY:
+    case BUF_BLOCK_REMOVE_HASH:
+      /* Skip blocks that are not being used for file pages. */
+      break;
+    case BUF_BLOCK_FILE_PAGE:
+      const lsn_t lsn= block->page.oldest_modification();
+
+      if (srv_read_only_mode)
+      {
+        /* The page cleaner is disabled in read-only mode.  No pages
+        can be dirtied, so all of them must be clean. */
+        ut_ad(lsn == 0 || lsn == recv_sys.recovered_lsn ||
+              srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
+        ut_ad(!block->page.buf_fix_count());
+        ut_ad(block->page.io_fix() == BUF_IO_NONE);
+        break;
+      }
+
+      if (fsp_is_system_temporary(block->page.id().space()))
+      {
+        ut_ad(lsn == 0 || lsn == 2);
+        break;
+      }
+
+      if (lsn > 1 || !block->page.can_relocate())
+        return block;
+
+      break;
+    }
+  }
+
+  return nullptr;
+}
+#endif /* UNIV_DEBUG */
+
+/** Free the synchronization objects of a buffer pool block descriptor
+@param[in,out]	block	buffer pool block descriptor */
+static void buf_block_free_mutexes(buf_block_t* block)
+{
+	rw_lock_free(&block->lock);
+	ut_d(rw_lock_free(block->debug_latch));
+	ut_d(ut_free(block->debug_latch));
+}
+
+/** Create the hash table.
+@param n  the lower bound of n_cells */
+void buf_pool_t::page_hash_table::create(ulint n)
+{
+  n_cells= ut_find_prime(n);
+  const size_t size= pad(n_cells) * sizeof *array;
+  void* v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE);
+  memset(v, 0, size);
+  array= static_cast<hash_cell_t*>(v);
+}
+
+/** Create the buffer pool.
+@return whether the creation failed */
+bool buf_pool_t::create()
+{
+  ut_ad(this == &buf_pool);
+  ut_ad(srv_buf_pool_size % srv_buf_pool_chunk_unit == 0);
+  ut_ad(!is_initialised());
+  ut_ad(srv_buf_pool_size > 0);
+  ut_ad(!resizing);
+  ut_ad(!chunks_old);
+  ut_ad(!field_ref_zero);
+
+  NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
+
+  if (auto b= aligned_malloc(UNIV_PAGE_SIZE_MAX, 4096))
+    field_ref_zero= static_cast<const byte*>
+      (memset_aligned<4096>(b, 0, UNIV_PAGE_SIZE_MAX));
+  else
+    return true;
+
+  chunk_t::map_reg= UT_NEW_NOKEY(chunk_t::map());
+
+  new(&allocator) ut_allocator<unsigned char>(mem_key_buf_buf_pool);
+
+  n_chunks= srv_buf_pool_size / srv_buf_pool_chunk_unit;
+  const size_t chunk_size= srv_buf_pool_chunk_unit;
+
+  chunks= static_cast<chunk_t*>(ut_zalloc_nokey(n_chunks * sizeof *chunks));
+  UT_LIST_INIT(free, &buf_page_t::list);
+  curr_size= 0;
+  auto chunk= chunks;
+
+  do
+  {
+    if (!chunk->create(chunk_size))
+    {
+      while (--chunk >= chunks)
+      {
+        buf_block_t* block= chunk->blocks;
+
+        for (auto i= chunk->size; i--; block++)
+          buf_block_free_mutexes(block);
+
+        allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx);
+      }
+      ut_free(chunks);
+      chunks= nullptr;
+      UT_DELETE(chunk_t::map_reg);
+      chunk_t::map_reg= nullptr;
+      aligned_free(const_cast<byte*>(field_ref_zero));
+      field_ref_zero= nullptr;
+      ut_ad(!is_initialised());
+      return true;
+    }
+
+    curr_size+= chunk->size;
+  }
+  while (++chunk < chunks + n_chunks);
+
+  ut_ad(is_initialised());
+  mysql_mutex_init(buf_pool_mutex_key, &mutex, MY_MUTEX_INIT_FAST);
+
+  UT_LIST_INIT(LRU, &buf_page_t::LRU);
+  UT_LIST_INIT(withdraw, &buf_page_t::list);
+  withdraw_target= 0;
+  UT_LIST_INIT(flush_list, &buf_page_t::list);
+  UT_LIST_INIT(unzip_LRU, &buf_block_t::unzip_LRU);
+
+  for (size_t i= 0; i < UT_ARR_SIZE(zip_free); ++i)
+    UT_LIST_INIT(zip_free[i], &buf_buddy_free_t::list);
+  ulint s= curr_size;
+  old_size= s;
+  s/= BUF_READ_AHEAD_PORTION;
+  read_ahead_area= s >= READ_AHEAD_PAGES
+    ? READ_AHEAD_PAGES
+    : my_round_up_to_next_power(static_cast<uint32_t>(s));
+  curr_pool_size= srv_buf_pool_size;
+
+  n_chunks_new= n_chunks;
+
+  page_hash.create(2 * curr_size);
+  zip_hash.create(2 * curr_size);
+  last_printout_time= time(NULL);
+
+  mysql_mutex_init(flush_list_mutex_key, &flush_list_mutex,
+                   MY_MUTEX_INIT_FAST);
+
+  pthread_cond_init(&done_flush_LRU, nullptr);
+  pthread_cond_init(&done_flush_list, nullptr);
+  pthread_cond_init(&do_flush_list, nullptr);
+  pthread_cond_init(&done_free, nullptr);
+
+  try_LRU_scan= true;
+
+  ut_d(flush_hp.m_mutex= &flush_list_mutex;);
+  ut_d(lru_hp.m_mutex= &mutex);
+  ut_d(lru_scan_itr.m_mutex= &mutex);
+
+  io_buf.create((srv_n_read_io_threads + srv_n_write_io_threads) *
+                OS_AIO_N_PENDING_IOS_PER_THREAD);
+
+  /* FIXME: remove some of these variables */
+  srv_buf_pool_curr_size= curr_pool_size;
+  srv_buf_pool_old_size= srv_buf_pool_size;
+  srv_buf_pool_base_size= srv_buf_pool_size;
+
+  last_activity_count= srv_get_activity_count();
+
+  chunk_t::map_ref= chunk_t::map_reg;
+  buf_LRU_old_ratio_update(100 * 3 / 8, false);
+  btr_search_sys_create();
+  ut_ad(is_initialised());
+  return false;
+}
+
+/** Clean up after successful create() */
+void buf_pool_t::close()
+{
+  ut_ad(this == &buf_pool);
+  if (!is_initialised())
+    return;
+
+  mysql_mutex_destroy(&mutex);
+  mysql_mutex_destroy(&flush_list_mutex);
+
+  for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev_bpage= nullptr; bpage;
+       bpage= prev_bpage)
+  {
+    prev_bpage= UT_LIST_GET_PREV(LRU, bpage);
+    ut_ad(bpage->in_file());
+    ut_ad(bpage->in_LRU_list);
+    /* The buffer pool must be clean during normal shutdown.
+    Only on aborted startup (with recovery) or with innodb_fast_shutdown=2
+    we may discard changes. */
+    ut_d(const lsn_t oldest= bpage->oldest_modification();)
+    ut_ad(fsp_is_system_temporary(bpage->id().space())
+          ? (oldest == 0 || oldest == 2)
+          : oldest <= 1 || srv_is_being_started || srv_fast_shutdown == 2);
+
+    if (bpage->state() != BUF_BLOCK_FILE_PAGE)
+      buf_page_free_descriptor(bpage);
+  }
+
+  for (auto chunk= chunks + n_chunks; --chunk >= chunks; )
+  {
+    buf_block_t *block= chunk->blocks;
+
+    for (auto i= chunk->size; i--; block++)
+      buf_block_free_mutexes(block);
+
+    allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx);
+  }
+
+  pthread_cond_destroy(&done_flush_LRU);
+  pthread_cond_destroy(&done_flush_list);
+  pthread_cond_destroy(&do_flush_list);
+  pthread_cond_destroy(&done_free);
+
+  ut_free(chunks);
+  chunks= nullptr;
+  page_hash.free();
+  zip_hash.free();
+
+  io_buf.close();
+  UT_DELETE(chunk_t::map_reg);
+  chunk_t::map_reg= chunk_t::map_ref= nullptr;
+  aligned_free(const_cast<byte*>(field_ref_zero));
+  field_ref_zero= nullptr;
+}
+
+/** Try to reallocate a control block.
+@param block  control block to reallocate
+@return whether the reallocation succeeded */
+inline bool buf_pool_t::realloc(buf_block_t *block)
+{
+	buf_block_t*	new_block;
+
+	mysql_mutex_assert_owner(&mutex);
+	ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+
+	new_block = buf_LRU_get_free_only();
+
+	if (new_block == NULL) {
+		return(false); /* free list was not enough */
+	}
+
+	const page_id_t id(block->page.id());
+	page_hash_latch* hash_lock = hash_lock_get(id);
+	hash_lock->write_lock();
+
+	if (block->page.can_relocate()) {
+		memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(
+			new_block->frame, block->frame, srv_page_size);
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		new (&new_block->page) buf_page_t(block->page);
+
+		/* relocate LRU list */
+		if (buf_page_t*	prev_b = buf_pool.LRU_remove(&block->page)) {
+			UT_LIST_INSERT_AFTER(LRU, prev_b, &new_block->page);
+		} else {
+			UT_LIST_ADD_FIRST(LRU, &new_block->page);
+		}
+
+		if (LRU_old == &block->page) {
+			LRU_old = &new_block->page;
+		}
+
+		ut_ad(new_block->page.in_LRU_list);
+
+		/* relocate unzip_LRU list */
+		if (block->page.zip.data != NULL) {
+			ut_ad(block->in_unzip_LRU_list);
+			ut_d(new_block->in_unzip_LRU_list = true);
+
+			buf_block_t*	prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
+			UT_LIST_REMOVE(unzip_LRU, block);
+
+			ut_d(block->in_unzip_LRU_list = false);
+			block->page.zip.data = NULL;
+			page_zip_set_size(&block->page.zip, 0);
+
+			if (prev_block != NULL) {
+				UT_LIST_INSERT_AFTER(unzip_LRU, prev_block, new_block);
+			} else {
+				UT_LIST_ADD_FIRST(unzip_LRU, new_block);
+			}
+		} else {
+			ut_ad(!block->in_unzip_LRU_list);
+			ut_d(new_block->in_unzip_LRU_list = false);
+		}
+
+		/* relocate page_hash */
+		ut_ad(block->page.in_page_hash);
+		ut_ad(new_block->page.in_page_hash);
+		const ulint fold = id.fold();
+		ut_ad(&block->page == page_hash_get_low(id, fold));
+		ut_d(block->page.in_page_hash = false);
+		HASH_REPLACE(buf_page_t, hash, &page_hash, fold,
+			     &block->page, &new_block->page);
+
+		buf_block_modify_clock_inc(block);
+		static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+		memset_aligned<4>(block->frame + FIL_PAGE_OFFSET, 0xff, 4);
+		static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+			      "not perfect alignment");
+		memset_aligned<2>(block->frame
+				  + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
+		MEM_UNDEFINED(block->frame, srv_page_size);
+		block->page.set_state(BUF_BLOCK_REMOVE_HASH);
+		if (!fsp_is_system_temporary(id.space())) {
+			buf_flush_relocate_on_flush_list(&block->page,
+							 &new_block->page);
+		}
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+		block->page.set_corrupt_id();
+
+		/* set other flags of buf_block_t */
+
+#ifdef BTR_CUR_HASH_ADAPT
+		/* This code should only be executed by resize(),
+		while the adaptive hash index is disabled. */
+		assert_block_ahi_empty(block);
+		assert_block_ahi_empty_on_init(new_block);
+		ut_ad(!block->index);
+		new_block->index	= NULL;
+		new_block->n_hash_helps	= 0;
+		new_block->n_fields	= 1;
+		new_block->left_side	= TRUE;
+#endif /* BTR_CUR_HASH_ADAPT */
+		ut_d(block->page.set_state(BUF_BLOCK_MEMORY));
+		/* free block */
+		new_block = block;
+	}
+
+	hash_lock->write_unlock();
+	buf_LRU_block_free_non_file_page(new_block);
+	return(true); /* free_list was enough */
+}
+
+/** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3).
+@param[in]	fmt	format
+@param[in]	...	extra parameters according to fmt */
+static
+void
+buf_resize_status(
+	const char*	fmt,
+	...)
+{
+	va_list	ap;
+
+	va_start(ap, fmt);
+
+	vsnprintf(
+		export_vars.innodb_buffer_pool_resize_status,
+		sizeof(export_vars.innodb_buffer_pool_resize_status),
+		fmt, ap);
+
+	va_end(ap);
+
+	ib::info() << export_vars.innodb_buffer_pool_resize_status;
+}
+
+/** Withdraw blocks from the buffer pool until meeting withdraw_target.
+@return whether retry is needed */
+inline bool buf_pool_t::withdraw_blocks()
+{
+	buf_block_t*	block;
+	ulint		loop_count = 0;
+
+	ib::info() << "start to withdraw the last "
+		<< withdraw_target << " blocks";
+
+	/* Minimize zip_free[i] lists */
+	mysql_mutex_lock(&mutex);
+	buf_buddy_condense_free();
+	mysql_mutex_unlock(&mutex);
+
+	while (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
+
+		/* try to withdraw from free_list */
+		ulint	count1 = 0;
+
+		mysql_mutex_lock(&mutex);
+		block = reinterpret_cast<buf_block_t*>(
+			UT_LIST_GET_FIRST(free));
+		while (block != NULL
+		       && UT_LIST_GET_LEN(withdraw) < withdraw_target) {
+			ut_ad(block->page.in_free_list);
+			ut_ad(!block->page.oldest_modification());
+			ut_ad(!block->page.in_LRU_list);
+			ut_a(!block->page.in_file());
+
+			buf_block_t*	next_block;
+			next_block = reinterpret_cast<buf_block_t*>(
+				UT_LIST_GET_NEXT(
+					list, &block->page));
+
+			if (will_be_withdrawn(block->page)) {
+				/* This should be withdrawn */
+				UT_LIST_REMOVE(free, &block->page);
+				UT_LIST_ADD_LAST(withdraw, &block->page);
+				ut_d(block->in_withdraw_list = true);
+				count1++;
+			}
+
+			block = next_block;
+		}
+		mysql_mutex_unlock(&mutex);
+
+		/* reserve free_list length */
+		if (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
+			ulint n_flushed = buf_flush_LRU(
+				std::max<ulint>(withdraw_target
+						- UT_LIST_GET_LEN(withdraw),
+						srv_LRU_scan_depth));
+			buf_flush_wait_batch_end_acquiring_mutex(true);
+
+			if (n_flushed) {
+				MONITOR_INC_VALUE_CUMULATIVE(
+					MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+					MONITOR_LRU_BATCH_FLUSH_COUNT,
+					MONITOR_LRU_BATCH_FLUSH_PAGES,
+					n_flushed);
+			}
+		}
+
+		/* relocate blocks/buddies in withdrawn area */
+		ulint	count2 = 0;
+
+		mysql_mutex_lock(&mutex);
+		buf_page_t*	bpage;
+		bpage = UT_LIST_GET_FIRST(LRU);
+		while (bpage != NULL) {
+			buf_page_t* next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
+			if (bpage->zip.data != NULL
+			    && will_be_withdrawn(bpage->zip.data)
+			    && bpage->can_relocate()) {
+				buf_pool_mutex_exit_forbid();
+				if (!buf_buddy_realloc(
+					    bpage->zip.data,
+					    page_zip_get_size(&bpage->zip))) {
+					/* failed to allocate block */
+					buf_pool_mutex_exit_allow();
+					break;
+				}
+				buf_pool_mutex_exit_allow();
+				count2++;
+			}
+
+			if (bpage->state() == BUF_BLOCK_FILE_PAGE
+			    && will_be_withdrawn(*bpage)) {
+				if (bpage->can_relocate()) {
+					buf_pool_mutex_exit_forbid();
+					if (!realloc(
+						reinterpret_cast<buf_block_t*>(
+							bpage))) {
+						/* failed to allocate block */
+						buf_pool_mutex_exit_allow();
+						break;
+					}
+					buf_pool_mutex_exit_allow();
+					count2++;
+				}
+				/* NOTE: if the page is in use,
+				not relocated yet */
+			}
+
+			bpage = next_bpage;
+		}
+		mysql_mutex_unlock(&mutex);
+
+		buf_resize_status(
+			"withdrawing blocks. (" ULINTPF "/" ULINTPF ")",
+			UT_LIST_GET_LEN(withdraw),
+			withdraw_target);
+
+		ib::info() << "withdrew "
+			<< count1 << " blocks from free list."
+			<< " Tried to relocate " << count2 << " pages ("
+			<< UT_LIST_GET_LEN(withdraw) << "/"
+			<< withdraw_target << ")";
+
+		if (++loop_count >= 10) {
+			/* give up for now.
+			retried after user threads paused. */
+
+			ib::info() << "will retry to withdraw later";
+
+			/* need retry later */
+			return(true);
+		}
+	}
+
+	/* confirm withdrawn enough */
+	for (const chunk_t* chunk = chunks + n_chunks_new,
+	     * const echunk = chunks + n_chunks; chunk != echunk; chunk++) {
+		block = chunk->blocks;
+		for (ulint j = chunk->size; j--; block++) {
+			ut_a(block->page.state() == BUF_BLOCK_NOT_USED);
+			ut_ad(block->in_withdraw_list);
+		}
+	}
+
+	ib::info() << "withdrawn target: " << UT_LIST_GET_LEN(withdraw)
+		   << " blocks";
+
+	return(false);
+}
+
+
+
+inline void buf_pool_t::page_hash_table::write_lock_all()
+{
+  for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
+  {
+    reinterpret_cast<page_hash_latch&>(array[n]).write_lock();
+    if (!n)
+      break;
+  }
+}
+
+
+inline void buf_pool_t::page_hash_table::write_unlock_all()
+{
+  for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
+  {
+    reinterpret_cast<page_hash_latch&>(array[n]).write_unlock();
+    if (!n)
+      break;
+  }
+}
+
+
+namespace
+{
+
+struct find_interesting_trx
+{
+  void operator()(const trx_t &trx)
+  {
+    if (trx.state == TRX_STATE_NOT_STARTED)
+      return;
+    if (trx.mysql_thd == nullptr)
+      return;
+    if (withdraw_started <= trx.start_time)
+      return;
+
+    if (!found)
+    {
+      ib::warn() << "The following trx might hold "
+                    "the blocks in buffer pool to "
+                    "be withdrawn. Buffer pool "
+                    "resizing can complete only "
+                    "after all the transactions "
+                    "below release the blocks.";
+      found= true;
+    }
+
+    lock_trx_print_wait_and_mvcc_state(stderr, &trx, current_time);
+  }
+
+  bool &found;
+  time_t withdraw_started;
+  time_t current_time;
+};
+
+} // namespace
+
+/** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */
+inline void buf_pool_t::resize()
+{
+  ut_ad(this == &buf_pool);
+
+	bool		warning = false;
+
+	NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
+
+	ut_ad(!resize_in_progress());
+	ut_ad(srv_buf_pool_chunk_unit > 0);
+
+	ulint new_instance_size = srv_buf_pool_size >> srv_page_size_shift;
+
+	buf_resize_status("Resizing buffer pool from " ULINTPF " to "
+			  ULINTPF " (unit=" ULINTPF ").",
+			  srv_buf_pool_old_size, srv_buf_pool_size,
+			  srv_buf_pool_chunk_unit);
+
+	mysql_mutex_lock(&mutex);
+	ut_ad(curr_size == old_size);
+	ut_ad(n_chunks_new == n_chunks);
+	ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
+
+	n_chunks_new = (new_instance_size << srv_page_size_shift)
+		/ srv_buf_pool_chunk_unit;
+	curr_size = n_chunks_new * chunks->size;
+	mysql_mutex_unlock(&mutex);
+
+#ifdef BTR_CUR_HASH_ADAPT
+	/* disable AHI if needed */
+	const bool btr_search_disabled = btr_search_enabled;
+
+	buf_resize_status("Disabling adaptive hash index.");
+
+	btr_search_s_lock_all();
+	if (btr_search_disabled) {
+		btr_search_s_unlock_all();
+	} else {
+		btr_search_s_unlock_all();
+	}
+
+	btr_search_disable();
+
+	if (btr_search_disabled) {
+		ib::info() << "disabled adaptive hash index.";
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	if (curr_size < old_size) {
+		/* set withdraw target */
+		size_t w = 0;
+
+		for (const chunk_t* chunk = chunks + n_chunks_new,
+		     * const echunk = chunks + n_chunks;
+		     chunk != echunk; chunk++)
+			w += chunk->size;
+
+		ut_ad(withdraw_target == 0);
+		withdraw_target = w;
+	}
+
+	buf_resize_status("Withdrawing blocks to be shrunken.");
+
+	time_t		withdraw_started = time(NULL);
+	double		message_interval = 60;
+	ulint		retry_interval = 1;
+
+withdraw_retry:
+	/* wait for the number of blocks fit to the new size (if needed)*/
+	bool	should_retry_withdraw = curr_size < old_size
+		&& withdraw_blocks();
+
+	if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
+		/* abort to resize for shutdown. */
+		return;
+	}
+
+	/* abort buffer pool load */
+	buf_load_abort();
+
+	const time_t current_time = time(NULL);
+
+	if (should_retry_withdraw
+	    && difftime(current_time, withdraw_started) >= message_interval) {
+
+		if (message_interval > 900) {
+			message_interval = 1800;
+		} else {
+			message_interval *= 2;
+		}
+
+		lock_mutex_enter();
+		bool	found = false;
+		trx_sys.trx_list.for_each(find_interesting_trx{
+			found, withdraw_started, current_time});
+		lock_mutex_exit();
+
+		withdraw_started = current_time;
+	}
+
+	if (should_retry_withdraw) {
+		ib::info() << "Will retry to withdraw " << retry_interval
+			<< " seconds later.";
+		os_thread_sleep(retry_interval * 1000000);
+
+		if (retry_interval > 5) {
+			retry_interval = 10;
+		} else {
+			retry_interval *= 2;
+		}
+
+		goto withdraw_retry;
+	}
+
+	buf_resize_status("Latching whole of buffer pool.");
+
+#ifndef DBUG_OFF
+	{
+		bool	should_wait = true;
+
+		while (should_wait) {
+			should_wait = false;
+			DBUG_EXECUTE_IF(
+				"ib_buf_pool_resize_wait_before_resize",
+				should_wait = true; os_thread_sleep(10000););
+		}
+	}
+#endif /* !DBUG_OFF */
+
+	if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
+		return;
+	}
+
+	/* Indicate critical path */
+	resizing.store(true, std::memory_order_relaxed);
+
+  mysql_mutex_lock(&mutex);
+  page_hash.write_lock_all();
+
+	chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map());
+
+	/* add/delete chunks */
+
+	buf_resize_status("buffer pool resizing with chunks "
+			  ULINTPF " to " ULINTPF ".",
+			  n_chunks, n_chunks_new);
+
+	if (n_chunks_new < n_chunks) {
+		/* delete chunks */
+		chunk_t* chunk = chunks + n_chunks_new;
+		const chunk_t* const echunk = chunks + n_chunks;
+
+		ulint	sum_freed = 0;
+
+		while (chunk < echunk) {
+			/* buf_LRU_block_free_non_file_page() invokes
+			MEM_NOACCESS() on any buf_pool.free blocks.
+			We must cancel the effect of that. In
+			MemorySanitizer, MEM_NOACCESS() is no-op, so
+			we must not do anything special for it here. */
+#ifdef HAVE_valgrind
+# if !__has_feature(memory_sanitizer)
+			MEM_MAKE_DEFINED(chunk->mem, chunk->mem_size());
+# endif
+#else
+			MEM_MAKE_ADDRESSABLE(chunk->mem, chunk->size);
+#endif
+
+			buf_block_t*	block = chunk->blocks;
+
+			for (ulint j = chunk->size; j--; block++) {
+				buf_block_free_mutexes(block);
+			}
+
+			allocator.deallocate_large_dodump(
+				chunk->mem, &chunk->mem_pfx);
+			sum_freed += chunk->size;
+			++chunk;
+		}
+
+		/* discard withdraw list */
+		UT_LIST_INIT(withdraw, &buf_page_t::list);
+		withdraw_target = 0;
+
+		ib::info() << n_chunks - n_chunks_new
+			   << " chunks (" << sum_freed
+			   << " blocks) were freed.";
+
+		n_chunks = n_chunks_new;
+	}
+
+	{
+		/* reallocate chunks */
+		const size_t	new_chunks_size
+			= n_chunks_new * sizeof(chunk_t);
+
+		chunk_t*	new_chunks = static_cast<chunk_t*>(
+			ut_zalloc_nokey_nofatal(new_chunks_size));
+
+		DBUG_EXECUTE_IF("buf_pool_resize_chunk_null",
+				ut_free(new_chunks); new_chunks= nullptr; );
+
+		if (!new_chunks) {
+			ib::error() << "failed to allocate"
+				" the chunk array.";
+			n_chunks_new = n_chunks;
+			warning = true;
+			chunks_old = NULL;
+			goto calc_buf_pool_size;
+		}
+
+		ulint	n_chunks_copy = ut_min(n_chunks_new,
+					       n_chunks);
+
+		memcpy(new_chunks, chunks,
+		       n_chunks_copy * sizeof *new_chunks);
+
+		for (ulint j = 0; j < n_chunks_copy; j++) {
+			new_chunks[j].reg();
+		}
+
+		chunks_old = chunks;
+		chunks = new_chunks;
+	}
+
+	if (n_chunks_new > n_chunks) {
+		/* add chunks */
+		ulint	sum_added = 0;
+		ulint	n = n_chunks;
+		const size_t unit = srv_buf_pool_chunk_unit;
+
+		for (chunk_t* chunk = chunks + n_chunks,
+		     * const echunk = chunks + n_chunks_new;
+		     chunk != echunk; chunk++) {
+			if (!chunk->create(unit)) {
+				ib::error() << "failed to allocate"
+					" memory for buffer pool chunk";
+
+				warning = true;
+				n_chunks_new = n_chunks;
+				break;
+			}
+
+			sum_added += chunk->size;
+			++n;
+		}
+
+		ib::info() << n_chunks_new - n_chunks
+			   << " chunks (" << sum_added
+			   << " blocks) were added.";
+
+		n_chunks = n;
+	}
+calc_buf_pool_size:
+	/* recalc curr_size */
+	ulint	new_size = 0;
+
+	{
+		chunk_t* chunk = chunks;
+		const chunk_t* const echunk = chunk + n_chunks;
+		do {
+			new_size += chunk->size;
+		} while (++chunk != echunk);
+	}
+
+	curr_size = new_size;
+	n_chunks_new = n_chunks;
+
+	if (chunks_old) {
+		ut_free(chunks_old);
+		chunks_old = NULL;
+	}
+
+	chunk_t::map* chunk_map_old = chunk_t::map_ref;
+	chunk_t::map_ref = chunk_t::map_reg;
+
+	/* set size */
+	ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
+  ulint s= curr_size;
+  old_size= s;
+  s/= BUF_READ_AHEAD_PORTION;
+  read_ahead_area= s >= READ_AHEAD_PAGES
+    ? READ_AHEAD_PAGES
+    : my_round_up_to_next_power(static_cast<uint32_t>(s));
+  curr_pool_size= n_chunks * srv_buf_pool_chunk_unit;
+  srv_buf_pool_curr_size= curr_pool_size;/* FIXME: remove*/
+  innodb_set_buf_pool_size(buf_pool_size_align(srv_buf_pool_curr_size));
+
+	const bool	new_size_too_diff
+		= srv_buf_pool_base_size > srv_buf_pool_size * 2
+			|| srv_buf_pool_base_size * 2 < srv_buf_pool_size;
+
+  mysql_mutex_unlock(&mutex);
+  page_hash.write_unlock_all();
+
+	UT_DELETE(chunk_map_old);
+
+	resizing.store(false, std::memory_order_relaxed);
+
+	/* Normalize other components, if the new size is too different */
+	if (!warning && new_size_too_diff) {
+		srv_buf_pool_base_size = srv_buf_pool_size;
+
+		buf_resize_status("Resizing also other hash tables.");
+
+		srv_lock_table_size = 5
+			* (srv_buf_pool_size >> srv_page_size_shift);
+		lock_sys.resize(srv_lock_table_size);
+		dict_sys.resize();
+
+		ib::info() << "Resized hash tables at lock_sys,"
+#ifdef BTR_CUR_HASH_ADAPT
+			" adaptive hash index,"
+#endif /* BTR_CUR_HASH_ADAPT */
+			" dictionary.";
+	}
+
+	/* normalize ibuf.max_size */
+	ibuf_max_size_update(srv_change_buffer_max_size);
+
+	if (srv_buf_pool_old_size != srv_buf_pool_size) {
+
+		ib::info() << "Completed to resize buffer pool from "
+			<< srv_buf_pool_old_size
+			<< " to " << srv_buf_pool_size << ".";
+		srv_buf_pool_old_size = srv_buf_pool_size;
+	}
+
+#ifdef BTR_CUR_HASH_ADAPT
+	/* enable AHI if needed */
+	if (btr_search_disabled) {
+		btr_search_enable(true);
+		ib::info() << "Re-enabled adaptive hash index.";
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	char	now[32];
+
+	ut_sprintf_timestamp(now);
+	if (!warning) {
+		buf_resize_status("Completed resizing buffer pool at %s.",
+			now);
+	} else {
+		buf_resize_status("Resizing buffer pool failed,"
+			" finished resizing at %s.", now);
+	}
+
+	ut_d(validate());
+
+	return;
+}
+
+/** Thread pool task invoked by innodb_buffer_pool_size changes. */
+static void buf_resize_callback(void *)
+{
+  DBUG_ENTER("buf_resize_callback");
+  ut_ad(srv_shutdown_state < SRV_SHUTDOWN_CLEANUP);
+  mysql_mutex_lock(&buf_pool.mutex);
+  const auto size= srv_buf_pool_size;
+  const bool work= srv_buf_pool_old_size != size;
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  if (work)
+    buf_pool.resize();
+  else
+  {
+    std::ostringstream sout;
+    sout << "Size did not change: old size = new size = " << size;
+    buf_resize_status(sout.str().c_str());
+  }
+  DBUG_VOID_RETURN;
+}
+
+/* Ensure that task does not run in parallel, by setting max_concurrency to 1 for the thread group */
+static tpool::task_group single_threaded_group(1);
+static tpool::waitable_task buf_resize_task(buf_resize_callback,
+	nullptr, &single_threaded_group);
+
+void buf_resize_start()
+{
+	srv_thread_pool->submit_task(&buf_resize_task);
+}
+
+void buf_resize_shutdown()
+{
+	buf_resize_task.wait();
+}
+
+
+/** Relocate a ROW_FORMAT=COMPRESSED block in the LRU list and
+buf_pool.page_hash.
+The caller must relocate bpage->list.
+@param bpage   BUF_BLOCK_ZIP_PAGE block
+@param dpage   destination control block */
+static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage)
+{
+  const ulint fold= bpage->id().fold();
+  ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE);
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(buf_pool.hash_lock_get(bpage->id())->is_write_locked());
+  ut_a(bpage->io_fix() == BUF_IO_NONE);
+  ut_a(!bpage->buf_fix_count());
+  ut_ad(bpage == buf_pool.page_hash_get_low(bpage->id(), fold));
+  ut_ad(!buf_pool.watch_is_sentinel(*bpage));
+  ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE);
+
+  new (dpage) buf_page_t(*bpage);
+
+  /* Important that we adjust the hazard pointer before
+  removing bpage from LRU list. */
+  if (buf_page_t *b= buf_pool.LRU_remove(bpage))
+    UT_LIST_INSERT_AFTER(buf_pool.LRU, b, dpage);
+  else
+    UT_LIST_ADD_FIRST(buf_pool.LRU, dpage);
+
+  if (UNIV_UNLIKELY(buf_pool.LRU_old == bpage))
+  {
+    buf_pool.LRU_old= dpage;
+#ifdef UNIV_LRU_DEBUG
+    /* buf_pool.LRU_old must be the first item in the LRU list
+    whose "old" flag is set. */
+    ut_a(buf_pool.LRU_old->old);
+    ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old) ||
+         !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old);
+    ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old) ||
+         UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old);
+  }
+  else
+  {
+    /* Check that the "old" flag is consistent in
+    the block and its neighbours. */
+    dpage->set_old(dpage->is_old());
+#endif /* UNIV_LRU_DEBUG */
+  }
+
+  ut_d(CheckInLRUList::validate());
+
+  /* relocate buf_pool.page_hash */
+  ut_ad(bpage->in_page_hash);
+  ut_ad(dpage->in_page_hash);
+  ut_d(bpage->in_page_hash= false);
+  HASH_REPLACE(buf_page_t, hash, &buf_pool.page_hash, fold, bpage, dpage);
+}
+
+/** Register a watch for a page identifier. The caller must hold an
+exclusive page hash latch. The *hash_lock may be released,
+relocated, and reacquired.
+@param id         page identifier
+@param hash_lock  exclusively held page_hash latch
+@return a buffer pool block corresponding to id
+@retval nullptr   if the block was not present, and a watch was installed */
+inline buf_page_t *buf_pool_t::watch_set(const page_id_t id,
+                                         page_hash_latch **hash_lock)
+{
+  const ulint fold= id.fold();
+  ut_ad(*hash_lock == page_hash.lock_get(fold));
+  ut_ad((*hash_lock)->is_write_locked());
+
+retry:
+  if (buf_page_t *bpage= page_hash_get_low(id, fold))
+  {
+    if (!watch_is_sentinel(*bpage))
+      /* The page was loaded meanwhile. */
+      return bpage;
+    /* Add to an existing watch. */
+    bpage->fix();
+    return nullptr;
+  }
+
+  (*hash_lock)->write_unlock();
+  /* Allocate a watch[] and then try to insert it into the page_hash. */
+  mysql_mutex_lock(&mutex);
+
+  /* The maximum number of purge tasks should never exceed
+  the UT_ARR_SIZE(watch) - 1, and there is no way for a purge task to hold a
+  watch when setting another watch. */
+  for (buf_page_t *w= &watch[UT_ARR_SIZE(watch)]; w-- >= watch; )
+  {
+    ut_ad(w->access_time == 0);
+    ut_ad(!w->oldest_modification());
+    ut_ad(!w->zip.data);
+    ut_ad(!w->in_zip_hash);
+    if (w->state() == BUF_BLOCK_ZIP_PAGE)
+      /* This watch may be in use for some other page. */
+      continue;
+    ut_ad(w->state() == BUF_BLOCK_NOT_USED);
+    ut_ad(!w->buf_fix_count());
+    /* w is pointing to watch[], which is protected by mutex.
+    Normally, buf_page_t::id for objects that are reachable by
+    page_hash_get_low(id, fold) are protected by hash_lock. */
+    w->set_state(BUF_BLOCK_ZIP_PAGE);
+    w->id_= id;
+
+    *hash_lock= page_hash.lock_get(fold);
+    (*hash_lock)->write_lock();
+    mysql_mutex_unlock(&mutex);
+
+    buf_page_t *bpage= page_hash_get_low(id, fold);
+    if (UNIV_LIKELY_NULL(bpage))
+    {
+      (*hash_lock)->write_unlock();
+      mysql_mutex_lock(&mutex);
+      w->set_state(BUF_BLOCK_NOT_USED);
+      *hash_lock= page_hash.lock_get(fold);
+      (*hash_lock)->write_lock();
+      mysql_mutex_unlock(&mutex);
+      goto retry;
+    }
+
+    ut_ad(!w->buf_fix_count_);
+    w->buf_fix_count_= 1;
+    ut_ad(!w->in_page_hash);
+    ut_d(w->in_page_hash= true); /* Not holding buf_pool.mutex here! */
+    HASH_INSERT(buf_page_t, hash, &page_hash, fold, w);
+    return nullptr;
+  }
+
+  ut_error;
+  mysql_mutex_unlock(&mutex);
+  return nullptr;
+}
+
+/** Mark the page status as FREED for the given tablespace id and
+page number. If the page is not in buffer pool then ignore it.
+@param[in,out]	space	tablespace
+@param[in]	page	page number
+@param[in,out]	mtr	mini-transaction
+@param[in]	file	file name
+@param[in]	line	line where called */
+void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr,
+                   const char *file, unsigned line)
+{
+  ut_ad(mtr);
+  ut_ad(mtr->is_active());
+
+  if (srv_immediate_scrub_data_uncompressed
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+      || space->is_compressed()
+#endif
+      )
+    mtr->add_freed_offset(space, page);
+
+  buf_pool.stat.n_page_gets++;
+  const page_id_t page_id(space->id, page);
+  const ulint fold= page_id.fold();
+  page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
+  if (buf_block_t *block= reinterpret_cast<buf_block_t*>
+      (buf_pool.page_hash_get_low(page_id, fold)))
+  {
+    if (block->page.state() != BUF_BLOCK_FILE_PAGE)
+      /* FIXME: convert, but avoid buf_zip_decompress() */;
+    else
+    {
+      buf_block_buf_fix_inc(block, file, line);
+      ut_ad(block->page.buf_fix_count());
+      hash_lock->read_unlock();
+
+      mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+      rw_lock_x_lock_inline(&block->lock, 0, file, line);
+      buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+      block->page.status= buf_page_t::FREED;
+      return;
+    }
+  }
+
+  hash_lock->read_unlock();
+}
+
+/** Get read access to a compressed page (usually of type
+FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
+The page must be released with buf_page_release_zip().
+NOTE: the page is not protected by any latch.  Mutual exclusion has to
+be implemented at a higher level.  In other words, all possible
+accesses to a given page through this function must be protected by
+the same set of mutexes or latches.
+@param[in]	page_id		page id
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size
+@return pointer to the block */
+buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size)
+{
+  ut_ad(zip_size);
+  ut_ad(ut_is_2pow(zip_size));
+  buf_pool.stat.n_page_gets++;
+
+  bool discard_attempted= false;
+  const ulint fold= page_id.fold();
+  buf_page_t *bpage;
+  page_hash_latch *hash_lock;
+
+  for (;;)
+  {
+lookup:
+    bpage= buf_pool.page_hash_get_locked<false>(page_id, fold, &hash_lock);
+    if (bpage)
+      break;
+
+    dberr_t err= buf_read_page(page_id, zip_size);
+
+    if (UNIV_UNLIKELY(err != DB_SUCCESS))
+    {
+      ib::error() << "Reading compressed page " << page_id
+                  << " failed with error: " << err;
+      goto err_exit;
+    }
+
+#ifdef UNIV_DEBUG
+    if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+  }
+
+  ut_ad(hash_lock->is_read_locked());
+
+  if (!bpage->zip.data)
+  {
+    /* There is no compressed page. */
+err_exit:
+    hash_lock->read_unlock();
+    return nullptr;
+  }
+
+  ut_ad(!buf_pool.watch_is_sentinel(*bpage));
+
+  switch (bpage->state()) {
+  case BUF_BLOCK_ZIP_PAGE:
+    bpage->fix();
+    goto got_block;
+  case BUF_BLOCK_FILE_PAGE:
+    /* Discard the uncompressed page frame if possible. */
+    if (!discard_attempted)
+    {
+      discard_attempted= true;
+      hash_lock->read_unlock();
+      mysql_mutex_lock(&buf_pool.mutex);
+      if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold))
+        buf_LRU_free_page(bpage, false);
+      mysql_mutex_unlock(&buf_pool.mutex);
+      goto lookup;
+    }
+
+    buf_block_buf_fix_inc(reinterpret_cast<buf_block_t*>(bpage),
+                          __FILE__, __LINE__);
+    goto got_block;
+  default:
+    break;
+  }
+
+  ut_error;
+  goto err_exit;
+
+got_block:
+  bool must_read= bpage->io_fix() == BUF_IO_READ;
+  hash_lock->read_unlock();
+
+  DBUG_ASSERT(bpage->status != buf_page_t::FREED);
+
+  bpage->set_accessed();
+  buf_page_make_young_if_needed(bpage);
+
+#ifdef UNIV_DEBUG
+  if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+  ut_ad(bpage->buf_fix_count());
+  ut_ad(bpage->in_file());
+
+  if (must_read)
+    /* Let us wait until the read operation completes */
+    while (bpage->io_fix() == BUF_IO_READ)
+      os_thread_sleep(WAIT_FOR_READ);
+
+  return bpage;
+}
+
+/********************************************************************//**
+Initialize some fields of a control block. */
+UNIV_INLINE
+void
+buf_block_init_low(
+/*===============*/
+	buf_block_t*	block)	/*!< in: block to init */
+{
+#ifdef BTR_CUR_HASH_ADAPT
+	/* No adaptive hash index entries may point to a previously
+	unused (and now freshly allocated) block. */
+	assert_block_ahi_empty_on_init(block);
+	block->index		= NULL;
+
+	block->n_hash_helps	= 0;
+	block->n_fields		= 1;
+	block->n_bytes		= 0;
+	block->left_side	= TRUE;
+#endif /* BTR_CUR_HASH_ADAPT */
+}
+
+/********************************************************************//**
+Decompress a block.
+@return TRUE if successful */
+ibool
+buf_zip_decompress(
+/*===============*/
+	buf_block_t*	block,	/*!< in/out: block */
+	ibool		check)	/*!< in: TRUE=verify the page checksum */
+{
+	const byte*	frame = block->page.zip.data;
+	ulint		size = page_zip_get_size(&block->page.zip);
+	/* The tablespace will not be found if this function is called
+	during IMPORT. */
+	fil_space_t* space= fil_space_t::get(block->page.id().space());
+	const unsigned key_version = mach_read_from_4(
+		frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+	fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL;
+	const bool encrypted = crypt_data
+		&& crypt_data->type != CRYPT_SCHEME_UNENCRYPTED
+		&& (!crypt_data->is_default_encryption()
+		    || srv_encrypt_tables);
+
+	ut_ad(block->zip_size());
+	ut_a(block->page.id().space() != 0);
+
+	if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) {
+
+		ib::error() << "Compressed page checksum mismatch for "
+			<< (space ? space->chain.start->name : "")
+			<< block->page.id() << ": stored: "
+			<< mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
+			<< ", crc32: "
+			<< page_zip_calc_checksum(
+				frame, size, SRV_CHECKSUM_ALGORITHM_CRC32)
+			<< " innodb: "
+			<< page_zip_calc_checksum(
+				frame, size, SRV_CHECKSUM_ALGORITHM_INNODB)
+			<< ", none: "
+			<< page_zip_calc_checksum(
+				frame, size, SRV_CHECKSUM_ALGORITHM_NONE)
+			<< " (algorithm: " << srv_checksum_algorithm << ")";
+		goto err_exit;
+	}
+
+	switch (fil_page_get_type(frame)) {
+	case FIL_PAGE_INDEX:
+	case FIL_PAGE_RTREE:
+		if (page_zip_decompress(&block->page.zip,
+					block->frame, TRUE)) {
+			if (space) {
+				space->release();
+			}
+			return(TRUE);
+		}
+
+		ib::error() << "Unable to decompress "
+			<< (space ? space->chain.start->name : "")
+			<< block->page.id();
+		goto err_exit;
+	case FIL_PAGE_TYPE_ALLOCATED:
+	case FIL_PAGE_INODE:
+	case FIL_PAGE_IBUF_BITMAP:
+	case FIL_PAGE_TYPE_FSP_HDR:
+	case FIL_PAGE_TYPE_XDES:
+	case FIL_PAGE_TYPE_ZBLOB:
+	case FIL_PAGE_TYPE_ZBLOB2:
+		/* Copy to uncompressed storage. */
+		memcpy(block->frame, frame, block->zip_size());
+		if (space) {
+			space->release();
+		}
+
+		return(TRUE);
+	}
+
+	ib::error() << "Unknown compressed page type "
+		<< fil_page_get_type(frame)
+		<< " in " << (space ? space->chain.start->name : "")
+		<< block->page.id();
+
+err_exit:
+	if (encrypted) {
+		ib::info() << "Row compressed page could be encrypted"
+			" with key_version " << key_version;
+	}
+
+	if (space) {
+		if (encrypted) {
+			dict_set_encrypted_by_space(space);
+		} else {
+			dict_set_corrupted_by_space(space);
+		}
+
+		space->release();
+	}
+
+	return(FALSE);
+}
+
+/** Wait for the block to be read in.
+@param[in]	block	The block to check */
+static
+void
+buf_wait_for_read(
+	buf_block_t*	block)
+{
+	/* Note:
+
+	We are using the block->lock to check for IO state.
+	We set the IO_READ state under the protection of the hash_lock.
+	This is safe because another thread can only
+	access the block (and check for IO state) after the block has been
+	added to the page hashtable. */
+
+	while (block->page.io_fix() == BUF_IO_READ) {
+		rw_lock_s_lock(&block->lock);
+		rw_lock_s_unlock(&block->lock);
+	}
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** If a stale adaptive hash index exists on the block, drop it.
+Multiple executions of btr_search_drop_page_hash_index() on the
+same block must be prevented by exclusive page latch. */
+ATTRIBUTE_COLD
+static void buf_defer_drop_ahi(buf_block_t *block, mtr_memo_type_t fix_type)
+{
+  switch (fix_type) {
+  case MTR_MEMO_BUF_FIX:
+    /* We do not drop the adaptive hash index, because safely doing
+    so would require acquiring block->lock, and that is not safe
+    to acquire in some RW_NO_LATCH access paths. Those code paths
+    should have no business accessing the adaptive hash index anyway. */
+    break;
+  case MTR_MEMO_PAGE_S_FIX:
+    /* Temporarily release our S-latch. */
+    rw_lock_s_unlock(&block->lock);
+    rw_lock_x_lock(&block->lock);
+    if (dict_index_t *index= block->index)
+      if (index->freed())
+        btr_search_drop_page_hash_index(block);
+    rw_lock_x_unlock(&block->lock);
+    rw_lock_s_lock(&block->lock);
+    break;
+  case MTR_MEMO_PAGE_SX_FIX:
+    rw_lock_sx_unlock(&block->lock);
+    rw_lock_x_lock(&block->lock);
+    if (dict_index_t *index= block->index)
+      if (index->freed())
+        btr_search_drop_page_hash_index(block);
+    rw_lock_x_unlock(&block->lock);
+    rw_lock_sx_lock(&block->lock);
+    break;
+  default:
+    ut_ad(fix_type == MTR_MEMO_PAGE_X_FIX);
+    btr_search_drop_page_hash_index(block);
+  }
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/** Lock the page with the given latch type.
+@param[in,out]	block		block to be locked
+@param[in]	rw_latch	RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in]	mtr		mini-transaction
+@param[in]	file		file name
+@param[in]	line		line where called
+@return pointer to locked block */
+static buf_block_t* buf_page_mtr_lock(buf_block_t *block,
+                                      ulint rw_latch,
+                                      mtr_t* mtr,
+                                      const char *file,
+                                      unsigned line)
+{
+  mtr_memo_type_t fix_type;
+  switch (rw_latch)
+  {
+  case RW_NO_LATCH:
+    fix_type= MTR_MEMO_BUF_FIX;
+    goto done;
+  case RW_S_LATCH:
+    rw_lock_s_lock_inline(&block->lock, 0, file, line);
+    fix_type= MTR_MEMO_PAGE_S_FIX;
+    break;
+  case RW_SX_LATCH:
+    rw_lock_sx_lock_inline(&block->lock, 0, file, line);
+    fix_type= MTR_MEMO_PAGE_SX_FIX;
+    break;
+  default:
+    ut_ad(rw_latch == RW_X_LATCH);
+    rw_lock_x_lock_inline(&block->lock, 0, file, line);
+    fix_type= MTR_MEMO_PAGE_X_FIX;
+    break;
+  }
+
+#ifdef BTR_CUR_HASH_ADAPT
+  {
+    dict_index_t *index= block->index;
+    if (index && index->freed())
+      buf_defer_drop_ahi(block, fix_type);
+  }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+done:
+  mtr_memo_push(mtr, block, fix_type);
+  return block;
+}
+
+/** Low level function used to get access to a database page.
+@param[in]	page_id			page id
+@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in]	guess			guessed block or NULL
+@param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in]	file			file name
+@param[in]	line			line where called
+@param[in]	mtr			mini-transaction
+@param[out]	err			DB_SUCCESS or error code
+@param[in]	allow_ibuf_merge	Allow change buffer merge to happen
+while reading the page from file
+then it makes sure that it does merging of change buffer changes while
+reading the page from file.
+@return pointer to the block or NULL */
+buf_block_t*
+buf_page_get_low(
+	const page_id_t		page_id,
+	ulint			zip_size,
+	ulint			rw_latch,
+	buf_block_t*		guess,
+	ulint			mode,
+	const char*		file,
+	unsigned		line,
+	mtr_t*			mtr,
+	dberr_t*		err,
+	bool			allow_ibuf_merge)
+{
+	buf_block_t*	block;
+	unsigned	access_time;
+	ulint		retries = 0;
+	const ulint	fold = page_id.fold();
+
+	ut_ad((mtr == NULL) == (mode == BUF_EVICT_IF_IN_POOL));
+	ut_ad(!mtr || mtr->is_active());
+	ut_ad((rw_latch == RW_S_LATCH)
+	      || (rw_latch == RW_X_LATCH)
+	      || (rw_latch == RW_SX_LATCH)
+	      || (rw_latch == RW_NO_LATCH));
+	ut_ad(!allow_ibuf_merge
+	      || mode == BUF_GET
+	      || mode == BUF_GET_POSSIBLY_FREED
+	      || mode == BUF_GET_IF_IN_POOL
+	      || mode == BUF_GET_IF_IN_POOL_OR_WATCH);
+
+	if (err) {
+		*err = DB_SUCCESS;
+	}
+
+#ifdef UNIV_DEBUG
+	switch (mode) {
+	case BUF_EVICT_IF_IN_POOL:
+		/* After DISCARD TABLESPACE, the tablespace would not exist,
+		but in IMPORT TABLESPACE, PageConverter::operator() must
+		replace any old pages, which were not evicted during DISCARD.
+		Skip the assertion on space_page_size. */
+		break;
+	case BUF_PEEK_IF_IN_POOL:
+	case BUF_GET_IF_IN_POOL:
+		/* The caller may pass a dummy page size,
+		because it does not really matter. */
+		break;
+	default:
+		ut_error;
+	case BUF_GET_POSSIBLY_FREED:
+		break;
+	case BUF_GET_NO_LATCH:
+		ut_ad(rw_latch == RW_NO_LATCH);
+		/* fall through */
+	case BUF_GET:
+	case BUF_GET_IF_IN_POOL_OR_WATCH:
+		fil_space_t* s = fil_space_get(page_id.space());
+		ut_ad(s);
+		ut_ad(s->zip_size() == zip_size);
+	}
+#endif /* UNIV_DEBUG */
+
+	ut_ad(!mtr || !ibuf_inside(mtr)
+	      || ibuf_page_low(page_id, zip_size, FALSE, file, line, NULL));
+
+	buf_pool.stat.n_page_gets++;
+loop:
+	buf_block_t* fix_block;
+	block = guess;
+
+	page_hash_latch* hash_lock = buf_pool.page_hash.lock<false>(fold);
+
+	if (block) {
+
+		/* If the guess is a compressed page descriptor that
+		has been allocated by buf_page_alloc_descriptor(),
+		it may have been freed by buf_relocate(). */
+
+		if (!buf_pool.is_uncompressed(block)
+		    || page_id != block->page.id()
+		    || block->page.state() != BUF_BLOCK_FILE_PAGE) {
+			/* Our guess was bogus or things have changed
+			since. */
+			guess = nullptr;
+			goto lookup;
+		} else {
+			ut_ad(!block->page.in_zip_hash);
+		}
+	} else {
+lookup:
+		block = reinterpret_cast<buf_block_t*>(
+			buf_pool.page_hash_get_low(page_id, fold));
+	}
+
+	if (!block || buf_pool.watch_is_sentinel(block->page)) {
+		hash_lock->read_unlock();
+		block = nullptr;
+	}
+
+	if (UNIV_UNLIKELY(!block)) {
+		/* Page not in buf_pool: needs to be read from file */
+		if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
+			hash_lock = buf_pool.page_hash.lock<true>(fold);
+
+			if (buf_page_t *bpage= buf_pool.watch_set(
+				    page_id, &hash_lock)) {
+				/* We can release hash_lock after we
+				increment the fix count to make
+				sure that no state change takes place. */
+				bpage->fix();
+				hash_lock->write_unlock();
+				block = reinterpret_cast<buf_block_t*>(bpage);
+				fix_block = block;
+				goto got_block;
+			}
+
+			hash_lock->write_unlock();
+		}
+
+		switch (mode) {
+		case BUF_GET_IF_IN_POOL:
+		case BUF_GET_IF_IN_POOL_OR_WATCH:
+		case BUF_PEEK_IF_IN_POOL:
+		case BUF_EVICT_IF_IN_POOL:
+			return(NULL);
+		}
+
+		/* The call path is buf_read_page() ->
+		buf_read_page_low() (fil_space_t::io()) ->
+		buf_page_read_complete() ->
+		buf_decrypt_after_read(). Here fil_space_t* is used
+		and we decrypt -> buf_page_check_corrupt() where page
+		checksums are compared. Decryption, decompression as
+		well as error handling takes place at a lower level.
+		Here we only need to know whether the page really is
+		corrupted, or if an encrypted page with a valid
+		checksum cannot be decypted. */
+
+		dberr_t local_err = buf_read_page(page_id, zip_size);
+
+		if (local_err == DB_SUCCESS) {
+			buf_read_ahead_random(page_id, zip_size,
+					      ibuf_inside(mtr));
+
+			retries = 0;
+		} else if (mode == BUF_GET_POSSIBLY_FREED) {
+			if (err) {
+				*err = local_err;
+			}
+			return NULL;
+		} else if (retries < BUF_PAGE_READ_MAX_RETRIES) {
+			++retries;
+
+			DBUG_EXECUTE_IF(
+				"innodb_page_corruption_retries",
+				retries = BUF_PAGE_READ_MAX_RETRIES;
+			);
+		} else {
+			if (err) {
+				*err = local_err;
+			}
+
+			/* Pages whose encryption key is unavailable or used
+			key, encryption algorithm or encryption method is
+			incorrect are marked as encrypted in
+			buf_page_check_corrupt(). Unencrypted page could be
+			corrupted in a way where the key_id field is
+			nonzero. There is no checksum on field
+			FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION. */
+			if (local_err == DB_DECRYPTION_FAILED) {
+				return (NULL);
+			}
+
+			if (local_err == DB_PAGE_CORRUPTED
+			    && srv_force_recovery) {
+				return NULL;
+			}
+
+			/* Try to set table as corrupted instead of
+			asserting. */
+			if (page_id.space() == TRX_SYS_SPACE) {
+			} else if (page_id.space() == SRV_TMP_SPACE_ID) {
+			} else if (fil_space_t* space= fil_space_t::get(
+					   page_id.space())) {
+				bool set = dict_set_corrupted_by_space(space);
+				space->release();
+				if (set) {
+					return NULL;
+				}
+			}
+
+			ib::fatal() << "Unable to read page " << page_id
+				<< " into the buffer pool after "
+				<< BUF_PAGE_READ_MAX_RETRIES
+				<< ". The most probable cause"
+				" of this error may be that the"
+				" table has been corrupted."
+				" See https://mariadb.com/kb/en/library/innodb-recovery-modes/";
+		}
+
+#ifdef UNIV_DEBUG
+		if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+		goto loop;
+	} else {
+		fix_block = block;
+	}
+
+	fix_block->fix();
+	hash_lock->read_unlock();
+
+got_block:
+	switch (mode) {
+	default:
+		ut_ad(block->zip_size() == zip_size);
+		break;
+	case BUF_GET_IF_IN_POOL:
+	case BUF_PEEK_IF_IN_POOL:
+	case BUF_EVICT_IF_IN_POOL:
+		if (fix_block->page.io_fix() == BUF_IO_READ) {
+			/* The page is being read to buffer pool,
+			but we cannot wait around for the read to
+			complete. */
+			fix_block->unfix();
+			return(NULL);
+		}
+	}
+
+	switch (UNIV_EXPECT(fix_block->page.state(), BUF_BLOCK_FILE_PAGE)) {
+	case BUF_BLOCK_FILE_PAGE:
+		if (fsp_is_system_temporary(page_id.space())
+		    && block->page.io_fix() != BUF_IO_NONE) {
+			/* This suggests that the page is being flushed.
+			Avoid returning reference to this page.
+			Instead wait for the flush action to complete. */
+			fix_block->unfix();
+			os_thread_sleep(WAIT_FOR_WRITE);
+			goto loop;
+		}
+
+		if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
+evict_from_pool:
+			ut_ad(!fix_block->page.oldest_modification());
+			mysql_mutex_lock(&buf_pool.mutex);
+			fix_block->unfix();
+
+			if (!buf_LRU_free_page(&fix_block->page, true)) {
+				ut_ad(0);
+			}
+
+			mysql_mutex_unlock(&buf_pool.mutex);
+			return(NULL);
+		}
+
+		break;
+	default:
+		ut_error;
+		break;
+
+	case BUF_BLOCK_ZIP_PAGE:
+		if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
+			goto evict_from_pool;
+		}
+
+		if (mode == BUF_PEEK_IF_IN_POOL) {
+			/* This mode is only used for dropping an
+			adaptive hash index.  There cannot be an
+			adaptive hash index for a compressed-only
+			page, so do not bother decompressing the page. */
+			fix_block->unfix();
+
+			return(NULL);
+		}
+
+		buf_page_t* bpage = &block->page;
+
+		/* Note: We have already buffer fixed this block. */
+		if (bpage->buf_fix_count() > 1
+		    || bpage->io_fix() != BUF_IO_NONE) {
+
+			/* This condition often occurs when the buffer
+			is not buffer-fixed, but I/O-fixed by
+			buf_page_init_for_read(). */
+			fix_block->unfix();
+
+			/* The block is buffer-fixed or I/O-fixed.
+			Try again later. */
+			os_thread_sleep(WAIT_FOR_READ);
+
+			goto loop;
+		}
+
+		/* Buffer-fix the block so that it cannot be evicted
+		or relocated while we are attempting to allocate an
+		uncompressed page. */
+
+		block = buf_LRU_get_free_block(false);
+		buf_block_init_low(block);
+
+		mysql_mutex_lock(&buf_pool.mutex);
+		hash_lock = buf_pool.page_hash.lock_get(fold);
+
+		hash_lock->write_lock();
+
+		/* Buffer-fixing prevents the page_hash from changing. */
+		ut_ad(bpage == buf_pool.page_hash_get_low(page_id, fold));
+
+		fix_block->unfix(); /* hash_lock protects us after this */
+
+		if (bpage->buf_fix_count() || bpage->io_fix() != BUF_IO_NONE) {
+			/* The block was buffer-fixed or I/O-fixed while
+			buf_pool.mutex was not held by this thread.
+			Free the block that was allocated and retry.
+			This should be extremely unlikely, for example,
+			if buf_page_get_zip() was invoked. */
+
+			hash_lock->write_unlock();
+			buf_LRU_block_free_non_file_page(block);
+			mysql_mutex_unlock(&buf_pool.mutex);
+
+			/* Try again */
+			goto loop;
+		}
+
+		fix_block = block;
+
+		/* Move the compressed page from bpage to block,
+		and uncompress it. */
+
+		/* Note: this is the uncompressed block and it is not
+		accessible by other threads yet because it is not in
+		any list or hash table */
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		buf_relocate(bpage, &block->page);
+
+		/* Set after buf_relocate(). */
+		block->page.set_buf_fix_count(1);
+
+		buf_flush_relocate_on_flush_list(bpage, &block->page);
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+		/* Buffer-fix, I/O-fix, and X-latch the block
+		for the duration of the decompression.
+		Also add the block to the unzip_LRU list. */
+		block->page.set_state(BUF_BLOCK_FILE_PAGE);
+
+		/* Insert at the front of unzip_LRU list */
+		buf_unzip_LRU_add_block(block, FALSE);
+
+		block->page.set_io_fix(BUF_IO_READ);
+		rw_lock_x_lock_inline(&block->lock, 0, file, line);
+
+		MEM_UNDEFINED(bpage, sizeof *bpage);
+
+		mysql_mutex_unlock(&buf_pool.mutex);
+		hash_lock->write_unlock();
+		buf_pool.n_pend_unzip++;
+
+		access_time = block->page.is_accessed();
+
+		if (!access_time && !recv_no_ibuf_operations
+		    && ibuf_page_exists(block->page.id(), zip_size)) {
+			block->page.ibuf_exist = true;
+		}
+
+		buf_page_free_descriptor(bpage);
+
+		/* Decompress the page while not holding
+		buf_pool.mutex. */
+
+		if (!buf_zip_decompress(block, false)) {
+			rw_lock_x_unlock(&fix_block->lock);
+			fix_block->page.io_unfix();
+			fix_block->unfix();
+			--buf_pool.n_pend_unzip;
+
+			if (err) {
+				*err = DB_PAGE_CORRUPTED;
+			}
+			return NULL;
+		}
+
+		rw_lock_x_unlock(&block->lock);
+		fix_block->page.io_unfix();
+		--buf_pool.n_pend_unzip;
+		break;
+	}
+
+	ut_ad(block == fix_block);
+	ut_ad(fix_block->page.buf_fix_count());
+
+	ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE);
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+re_evict:
+	if (mode != BUF_GET_IF_IN_POOL
+	    && mode != BUF_GET_IF_IN_POOL_OR_WATCH) {
+	} else if (!ibuf_debug) {
+	} else if (fil_space_t* space = fil_space_t::get(page_id.space())) {
+		/* Try to evict the block from the buffer pool, to use the
+		insert buffer (change buffer) as much as possible. */
+
+		mysql_mutex_lock(&buf_pool.mutex);
+
+		fix_block->unfix();
+
+		/* Blocks cannot be relocated or enter or exit the
+		buf_pool while we are holding the buf_pool.mutex. */
+		const bool evicted = buf_LRU_free_page(&fix_block->page, true);
+		space->release();
+
+		if (evicted) {
+			hash_lock = buf_pool.page_hash.lock_get(fold);
+			hash_lock->write_lock();
+			mysql_mutex_unlock(&buf_pool.mutex);
+			/* We may set the watch, as it would have
+			been set if the page were not in the
+			buffer pool in the first place. */
+			block= reinterpret_cast<buf_block_t*>(
+				mode == BUF_GET_IF_IN_POOL_OR_WATCH
+				? buf_pool.watch_set(page_id, &hash_lock)
+				: buf_pool.page_hash_get_low(page_id, fold));
+			hash_lock->write_unlock();
+
+			if (block != NULL) {
+				/* Either the page has been read in or
+				a watch was set on that in the window
+				where we released the buf_pool.mutex
+				and before we acquire the hash_lock
+				above. Try again. */
+				guess = block;
+
+				goto loop;
+			}
+
+			return(NULL);
+		}
+
+		fix_block->fix();
+		mysql_mutex_unlock(&buf_pool.mutex);
+		buf_flush_list();
+		buf_flush_wait_batch_end_acquiring_mutex(false);
+		while (buf_flush_list_space(space));
+		os_aio_wait_until_no_pending_writes();
+
+		if (fix_block->page.buf_fix_count() == 1
+		    && !fix_block->page.oldest_modification()) {
+			goto re_evict;
+		}
+
+		/* Failed to evict the page; change it directly */
+	}
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+	ut_ad(fix_block->page.buf_fix_count());
+
+#ifdef UNIV_DEBUG
+	/* We have already buffer fixed the page, and we are committed to
+	returning this page to the caller. Register for debugging.
+	Avoid debug latching if page/block belongs to system temporary
+	tablespace (Not much needed for table with single threaded access.). */
+	if (!fsp_is_system_temporary(page_id.space())) {
+		ibool   ret;
+		ret = rw_lock_s_lock_nowait(
+			fix_block->debug_latch, file, line);
+		ut_a(ret);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* While tablespace is reinited the indexes are already freed but the
+	blocks related to it still resides in buffer pool. Trying to remove
+	such blocks from buffer pool would invoke removal of AHI entries
+	associated with these blocks. Logic to remove AHI entry will try to
+	load the block but block is already in free state. Handle the said case
+	with mode = BUF_PEEK_IF_IN_POOL that is invoked from
+	"btr_search_drop_page_hash_when_freed". */
+	ut_ad(mode == BUF_GET_POSSIBLY_FREED
+	      || mode == BUF_PEEK_IF_IN_POOL
+	      || fix_block->page.status != buf_page_t::FREED);
+
+	const bool not_first_access = fix_block->page.set_accessed();
+
+	if (mode != BUF_PEEK_IF_IN_POOL) {
+		buf_page_make_young_if_needed(&fix_block->page);
+	}
+
+#ifdef UNIV_DEBUG
+	if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+	ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE);
+
+	/* We have to wait here because the IO_READ state was set
+	under the protection of the hash_lock and not block->lock. */
+	buf_wait_for_read(fix_block);
+
+	if (fix_block->page.id() != page_id) {
+		fix_block->unfix();
+
+#ifdef UNIV_DEBUG
+		if (!fsp_is_system_temporary(page_id.space())) {
+			rw_lock_s_unlock(fix_block->debug_latch);
+		}
+#endif /* UNIV_DEBUG */
+
+		if (err) {
+			*err = DB_PAGE_CORRUPTED;
+		}
+
+		return NULL;
+	}
+
+	if (fix_block->page.status != buf_page_t::FREED
+	    && allow_ibuf_merge
+	    && fil_page_get_type(fix_block->frame) == FIL_PAGE_INDEX
+	    && page_is_leaf(fix_block->frame)) {
+		rw_lock_x_lock_inline(&fix_block->lock, 0, file, line);
+
+		if (fix_block->page.ibuf_exist) {
+			fix_block->page.ibuf_exist = false;
+			ibuf_merge_or_delete_for_page(fix_block, page_id,
+						      zip_size);
+		}
+
+		if (rw_latch == RW_X_LATCH) {
+			mtr->memo_push(fix_block, MTR_MEMO_PAGE_X_FIX);
+		} else {
+			rw_lock_x_unlock(&fix_block->lock);
+			goto get_latch;
+		}
+	} else {
+get_latch:
+		fix_block = buf_page_mtr_lock(fix_block, rw_latch, mtr,
+					      file, line);
+	}
+
+	if (!not_first_access && mode != BUF_PEEK_IF_IN_POOL) {
+		/* In the case of a first access, try to apply linear
+		read-ahead */
+
+		buf_read_ahead_linear(page_id, zip_size, ibuf_inside(mtr));
+	}
+
+	return(fix_block);
+}
+
+/** Get access to a database page. Buffered redo log may be applied.
+@param[in]	page_id			page id
+@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in]	guess			guessed block or NULL
+@param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in]	file			file name
+@param[in]	line			line where called
+@param[in]	mtr			mini-transaction
+@param[out]	err			DB_SUCCESS or error code
+@param[in]	allow_ibuf_merge	Allow change buffer merge while
+reading the pages from file.
+@return pointer to the block or NULL */
+buf_block_t*
+buf_page_get_gen(
+	const page_id_t		page_id,
+	ulint			zip_size,
+	ulint			rw_latch,
+	buf_block_t*		guess,
+	ulint			mode,
+	const char*		file,
+	unsigned		line,
+	mtr_t*			mtr,
+	dberr_t*		err,
+	bool			allow_ibuf_merge)
+{
+  if (buf_block_t *block= recv_sys.recover(page_id))
+  {
+    block->fix();
+    ut_ad(rw_lock_s_lock_nowait(block->debug_latch, file, line));
+    if (err)
+      *err= DB_SUCCESS;
+    const bool must_merge= allow_ibuf_merge &&
+      ibuf_page_exists(page_id, block->zip_size());
+    if (block->page.status == buf_page_t::FREED)
+      ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL);
+    else if (must_merge && fil_page_get_type(block->frame) == FIL_PAGE_INDEX &&
+	     page_is_leaf(block->frame))
+    {
+      rw_lock_x_lock_inline(&block->lock, 0, file, line);
+      block->page.ibuf_exist= false;
+      ibuf_merge_or_delete_for_page(block, page_id, block->zip_size());
+
+      if (rw_latch == RW_X_LATCH)
+      {
+        mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+	return block;
+      }
+      rw_lock_x_unlock(&block->lock);
+    }
+    block= buf_page_mtr_lock(block, rw_latch, mtr, file, line);
+    return block;
+  }
+
+  return buf_page_get_low(page_id, zip_size, rw_latch,
+                          guess, mode, file, line, mtr, err, allow_ibuf_merge);
+}
+
+/********************************************************************//**
+This is the general function used to get optimistic access to a database
+page.
+@return TRUE if success */
+ibool
+buf_page_optimistic_get(
+/*====================*/
+	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+	buf_block_t*	block,	/*!< in: guessed buffer block */
+	ib_uint64_t	modify_clock,/*!< in: modify clock value */
+	const char*	file,	/*!< in: file name */
+	unsigned	line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
+{
+	ibool		success;
+
+	ut_ad(block);
+	ut_ad(mtr);
+	ut_ad(mtr->is_active());
+	ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
+
+	if (UNIV_UNLIKELY(block->page.state() != BUF_BLOCK_FILE_PAGE
+			  || block->page.io_fix() != BUF_IO_NONE)) {
+		return FALSE;
+	}
+
+	const page_id_t id(block->page.id());
+
+	page_hash_latch *hash_lock = buf_pool.hash_lock_get(id);
+	hash_lock->read_lock();
+
+	if (UNIV_UNLIKELY(id != block->page.id()
+			  || block->page.state() != BUF_BLOCK_FILE_PAGE
+			  || block->page.io_fix() != BUF_IO_NONE)) {
+		hash_lock->read_unlock();
+		return(FALSE);
+	}
+
+	buf_block_buf_fix_inc(block, file, line);
+	hash_lock->read_unlock();
+
+	block->page.set_accessed();
+
+	buf_page_make_young_if_needed(&block->page);
+
+	ut_ad(!ibuf_inside(mtr) || ibuf_page(id, block->zip_size(), NULL));
+
+	mtr_memo_type_t	fix_type;
+
+	if (rw_latch == RW_S_LATCH) {
+		fix_type = MTR_MEMO_PAGE_S_FIX;
+		success = rw_lock_s_lock_nowait(&block->lock, file, line);
+	} else {
+		fix_type = MTR_MEMO_PAGE_X_FIX;
+		success = rw_lock_x_lock_func_nowait_inline(
+			&block->lock, file, line);
+	}
+
+	ut_ad(id == block->page.id());
+
+	if (!success) {
+		buf_block_buf_fix_dec(block);
+		return(FALSE);
+	}
+
+	if (modify_clock != block->modify_clock) {
+
+		buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+		if (rw_latch == RW_S_LATCH) {
+			rw_lock_s_unlock(&block->lock);
+		} else {
+			rw_lock_x_unlock(&block->lock);
+		}
+
+		buf_block_buf_fix_dec(block);
+		return(FALSE);
+	}
+
+	mtr_memo_push(mtr, block, fix_type);
+
+#ifdef UNIV_DEBUG
+	if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+	ut_ad(block->page.buf_fix_count());
+	ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+
+	buf_pool.stat.n_page_gets++;
+
+	return(TRUE);
+}
+
+/** Given a tablespace id and page number tries to get that page. If the
+page is not in the buffer pool it is not loaded and NULL is returned.
+Suitable for using when holding the lock_sys_t::mutex.
+@param[in]	page_id	page id
+@param[in]	file	file name
+@param[in]	line	line where called
+@param[in]	mtr	mini-transaction
+@return pointer to a page or NULL */
+buf_block_t*
+buf_page_try_get_func(
+	const page_id_t		page_id,
+	const char*		file,
+	unsigned		line,
+	mtr_t*			mtr)
+{
+  ut_ad(mtr);
+  ut_ad(mtr->is_active());
+
+  page_hash_latch *hash_lock;
+  buf_page_t *bpage= buf_pool.page_hash_get_locked<false>(page_id,
+                                                          page_id.fold(),
+                                                          &hash_lock);
+  if (!bpage)
+    return nullptr;
+  if (bpage->state() != BUF_BLOCK_FILE_PAGE)
+  {
+    hash_lock->read_unlock();
+    return nullptr;
+  }
+
+  buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
+  buf_block_buf_fix_inc(block, file, line);
+  hash_lock->read_unlock();
+
+  mtr_memo_type_t fix_type= MTR_MEMO_PAGE_S_FIX;
+  if (!rw_lock_s_lock_nowait(&block->lock, file, line))
+  {
+    /* Let us try to get an X-latch. If the current thread
+    is holding an X-latch on the page, we cannot get an S-latch. */
+    fix_type= MTR_MEMO_PAGE_X_FIX;
+    if (!rw_lock_x_lock_func_nowait_inline(&block->lock, file, line))
+    {
+      buf_block_buf_fix_dec(block);
+      return nullptr;
+    }
+  }
+
+  mtr_memo_push(mtr, block, fix_type);
+
+#ifdef UNIV_DEBUG
+  if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+  ut_ad(bpage->buf_fix_count());
+  ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
+  ut_ad(bpage->id() == page_id);
+  buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+  buf_pool.stat.n_page_gets++;
+  return block;
+}
+
+/** Initialize the block.
+@param page_id  page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param fix      initial buf_fix_count() */
+void buf_block_t::initialise(const page_id_t page_id, ulint zip_size,
+                             uint32_t fix)
+{
+  ut_ad(page.state() != BUF_BLOCK_FILE_PAGE);
+  buf_block_init_low(this);
+  page.init(page_id, fix);
+  page_zip_set_size(&page.zip, zip_size);
+}
+
+/** Initialize a page in the buffer pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED =>
+FILE_PAGE (the other is buf_page_get_gen).
+@param[in,out]	space		space object
+@param[in]	offset		offset of the tablespace
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	mtr		mini-transaction
+@param[in,out]	free_block	pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t*
+buf_page_create(fil_space_t *space, uint32_t offset,
+                ulint zip_size, mtr_t *mtr, buf_block_t *free_block)
+{
+  page_id_t page_id(space->id, offset);
+  ut_ad(mtr->is_active());
+  ut_ad(page_id.space() != 0 || !zip_size);
+
+  space->free_page(offset, false);
+  free_block->initialise(page_id, zip_size, 1);
+
+  const ulint fold= page_id.fold();
+  mysql_mutex_lock(&buf_pool.mutex);
+
+loop:
+  buf_block_t *block= reinterpret_cast<buf_block_t*>
+    (buf_pool.page_hash_get_low(page_id, fold));
+
+  if (block && block->page.in_file() &&
+      !buf_pool.watch_is_sentinel(block->page))
+  {
+#ifdef BTR_CUR_HASH_ADAPT
+    const dict_index_t *drop_hash_entry= nullptr;
+#endif
+    switch (UNIV_EXPECT(block->page.state(), BUF_BLOCK_FILE_PAGE)) {
+    default:
+      ut_ad(0);
+      break;
+    case BUF_BLOCK_FILE_PAGE:
+      if (!mtr->have_x_latch(*block))
+      {
+        buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+        while (!rw_lock_x_lock_nowait(&block->lock))
+        {
+          /* Wait for buf_page_write_complete() to release block->lock.
+          We must not hold buf_pool.mutex while waiting. */
+          timespec abstime;
+          set_timespec_nsec(abstime, 1000000);
+          my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
+                            &abstime);
+        }
+        mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
+      }
+      else
+      {
+        ut_ad(!block->page.ibuf_exist);
+#ifdef BTR_CUR_HASH_ADAPT
+        ut_ad(!block->index);
+#endif
+      }
+#ifdef BTR_CUR_HASH_ADAPT
+      drop_hash_entry= block->index;
+#endif
+      break;
+    case BUF_BLOCK_ZIP_PAGE:
+      page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
+      hash_lock->write_lock();
+      if (block->page.io_fix() != BUF_IO_NONE)
+      {
+        hash_lock->write_unlock();
+        /* Wait for buf_page_write_complete() to release the I/O fix. */
+        timespec abstime;
+        set_timespec_nsec(abstime, 1000000);
+        my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
+                          &abstime);
+        goto loop;
+      }
+
+      rw_lock_x_lock(&free_block->lock);
+      mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      buf_relocate(&block->page, &free_block->page);
+      buf_flush_relocate_on_flush_list(&block->page, &free_block->page);
+      mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+      free_block->page.set_state(BUF_BLOCK_FILE_PAGE);
+      buf_unzip_LRU_add_block(free_block, FALSE);
+      hash_lock->write_unlock();
+      buf_page_free_descriptor(&block->page);
+      block= free_block;
+      buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+      mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
+      break;
+    }
+
+    mysql_mutex_unlock(&buf_pool.mutex);
+
+#ifdef BTR_CUR_HASH_ADAPT
+    if (drop_hash_entry)
+      btr_search_drop_page_hash_index(block);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+    if (block->page.ibuf_exist)
+    {
+      if (!recv_recovery_is_on())
+        ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size);
+      block->page.ibuf_exist= false;
+    }
+
+    return block;
+  }
+
+  /* If we get here, the page was not in buf_pool: init it there */
+
+  DBUG_PRINT("ib_buf", ("create page %u:%u",
+                        page_id.space(), page_id.page_no()));
+
+  block= free_block;
+
+  /* Duplicate buf_block_buf_fix_inc_func() */
+  ut_ad(block->page.buf_fix_count() == 1);
+  ut_ad(fsp_is_system_temporary(page_id.space()) ||
+        rw_lock_s_lock_nowait(block->debug_latch, __FILE__, __LINE__));
+
+  /* The block must be put to the LRU list */
+  buf_LRU_add_block(&block->page, false);
+  page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
+  hash_lock->write_lock();
+  block->page.set_state(BUF_BLOCK_FILE_PAGE);
+  ut_d(block->page.in_page_hash= true);
+  HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, &block->page);
+
+  rw_lock_x_lock(&block->lock);
+  if (UNIV_UNLIKELY(zip_size))
+  {
+    /* Prevent race conditions during buf_buddy_alloc(), which may
+    release and reacquire buf_pool.mutex, by IO-fixing and X-latching
+    the block. */
+    block->page.set_io_fix(BUF_IO_READ);
+    hash_lock->write_unlock();
+
+    /* buf_pool.mutex may be released and reacquired by
+    buf_buddy_alloc(). We must defer this operation until
+    after the block descriptor has been added to
+    buf_pool.LRU and buf_pool.page_hash. */
+    block->page.zip.data= buf_buddy_alloc(zip_size);
+
+    /* To maintain the invariant block->in_unzip_LRU_list ==
+    block->page.belongs_to_unzip_LRU() we have to add this
+    block to unzip_LRU after block->page.zip.data is set. */
+    ut_ad(block->page.belongs_to_unzip_LRU());
+    buf_unzip_LRU_add_block(block, FALSE);
+
+    block->page.set_io_fix(BUF_IO_NONE);
+  }
+  else
+    hash_lock->write_unlock();
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+  block->page.set_accessed();
+  buf_pool.stat.n_pages_created++;
+
+  /* Delete possible entries for the page from the insert buffer:
+  such can exist if the page belonged to an index which was dropped */
+  if (page_id < page_id_t{SRV_SPACE_ID_UPPER_BOUND, 0} &&
+      !recv_recovery_is_on())
+    ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size);
+
+  static_assert(FIL_PAGE_PREV + 4 == FIL_PAGE_NEXT, "adjacent");
+  memset_aligned<8>(block->frame + FIL_PAGE_PREV, 0xff, 8);
+  mach_write_to_2(block->frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
+
+  /* FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is only used on the
+  following pages:
+  (1) The first page of the InnoDB system tablespace (page 0:0)
+  (2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages
+  (3) key_version on encrypted pages (not page 0:0) */
+
+  memset(block->frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
+  memset_aligned<8>(block->frame + FIL_PAGE_LSN, 0, 8);
+
+#ifdef UNIV_DEBUG
+  if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+  return block;
+}
+
+/** Monitor the buffer page read/write activity, and increment corresponding
+counter value in MONITOR_MODULE_BUF_PAGE.
+@param bpage   buffer page whose read or write was completed
+@param io_type BUF_IO_READ or BUF_IO_WRITE */
+ATTRIBUTE_COLD __attribute__((nonnull))
+void buf_page_monitor(const buf_page_t *bpage, buf_io_fix io_type)
+{
+	const byte*	frame;
+	monitor_id_t	counter;
+
+	ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
+
+	frame = bpage->zip.data
+		? bpage->zip.data
+		: ((buf_block_t*) bpage)->frame;
+
+	switch (fil_page_get_type(frame)) {
+		ulint	level;
+	case FIL_PAGE_TYPE_INSTANT:
+	case FIL_PAGE_INDEX:
+	case FIL_PAGE_RTREE:
+		level = btr_page_get_level(frame);
+
+		/* Check if it is an index page for insert buffer */
+		if (fil_page_get_type(frame) == FIL_PAGE_INDEX
+		    && btr_page_get_index_id(frame)
+		    == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
+			if (level == 0) {
+				counter = MONITOR_RW_COUNTER(
+					io_type, MONITOR_INDEX_IBUF_LEAF_PAGE);
+			} else {
+				counter = MONITOR_RW_COUNTER(
+					io_type,
+					MONITOR_INDEX_IBUF_NON_LEAF_PAGE);
+			}
+		} else {
+			if (level == 0) {
+				counter = MONITOR_RW_COUNTER(
+					io_type, MONITOR_INDEX_LEAF_PAGE);
+			} else {
+				counter = MONITOR_RW_COUNTER(
+					io_type, MONITOR_INDEX_NON_LEAF_PAGE);
+			}
+		}
+		break;
+
+	case FIL_PAGE_UNDO_LOG:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_UNDO_LOG_PAGE);
+		break;
+
+	case FIL_PAGE_INODE:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_INODE_PAGE);
+		break;
+
+	case FIL_PAGE_IBUF_FREE_LIST:
+		counter = MONITOR_RW_COUNTER(io_type,
+					     MONITOR_IBUF_FREELIST_PAGE);
+		break;
+
+	case FIL_PAGE_IBUF_BITMAP:
+		counter = MONITOR_RW_COUNTER(io_type,
+					     MONITOR_IBUF_BITMAP_PAGE);
+		break;
+
+	case FIL_PAGE_TYPE_SYS:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_SYSTEM_PAGE);
+		break;
+
+	case FIL_PAGE_TYPE_TRX_SYS:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_TRX_SYSTEM_PAGE);
+		break;
+
+	case FIL_PAGE_TYPE_FSP_HDR:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_FSP_HDR_PAGE);
+		break;
+
+	case FIL_PAGE_TYPE_XDES:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_XDES_PAGE);
+		break;
+
+	case FIL_PAGE_TYPE_BLOB:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_BLOB_PAGE);
+		break;
+
+	case FIL_PAGE_TYPE_ZBLOB:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB_PAGE);
+		break;
+
+	case FIL_PAGE_TYPE_ZBLOB2:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB2_PAGE);
+		break;
+
+	default:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_OTHER_PAGE);
+	}
+
+	MONITOR_INC_NOCHECK(counter);
+}
+
+/** Mark a table corrupted.
+@param[in]	bpage	corrupted page
+@param[in]	space	tablespace of the corrupted page */
+ATTRIBUTE_COLD
+static void buf_mark_space_corrupt(buf_page_t* bpage, const fil_space_t& space)
+{
+	/* If block is not encrypted find the table with specified
+	space id, and mark it corrupted. Encrypted tables
+	are marked unusable later e.g. in ::open(). */
+	if (!space.crypt_data
+	    || space.crypt_data->type == CRYPT_SCHEME_UNENCRYPTED) {
+		dict_set_corrupted_by_space(&space);
+	} else {
+		dict_set_encrypted_by_space(&space);
+	}
+}
+
+/** Release and evict a corrupted page.
+@param bpage    page that was being read */
+ATTRIBUTE_COLD void buf_pool_t::corrupted_evict(buf_page_t *bpage)
+{
+  const page_id_t id(bpage->id());
+  page_hash_latch *hash_lock= hash_lock_get(id);
+
+  mysql_mutex_lock(&mutex);
+  hash_lock->write_lock();
+
+  ut_ad(bpage->io_fix() == BUF_IO_READ);
+  ut_ad(!bpage->oldest_modification());
+  bpage->set_corrupt_id();
+
+  if (bpage->state() == BUF_BLOCK_FILE_PAGE)
+    rw_lock_x_unlock_gen(&reinterpret_cast<buf_block_t*>(bpage)->lock,
+                         BUF_IO_READ);
+
+  bpage->io_unfix();
+
+  /* remove from LRU and page_hash */
+  buf_LRU_free_one_page(bpage, id, hash_lock);
+  mysql_mutex_unlock(&mutex);
+
+  ut_d(auto n=) n_pend_reads--;
+  ut_ad(n > 0);
+}
+
+/** Mark a table corrupted.
+@param[in]	bpage	Corrupted page
+@param[in]	node	data file
+Also remove the bpage from LRU list. */
+ATTRIBUTE_COLD
+static void buf_corrupt_page_release(buf_page_t *bpage, const fil_node_t &node)
+{
+  ut_ad(bpage->id().space() == node.space->id);
+  buf_pool.corrupted_evict(bpage);
+
+  if (!srv_force_recovery)
+    buf_mark_space_corrupt(bpage, *node.space);
+}
+
+/** Check if the encrypted page is corrupted for the full crc32 format.
+@param[in]	space_id	page belongs to space id
+@param[in]	d		page
+@param[in]	is_compressed	compressed page
+@return true if page is corrupted or false if it isn't */
+static bool buf_page_full_crc32_is_corrupted(ulint space_id, const byte* d,
+                                             bool is_compressed)
+{
+  if (space_id != mach_read_from_4(d + FIL_PAGE_SPACE_ID))
+    return true;
+
+  static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+  static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+
+  return !is_compressed &&
+    memcmp_aligned<4>(FIL_PAGE_LSN + 4 + d,
+                      d + srv_page_size - FIL_PAGE_FCRC32_END_LSN, 4);
+}
+
+/** Check if page is maybe compressed, encrypted or both when we encounter
+corrupted page. Note that we can't be 100% sure if page is corrupted
+or decrypt/decompress just failed.
+@param[in,out]	bpage		page
+@param[in]	node		data file
+@return	whether the operation succeeded
+@retval	DB_SUCCESS		if page has been read and is not corrupted
+@retval	DB_PAGE_CORRUPTED	if page based on checksum check is corrupted
+@retval	DB_DECRYPTION_FAILED	if page post encryption checksum matches but
+after decryption normal page checksum does not match.
+@retval	DB_TABLESPACE_DELETED	if accessed tablespace is not found */
+static dberr_t buf_page_check_corrupt(buf_page_t *bpage,
+                                      const fil_node_t &node)
+{
+	ut_ad(node.space->referenced());
+
+	byte* dst_frame = (bpage->zip.data) ? bpage->zip.data :
+		((buf_block_t*) bpage)->frame;
+	dberr_t err = DB_SUCCESS;
+	uint key_version = buf_page_get_key_version(dst_frame,
+						    node.space->flags);
+
+	/* In buf_decrypt_after_read we have either decrypted the page if
+	page post encryption checksum matches and used key_id is found
+	from the encryption plugin. If checksum did not match page was
+	not decrypted and it could be either encrypted and corrupted
+	or corrupted or good page. If we decrypted, there page could
+	still be corrupted if used key does not match. */
+	const bool seems_encrypted = !node.space->full_crc32() && key_version
+		&& node.space->crypt_data
+		&& node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED;
+	ut_ad(node.space->purpose != FIL_TYPE_TEMPORARY ||
+	      node.space->full_crc32());
+
+	/* If traditional checksums match, we assume that page is
+	not anymore encrypted. */
+	if (node.space->full_crc32()
+	    && !buf_is_zeroes(span<const byte>(dst_frame,
+					       node.space->physical_size()))
+	    && (key_version || node.space->is_compressed()
+		|| node.space->purpose == FIL_TYPE_TEMPORARY)) {
+		if (buf_page_full_crc32_is_corrupted(
+			    bpage->id().space(), dst_frame,
+			    node.space->is_compressed())) {
+			err = DB_PAGE_CORRUPTED;
+		}
+	} else if (buf_page_is_corrupted(true, dst_frame, node.space->flags)) {
+		err = DB_PAGE_CORRUPTED;
+	}
+
+	if (seems_encrypted && err == DB_PAGE_CORRUPTED
+	    && bpage->id().page_no() != 0) {
+		err = DB_DECRYPTION_FAILED;
+
+		ib::error()
+			<< "The page " << bpage->id()
+			<< " in file '" << node.name
+			<< "' cannot be decrypted.";
+
+		ib::info()
+			<< "However key management plugin or used key_version "
+			<< key_version
+			<< " is not found or"
+			" used encryption algorithm or method does not match.";
+
+		if (bpage->id().space() != TRX_SYS_SPACE) {
+			ib::info()
+				<< "Marking tablespace as missing."
+				" You may drop this table or"
+				" install correct key management plugin"
+				" and key file.";
+		}
+	}
+
+	return (err);
+}
+
+/** Complete a read request of a file page to buf_pool.
+@param bpage    recently read page
+@param node     data file
+@return whether the operation succeeded
+@retval DB_SUCCESS              always when writing, or if a read page was OK
+@retval DB_PAGE_CORRUPTED       if the checksum fails on a page read
+@retval DB_DECRYPTION_FAILED    if the page cannot be decrypted */
+dberr_t buf_page_read_complete(buf_page_t *bpage, const fil_node_t &node)
+{
+  const page_id_t id(bpage->id());
+  ut_ad(bpage->in_file());
+  ut_ad(!buf_dblwr.is_inside(id));
+  ut_ad(id.space() == node.space->id);
+  ut_ad(bpage->zip_size() == node.space->zip_size());
+
+  /* We do not need protect io_fix here by mutex to read it because
+  this and buf_page_write_complete() are the only functions where we can
+  change the value from BUF_IO_READ or BUF_IO_WRITE to some other
+  value, and our code ensures that this is the only thread that handles
+  the i/o for this block. */
+
+  ut_ad(bpage->io_fix() == BUF_IO_READ);
+  ut_ad(!!bpage->zip.ssize == !!bpage->zip.data);
+  ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE || bpage->zip.data);
+
+  const byte *frame= bpage->zip.data
+    ? bpage->zip.data
+    : reinterpret_cast<buf_block_t*>(bpage)->frame;
+  ut_ad(frame);
+
+  dberr_t err;
+  if (!buf_page_decrypt_after_read(bpage, node))
+  {
+    err= DB_DECRYPTION_FAILED;
+    goto database_corrupted;
+  }
+
+  if (bpage->zip.data && bpage->state() == BUF_BLOCK_FILE_PAGE)
+  {
+    buf_pool.n_pend_unzip++;
+    auto ok= buf_zip_decompress(reinterpret_cast<buf_block_t*>(bpage), FALSE);
+    buf_pool.n_pend_unzip--;
+
+    if (!ok)
+    {
+      ib::info() << "Page " << id << " zip_decompress failure.";
+      err= DB_PAGE_CORRUPTED;
+      goto database_corrupted;
+    }
+  }
+
+  {
+    const page_id_t read_id(mach_read_from_4(frame + FIL_PAGE_SPACE_ID),
+                            mach_read_from_4(frame + FIL_PAGE_OFFSET));
+
+    if (read_id == id);
+    else if (read_id == page_id_t(0, 0))
+      /* This is likely an uninitialized page. */;
+    else if (!node.space->full_crc32() &&
+             page_id_t(0, read_id.page_no()) == id)
+      /* FIL_PAGE_SPACE_ID was written as garbage in the system tablespace
+      before MySQL 4.1.1, which introduced innodb_file_per_table. */;
+    else if (node.space->full_crc32() &&
+             *reinterpret_cast<const uint32_t*>
+             (&frame[FIL_PAGE_FCRC32_KEY_VERSION]) &&
+             node.space->crypt_data &&
+             node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)
+    {
+      ib::error() << "Cannot decrypt " << id;
+      err= DB_DECRYPTION_FAILED;
+      goto release_page;
+    }
+    else
+      ib::error() << "Space id and page no stored in the page, read in are "
+                  << read_id << ", should be " << id;
+  }
+
+  err= buf_page_check_corrupt(bpage, node);
+  if (UNIV_UNLIKELY(err != DB_SUCCESS))
+  {
+database_corrupted:
+    /* Not a real corruption if it was triggered by error injection */
+    DBUG_EXECUTE_IF("buf_page_import_corrupt_failure",
+                    if (!is_predefined_tablespace(id.space()))
+                    {
+                      buf_corrupt_page_release(bpage, node);
+                      ib::info() << "Simulated IMPORT corruption";
+                      return err;
+                    }
+                    err= DB_SUCCESS;
+                    goto page_not_corrupt;);
+
+    if (bpage->zip.data && bpage->state() == BUF_BLOCK_FILE_PAGE)
+      memset(reinterpret_cast<buf_block_t*>(bpage)->frame, 0, srv_page_size);
+
+    if (err == DB_PAGE_CORRUPTED)
+    {
+      ib::error() << "Database page corruption on disk"
+                     " or a failed read of file '"
+                  << node.name << "' page " << id
+                  << ". You may have to recover from a backup.";
+
+      buf_page_print(frame, bpage->zip_size());
+
+      ib::info() << " You can use CHECK TABLE to scan"
+                    " your table for corruption. "
+                 << FORCE_RECOVERY_MSG;
+    }
+
+    if (!srv_force_recovery)
+    {
+      /* If the corruption is in the system tablespace, we will
+      intentionally crash the server. */
+      if (id.space() == TRX_SYS_SPACE)
+        ib::fatal() << "Aborting because of a corrupt database page.";
+      buf_corrupt_page_release(bpage, node);
+      return err;
+    }
+  }
+
+  DBUG_EXECUTE_IF("buf_page_import_corrupt_failure",
+                  page_not_corrupt: bpage= bpage; );
+
+  if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED)
+  {
+release_page:
+    buf_corrupt_page_release(bpage, node);
+    if (recv_recovery_is_on())
+      recv_sys.free_corrupted_page(id);
+    return err;
+  }
+
+  if (recv_recovery_is_on())
+    recv_recover_page(node.space, bpage);
+
+  if (bpage->state() == BUF_BLOCK_FILE_PAGE && !recv_no_ibuf_operations &&
+      (!id.space() || !is_predefined_tablespace(id.space())) &&
+      fil_page_get_type(frame) == FIL_PAGE_INDEX &&
+      page_is_leaf(frame))
+    bpage->ibuf_exist= true;
+
+  if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
+    buf_page_monitor(bpage, BUF_IO_READ);
+  DBUG_PRINT("ib_buf", ("read page %u:%u",
+                        id.space(), id.page_no()));
+
+  /* Because this thread which does the unlocking might not be the same that
+  did the locking, we use a pass value != 0 in unlock, which simply
+  removes the newest lock debug record, without checking the thread id. */
+  if (bpage->state() == BUF_BLOCK_FILE_PAGE)
+    rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_READ);
+  bpage->io_unfix();
+
+  ut_d(auto n=) buf_pool.n_pend_reads--;
+  ut_ad(n > 0);
+  buf_pool.stat.n_pages_read++;
+
+  return DB_SUCCESS;
+}
+
+#ifdef UNIV_DEBUG
+/** Check that all blocks are in a replaceable state.
+@return address of a non-free block
+@retval nullptr if all freed */
+void buf_pool_t::assert_all_freed()
+{
+  mysql_mutex_lock(&mutex);
+  const chunk_t *chunk= chunks;
+  for (auto i= n_chunks; i--; chunk++)
+    if (const buf_block_t* block= chunk->not_freed())
+      ib::fatal() << "Page " << block->page.id() << " still fixed or dirty";
+  mysql_mutex_unlock(&mutex);
+}
+#endif /* UNIV_DEBUG */
+
+/** Refresh the statistics used to print per-second averages. */
+void buf_refresh_io_stats()
+{
+	buf_pool.last_printout_time = time(NULL);
+	buf_pool.old_stat = buf_pool.stat;
+}
+
+/** Invalidate all pages in the buffer pool.
+All pages must be in a replaceable state (not modified or latched). */
+void buf_pool_invalidate()
+{
+	mysql_mutex_lock(&buf_pool.mutex);
+
+	buf_flush_wait_batch_end(true);
+	buf_flush_wait_batch_end(false);
+
+	/* It is possible that a write batch that has been posted
+	earlier is still not complete. For buffer pool invalidation to
+	proceed we must ensure there is NO write activity happening. */
+
+	ut_d(mysql_mutex_unlock(&buf_pool.mutex));
+	ut_d(buf_pool.assert_all_freed());
+	ut_d(mysql_mutex_lock(&buf_pool.mutex));
+
+	while (buf_LRU_scan_and_free_block());
+
+	ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == 0);
+	ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0);
+
+	buf_pool.freed_page_clock = 0;
+	buf_pool.LRU_old = NULL;
+	buf_pool.LRU_old_len = 0;
+
+	memset(&buf_pool.stat, 0x00, sizeof(buf_pool.stat));
+	buf_refresh_io_stats();
+	mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+#ifdef UNIV_DEBUG
+/** Validate the buffer pool. */
+void buf_pool_t::validate()
+{
+	ulint		n_lru		= 0;
+	ulint		n_flushing	= 0;
+	ulint		n_free		= 0;
+	ulint		n_zip		= 0;
+
+	mysql_mutex_lock(&mutex);
+
+	chunk_t* chunk = chunks;
+
+	/* Check the uncompressed blocks. */
+
+	for (auto i = n_chunks; i--; chunk++) {
+
+		ulint		j;
+		buf_block_t*	block = chunk->blocks;
+
+		for (j = chunk->size; j--; block++) {
+			switch (block->page.state()) {
+			case BUF_BLOCK_ZIP_PAGE:
+				/* This kind of block descriptors should
+				be allocated by malloc() only. */
+				ut_error;
+				break;
+
+			case BUF_BLOCK_NOT_USED:
+				n_free++;
+				break;
+
+			case BUF_BLOCK_MEMORY:
+			case BUF_BLOCK_REMOVE_HASH:
+				/* do nothing */
+				break;
+
+			case BUF_BLOCK_FILE_PAGE:
+				const page_id_t id = block->page.id();
+				ut_ad(page_hash_get_low(id, id.fold())
+				      == &block->page);
+				n_lru++;
+				break;
+
+			}
+		}
+	}
+
+	/* Check dirty blocks. */
+
+	mysql_mutex_lock(&flush_list_mutex);
+	for (buf_page_t* b = UT_LIST_GET_FIRST(flush_list); b;
+	     b = UT_LIST_GET_NEXT(list, b)) {
+		ut_ad(b->oldest_modification());
+		ut_ad(!fsp_is_system_temporary(b->id().space()));
+		n_flushing++;
+
+		switch (b->state()) {
+		case BUF_BLOCK_ZIP_PAGE:
+			n_lru++;
+			n_zip++;
+			break;
+		case BUF_BLOCK_FILE_PAGE:
+			/* uncompressed page */
+			break;
+		case BUF_BLOCK_NOT_USED:
+		case BUF_BLOCK_MEMORY:
+		case BUF_BLOCK_REMOVE_HASH:
+			ut_error;
+			break;
+		}
+		const page_id_t id = b->id();
+		ut_ad(page_hash_get_low(id, id.fold()) == b);
+	}
+
+	ut_ad(UT_LIST_GET_LEN(flush_list) == n_flushing);
+
+	mysql_mutex_unlock(&flush_list_mutex);
+
+	if (curr_size == old_size
+	    && n_lru + n_free > curr_size + n_zip) {
+
+		ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free
+			<< ", pool " << curr_size
+			<< " zip " << n_zip << ". Aborting...";
+	}
+
+	ut_ad(UT_LIST_GET_LEN(LRU) >= n_lru);
+
+	if (curr_size == old_size
+	    && UT_LIST_GET_LEN(free) != n_free) {
+
+		ib::fatal() << "Free list len "
+			<< UT_LIST_GET_LEN(free)
+			<< ", free blocks " << n_free << ". Aborting...";
+	}
+
+	mysql_mutex_unlock(&mutex);
+
+	ut_d(buf_LRU_validate());
+	ut_d(buf_flush_validate());
+}
+#endif /* UNIV_DEBUG */
+
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+/** Write information of the buf_pool to the error log. */
+void buf_pool_t::print()
+{
+	index_id_t*	index_ids;
+	ulint*		counts;
+	ulint		size;
+	ulint		i;
+	ulint		j;
+	index_id_t	id;
+	ulint		n_found;
+	chunk_t*	chunk;
+	dict_index_t*	index;
+
+	size = curr_size;
+
+	index_ids = static_cast<index_id_t*>(
+		ut_malloc_nokey(size * sizeof *index_ids));
+
+	counts = static_cast<ulint*>(ut_malloc_nokey(sizeof(ulint) * size));
+
+	mysql_mutex_lock(&mutex);
+	mysql_mutex_lock(&flush_list_mutex);
+
+	ib::info()
+		<< "[buffer pool: size=" << curr_size
+		<< ", database pages=" << UT_LIST_GET_LEN(LRU)
+		<< ", free pages=" << UT_LIST_GET_LEN(free)
+		<< ", modified database pages="
+		<< UT_LIST_GET_LEN(flush_list)
+		<< ", n pending decompressions=" << n_pend_unzip
+		<< ", n pending reads=" << n_pend_reads
+		<< ", n pending flush LRU=" << n_flush_LRU_
+		<< " list=" << n_flush_list_
+		<< ", pages made young=" << stat.n_pages_made_young
+		<< ", not young=" << stat.n_pages_not_made_young
+		<< ", pages read=" << stat.n_pages_read
+		<< ", created=" << stat.n_pages_created
+		<< ", written=" << stat.n_pages_written << "]";
+
+	mysql_mutex_unlock(&flush_list_mutex);
+
+	/* Count the number of blocks belonging to each index in the buffer */
+
+	n_found = 0;
+
+	chunk = chunks;
+
+	for (i = n_chunks; i--; chunk++) {
+		buf_block_t*	block		= chunk->blocks;
+		ulint		n_blocks	= chunk->size;
+
+		for (; n_blocks--; block++) {
+			const buf_frame_t* frame = block->frame;
+
+			if (fil_page_index_page_check(frame)) {
+
+				id = btr_page_get_index_id(frame);
+
+				/* Look for the id in the index_ids array */
+				j = 0;
+
+				while (j < n_found) {
+
+					if (index_ids[j] == id) {
+						counts[j]++;
+
+						break;
+					}
+					j++;
+				}
+
+				if (j == n_found) {
+					n_found++;
+					index_ids[j] = id;
+					counts[j] = 1;
+				}
+			}
+		}
+	}
+
+	mysql_mutex_unlock(&mutex);
+
+	for (i = 0; i < n_found; i++) {
+		index = dict_index_get_if_in_cache(index_ids[i]);
+
+		if (!index) {
+			ib::info() << "Block count for index "
+				<< index_ids[i] << " in buffer is about "
+				<< counts[i];
+		} else {
+			ib::info() << "Block count for index " << index_ids[i]
+				<< " in buffer is about " << counts[i]
+				<< ", index " << index->name
+				<< " of table " << index->table->name;
+		}
+	}
+
+	ut_free(index_ids);
+	ut_free(counts);
+
+	validate();
+}
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+/** @return the number of latched pages in the buffer pool */
+ulint buf_get_latched_pages_number()
+{
+  ulint fixed_pages_number= 0;
+
+  mysql_mutex_lock(&buf_pool.mutex);
+
+  for (buf_page_t *b= UT_LIST_GET_FIRST(buf_pool.LRU); b;
+       b= UT_LIST_GET_NEXT(LRU, b))
+    if (b->in_file() && (b->buf_fix_count() || b->io_fix() != BUF_IO_NONE))
+      fixed_pages_number++;
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  return fixed_pages_number;
+}
+#endif /* UNIV_DEBUG */
+
+/** Collect buffer pool metadata.
+@param[out]	pool_info	buffer pool metadata */
+void buf_stats_get_pool_info(buf_pool_info_t *pool_info)
+{
+	time_t			current_time;
+	double			time_elapsed;
+
+	mysql_mutex_lock(&buf_pool.mutex);
+
+	pool_info->pool_size = buf_pool.curr_size;
+
+	pool_info->lru_len = UT_LIST_GET_LEN(buf_pool.LRU);
+
+	pool_info->old_lru_len = buf_pool.LRU_old_len;
+
+	pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool.free);
+
+	mysql_mutex_lock(&buf_pool.flush_list_mutex);
+	pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool.flush_list);
+
+	pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool.unzip_LRU);
+	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+	pool_info->n_pend_reads = buf_pool.n_pend_reads;
+
+	pool_info->n_pending_flush_lru = buf_pool.n_flush_LRU_;
+
+	pool_info->n_pending_flush_list = buf_pool.n_flush_list_;
+
+	current_time = time(NULL);
+	time_elapsed = 0.001 + difftime(current_time,
+					buf_pool.last_printout_time);
+
+	pool_info->n_pages_made_young = buf_pool.stat.n_pages_made_young;
+
+	pool_info->n_pages_not_made_young =
+		buf_pool.stat.n_pages_not_made_young;
+
+	pool_info->n_pages_read = buf_pool.stat.n_pages_read;
+
+	pool_info->n_pages_created = buf_pool.stat.n_pages_created;
+
+	pool_info->n_pages_written = buf_pool.stat.n_pages_written;
+
+	pool_info->n_page_gets = buf_pool.stat.n_page_gets;
+
+	pool_info->n_ra_pages_read_rnd = buf_pool.stat.n_ra_pages_read_rnd;
+	pool_info->n_ra_pages_read = buf_pool.stat.n_ra_pages_read;
+
+	pool_info->n_ra_pages_evicted = buf_pool.stat.n_ra_pages_evicted;
+
+	pool_info->page_made_young_rate =
+	static_cast<double>(buf_pool.stat.n_pages_made_young
+			    - buf_pool.old_stat.n_pages_made_young)
+	/ time_elapsed;
+
+	pool_info->page_not_made_young_rate =
+	static_cast<double>(buf_pool.stat.n_pages_not_made_young
+			    - buf_pool.old_stat.n_pages_not_made_young)
+	/ time_elapsed;
+
+	pool_info->pages_read_rate =
+	static_cast<double>(buf_pool.stat.n_pages_read
+			    - buf_pool.old_stat.n_pages_read)
+	/ time_elapsed;
+
+	pool_info->pages_created_rate =
+	static_cast<double>(buf_pool.stat.n_pages_created
+			    - buf_pool.old_stat.n_pages_created)
+	/ time_elapsed;
+
+	pool_info->pages_written_rate =
+	static_cast<double>(buf_pool.stat.n_pages_written
+			    - buf_pool.old_stat.n_pages_written)
+	/ time_elapsed;
+
+	pool_info->n_page_get_delta = buf_pool.stat.n_page_gets
+				      - buf_pool.old_stat.n_page_gets;
+
+	if (pool_info->n_page_get_delta) {
+		pool_info->page_read_delta = buf_pool.stat.n_pages_read
+					     - buf_pool.old_stat.n_pages_read;
+
+		pool_info->young_making_delta =
+			buf_pool.stat.n_pages_made_young
+			- buf_pool.old_stat.n_pages_made_young;
+
+		pool_info->not_young_making_delta =
+			buf_pool.stat.n_pages_not_made_young
+			- buf_pool.old_stat.n_pages_not_made_young;
+	}
+	pool_info->pages_readahead_rnd_rate =
+	static_cast<double>(buf_pool.stat.n_ra_pages_read_rnd
+			    - buf_pool.old_stat.n_ra_pages_read_rnd)
+	/ time_elapsed;
+
+
+	pool_info->pages_readahead_rate =
+	static_cast<double>(buf_pool.stat.n_ra_pages_read
+			    - buf_pool.old_stat.n_ra_pages_read)
+	/ time_elapsed;
+
+	pool_info->pages_evicted_rate =
+	static_cast<double>(buf_pool.stat.n_ra_pages_evicted
+			    - buf_pool.old_stat.n_ra_pages_evicted)
+	/ time_elapsed;
+
+	pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool.unzip_LRU);
+
+	pool_info->io_sum = buf_LRU_stat_sum.io;
+
+	pool_info->io_cur = buf_LRU_stat_cur.io;
+
+	pool_info->unzip_sum = buf_LRU_stat_sum.unzip;
+
+	pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
+
+	buf_refresh_io_stats();
+	mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+static
+void
+buf_print_io_instance(
+/*==================*/
+	buf_pool_info_t*pool_info,	/*!< in: buffer pool info */
+	FILE*		file)		/*!< in/out: buffer where to print */
+{
+	ut_ad(pool_info);
+
+	fprintf(file,
+		"Buffer pool size   " ULINTPF "\n"
+		"Free buffers       " ULINTPF "\n"
+		"Database pages     " ULINTPF "\n"
+		"Old database pages " ULINTPF "\n"
+		"Modified db pages  " ULINTPF "\n"
+		"Percent of dirty pages(LRU & free pages): %.3f\n"
+		"Max dirty pages percent: %.3f\n"
+		"Pending reads " ULINTPF "\n"
+		"Pending writes: LRU " ULINTPF ", flush list " ULINTPF "\n",
+		pool_info->pool_size,
+		pool_info->free_list_len,
+		pool_info->lru_len,
+		pool_info->old_lru_len,
+		pool_info->flush_list_len,
+		static_cast<double>(pool_info->flush_list_len)
+		/ (static_cast<double>(pool_info->lru_len
+				       + pool_info->free_list_len) + 1.0)
+		* 100.0,
+		srv_max_buf_pool_modified_pct,
+		pool_info->n_pend_reads,
+		pool_info->n_pending_flush_lru,
+		pool_info->n_pending_flush_list);
+
+	fprintf(file,
+		"Pages made young " ULINTPF ", not young " ULINTPF "\n"
+		"%.2f youngs/s, %.2f non-youngs/s\n"
+		"Pages read " ULINTPF ", created " ULINTPF
+		", written " ULINTPF "\n"
+		"%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
+		pool_info->n_pages_made_young,
+		pool_info->n_pages_not_made_young,
+		pool_info->page_made_young_rate,
+		pool_info->page_not_made_young_rate,
+		pool_info->n_pages_read,
+		pool_info->n_pages_created,
+		pool_info->n_pages_written,
+		pool_info->pages_read_rate,
+		pool_info->pages_created_rate,
+		pool_info->pages_written_rate);
+
+	if (pool_info->n_page_get_delta) {
+		double hit_rate = static_cast<double>(
+			pool_info->page_read_delta)
+			/ static_cast<double>(pool_info->n_page_get_delta);
+
+		if (hit_rate > 1) {
+			hit_rate = 1;
+		}
+
+		fprintf(file,
+			"Buffer pool hit rate " ULINTPF " / 1000,"
+			" young-making rate " ULINTPF " / 1000 not "
+			ULINTPF " / 1000\n",
+			ulint(1000 * (1 - hit_rate)),
+			ulint(1000
+			      * double(pool_info->young_making_delta)
+			      / double(pool_info->n_page_get_delta)),
+			ulint(1000 * double(pool_info->not_young_making_delta)
+			      / double(pool_info->n_page_get_delta)));
+	} else {
+		fputs("No buffer pool page gets since the last printout\n",
+		      file);
+	}
+
+	/* Statistics about read ahead algorithm */
+	fprintf(file, "Pages read ahead %.2f/s,"
+		" evicted without access %.2f/s,"
+		" Random read ahead %.2f/s\n",
+
+		pool_info->pages_readahead_rate,
+		pool_info->pages_evicted_rate,
+		pool_info->pages_readahead_rnd_rate);
+
+	/* Print some values to help us with visualizing what is
+	happening with LRU eviction. */
+	fprintf(file,
+		"LRU len: " ULINTPF ", unzip_LRU len: " ULINTPF "\n"
+		"I/O sum[" ULINTPF "]:cur[" ULINTPF "], "
+		"unzip sum[" ULINTPF "]:cur[" ULINTPF "]\n",
+		pool_info->lru_len, pool_info->unzip_lru_len,
+		pool_info->io_sum, pool_info->io_cur,
+		pool_info->unzip_sum, pool_info->unzip_cur);
+}
+
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+void
+buf_print_io(
+/*=========*/
+	FILE*	file)	/*!< in/out: buffer where to print */
+{
+	buf_pool_info_t	pool_info;
+
+	buf_stats_get_pool_info(&pool_info);
+	buf_print_io_instance(&pool_info, file);
+}
+
+/** Verify that post encryption checksum match with the calculated checksum.
+This function should be called only if tablespace contains crypt data metadata.
+@param[in]	page		page frame
+@param[in]	fsp_flags	tablespace flags
+@return true if true if page is encrypted and OK, false otherwise */
+bool buf_page_verify_crypt_checksum(const byte* page, ulint fsp_flags)
+{
+	if (!fil_space_t::full_crc32(fsp_flags)) {
+		return fil_space_verify_crypt_checksum(
+			page, fil_space_t::zip_size(fsp_flags));
+	}
+
+	return !buf_page_is_corrupted(true, page, fsp_flags);
+}
+
+/** Print the given page_id_t object.
+@param[in,out]	out	the output stream
+@param[in]	page_id	the page_id_t object to be printed
+@return the output stream */
+std::ostream& operator<<(std::ostream &out, const page_id_t page_id)
+{
+  out << "[page id: space=" << page_id.space()
+      << ", page number=" << page_id.page_no() << "]";
+  return out;
+}
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/buf/buf0checksum.cc b/storage/innobase/buf/buf0checksum.cc
new file mode 100644
index 00000000..e98dc184
--- /dev/null
+++ b/storage/innobase/buf/buf0checksum.cc
@@ -0,0 +1,129 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0checksum.cc
+Buffer pool checksum functions, also linked from /extra/innochecksum.cc
+
+Created Aug 11, 2011 Vasil Dimov
+*******************************************************/
+
+#include "buf0checksum.h"
+#include "fil0fil.h"
+#include "ut0crc32.h"
+#include "ut0rnd.h"
+
+#ifndef UNIV_INNOCHECKSUM
+#include "srv0srv.h"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** the value of innodb_checksum_algorithm */
+ulong	srv_checksum_algorithm;
+
+/** Calculate the CRC32 checksum of a page. The value is stored to the page
+when it is written to a file and also checked for a match when reading from
+the file. Note that we must be careful to calculate the same value on all
+architectures.
+@param[in]	page			buffer page (srv_page_size bytes)
+@return	CRC-32C */
+uint32_t buf_calc_page_crc32(const byte* page)
+{
+	/* Note: innodb_checksum_algorithm=crc32 could and should have
+	included the entire page in the checksum, and CRC-32 values
+	should be combined with the CRC-32 function, not with
+	exclusive OR. We stick to the current algorithm in order to
+	remain compatible with old data files. */
+	return ut_crc32(page + FIL_PAGE_OFFSET,
+			FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+			- FIL_PAGE_OFFSET)
+		^ ut_crc32(page + FIL_PAGE_DATA,
+			   srv_page_size
+			   - (FIL_PAGE_DATA + FIL_PAGE_END_LSN_OLD_CHKSUM));
+}
+
+/** Calculate a checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@param[in]	page	file page (srv_page_size bytes)
+@return checksum */
+uint32_t
+buf_calc_page_new_checksum(const byte* page)
+{
+	ulint checksum;
+
+	/* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
+	FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, are written outside the buffer pool
+	to the first pages of data files, we have to skip them in the page
+	checksum calculation.
+	We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
+	checksum is stored, and also the last 8 bytes of page because
+	there we store the old formula checksum. */
+
+	checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
+				  FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+				  - FIL_PAGE_OFFSET)
+		+ ut_fold_binary(page + FIL_PAGE_DATA,
+				 srv_page_size - FIL_PAGE_DATA
+				 - FIL_PAGE_END_LSN_OLD_CHKSUM);
+	return(static_cast<uint32_t>(checksum));
+}
+
+/** In MySQL before 4.0.14 or 4.1.1 there was an InnoDB bug that
+the checksum only looked at the first few bytes of the page.
+This calculates that old checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@param[in]	page	file page (srv_page_size bytes)
+@return checksum */
+uint32_t
+buf_calc_page_old_checksum(const byte* page)
+{
+	return(static_cast<uint32_t>
+	       (ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)));
+}
+
+/** Return a printable string describing the checksum algorithm.
+@param[in]	algo	algorithm
+@return algorithm name */
+const char*
+buf_checksum_algorithm_name(srv_checksum_algorithm_t algo)
+{
+	switch (algo) {
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+		return("crc32");
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+		return("strict_crc32");
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+		return("innodb");
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+		return("strict_innodb");
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+		return("none");
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+		return("strict_none");
+	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+		return("full_crc32");
+	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+		return("strict_full_crc32");
+	}
+
+	ut_error;
+	return(NULL);
+}
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc
new file mode 100644
index 00000000..52e947b7
--- /dev/null
+++ b/storage/innobase/buf/buf0dblwr.cc
@@ -0,0 +1,764 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dblwr.cc
+Doublwrite buffer module
+
+Created 2011/12/19
+*******************************************************/
+
+#include "buf0dblwr.h"
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "sync0sync.h"
+#include "page0zip.h"
+#include "trx0sys.h"
+#include "fil0crypt.h"
+#include "fil0pagecompress.h"
+
+using st_::span;
+
+/** The doublewrite buffer */
+buf_dblwr_t buf_dblwr;
+
+/** @return the TRX_SYS page */
+inline buf_block_t *buf_dblwr_trx_sys_get(mtr_t *mtr)
+{
+  buf_block_t *block= buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
+                                   0, RW_X_LATCH, mtr);
+  buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+  return block;
+}
+
+/** Initialize the doublewrite buffer data structure.
+@param header   doublewrite page header in the TRX_SYS page */
+inline void buf_dblwr_t::init(const byte *header)
+{
+  ut_ad(!active_slot->first_free);
+  ut_ad(!active_slot->reserved);
+  ut_ad(!batch_running);
+
+  mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr);
+  pthread_cond_init(&cond, nullptr);
+  block1= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK1));
+  block2= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK2));
+
+  const uint32_t buf_size= 2 * block_size();
+  for (int i= 0; i < 2; i++)
+  {
+    slots[i].write_buf= static_cast<byte*>
+      (aligned_malloc(buf_size << srv_page_size_shift, srv_page_size));
+    slots[i].buf_block_arr= static_cast<element*>
+      (ut_zalloc_nokey(buf_size * sizeof(element)));
+  }
+  active_slot= &slots[0];
+}
+
+/** Create or restore the doublewrite buffer in the TRX_SYS page.
+@return whether the operation succeeded */
+bool buf_dblwr_t::create()
+{
+  if (is_initialised())
+    return true;
+
+  mtr_t mtr;
+  const ulint size= block_size();
+
+start_again:
+  mtr.start();
+
+  buf_block_t *trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
+
+  if (mach_read_from_4(TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+                       trx_sys_block->frame) == TRX_SYS_DOUBLEWRITE_MAGIC_N)
+  {
+    /* The doublewrite buffer has already been created: just read in
+    some numbers */
+    init(TRX_SYS_DOUBLEWRITE + trx_sys_block->frame);
+    mtr.commit();
+    return true;
+  }
+
+  if (UT_LIST_GET_FIRST(fil_system.sys_space->chain)->size < 3 * size)
+  {
+too_small:
+    ib::error() << "Cannot create doublewrite buffer: "
+                   "the first file in innodb_data_file_path must be at least "
+                << (3 * (size >> (20U - srv_page_size_shift))) << "M.";
+    mtr.commit();
+    return false;
+  }
+  else
+  {
+    buf_block_t *b= fseg_create(fil_system.sys_space,
+                                TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG,
+                                &mtr, false, trx_sys_block);
+    if (!b)
+      goto too_small;
+    ib::info() << "Doublewrite buffer not found: creating new";
+
+    /* FIXME: After this point, the doublewrite buffer creation
+    is not atomic. The doublewrite buffer should not exist in
+    the InnoDB system tablespace file in the first place.
+    It could be located in separate optional file(s) in a
+    user-specified location. */
+
+    /* fseg_create acquires a second latch on the page,
+    therefore we must declare it: */
+    buf_block_dbg_add_level(b, SYNC_NO_ORDER_CHECK);
+  }
+
+  byte *fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
+    trx_sys_block->frame;
+  for (uint32_t prev_page_no= 0, i= 0, extent_size= FSP_EXTENT_SIZE;
+       i < 2 * size + extent_size / 2; i++)
+  {
+    buf_block_t *new_block= fseg_alloc_free_page(fseg_header, prev_page_no + 1,
+                                                 FSP_UP, &mtr);
+    if (!new_block)
+    {
+      ib::error() << "Cannot create doublewrite buffer: "
+                     " you must increase your tablespace size."
+                     " Cannot continue operation.";
+      /* This may essentially corrupt the doublewrite
+      buffer. However, usually the doublewrite buffer
+      is created at database initialization, and it
+      should not matter (just remove all newly created
+      InnoDB files and restart). */
+      mtr.commit();
+      return false;
+    }
+
+    /* We read the allocated pages to the buffer pool; when they are
+    written to disk in a flush, the space id and page number fields
+    are also written to the pages. When we at database startup read
+    pages from the doublewrite buffer, we know that if the space id
+    and page number in them are the same as the page position in the
+    tablespace, then the page has not been written to in
+    doublewrite. */
+
+    ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
+    const page_id_t id= new_block->page.id();
+    /* We only do this in the debug build, to ensure that the check in
+    buf_flush_init_for_writing() will see a valid page type. The
+    flushes of new_block are actually unnecessary here.  */
+    ut_d(mtr.write<2>(*new_block, FIL_PAGE_TYPE + new_block->frame,
+                      FIL_PAGE_TYPE_SYS));
+
+    if (i == size / 2)
+    {
+      ut_a(id.page_no() == size);
+      mtr.write<4>(*trx_sys_block,
+                   TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK1 +
+                   trx_sys_block->frame, id.page_no());
+      mtr.write<4>(*trx_sys_block,
+                   TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
+                   TRX_SYS_DOUBLEWRITE_BLOCK1 + trx_sys_block->frame,
+                   id.page_no());
+    }
+    else if (i == size / 2 + size)
+    {
+      ut_a(id.page_no() == 2 * size);
+      mtr.write<4>(*trx_sys_block,
+                   TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK2 +
+                   trx_sys_block->frame, id.page_no());
+      mtr.write<4>(*trx_sys_block,
+                   TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
+                   TRX_SYS_DOUBLEWRITE_BLOCK2 + trx_sys_block->frame,
+                   id.page_no());
+    }
+    else if (i > size / 2)
+      ut_a(id.page_no() == prev_page_no + 1);
+
+    if (((i + 1) & 15) == 0) {
+      /* rw_locks can only be recursively x-locked 2048 times. (on 32
+      bit platforms, (lint) 0 - (X_LOCK_DECR * 2049) is no longer a
+      negative number, and thus lock_word becomes like a shared lock).
+      For 4k page size this loop will lock the fseg header too many
+      times. Since this code is not done while any other threads are
+      active, restart the MTR occasionally. */
+      mtr.commit();
+      mtr.start();
+      trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
+      fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
+        trx_sys_block->frame;
+    }
+
+    prev_page_no= id.page_no();
+  }
+
+  mtr.write<4>(*trx_sys_block,
+               TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+               trx_sys_block->frame, TRX_SYS_DOUBLEWRITE_MAGIC_N);
+  mtr.write<4>(*trx_sys_block,
+               TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+               TRX_SYS_DOUBLEWRITE_REPEAT + trx_sys_block->frame,
+               TRX_SYS_DOUBLEWRITE_MAGIC_N);
+
+  mtr.write<4>(*trx_sys_block,
+               TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
+               trx_sys_block->frame, TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N);
+  mtr.commit();
+
+  /* Flush the modified pages to disk and make a checkpoint */
+  log_make_checkpoint();
+
+  /* Remove doublewrite pages from LRU */
+  buf_pool_invalidate();
+
+  ib::info() << "Doublewrite buffer created";
+  goto start_again;
+}
+
+/** Initialize the doublewrite buffer memory structure on recovery.
+If we are upgrading from a version before MySQL 4.1, then this
+function performs the necessary update operations to support
+innodb_file_per_table. If we are in a crash recovery, this function
+loads the pages from double write buffer into memory.
+@param file File handle
+@param path Path name of file
+@return DB_SUCCESS or error code */
+dberr_t buf_dblwr_t::init_or_load_pages(pfs_os_file_t file, const char *path)
+{
+  ut_ad(this == &buf_dblwr);
+  const uint32_t size= block_size();
+
+  /* We do the file i/o past the buffer pool */
+  byte *read_buf= static_cast<byte*>(aligned_malloc(srv_page_size,
+                                                    srv_page_size));
+  /* Read the TRX_SYS header to check if we are using the doublewrite buffer */
+  dberr_t err= os_file_read(IORequestRead, file, read_buf,
+                            TRX_SYS_PAGE_NO << srv_page_size_shift,
+                            srv_page_size);
+
+  if (err != DB_SUCCESS)
+  {
+    ib::error() << "Failed to read the system tablespace header page";
+func_exit:
+    aligned_free(read_buf);
+    return err;
+  }
+
+  /* TRX_SYS_PAGE_NO is not encrypted see fil_crypt_rotate_page() */
+  if (mach_read_from_4(TRX_SYS_DOUBLEWRITE_MAGIC + TRX_SYS_DOUBLEWRITE +
+                       read_buf) != TRX_SYS_DOUBLEWRITE_MAGIC_N)
+  {
+    /* There is no doublewrite buffer initialized in the TRX_SYS page.
+    This should normally not be possible; the doublewrite buffer should
+    be initialized when creating the database. */
+    err= DB_SUCCESS;
+    goto func_exit;
+  }
+
+  init(TRX_SYS_DOUBLEWRITE + read_buf);
+
+  const bool upgrade_to_innodb_file_per_table=
+    mach_read_from_4(TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
+                     TRX_SYS_DOUBLEWRITE + read_buf) !=
+    TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N;
+
+  auto write_buf= active_slot->write_buf;
+  /* Read the pages from the doublewrite buffer to memory */
+  err= os_file_read(IORequestRead, file, write_buf,
+                    block1.page_no() << srv_page_size_shift,
+                    size << srv_page_size_shift);
+
+  if (err != DB_SUCCESS)
+  {
+    ib::error() << "Failed to read the first double write buffer extent";
+    goto func_exit;
+  }
+
+  err= os_file_read(IORequestRead, file,
+                    write_buf + (size << srv_page_size_shift),
+                    block2.page_no() << srv_page_size_shift,
+                    size << srv_page_size_shift);
+  if (err != DB_SUCCESS)
+  {
+    ib::error() << "Failed to read the second double write buffer extent";
+    goto func_exit;
+  }
+
+  byte *page= write_buf;
+
+  if (UNIV_UNLIKELY(upgrade_to_innodb_file_per_table))
+  {
+    ib::info() << "Resetting space id's in the doublewrite buffer";
+
+    for (ulint i= 0; i < size * 2; i++, page += srv_page_size)
+    {
+      memset(page + FIL_PAGE_SPACE_ID, 0, 4);
+      /* For innodb_checksum_algorithm=innodb, we do not need to
+      calculate new checksums for the pages because the field
+      .._SPACE_ID does not affect them. Write the page back to where
+      we read it from. */
+      const ulint source_page_no= i < size
+        ? block1.page_no() + i
+        : block2.page_no() + i - size;
+      err= os_file_write(IORequestWrite, path, file, page,
+                         source_page_no << srv_page_size_shift, srv_page_size);
+      if (err != DB_SUCCESS)
+      {
+        ib::error() << "Failed to upgrade the double write buffer";
+        goto func_exit;
+      }
+    }
+    os_file_flush(file);
+  }
+  else
+    for (ulint i= 0; i < size * 2; i++, page += srv_page_size)
+      if (mach_read_from_8(my_assume_aligned<8>(page + FIL_PAGE_LSN)))
+        /* Each valid page header must contain a nonzero FIL_PAGE_LSN field. */
+        recv_sys.dblwr.add(page);
+
+  err= DB_SUCCESS;
+  goto func_exit;
+}
+
+/** Process and remove the double write buffer pages for all tablespaces. */
+void buf_dblwr_t::recover()
+{
+  ut_ad(recv_sys.parse_start_lsn);
+  if (!is_initialised())
+    return;
+
+  uint32_t page_no_dblwr= 0;
+  byte *read_buf= static_cast<byte*>(aligned_malloc(3 * srv_page_size,
+                                                    srv_page_size));
+  byte *const buf= read_buf + srv_page_size;
+
+  for (recv_dblwr_t::list::iterator i= recv_sys.dblwr.pages.begin();
+       i != recv_sys.dblwr.pages.end(); ++i, ++page_no_dblwr)
+  {
+    byte *page= *i;
+    const uint32_t page_no= page_get_page_no(page);
+    if (!page_no) /* recovered via Datafile::restore_from_doublewrite() */
+      continue;
+
+    const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN);
+    if (recv_sys.parse_start_lsn > lsn)
+      /* Pages written before the checkpoint are not useful for recovery. */
+      continue;
+    const ulint space_id= page_get_space_id(page);
+    const page_id_t page_id(space_id, page_no);
+
+    if (recv_sys.scanned_lsn < lsn)
+    {
+      ib::info() << "Ignoring a doublewrite copy of page " << page_id
+                 << " with future log sequence number " << lsn;
+      continue;
+    }
+
+    fil_space_t *space= fil_space_t::get(space_id);
+
+    if (!space)
+      /* The tablespace that this page once belonged to does not exist */
+      continue;
+
+    if (UNIV_UNLIKELY(page_no >= space->get_size()))
+    {
+      /* Do not report the warning for undo tablespaces, because they
+      can be truncated in place. */
+      if (!srv_is_undo_tablespace(space_id))
+        ib::warn() << "A copy of page " << page_no
+                   << " in the doublewrite buffer slot " << page_no_dblwr
+                   << " is beyond the end of tablespace " << space->name
+                   << " (" << space->size << " pages)";
+next_page:
+      space->release();
+      continue;
+    }
+
+    const ulint physical_size= space->physical_size();
+    ut_ad(!buf_is_zeroes(span<const byte>(page, physical_size)));
+
+    /* We want to ensure that for partial reads the unread portion of
+    the page is NUL. */
+    memset(read_buf, 0x0, physical_size);
+
+    /* Read in the actual page from the file */
+    fil_io_t fio= space->io(IORequest(IORequest::DBLWR_RECOVER),
+                            os_offset_t{page_no} * physical_size,
+                            physical_size, read_buf);
+
+    if (UNIV_UNLIKELY(fio.err != DB_SUCCESS))
+       ib::warn() << "Double write buffer recovery: " << page_id
+                  << " (tablespace '" << space->name
+                  << "') read failed with error: " << fio.err;
+
+    if (buf_is_zeroes(span<const byte>(read_buf, physical_size)))
+    {
+      /* We will check if the copy in the doublewrite buffer is
+      valid. If not, we will ignore this page (there should be redo
+      log records to initialize it). */
+    }
+    else if (recv_sys.dblwr.validate_page(page_id, read_buf, space, buf))
+      goto next_page;
+    else
+      /* We intentionally skip this message for all-zero pages. */
+      ib::info() << "Trying to recover page " << page_id
+                 << " from the doublewrite buffer.";
+
+    page= recv_sys.dblwr.find_page(page_id, space, buf);
+
+    if (!page)
+      goto next_page;
+
+    /* Write the good page from the doublewrite buffer to the intended
+    position. */
+    space->reacquire();
+    fio= space->io(IORequestWrite,
+                   os_offset_t{page_id.page_no()} * physical_size,
+                   physical_size, page);
+
+    if (fio.err == DB_SUCCESS)
+      ib::info() << "Recovered page " << page_id << " to '" << fio.node->name
+                 << "' from the doublewrite buffer.";
+    goto next_page;
+  }
+
+  recv_sys.dblwr.pages.clear();
+  fil_flush_file_spaces();
+  aligned_free(read_buf);
+}
+
+/** Free the doublewrite buffer. */
+void buf_dblwr_t::close()
+{
+  if (!is_initialised())
+    return;
+
+  /* Free the double write data structures. */
+  ut_ad(!active_slot->reserved);
+  ut_ad(!active_slot->first_free);
+  ut_ad(!batch_running);
+
+  pthread_cond_destroy(&cond);
+  for (int i= 0; i < 2; i++)
+  {
+    aligned_free(slots[i].write_buf);
+    ut_free(slots[i].buf_block_arr);
+  }
+  mysql_mutex_destroy(&mutex);
+
+  memset((void*) this, 0, sizeof *this);
+  active_slot= &slots[0];
+}
+
+/** Update the doublewrite buffer on write completion. */
+void buf_dblwr_t::write_completed()
+{
+  ut_ad(this == &buf_dblwr);
+  ut_ad(srv_use_doublewrite_buf);
+  ut_ad(is_initialised());
+  ut_ad(!srv_read_only_mode);
+
+  mysql_mutex_lock(&mutex);
+
+  ut_ad(batch_running);
+  slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+  ut_ad(flush_slot->reserved);
+  ut_ad(flush_slot->reserved <= flush_slot->first_free);
+
+  if (!--flush_slot->reserved)
+  {
+    mysql_mutex_unlock(&mutex);
+    /* This will finish the batch. Sync data files to the disk. */
+    fil_flush_file_spaces();
+    mysql_mutex_lock(&mutex);
+
+    /* We can now reuse the doublewrite memory buffer: */
+    flush_slot->first_free= 0;
+    batch_running= false;
+    pthread_cond_broadcast(&cond);
+  }
+
+  mysql_mutex_unlock(&mutex);
+}
+
+#ifdef UNIV_DEBUG
+/** Check the LSN values on the page.
+@param[in] page  page to check
+@param[in] s     tablespace */
+static void buf_dblwr_check_page_lsn(const page_t* page, const fil_space_t& s)
+{
+  /* Ignore page_compressed or encrypted pages */
+  if (s.is_compressed() || buf_page_get_key_version(page, s.flags))
+    return;
+  const byte* lsn_start= FIL_PAGE_LSN + 4 + page;
+  const byte* lsn_end= page + srv_page_size -
+    (s.full_crc32()
+     ? FIL_PAGE_FCRC32_END_LSN
+     : FIL_PAGE_END_LSN_OLD_CHKSUM - 4);
+  static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+  static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+  ut_ad(!memcmp_aligned<4>(lsn_start, lsn_end, 4));
+}
+
+static void buf_dblwr_check_page_lsn(const buf_page_t &b, const byte *page)
+{
+  if (fil_space_t *space= fil_space_t::get(b.id().space()))
+  {
+    buf_dblwr_check_page_lsn(page, *space);
+    space->release();
+  }
+}
+
+/** Check the LSN values on the page with which this block is associated. */
+static void buf_dblwr_check_block(const buf_page_t *bpage)
+{
+  ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
+  const page_t *page= reinterpret_cast<const buf_block_t*>(bpage)->frame;
+
+  switch (fil_page_get_type(page)) {
+  case FIL_PAGE_INDEX:
+  case FIL_PAGE_TYPE_INSTANT:
+  case FIL_PAGE_RTREE:
+    if (page_is_comp(page))
+    {
+      if (page_simple_validate_new(page))
+        return;
+    }
+    else if (page_simple_validate_old(page))
+      return;
+    /* While it is possible that this is not an index page but just
+    happens to have wrongly set FIL_PAGE_TYPE, such pages should never
+    be modified to without also adjusting the page type during page
+    allocation or buf_flush_init_for_writing() or
+    fil_block_reset_type(). */
+    buf_page_print(page);
+
+    ib::fatal() << "Apparent corruption of an index page " << bpage->id()
+                << " to be written to data file. We intentionally crash"
+                " the server to prevent corrupt data from ending up in"
+                " data files.";
+  }
+}
+#endif /* UNIV_DEBUG */
+
+bool buf_dblwr_t::flush_buffered_writes(const ulint size)
+{
+  mysql_mutex_assert_owner(&mutex);
+  ut_ad(size == block_size());
+
+  for (;;)
+  {
+    if (!active_slot->first_free)
+      return false;
+    if (!batch_running)
+      break;
+    my_cond_wait(&cond, &mutex.m_mutex);
+  }
+
+  ut_ad(active_slot->reserved == active_slot->first_free);
+  ut_ad(!flushing_buffered_writes);
+
+  /* Disallow anyone else to start another batch of flushing. */
+  slot *flush_slot= active_slot;
+  /* Switch the active slot */
+  active_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+  ut_a(active_slot->first_free == 0);
+  batch_running= true;
+  const ulint old_first_free= flush_slot->first_free;
+  auto write_buf= flush_slot->write_buf;
+  const bool multi_batch= block1 + static_cast<uint32_t>(size) != block2 &&
+    old_first_free > size;
+  flushing_buffered_writes= 1 + multi_batch;
+  pages_submitted+= old_first_free;
+  /* Now safe to release the mutex. */
+  mysql_mutex_unlock(&mutex);
+#ifdef UNIV_DEBUG
+  for (ulint len2= 0, i= 0; i < old_first_free; len2 += srv_page_size, i++)
+  {
+    buf_page_t *bpage= flush_slot->buf_block_arr[i].request.bpage;
+
+    if (bpage->zip.data)
+      /* No simple validate for ROW_FORMAT=COMPRESSED pages exists. */
+      continue;
+
+    /* Check that the actual page in the buffer pool is not corrupt
+    and the LSN values are sane. */
+    buf_dblwr_check_block(bpage);
+    ut_d(buf_dblwr_check_page_lsn(*bpage, write_buf + len2));
+  }
+#endif /* UNIV_DEBUG */
+  const IORequest request(nullptr, fil_system.sys_space->chain.start,
+                          IORequest::DBLWR_BATCH);
+  ut_a(fil_system.sys_space->acquire());
+  if (multi_batch)
+  {
+    fil_system.sys_space->reacquire();
+    os_aio(request, write_buf,
+           os_offset_t{block1.page_no()} << srv_page_size_shift,
+           size << srv_page_size_shift);
+    os_aio(request, write_buf + (size << srv_page_size_shift),
+           os_offset_t{block2.page_no()} << srv_page_size_shift,
+           (old_first_free - size) << srv_page_size_shift);
+  }
+  else
+    os_aio(request, write_buf,
+           os_offset_t{block1.page_no()} << srv_page_size_shift,
+           old_first_free << srv_page_size_shift);
+  return true;
+}
+
+void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request)
+{
+  ut_ad(this == &buf_dblwr);
+  ut_ad(srv_use_doublewrite_buf);
+  ut_ad(is_initialised());
+  ut_ad(!srv_read_only_mode);
+  ut_ad(!request.bpage);
+  ut_ad(request.node == fil_system.sys_space->chain.start);
+  ut_ad(request.type == IORequest::DBLWR_BATCH);
+  mysql_mutex_lock(&mutex);
+  ut_ad(batch_running);
+  ut_ad(flushing_buffered_writes);
+  ut_ad(flushing_buffered_writes <= 2);
+  writes_completed++;
+  if (UNIV_UNLIKELY(--flushing_buffered_writes))
+  {
+    mysql_mutex_unlock(&mutex);
+    return;
+  }
+
+  slot *const flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+  ut_ad(flush_slot->reserved == flush_slot->first_free);
+  /* increment the doublewrite flushed pages counter */
+  pages_written+= flush_slot->first_free;
+  mysql_mutex_unlock(&mutex);
+
+  /* Now flush the doublewrite buffer data to disk */
+  fil_system.sys_space->flush<false>();
+
+  /* The writes have been flushed to disk now and in recovery we will
+  find them in the doublewrite buffer blocks. Next, write the data pages. */
+  for (ulint i= 0, first_free= flush_slot->first_free; i < first_free; i++)
+  {
+    auto e= flush_slot->buf_block_arr[i];
+    buf_page_t* bpage= e.request.bpage;
+    ut_ad(bpage->in_file());
+
+    /* We request frame here to get correct buffer in case of
+    encryption and/or page compression */
+    void *frame= buf_page_get_frame(bpage);
+
+    auto e_size= e.size;
+
+    if (UNIV_LIKELY_NULL(bpage->zip.data))
+    {
+      e_size= bpage->zip_size();
+      ut_ad(e_size);
+    }
+    else
+    {
+      ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
+      ut_ad(!bpage->zip_size());
+      ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast<const byte*>(frame)));
+    }
+
+    const lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
+                                      (FIL_PAGE_LSN +
+                                       static_cast<const byte*>(frame)));
+    ut_ad(lsn);
+    ut_ad(lsn >= bpage->oldest_modification());
+    log_write_up_to(lsn, true);
+    e.request.node->space->io(e.request, bpage->physical_offset(), e_size,
+                              frame, bpage);
+  }
+}
+
+/** Flush possible buffered writes to persistent storage.
+It is very important to call this function after a batch of writes has been
+posted, and also when we may have to wait for a page latch!
+Otherwise a deadlock of threads can occur. */
+void buf_dblwr_t::flush_buffered_writes()
+{
+  if (!is_initialised() || !srv_use_doublewrite_buf)
+  {
+    fil_flush_file_spaces();
+    return;
+  }
+
+  ut_ad(!srv_read_only_mode);
+  const ulint size= block_size();
+
+  mysql_mutex_lock(&mutex);
+  if (!flush_buffered_writes(size))
+    mysql_mutex_unlock(&mutex);
+}
+
+/** Schedule a page write. If the doublewrite memory buffer is full,
+flush_buffered_writes() will be invoked to make space.
+@param request    asynchronous write request
+@param size       payload size in bytes */
+void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size)
+{
+  ut_ad(request.is_async());
+  ut_ad(request.is_write());
+  ut_ad(request.bpage);
+  ut_ad(request.bpage->in_file());
+  ut_ad(request.node);
+  ut_ad(request.node->space->id == request.bpage->id().space());
+  ut_ad(request.node->space->referenced());
+  ut_ad(!srv_read_only_mode);
+
+  const ulint buf_size= 2 * block_size();
+
+  mysql_mutex_lock(&mutex);
+
+  for (;;)
+  {
+    ut_ad(active_slot->first_free <= buf_size);
+    if (active_slot->first_free != buf_size)
+      break;
+
+    if (flush_buffered_writes(buf_size / 2))
+      mysql_mutex_lock(&mutex);
+  }
+
+  byte *p= active_slot->write_buf + srv_page_size * active_slot->first_free;
+
+  /* We request frame here to get correct buffer in case of
+  encryption and/or page compression */
+  void *frame= buf_page_get_frame(request.bpage);
+
+  /* "frame" is at least 1024-byte aligned for ROW_FORMAT=COMPRESSED pages,
+  and at least srv_page_size (4096-byte) for everything else. */
+  memcpy_aligned<UNIV_ZIP_SIZE_MIN>(p, frame, size);
+  /* fil_page_compress() for page_compressed guarantees 256-byte alignment */
+  memset_aligned<256>(p + size, 0, srv_page_size - size);
+  /* FIXME: Inform the compiler that "size" and "srv_page_size - size"
+  are integer multiples of 256, so the above can translate into simple
+  SIMD instructions. Currently, we make no such assumptions about the
+  non-pointer parameters that are passed to the _aligned templates. */
+  ut_ad(!request.bpage->zip_size() || request.bpage->zip_size() == size);
+  ut_ad(active_slot->reserved == active_slot->first_free);
+  ut_ad(active_slot->reserved < buf_size);
+  new (active_slot->buf_block_arr + active_slot->first_free++)
+    element{request, size};
+  active_slot->reserved= active_slot->first_free;
+
+  if (active_slot->first_free != buf_size ||
+      !flush_buffered_writes(buf_size / 2))
+    mysql_mutex_unlock(&mutex);
+}
diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc
new file mode 100644
index 00000000..c6ddcb4f
--- /dev/null
+++ b/storage/innobase/buf/buf0dump.cc
@@ -0,0 +1,824 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dump.cc
+Implements a buffer pool dump/load.
+
+Created April 08, 2011 Vasil Dimov
+*******************************************************/
+
+#include "my_global.h"
+#include "mysqld.h"
+#include "my_sys.h"
+
+#include "mysql/psi/mysql_stage.h"
+#include "mysql/psi/psi.h"
+
+#include "buf0buf.h"
+#include "buf0dump.h"
+#include "dict0dict.h"
+#include "os0file.h"
+#include "os0thread.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "sync0rw.h"
+#include "ut0byte.h"
+
+#include <algorithm>
+
+#include "mysql/service_wsrep.h" /* wsrep_recovery */
+#include <my_service_manager.h>
+
+static void buf_do_load_dump();
+
+enum status_severity {
+	STATUS_INFO,
+	STATUS_ERR
+};
+
+#define SHUTTING_DOWN()	(srv_shutdown_state != SRV_SHUTDOWN_NONE)
+
+/* Flags that tell the buffer pool dump/load thread which action should it
+take after being waked up. */
+static volatile bool	buf_dump_should_start;
+static volatile bool	buf_load_should_start;
+
+static bool	buf_load_abort_flag;
+
+/** Start the buffer pool dump/load task and instructs it to start a dump. */
+void buf_dump_start()
+{
+  buf_dump_should_start= true;
+  buf_do_load_dump();
+}
+
+/** Start the buffer pool dump/load task and instructs it to start a load. */
+void buf_load_start()
+{
+  buf_load_should_start= true;
+  buf_do_load_dump();
+}
+
+/*****************************************************************//**
+Sets the global variable that feeds MySQL's innodb_buffer_pool_dump_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3). The value of this variable can be
+retrieved by:
+SELECT variable_value FROM information_schema.global_status WHERE
+variable_name = 'INNODB_BUFFER_POOL_DUMP_STATUS';
+or by:
+SHOW STATUS LIKE 'innodb_buffer_pool_dump_status'; */
+static MY_ATTRIBUTE((nonnull, format(printf, 2, 3)))
+void
+buf_dump_status(
+/*============*/
+	enum status_severity	severity,/*!< in: status severity */
+	const char*		fmt,	/*!< in: format */
+	...)				/*!< in: extra parameters according
+					to fmt */
+{
+	va_list	ap;
+
+	va_start(ap, fmt);
+
+	vsnprintf(
+		export_vars.innodb_buffer_pool_dump_status,
+		sizeof(export_vars.innodb_buffer_pool_dump_status),
+		fmt, ap);
+
+	switch (severity) {
+	case STATUS_INFO:
+		ib::info() << export_vars.innodb_buffer_pool_dump_status;
+		break;
+
+	case STATUS_ERR:
+		ib::error() << export_vars.innodb_buffer_pool_dump_status;
+		break;
+	}
+
+	va_end(ap);
+}
+
+/*****************************************************************//**
+Sets the global variable that feeds MySQL's innodb_buffer_pool_load_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3). The value of this variable can be
+retrieved by:
+SELECT variable_value FROM information_schema.global_status WHERE
+variable_name = 'INNODB_BUFFER_POOL_LOAD_STATUS';
+or by:
+SHOW STATUS LIKE 'innodb_buffer_pool_load_status'; */
+static MY_ATTRIBUTE((nonnull, format(printf, 2, 3)))
+void
+buf_load_status(
+/*============*/
+	enum status_severity	severity,/*!< in: status severity */
+	const char*	fmt,	/*!< in: format */
+	...)			/*!< in: extra parameters according to fmt */
+{
+	va_list	ap;
+
+	va_start(ap, fmt);
+
+	vsnprintf(
+		export_vars.innodb_buffer_pool_load_status,
+		sizeof(export_vars.innodb_buffer_pool_load_status),
+		fmt, ap);
+
+	switch (severity) {
+	case STATUS_INFO:
+		ib::info() << export_vars.innodb_buffer_pool_load_status;
+		break;
+
+	case STATUS_ERR:
+		ib::error() << export_vars.innodb_buffer_pool_load_status;
+		break;
+	}
+
+	va_end(ap);
+}
+
+/** Returns the directory path where the buffer pool dump file will be created.
+@return directory path */
+static
+const char*
+get_buf_dump_dir()
+{
+	const char*	dump_dir;
+
+	/* The dump file should be created in the default data directory if
+	innodb_data_home_dir is set as an empty string. */
+	if (!*srv_data_home) {
+		dump_dir = fil_path_to_mysql_datadir;
+	} else {
+		dump_dir = srv_data_home;
+	}
+
+	return(dump_dir);
+}
+
+/** Generate the path to the buffer pool dump/load file.
+@param[out]	path		generated path
+@param[in]	path_size	size of 'path', used as in snprintf(3). */
+static void buf_dump_generate_path(char *path, size_t path_size)
+{
+	char	buf[FN_REFLEN];
+
+	mysql_mutex_lock(&LOCK_global_system_variables);
+	snprintf(buf, sizeof(buf), "%s%c%s", get_buf_dump_dir(),
+		 OS_PATH_SEPARATOR, srv_buf_dump_filename);
+	mysql_mutex_unlock(&LOCK_global_system_variables);
+
+	os_file_type_t	type;
+	bool		exists = false;
+	bool		ret;
+
+	ret = os_file_status(buf, &exists, &type);
+
+	/* For realpath() to succeed the file must exist. */
+
+	if (ret && exists) {
+		/* my_realpath() assumes the destination buffer is big enough
+		to hold FN_REFLEN bytes. */
+		ut_a(path_size >= FN_REFLEN);
+
+		my_realpath(path, buf, 0);
+	} else {
+		/* If it does not exist, then resolve only srv_data_home
+		and append srv_buf_dump_filename to it. */
+		char	srv_data_home_full[FN_REFLEN];
+
+		my_realpath(srv_data_home_full, get_buf_dump_dir(), 0);
+
+		if (srv_data_home_full[strlen(srv_data_home_full) - 1]
+		    == OS_PATH_SEPARATOR) {
+
+			snprintf(path, path_size, "%s%s",
+				 srv_data_home_full,
+				 srv_buf_dump_filename);
+		} else {
+			snprintf(path, path_size, "%s%c%s",
+				 srv_data_home_full,
+				 OS_PATH_SEPARATOR,
+				 srv_buf_dump_filename);
+		}
+	}
+}
+
+/*****************************************************************//**
+Perform a buffer pool dump into the file specified by
+innodb_buffer_pool_filename. If any errors occur then the value of
+innodb_buffer_pool_dump_status will be set accordingly, see buf_dump_status().
+The dump filename can be specified by (relative to srv_data_home):
+SET GLOBAL innodb_buffer_pool_filename='filename'; */
+static
+void
+buf_dump(
+/*=====*/
+	ibool	obey_shutdown)	/*!< in: quit if we are in a shutting down
+				state */
+{
+#define SHOULD_QUIT()	(SHUTTING_DOWN() && obey_shutdown)
+
+	char	full_filename[OS_FILE_MAX_PATH];
+	char	tmp_filename[OS_FILE_MAX_PATH + sizeof "incomplete"];
+	char	now[32];
+	FILE*	f;
+	int	ret;
+
+	buf_dump_generate_path(full_filename, sizeof(full_filename));
+
+	snprintf(tmp_filename, sizeof(tmp_filename),
+		 "%s.incomplete", full_filename);
+
+	buf_dump_status(STATUS_INFO, "Dumping buffer pool(s) to %s",
+			full_filename);
+
+#if defined(__GLIBC__) || defined(__WIN__) || O_CLOEXEC == 0
+	f = fopen(tmp_filename, "w" STR_O_CLOEXEC);
+#else
+	{
+		int	fd;
+		fd = open(tmp_filename, O_CREAT | O_TRUNC | O_CLOEXEC | O_WRONLY, 0640);
+		if (fd >= 0) {
+			f = fdopen(fd, "w");
+		}
+		else {
+			f = NULL;
+		}
+	}
+#endif
+	if (f == NULL) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot open '%s' for writing: %s",
+				tmp_filename, strerror(errno));
+		return;
+	}
+	const buf_page_t*	bpage;
+	page_id_t*		dump;
+	ulint			n_pages;
+	ulint			j;
+
+	mysql_mutex_lock(&buf_pool.mutex);
+
+	n_pages = UT_LIST_GET_LEN(buf_pool.LRU);
+
+	/* skip empty buffer pools */
+	if (n_pages == 0) {
+		mysql_mutex_unlock(&buf_pool.mutex);
+		goto done;
+	}
+
+	if (srv_buf_pool_dump_pct != 100) {
+		ulint		t_pages;
+
+		/* limit the number of total pages dumped to X% of the
+		total number of pages */
+		t_pages = buf_pool.curr_size * srv_buf_pool_dump_pct / 100;
+		if (n_pages > t_pages) {
+			buf_dump_status(STATUS_INFO,
+					"Restricted to " ULINTPF
+					" pages due to "
+					"innodb_buf_pool_dump_pct=%lu",
+					t_pages, srv_buf_pool_dump_pct);
+			n_pages = t_pages;
+		}
+
+		if (n_pages == 0) {
+			n_pages = 1;
+		}
+	}
+
+	dump = static_cast<page_id_t*>(ut_malloc_nokey(
+					       n_pages * sizeof(*dump)));
+
+	if (dump == NULL) {
+		mysql_mutex_unlock(&buf_pool.mutex);
+		fclose(f);
+		buf_dump_status(STATUS_ERR,
+				"Cannot allocate " ULINTPF " bytes: %s",
+				(ulint) (n_pages * sizeof(*dump)),
+				strerror(errno));
+		/* leave tmp_filename to exist */
+		return;
+	}
+
+	for (bpage = UT_LIST_GET_FIRST(buf_pool.LRU), j = 0;
+	     bpage != NULL && j < n_pages;
+	     bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+
+		ut_a(bpage->in_file());
+		const page_id_t id(bpage->id());
+
+		if (id.space() == SRV_TMP_SPACE_ID) {
+			/* Ignore the innodb_temporary tablespace. */
+			continue;
+		}
+
+		if (bpage->status == buf_page_t::FREED) {
+			continue;
+		}
+
+		dump[j++] = id;
+	}
+
+	mysql_mutex_unlock(&buf_pool.mutex);
+
+	ut_a(j <= n_pages);
+	n_pages = j;
+
+	for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) {
+		ret = fprintf(f, "%u,%u\n",
+			      dump[j].space(), dump[j].page_no());
+		if (ret < 0) {
+			ut_free(dump);
+			fclose(f);
+			buf_dump_status(STATUS_ERR,
+					"Cannot write to '%s': %s",
+					tmp_filename, strerror(errno));
+			/* leave tmp_filename to exist */
+			return;
+		}
+		if (SHUTTING_DOWN() && !(j & 1023)) {
+			service_manager_extend_timeout(
+				INNODB_EXTEND_TIMEOUT_INTERVAL,
+				"Dumping buffer pool page "
+				ULINTPF "/" ULINTPF, j + 1, n_pages);
+		}
+	}
+
+	ut_free(dump);
+
+done:
+	ret = fclose(f);
+	if (ret != 0) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot close '%s': %s",
+				tmp_filename, strerror(errno));
+		return;
+	}
+	/* else */
+
+	ret = unlink(full_filename);
+	if (ret != 0 && errno != ENOENT) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot delete '%s': %s",
+				full_filename, strerror(errno));
+		/* leave tmp_filename to exist */
+		return;
+	}
+	/* else */
+
+	ret = rename(tmp_filename, full_filename);
+	if (ret != 0) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot rename '%s' to '%s': %s",
+				tmp_filename, full_filename,
+				strerror(errno));
+		/* leave tmp_filename to exist */
+		return;
+	}
+	/* else */
+
+	/* success */
+
+	ut_sprintf_timestamp(now);
+
+	buf_dump_status(STATUS_INFO,
+			"Buffer pool(s) dump completed at %s", now);
+
+	/* Though dumping doesn't related to an incomplete load,
+	 we reset this to 0 here to indicate that a shutdown can also perform
+	 a dump */
+	export_vars.innodb_buffer_pool_load_incomplete = 0;
+}
+
+/*****************************************************************//**
+Artificially delay the buffer pool loading if necessary. The idea of
+this function is to prevent hogging the server with IO and slowing down
+too much normal client queries. */
+UNIV_INLINE
+void
+buf_load_throttle_if_needed(
+/*========================*/
+	ulint*	last_check_time,	/*!< in/out: milliseconds since epoch
+					of the last time we did check if
+					throttling is needed, we do the check
+					every srv_io_capacity IO ops. */
+	ulint*	last_activity_count,
+	ulint	n_io)			/*!< in: number of IO ops done since
+					buffer pool load has started */
+{
+	if (n_io % srv_io_capacity < srv_io_capacity - 1) {
+		return;
+	}
+
+	if (*last_check_time == 0 || *last_activity_count == 0) {
+		*last_check_time = ut_time_ms();
+		*last_activity_count = srv_get_activity_count();
+		return;
+	}
+
+	/* srv_io_capacity IO operations have been performed by buffer pool
+	load since the last time we were here. */
+
+	/* If no other activity, then keep going without any delay. */
+	if (srv_get_activity_count() == *last_activity_count) {
+		return;
+	}
+
+	/* There has been other activity, throttle. */
+
+	ulint	now = ut_time_ms();
+	ulint	elapsed_time = now - *last_check_time;
+
+	/* Notice that elapsed_time is not the time for the last
+	srv_io_capacity IO operations performed by BP load. It is the
+	time elapsed since the last time we detected that there has been
+	other activity. This has a small and acceptable deficiency, e.g.:
+	1. BP load runs and there is no other activity.
+	2. Other activity occurs, we run N IO operations after that and
+	   enter here (where 0 <= N < srv_io_capacity).
+	3. last_check_time is very old and we do not sleep at this time, but
+	   only update last_check_time and last_activity_count.
+	4. We run srv_io_capacity more IO operations and call this function
+	   again.
+	5. There has been more other activity and thus we enter here.
+	6. Now last_check_time is recent and we sleep if necessary to prevent
+	   more than srv_io_capacity IO operations per second.
+	The deficiency is that we could have slept at 3., but for this we
+	would have to update last_check_time before the
+	"cur_activity_count == *last_activity_count" check and calling
+	ut_time_ms() that often may turn out to be too expensive. */
+
+	if (elapsed_time < 1000 /* 1 sec (1000 milli secs) */) {
+		os_thread_sleep((1000 - elapsed_time) * 1000 /* micro secs */);
+	}
+
+	*last_check_time = ut_time_ms();
+	*last_activity_count = srv_get_activity_count();
+}
+
+/*****************************************************************//**
+Perform a buffer pool load from the file specified by
+innodb_buffer_pool_filename. If any errors occur then the value of
+innodb_buffer_pool_load_status will be set accordingly, see buf_load_status().
+The dump filename can be specified by (relative to srv_data_home):
+SET GLOBAL innodb_buffer_pool_filename='filename'; */
+static
+void
+buf_load()
+/*======*/
+{
+	char		full_filename[OS_FILE_MAX_PATH];
+	char		now[32];
+	FILE*		f;
+	page_id_t*	dump;
+	ulint		dump_n;
+	ulint		i;
+	uint32_t	space_id;
+	uint32_t	page_no;
+	int		fscanf_ret;
+
+	/* Ignore any leftovers from before */
+	buf_load_abort_flag = false;
+
+	buf_dump_generate_path(full_filename, sizeof(full_filename));
+
+	buf_load_status(STATUS_INFO,
+			"Loading buffer pool(s) from %s", full_filename);
+
+	f = fopen(full_filename, "r" STR_O_CLOEXEC);
+	if (f == NULL) {
+		buf_load_status(STATUS_INFO,
+				"Cannot open '%s' for reading: %s",
+				full_filename, strerror(errno));
+		return;
+	}
+	/* else */
+
+	/* First scan the file to estimate how many entries are in it.
+	This file is tiny (approx 500KB per 1GB buffer pool), reading it
+	two times is fine. */
+	dump_n = 0;
+	while (fscanf(f, "%u,%u", &space_id, &page_no) == 2
+	       && !SHUTTING_DOWN()) {
+		dump_n++;
+	}
+
+	if (!SHUTTING_DOWN() && !feof(f)) {
+		/* fscanf() returned != 2 */
+		const char*	what;
+		if (ferror(f)) {
+			what = "reading";
+		} else {
+			what = "parsing";
+		}
+		fclose(f);
+		buf_load_status(STATUS_ERR, "Error %s '%s',"
+				" unable to load buffer pool (stage 1)",
+				what, full_filename);
+		return;
+	}
+
+	/* If dump is larger than the buffer pool(s), then we ignore the
+	extra trailing. This could happen if a dump is made, then buffer
+	pool is shrunk and then load is attempted. */
+	dump_n = std::min(dump_n, buf_pool.get_n_pages());
+
+	if (dump_n != 0) {
+		dump = static_cast<page_id_t*>(ut_malloc_nokey(
+				dump_n * sizeof(*dump)));
+	} else {
+		fclose(f);
+		ut_sprintf_timestamp(now);
+		buf_load_status(STATUS_INFO,
+				"Buffer pool(s) load completed at %s"
+				" (%s was empty)", now, full_filename);
+		return;
+	}
+
+	if (dump == NULL) {
+		fclose(f);
+		buf_load_status(STATUS_ERR,
+				"Cannot allocate " ULINTPF " bytes: %s",
+				dump_n * sizeof(*dump),
+				strerror(errno));
+		return;
+	}
+
+	rewind(f);
+
+	export_vars.innodb_buffer_pool_load_incomplete = 1;
+
+	for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
+		fscanf_ret = fscanf(f, "%u,%u", &space_id, &page_no);
+
+		if (fscanf_ret != 2) {
+			if (feof(f)) {
+				break;
+			}
+			/* else */
+
+			ut_free(dump);
+			fclose(f);
+			buf_load_status(STATUS_ERR,
+					"Error parsing '%s', unable"
+					" to load buffer pool (stage 2)",
+					full_filename);
+			return;
+		}
+
+		if (space_id > ULINT32_MASK || page_no > ULINT32_MASK) {
+			ut_free(dump);
+			fclose(f);
+			buf_load_status(STATUS_ERR,
+					"Error parsing '%s': bogus"
+					" space,page %u,%u at line " ULINTPF
+					", unable to load buffer pool",
+					full_filename,
+					space_id, page_no,
+					i);
+			return;
+		}
+
+		dump[i] = page_id_t(space_id, page_no);
+	}
+
+	/* Set dump_n to the actual number of initialized elements,
+	i could be smaller than dump_n here if the file got truncated after
+	we read it the first time. */
+	dump_n = i;
+
+	fclose(f);
+
+	if (dump_n == 0) {
+		ut_free(dump);
+		ut_sprintf_timestamp(now);
+		buf_load_status(STATUS_INFO,
+				"Buffer pool(s) load completed at %s"
+				" (%s was empty or had errors)", now, full_filename);
+		return;
+	}
+
+	if (!SHUTTING_DOWN()) {
+		std::sort(dump, dump + dump_n);
+	}
+
+	ulint		last_check_time = 0;
+	ulint		last_activity_cnt = 0;
+
+	/* Avoid calling the expensive fil_space_t::get() for each
+	page within the same tablespace. dump[] is sorted by (space, page),
+	so all pages from a given tablespace are consecutive. */
+	ulint		cur_space_id = dump[0].space();
+	fil_space_t*	space = fil_space_t::get(cur_space_id);
+	ulint		zip_size = space ? space->zip_size() : 0;
+
+	PSI_stage_progress*	pfs_stage_progress __attribute__((unused))
+		= mysql_set_stage(srv_stage_buffer_pool_load.m_key);
+	mysql_stage_set_work_estimated(pfs_stage_progress, dump_n);
+	mysql_stage_set_work_completed(pfs_stage_progress, 0);
+
+	for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
+
+		/* space_id for this iteration of the loop */
+		const ulint	this_space_id = dump[i].space();
+
+		if (this_space_id == SRV_TMP_SPACE_ID) {
+			/* Ignore the innodb_temporary tablespace. */
+			continue;
+		}
+
+		if (this_space_id != cur_space_id) {
+			if (space) {
+				space->release();
+			}
+
+			cur_space_id = this_space_id;
+			space = fil_space_t::get(cur_space_id);
+
+			if (!space) {
+				continue;
+			}
+
+			zip_size = space->zip_size();
+		}
+
+		/* JAN: TODO: As we use background page read below,
+		if tablespace is encrypted we cant use it. */
+		if (!space || dump[i].page_no() >= space->get_size() ||
+		    (space->crypt_data &&
+		     space->crypt_data->encryption != FIL_ENCRYPTION_OFF &&
+		     space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) {
+			continue;
+		}
+
+		if (space->is_stopping()) {
+			space->release();
+			space = nullptr;
+			continue;
+		}
+
+		space->reacquire();
+		buf_read_page_background(space, dump[i], zip_size, true);
+
+		if (buf_load_abort_flag) {
+			if (space) {
+				space->release();
+			}
+			buf_load_abort_flag = false;
+			ut_free(dump);
+			buf_load_status(
+				STATUS_INFO,
+				"Buffer pool(s) load aborted on request");
+			/* Premature end, set estimated = completed = i and
+			end the current stage event. */
+
+			mysql_stage_set_work_estimated(pfs_stage_progress, i);
+			mysql_stage_set_work_completed(pfs_stage_progress, i);
+
+			mysql_end_stage();
+			return;
+		}
+
+		buf_load_throttle_if_needed(
+			&last_check_time, &last_activity_cnt, i);
+
+#ifdef UNIV_DEBUG
+		if ((i+1) >= srv_buf_pool_load_pages_abort) {
+			buf_load_abort_flag = true;
+		}
+#endif
+	}
+
+	if (space) {
+		space->release();
+	}
+
+	ut_free(dump);
+
+	ut_sprintf_timestamp(now);
+
+	if (i == dump_n) {
+		buf_load_status(STATUS_INFO,
+			"Buffer pool(s) load completed at %s", now);
+		export_vars.innodb_buffer_pool_load_incomplete = 0;
+	} else if (!buf_load_abort_flag) {
+		buf_load_status(STATUS_INFO,
+			"Buffer pool(s) load aborted due to user instigated abort at %s",
+			now);
+		/* intentionally don't reset innodb_buffer_pool_load_incomplete
+                   as we don't want a shutdown to save the buffer pool */
+	} else {
+		buf_load_status(STATUS_INFO,
+			"Buffer pool(s) load aborted due to shutdown at %s",
+			now);
+		/* intentionally don't reset innodb_buffer_pool_load_incomplete
+                   as we want to abort without saving the buffer pool */
+	}
+
+	/* Make sure that estimated = completed when we end. */
+	mysql_stage_set_work_completed(pfs_stage_progress, dump_n);
+	/* End the stage progress event. */
+	mysql_end_stage();
+}
+
+/** Abort a currently running buffer pool load. */
+void buf_load_abort()
+{
+  buf_load_abort_flag= true;
+}
+
+/*****************************************************************//**
+This is the main task for buffer pool dump/load. when scheduled
+either performs a dump or load, depending on server state, state of the variables etc- */
+static void buf_dump_load_func(void *)
+{
+	ut_ad(!srv_read_only_mode);
+	static bool first_time = true;
+	if (first_time && srv_buffer_pool_load_at_startup) {
+
+#ifdef WITH_WSREP
+		if (!get_wsrep_recovery()) {
+#endif /* WITH_WSREP */
+			buf_load();
+#ifdef WITH_WSREP
+		}
+#endif /* WITH_WSREP */
+	}
+	first_time = false;
+
+	while (!SHUTTING_DOWN()) {
+		if (buf_dump_should_start) {
+			buf_dump_should_start = false;
+			buf_dump(true);
+		}
+		if (buf_load_should_start) {
+			buf_load_should_start = false;
+			buf_load();
+		}
+
+		if (!buf_dump_should_start && !buf_load_should_start) {
+			return;
+		}
+	}
+
+	/* In shutdown */
+	if (srv_buffer_pool_dump_at_shutdown && srv_fast_shutdown != 2) {
+		if (export_vars.innodb_buffer_pool_load_incomplete) {
+			buf_dump_status(STATUS_INFO,
+				"Dumping of buffer pool not started"
+				" as load was incomplete");
+#ifdef WITH_WSREP
+		} else if (get_wsrep_recovery()) {
+#endif /* WITH_WSREP */
+		} else {
+			buf_dump(false/* do complete dump at shutdown */);
+		}
+	}
+}
+
+
+/* Execute task with max.concurrency */
+static tpool::task_group tpool_group(1);
+static tpool::waitable_task buf_dump_load_task(buf_dump_load_func, &tpool_group);
+static bool load_dump_enabled;
+
+/** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/
+void buf_load_at_startup()
+{
+  load_dump_enabled= true;
+  if (srv_buffer_pool_load_at_startup)
+    buf_do_load_dump();
+}
+
+static void buf_do_load_dump()
+{
+  if (load_dump_enabled && !buf_dump_load_task.is_running())
+    srv_thread_pool->submit_task(&buf_dump_load_task);
+}
+
+/** Wait for currently running load/dumps to finish*/
+void buf_load_dump_end()
+{
+  ut_ad(SHUTTING_DOWN());
+  buf_dump_load_task.wait();
+}
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
new file mode 100644
index 00000000..10a84d99
--- /dev/null
+++ b/storage/innobase/buf/buf0flu.cc
@@ -0,0 +1,2530 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+Copyright (c) 2013, 2014, Fusion-io
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0flu.cc
+The database buffer buf_pool flush algorithm
+
+Created 11/11/1995 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+#include <my_service_manager.h>
+#include <mysql/service_thd_wait.h>
+#include <sql_class.h>
+
+#include "buf0flu.h"
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "buf0dblwr.h"
+#include "srv0start.h"
+#include "page0zip.h"
+#include "fil0fil.h"
+#include "log0crypt.h"
+#include "srv0mon.h"
+#include "fil0pagecompress.h"
+#ifdef HAVE_LZO
+# include "lzo/lzo1x.h"
+#elif defined HAVE_SNAPPY
+# include "snappy-c.h"
+#endif
+
+/** Number of pages flushed via LRU. Protected by buf_pool.mutex.
+Also included in buf_flush_page_count. */
+ulint buf_lru_flush_page_count;
+
+/** Number of pages flushed. Protected by buf_pool.mutex. */
+ulint buf_flush_page_count;
+
+/** Flag indicating if the page_cleaner is in active state. */
+bool buf_page_cleaner_is_active;
+
+/** Factor for scan length to determine n_pages for intended oldest LSN
+progress */
+static constexpr ulint buf_flush_lsn_scan_factor = 3;
+
+/** Average redo generation rate */
+static lsn_t lsn_avg_rate = 0;
+
+/** Target oldest_modification for the page cleaner background flushing;
+writes are protected by buf_pool.flush_list_mutex */
+static Atomic_relaxed<lsn_t> buf_flush_async_lsn;
+/** Target oldest_modification for the page cleaner furious flushing;
+writes are protected by buf_pool.flush_list_mutex */
+static Atomic_relaxed<lsn_t> buf_flush_sync_lsn;
+
+#ifdef UNIV_PFS_THREAD
+mysql_pfs_key_t page_cleaner_thread_key;
+#endif /* UNIV_PFS_THREAD */
+
+/** Page cleaner structure */
+static struct
+{
+  /** total elapsed time in adaptive flushing, in seconds */
+  ulint flush_time;
+  /** number of adaptive flushing passes */
+  ulint flush_pass;
+} page_cleaner;
+
+#ifdef UNIV_DEBUG
+my_bool innodb_page_cleaner_disabled_debug;
+#endif /* UNIV_DEBUG */
+
+/** If LRU list of a buf_pool is less than this size then LRU eviction
+should not happen. This is because when we do LRU flushing we also put
+the blocks on free list. If LRU list is very small then we can end up
+in thrashing. */
+#define BUF_LRU_MIN_LEN		256
+
+/* @} */
+
+#ifdef UNIV_DEBUG
+/** Validate the flush list. */
+static void buf_flush_validate_low();
+
+/** Validates the flush list some of the time. */
+static void buf_flush_validate_skip()
+{
+/** Try buf_flush_validate_low() every this many times */
+# define BUF_FLUSH_VALIDATE_SKIP	23
+
+	/** The buf_flush_validate_low() call skip counter.
+	Use a signed type because of the race condition below. */
+	static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
+
+	/* There is a race condition below, but it does not matter,
+	because this call is only for heuristic purposes. We want to
+	reduce the call frequency of the costly buf_flush_validate_low()
+	check in debug builds. */
+	if (--buf_flush_validate_count > 0) {
+		return;
+	}
+
+	buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
+	buf_flush_validate_low();
+}
+#endif /* UNIV_DEBUG */
+
+/** Wake up the page cleaner if needed */
+inline void buf_pool_t::page_cleaner_wakeup()
+{
+  if (!page_cleaner_idle())
+    return;
+  double dirty_pct= double(UT_LIST_GET_LEN(buf_pool.flush_list)) * 100.0 /
+    double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free));
+  double pct_lwm= srv_max_dirty_pages_pct_lwm;
+
+  /* if pct_lwm != 0.0, adaptive flushing is enabled.
+  signal buf page cleaner thread
+  - if pct_lwm <= dirty_pct then it will invoke apdative flushing flow
+  - if pct_lwm > dirty_pct then it will invoke idle flushing flow.
+
+  idle_flushing:
+  dirty_pct < innodb_max_dirty_pages_pct_lwm so it could be an
+  idle flushing use-case.
+
+  Why is last_activity_count not updated always?
+  - let's first understand when is server activity count updated.
+  - it is updated on commit of a transaction trx_t::commit() and not
+    on adding a page to the flush list.
+  - page_cleaner_wakeup is called when a page is added to the flush list.
+
+  - now let's say the first user thread, updates the count from X -> Y but
+    is yet to commit the transaction (so activity count is still Y).
+    followup user threads will see the updated count as (Y) that is matching
+    the universal server activity count (Y), giving a false impression that
+    the server is idle.
+
+  How to avoid this?
+  - by allowing last_activity_count to updated when page-cleaner is made
+    active and has work to do. This ensures that the last_activity signal
+    is consumed by the page-cleaner before the next one is generated. */
+  if ((pct_lwm != 0.0 && pct_lwm <= dirty_pct) ||
+      (pct_lwm != 0.0 && last_activity_count == srv_get_activity_count()) ||
+      srv_max_buf_pool_modified_pct <= dirty_pct)
+  {
+    page_cleaner_is_idle= false;
+    pthread_cond_signal(&do_flush_list);
+  }
+}
+
+inline void buf_pool_t::delete_from_flush_list_low(buf_page_t *bpage)
+{
+  ut_ad(!fsp_is_system_temporary(bpage->id().space()));
+  mysql_mutex_assert_owner(&flush_list_mutex);
+  flush_hp.adjust(bpage);
+  UT_LIST_REMOVE(flush_list, bpage);
+}
+
+/** Insert a modified block into the flush list.
+@param block    modified block
+@param lsn      start LSN of the mini-transaction that modified the block */
+void buf_pool_t::insert_into_flush_list(buf_block_t *block, lsn_t lsn)
+{
+  mysql_mutex_assert_not_owner(&mutex);
+  mysql_mutex_assert_owner(&log_sys.flush_order_mutex);
+  ut_ad(lsn > 2);
+  ut_ad(!fsp_is_system_temporary(block->page.id().space()));
+
+  mysql_mutex_lock(&flush_list_mutex);
+  if (ut_d(const lsn_t old=) block->page.oldest_modification())
+  {
+    ut_ad(old == 1);
+    delete_from_flush_list_low(&block->page);
+  }
+  else
+    stat.flush_list_bytes+= block->physical_size();
+  ut_ad(stat.flush_list_bytes <= curr_pool_size);
+
+  block->page.set_oldest_modification(lsn);
+  MEM_CHECK_DEFINED(block->page.zip.data
+                    ? block->page.zip.data : block->frame,
+                    block->physical_size());
+  UT_LIST_ADD_FIRST(flush_list, &block->page);
+  ut_d(buf_flush_validate_skip());
+  page_cleaner_wakeup();
+  mysql_mutex_unlock(&flush_list_mutex);
+}
+
+/** Remove a block from flush_list.
+@param bpage   buffer pool page
+@param clear   whether to invoke buf_page_t::clear_oldest_modification() */
+void buf_pool_t::delete_from_flush_list(buf_page_t *bpage, bool clear)
+{
+  delete_from_flush_list_low(bpage);
+  stat.flush_list_bytes-= bpage->physical_size();
+  if (clear)
+    bpage->clear_oldest_modification();
+#ifdef UNIV_DEBUG
+  buf_flush_validate_skip();
+#endif /* UNIV_DEBUG */
+}
+
+/** Remove all dirty pages belonging to a given tablespace when we are
+deleting the data file of that tablespace.
+The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU.
+@param id    tablespace identifier */
+void buf_flush_remove_pages(ulint id)
+{
+  const page_id_t first(id, 0), end(id + 1, 0);
+  ut_ad(id);
+  mysql_mutex_lock(&buf_pool.mutex);
+
+  for (;;)
+  {
+    bool deferred= false;
+
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+    for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
+    {
+      ut_d(const auto s= bpage->state());
+      ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE ||
+            s == BUF_BLOCK_REMOVE_HASH);
+      buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+
+      const page_id_t bpage_id(bpage->id());
+
+      if (bpage_id < first || bpage_id >= end);
+      else if (bpage->io_fix() != BUF_IO_NONE)
+        deferred= true;
+      else
+        buf_pool.delete_from_flush_list(bpage);
+
+      bpage= prev;
+    }
+
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+    if (!deferred)
+      break;
+
+    mysql_mutex_unlock(&buf_pool.mutex);
+    os_thread_yield();
+    mysql_mutex_lock(&buf_pool.mutex);
+    buf_flush_wait_batch_end(false);
+  }
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/*******************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage have already been
+copied to dpage.
+IMPORTANT: When this function is called bpage and dpage are not
+exact copies of each other. For example, they both will have different
+::state. Also the ::list pointers in dpage may be stale. We need to
+use the current list node (bpage) to do the list manipulation because
+the list pointers could have changed between the time that we copied
+the contents of bpage to the dpage and the flush list manipulation
+below. */
+ATTRIBUTE_COLD
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+	buf_page_t*	bpage,	/*!< in/out: control block being moved */
+	buf_page_t*	dpage)	/*!< in/out: destination block */
+{
+	buf_page_t*	prev;
+
+	mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+	ut_ad(!fsp_is_system_temporary(bpage->id().space()));
+
+	const lsn_t lsn = bpage->oldest_modification();
+
+	if (!lsn) {
+		return;
+	}
+
+	ut_ad(lsn == 1 || lsn > 2);
+	ut_ad(dpage->oldest_modification() == lsn);
+
+	/* Important that we adjust the hazard pointer before removing
+	the bpage from the flush list. */
+	buf_pool.flush_hp.adjust(bpage);
+
+	prev = UT_LIST_GET_PREV(list, bpage);
+	UT_LIST_REMOVE(buf_pool.flush_list, bpage);
+
+	bpage->clear_oldest_modification();
+
+	if (lsn == 1) {
+		buf_pool.stat.flush_list_bytes -= dpage->physical_size();
+		dpage->list.prev = nullptr;
+		dpage->list.next = nullptr;
+		dpage->clear_oldest_modification();
+	} else if (prev) {
+		ut_ad(prev->oldest_modification());
+		UT_LIST_INSERT_AFTER(buf_pool.flush_list, prev, dpage);
+	} else {
+		UT_LIST_ADD_FIRST(buf_pool.flush_list, dpage);
+	}
+
+	ut_d(buf_flush_validate_low());
+}
+
+/** Complete write of a file page from buf_pool.
+@param request write request */
+void buf_page_write_complete(const IORequest &request)
+{
+  ut_ad(request.is_write());
+  ut_ad(!srv_read_only_mode/* ||
+        request.node->space->purpose == FIL_TYPE_TEMPORARY*/);
+  buf_page_t *bpage= request.bpage;
+  ut_ad(bpage);
+  ut_ad(bpage->in_file());
+  /* bpage->io_fix() can only be changed by buf_page_write_complete()
+  and buf_page_read_complete() from BUF_IO_READ or BUF_IO_WRITE */
+  ut_ad(bpage->io_fix() == BUF_IO_WRITE);
+  ut_ad(!buf_dblwr.is_inside(bpage->id()));
+  ut_ad(request.node->space->id == bpage->id().space());
+
+  if (bpage->status == buf_page_t::INIT_ON_FLUSH)
+    bpage->status= buf_page_t::NORMAL;
+  else
+  {
+    ut_ad(bpage->status == buf_page_t::NORMAL);
+    if (request.node->space->use_doublewrite())
+    {
+      ut_ad(request.node->space != fil_system.temp_space);
+      buf_dblwr.write_completed();
+    }
+  }
+
+  if (bpage->slot)
+  {
+    bpage->slot->release();
+    bpage->slot= nullptr;
+  }
+
+  if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
+    buf_page_monitor(bpage, BUF_IO_WRITE);
+  DBUG_PRINT("ib_buf", ("write page %u:%u",
+                        bpage->id().space(), bpage->id().page_no()));
+  const bool temp= fsp_is_system_temporary(bpage->id().space());
+
+  mysql_mutex_lock(&buf_pool.mutex);
+  buf_pool.stat.n_pages_written++;
+  /* While we do not need any mutex for clearing oldest_modification
+  here, we hope that it will be in the same cache line with io_fix,
+  whose changes must be protected by buf_pool.mutex. */
+  bpage->clear_oldest_modification(temp);
+  ut_ad(bpage->io_fix() == BUF_IO_WRITE);
+  bpage->set_io_fix(BUF_IO_NONE);
+
+  /* Because this thread which does the unlocking might not be the same that
+  did the locking, we use a pass value != 0 in unlock, which simply
+  removes the newest lock debug record, without checking the thread id. */
+  if (bpage->state() == BUF_BLOCK_FILE_PAGE)
+    rw_lock_sx_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_WRITE);
+
+  if (request.is_LRU())
+  {
+    buf_LRU_free_page(bpage, true);
+
+    ut_ad(buf_pool.n_flush_LRU_);
+    if (!--buf_pool.n_flush_LRU_)
+    {
+      pthread_cond_broadcast(&buf_pool.done_flush_LRU);
+      pthread_cond_signal(&buf_pool.done_free);
+    }
+  }
+  else
+  {
+    ut_ad(!temp);
+    ut_ad(buf_pool.n_flush_list_);
+    if (!--buf_pool.n_flush_list_)
+      pthread_cond_broadcast(&buf_pool.done_flush_list);
+  }
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page.
+@param[in,out]	page		page to update
+@param[in]	size		compressed page size */
+void buf_flush_update_zip_checksum(buf_frame_t *page, ulint size)
+{
+  ut_ad(size > 0);
+  mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+                  page_zip_calc_checksum(page, size,
+                                         static_cast<srv_checksum_algorithm_t>
+                                         (srv_checksum_algorithm)));
+}
+
+/** Assign the full crc32 checksum for non-compressed page.
+@param[in,out]	page	page to be updated */
+void buf_flush_assign_full_crc32_checksum(byte* page)
+{
+	ut_d(bool compressed = false);
+	ut_d(bool corrupted = false);
+	ut_d(const uint size = buf_page_full_crc32_size(page, &compressed,
+							&corrupted));
+	ut_ad(!compressed);
+	ut_ad(!corrupted);
+	ut_ad(size == uint(srv_page_size));
+	const ulint payload = srv_page_size - FIL_PAGE_FCRC32_CHECKSUM;
+	mach_write_to_4(page + payload, ut_crc32(page, payload));
+}
+
+/** Initialize a page for writing to the tablespace.
+@param[in]	block			buffer block; NULL if bypassing
+					the buffer pool
+@param[in,out]	page			page frame
+@param[in,out]	page_zip_		compressed page, or NULL if
+					uncompressed
+@param[in]	use_full_checksum	whether tablespace uses full checksum */
+void
+buf_flush_init_for_writing(
+	const buf_block_t*	block,
+	byte*			page,
+	void*			page_zip_,
+	bool			use_full_checksum)
+{
+	if (block != NULL && block->frame != page) {
+		/* If page is encrypted in full crc32 format then
+		checksum stored already as a part of fil_encrypt_buf() */
+		ut_ad(use_full_checksum);
+		return;
+	}
+
+	ut_ad(block == NULL || block->frame == page);
+	ut_ad(block == NULL || page_zip_ == NULL
+	      || &block->page.zip == page_zip_);
+	ut_ad(page);
+
+	if (page_zip_) {
+		page_zip_des_t*	page_zip;
+		ulint		size;
+
+		page_zip = static_cast<page_zip_des_t*>(page_zip_);
+		size = page_zip_get_size(page_zip);
+
+		ut_ad(size);
+		ut_ad(ut_is_2pow(size));
+		ut_ad(size <= UNIV_ZIP_SIZE_MAX);
+
+		switch (fil_page_get_type(page)) {
+		case FIL_PAGE_TYPE_ALLOCATED:
+		case FIL_PAGE_INODE:
+		case FIL_PAGE_IBUF_BITMAP:
+		case FIL_PAGE_TYPE_FSP_HDR:
+		case FIL_PAGE_TYPE_XDES:
+			/* These are essentially uncompressed pages. */
+			memcpy(page_zip->data, page, size);
+			/* fall through */
+		case FIL_PAGE_TYPE_ZBLOB:
+		case FIL_PAGE_TYPE_ZBLOB2:
+		case FIL_PAGE_INDEX:
+		case FIL_PAGE_RTREE:
+			buf_flush_update_zip_checksum(page_zip->data, size);
+			return;
+		}
+
+		ib::error() << "The compressed page to be written"
+			" seems corrupt:";
+		ut_print_buf(stderr, page, size);
+		fputs("\nInnoDB: Possibly older version of the page:", stderr);
+		ut_print_buf(stderr, page_zip->data, size);
+		putc('\n', stderr);
+		ut_error;
+	}
+
+	if (use_full_checksum) {
+		static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "aligned");
+		static_assert(FIL_PAGE_LSN % 4 == 0, "aligned");
+		memcpy_aligned<4>(page + srv_page_size
+				  - FIL_PAGE_FCRC32_END_LSN,
+				  FIL_PAGE_LSN + 4 + page, 4);
+		return buf_flush_assign_full_crc32_checksum(page);
+	}
+
+	static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 8 == 0, "aligned");
+	static_assert(FIL_PAGE_LSN % 8 == 0, "aligned");
+	memcpy_aligned<8>(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
+			  FIL_PAGE_LSN + page, 8);
+
+	if (block && srv_page_size == 16384) {
+		/* The page type could be garbage in old files
+		created before MySQL 5.5. Such files always
+		had a page size of 16 kilobytes. */
+		ulint	page_type = fil_page_get_type(page);
+		ulint	reset_type = page_type;
+
+		switch (block->page.id().page_no() % 16384) {
+		case 0:
+			reset_type = block->page.id().page_no() == 0
+				? FIL_PAGE_TYPE_FSP_HDR
+				: FIL_PAGE_TYPE_XDES;
+			break;
+		case 1:
+			reset_type = FIL_PAGE_IBUF_BITMAP;
+			break;
+		case FSP_TRX_SYS_PAGE_NO:
+			if (block->page.id()
+			    == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO)) {
+				reset_type = FIL_PAGE_TYPE_TRX_SYS;
+				break;
+			}
+			/* fall through */
+		default:
+			switch (page_type) {
+			case FIL_PAGE_INDEX:
+			case FIL_PAGE_TYPE_INSTANT:
+			case FIL_PAGE_RTREE:
+			case FIL_PAGE_UNDO_LOG:
+			case FIL_PAGE_INODE:
+			case FIL_PAGE_IBUF_FREE_LIST:
+			case FIL_PAGE_TYPE_ALLOCATED:
+			case FIL_PAGE_TYPE_SYS:
+			case FIL_PAGE_TYPE_TRX_SYS:
+			case FIL_PAGE_TYPE_BLOB:
+			case FIL_PAGE_TYPE_ZBLOB:
+			case FIL_PAGE_TYPE_ZBLOB2:
+				break;
+			case FIL_PAGE_TYPE_FSP_HDR:
+			case FIL_PAGE_TYPE_XDES:
+			case FIL_PAGE_IBUF_BITMAP:
+				/* These pages should have
+				predetermined page numbers
+				(see above). */
+			default:
+				reset_type = FIL_PAGE_TYPE_UNKNOWN;
+				break;
+			}
+		}
+
+		if (UNIV_UNLIKELY(page_type != reset_type)) {
+			ib::info()
+				<< "Resetting invalid page "
+				<< block->page.id() << " type "
+				<< page_type << " to "
+				<< reset_type << " when flushing.";
+			fil_page_set_type(page, reset_type);
+		}
+	}
+
+	uint32_t checksum = BUF_NO_CHECKSUM_MAGIC;
+
+	switch (srv_checksum_algorithm_t(srv_checksum_algorithm)) {
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+		checksum = buf_calc_page_new_checksum(page);
+		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+				checksum);
+		/* With the InnoDB checksum, we overwrite the first 4 bytes of
+		the end lsn field to store the old formula checksum. Since it
+		depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to
+		be calculated after storing the new formula checksum. */
+		checksum = buf_calc_page_old_checksum(page);
+		break;
+	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+		/* In other cases we write the same checksum to both fields. */
+		checksum = buf_calc_page_crc32(page);
+		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+				checksum);
+		break;
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+				checksum);
+		break;
+		/* no default so the compiler will emit a warning if
+		new enum is added and not handled here */
+	}
+
+	mach_write_to_4(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
+			checksum);
+}
+
+/** Reserve a buffer for compression.
+@param[in,out]  slot    reserved slot */
+static void buf_tmp_reserve_compression_buf(buf_tmp_buffer_t* slot)
+{
+  if (slot->comp_buf)
+    return;
+  /* Both Snappy and LZO compression methods require that the output
+  buffer be bigger than input buffer. Adjust the allocated size. */
+  ulint size= srv_page_size;
+#ifdef HAVE_LZO
+  size+= LZO1X_1_15_MEM_COMPRESS;
+#elif defined HAVE_SNAPPY
+  size= snappy_max_compressed_length(size);
+#endif
+  slot->comp_buf= static_cast<byte*>(aligned_malloc(size, srv_page_size));
+}
+
+/** Encrypt a buffer of temporary tablespace
+@param[in]      offset  Page offset
+@param[in]      s       Page to encrypt
+@param[in,out]  d       Output buffer
+@return encrypted buffer or NULL */
+static byte* buf_tmp_page_encrypt(ulint offset, const byte* s, byte* d)
+{
+  /* Calculate the start offset in a page */
+  uint srclen= static_cast<uint>(srv_page_size) -
+    (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION +
+     FIL_PAGE_FCRC32_CHECKSUM);
+  const byte* src= s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+  byte* dst= d + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+
+  memcpy(d, s, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+
+  if (!log_tmp_block_encrypt(src, srclen, dst, (offset * srv_page_size), true))
+    return NULL;
+
+  const ulint payload= srv_page_size - FIL_PAGE_FCRC32_CHECKSUM;
+  mach_write_to_4(d + payload, ut_crc32(d, payload));
+
+  srv_stats.pages_encrypted.inc();
+  srv_stats.n_temp_blocks_encrypted.inc();
+  return d;
+}
+
+/** Encryption and page_compression hook that is called just before
+a page is written to disk.
+@param[in,out]  space   tablespace
+@param[in,out]  bpage   buffer page
+@param[in]      s       physical page frame that is being encrypted
+@param[in,out]  size    payload size in bytes
+@return page frame to be written to file
+(may be src_frame or an encrypted/compressed copy of it) */
+static byte *buf_page_encrypt(fil_space_t* space, buf_page_t* bpage, byte* s,
+                              size_t *size)
+{
+  ut_ad(bpage->status != buf_page_t::FREED);
+  ut_ad(space->id == bpage->id().space());
+
+  ut_d(fil_page_type_validate(space, s));
+  const uint32_t page_no= bpage->id().page_no();
+
+  switch (page_no) {
+  case TRX_SYS_PAGE_NO:
+    if (bpage->id().space() != TRX_SYS_SPACE)
+      break;
+    /* The TRX_SYS page is neither encrypted nor compressed, because
+    it contains the address of the doublewrite buffer. */
+    /* fall through */
+  case 0:
+    /* Page 0 of a tablespace is not encrypted/compressed */
+    return s;
+  }
+
+  fil_space_crypt_t *crypt_data= space->crypt_data;
+  bool encrypted, page_compressed;
+  if (space->purpose == FIL_TYPE_TEMPORARY)
+  {
+    ut_ad(!crypt_data);
+    encrypted= innodb_encrypt_temporary_tables;
+    page_compressed= false;
+  }
+  else
+  {
+    encrypted= crypt_data && !crypt_data->not_encrypted() &&
+      crypt_data->type != CRYPT_SCHEME_UNENCRYPTED &&
+      (!crypt_data->is_default_encryption() || srv_encrypt_tables);
+    page_compressed= space->is_compressed();
+  }
+
+  const bool full_crc32= space->full_crc32();
+
+  if (!encrypted && !page_compressed)
+  {
+    /* No need to encrypt or compress. Clear key-version & crypt-checksum. */
+    static_assert(FIL_PAGE_FCRC32_KEY_VERSION % 4 == 0, "alignment");
+    static_assert(FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION % 4 == 2,
+                  "not perfect alignment");
+    if (full_crc32)
+      memset_aligned<4>(s + FIL_PAGE_FCRC32_KEY_VERSION, 0, 4);
+    else
+      memset_aligned<2>(s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
+    return s;
+  }
+
+  static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+  static_assert(FIL_PAGE_LSN % 8 == 0, "alignment");
+  if (full_crc32)
+    memcpy_aligned<4>(s + srv_page_size - FIL_PAGE_FCRC32_END_LSN,
+                      FIL_PAGE_LSN + 4 + s, 4);
+
+  ut_ad(!bpage->zip_size() || !page_compressed);
+  /* Find free slot from temporary memory array */
+  buf_tmp_buffer_t *slot= buf_pool.io_buf_reserve();
+  ut_a(slot);
+  slot->allocate();
+  slot->out_buf= NULL;
+  bpage->slot= slot;
+
+  byte *d= slot->crypt_buf;
+
+  if (!page_compressed)
+  {
+not_compressed:
+    byte *tmp= space->purpose == FIL_TYPE_TEMPORARY
+      ? buf_tmp_page_encrypt(page_no, s, d)
+      : fil_space_encrypt(space, page_no, s, d);
+
+    slot->out_buf= d= tmp;
+
+    ut_d(fil_page_type_validate(space, tmp));
+  }
+  else
+  {
+    ut_ad(space->purpose != FIL_TYPE_TEMPORARY);
+    /* First we compress the page content */
+    buf_tmp_reserve_compression_buf(slot);
+    byte *tmp= slot->comp_buf;
+    ulint len= fil_page_compress(s, tmp, space->flags,
+                                 fil_space_get_block_size(space, page_no),
+                                 encrypted);
+
+    if (!len)
+      goto not_compressed;
+
+    *size= len;
+
+    if (full_crc32)
+    {
+      ut_d(bool compressed = false);
+      len= buf_page_full_crc32_size(tmp,
+#ifdef UNIV_DEBUG
+                                    &compressed,
+#else
+                                    NULL,
+#endif
+                                    NULL);
+      ut_ad(compressed);
+    }
+
+    /* Workaround for MDEV-15527. */
+    memset(tmp + len, 0 , srv_page_size - len);
+    ut_d(fil_page_type_validate(space, tmp));
+
+    if (encrypted)
+      tmp = fil_space_encrypt(space, page_no, tmp, d);
+
+    if (full_crc32)
+    {
+      static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment");
+      mach_write_to_4(tmp + len - 4, ut_crc32(tmp, len - 4));
+      ut_ad(!buf_page_is_corrupted(true, tmp, space->flags));
+    }
+
+    slot->out_buf= d= tmp;
+  }
+
+  ut_d(fil_page_type_validate(space, d));
+  return d;
+}
+
+/** Free a page whose underlying file page has been freed. */
+inline void buf_pool_t::release_freed_page(buf_page_t *bpage)
+{
+  ut_ad(bpage->in_file());
+  const bool uncompressed= bpage->state() == BUF_BLOCK_FILE_PAGE;
+  mysql_mutex_lock(&mutex);
+  bpage->set_io_fix(BUF_IO_NONE);
+  bpage->status= buf_page_t::NORMAL;
+  mysql_mutex_lock(&flush_list_mutex);
+  ut_d(const lsn_t oldest_modification= bpage->oldest_modification();)
+  if (fsp_is_system_temporary(bpage->id().space()))
+  {
+    ut_ad(uncompressed);
+    ut_ad(oldest_modification == 2);
+  }
+  else
+  {
+    ut_ad(oldest_modification > 2);
+    delete_from_flush_list(bpage, false);
+  }
+  bpage->clear_oldest_modification();
+  mysql_mutex_unlock(&flush_list_mutex);
+
+  if (uncompressed)
+    rw_lock_sx_unlock_gen(&reinterpret_cast<buf_block_t*>(bpage)->lock,
+                          BUF_IO_WRITE);
+
+  buf_LRU_free_page(bpage, true);
+  mysql_mutex_unlock(&mutex);
+}
+
+/** Write a flushable page from buf_pool to a file.
+buf_pool.mutex must be held.
+@param bpage       buffer control block
+@param lru         true=buf_pool.LRU; false=buf_pool.flush_list
+@param space       tablespace
+@return whether the page was flushed and buf_pool.mutex was released */
+static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
+{
+  ut_ad(bpage->in_file());
+  ut_ad(bpage->ready_for_flush());
+  ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
+        (space == fil_system.temp_space));
+  ut_ad(space->purpose == FIL_TYPE_TABLESPACE ||
+        space->atomic_write_supported);
+  ut_ad(space->referenced());
+  ut_ad(lru || space != fil_system.temp_space);
+
+  rw_lock_t *rw_lock;
+
+  if (bpage->state() != BUF_BLOCK_FILE_PAGE)
+    rw_lock= nullptr;
+  else
+  {
+    rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock;
+    if (!rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE))
+      return false;
+  }
+
+  bpage->set_io_fix(BUF_IO_WRITE);
+  /* Because bpage->status can only be changed while buf_block_t
+  exists, it cannot be modified for ROW_FORMAT=COMPRESSED pages
+  without first allocating the uncompressed page frame. Such
+  allocation cannot be completed due to our io_fix. So, bpage->status
+  is protected even if !rw_lock. */
+  const auto status= bpage->status;
+
+  if (status != buf_page_t::FREED)
+  {
+    if (lru)
+      buf_pool.n_flush_LRU_++;
+    else
+      buf_pool.n_flush_list_++;
+    buf_flush_page_count++;
+  }
+
+  mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+
+  /* We are holding rw_lock = buf_block_t::lock in SX mode except if
+  this is a ROW_FORMAT=COMPRESSED page whose uncompressed page frame
+  has been evicted from the buffer pool.
+
+  Apart from possible rw_lock protection, bpage is also protected by
+  io_fix and oldest_modification()!=0. Thus, it cannot be relocated in
+  the buffer pool or removed from flush_list or LRU_list. */
+
+  DBUG_PRINT("ib_buf", ("%s %u page %u:%u",
+                        lru ? "LRU" : "flush_list",
+                        bpage->id().space(), bpage->id().page_no()));
+  ut_ad(bpage->io_fix() == BUF_IO_WRITE);
+  ut_d(const lsn_t oldest_modification= bpage->oldest_modification());
+  ut_ad(space == fil_system.temp_space
+        ? oldest_modification == 2
+        : oldest_modification > 2);
+  ut_ad(bpage->state() ==
+        (rw_lock ? BUF_BLOCK_FILE_PAGE : BUF_BLOCK_ZIP_PAGE));
+  ut_ad(ULINT_UNDEFINED >
+        (lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_));
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
+  page_t *frame= bpage->zip.data;
+
+  if (status == buf_page_t::FREED)
+    buf_pool.release_freed_page(&block->page);
+  else
+  {
+    space->reacquire();
+    ut_ad(status == buf_page_t::NORMAL || status == buf_page_t::INIT_ON_FLUSH);
+    size_t size;
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+    size_t orig_size;
+#endif
+    IORequest::Type type= lru ? IORequest::WRITE_LRU : IORequest::WRITE_ASYNC;
+
+    if (UNIV_UNLIKELY(!rw_lock)) /* ROW_FORMAT=COMPRESSED */
+    {
+      ut_ad(!space->full_crc32());
+      ut_ad(!space->is_compressed()); /* not page_compressed */
+      size= bpage->zip_size();
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+      orig_size= size;
+#endif
+      buf_flush_update_zip_checksum(frame, size);
+      frame= buf_page_encrypt(space, bpage, frame, &size);
+      ut_ad(size == bpage->zip_size());
+    }
+    else
+    {
+      byte *page= block->frame;
+      size= block->physical_size();
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+      orig_size= size;
+#endif
+
+      if (space->full_crc32())
+      {
+        /* innodb_checksum_algorithm=full_crc32 is not implemented for
+        ROW_FORMAT=COMPRESSED pages. */
+        ut_ad(!frame);
+        page= buf_page_encrypt(space, bpage, page, &size);
+        buf_flush_init_for_writing(block, page, nullptr, true);
+      }
+      else
+      {
+        buf_flush_init_for_writing(block, page, frame ? &bpage->zip : nullptr,
+                                   false);
+        page= buf_page_encrypt(space, bpage, frame ? frame : page, &size);
+      }
+
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+      if (size != orig_size && space->punch_hole)
+        type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH;
+#endif
+      frame=page;
+    }
+
+    ut_ad(status == bpage->status);
+    ut_ad(oldest_modification == bpage->oldest_modification());
+
+    if (status != buf_page_t::NORMAL || !space->use_doublewrite())
+    {
+      if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
+      {
+        const lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
+                                          (FIL_PAGE_LSN + (frame ? frame
+                                                           : block->frame)));
+        ut_ad(lsn >= oldest_modification);
+        if (lsn > log_sys.get_flushed_lsn())
+          log_write_up_to(lsn, true);
+      }
+      space->io(IORequest(type, bpage),
+                bpage->physical_offset(), size, frame, bpage);
+    }
+    else
+      buf_dblwr.add_to_batch(IORequest(bpage, space->chain.start, type), size);
+  }
+
+  /* Increment the I/O operation count used for selecting LRU policy. */
+  buf_LRU_stat_inc_io();
+  return true;
+}
+
+/** Check whether a page can be flushed from the buf_pool.
+@param id          page identifier
+@param fold        id.fold()
+@param lru         true=buf_pool.LRU; false=buf_pool.flush_list
+@return whether the page can be flushed */
+static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(fold == id.fold());
+
+  buf_page_t *bpage= buf_pool.page_hash_get_low(id, fold);
+
+  if (!bpage || buf_pool.watch_is_sentinel(*bpage))
+    return false;
+
+  /* We avoid flushing 'non-old' blocks in an LRU flush, because the
+  flushed blocks are soon freed */
+  if (lru && !bpage->is_old())
+    return false;
+
+  return bpage->oldest_modification() > 1 && bpage->ready_for_flush();
+}
+
+/** Check which neighbors of a page can be flushed from the buf_pool.
+@param space       tablespace
+@param id          page identifier of a dirty page
+@param contiguous  whether to consider contiguous areas of pages
+@param lru         true=buf_pool.LRU; false=buf_pool.flush_list
+@return last page number that can be flushed */
+static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
+                                           page_id_t &id, bool contiguous,
+                                           bool lru)
+{
+  ut_ad(id.page_no() < space.size);
+  /* When flushed, dirty blocks are searched in neighborhoods of this
+  size, and flushed along with the original page. */
+  const ulint s= buf_pool.curr_size / 16;
+  const uint32_t read_ahead= buf_pool.read_ahead_area;
+  const uint32_t buf_flush_area= read_ahead > s
+    ? static_cast<uint32_t>(s) : read_ahead;
+  page_id_t low= id - (id.page_no() % buf_flush_area);
+  page_id_t high= low + buf_flush_area;
+  high.set_page_no(std::min(high.page_no(), space.last_page_number()));
+
+  if (!contiguous)
+  {
+    high= std::max(id + 1, high);
+    id= low;
+    return high;
+  }
+
+  /* Determine the contiguous dirty area around id. */
+  const ulint id_fold= id.fold();
+
+  mysql_mutex_lock(&buf_pool.mutex);
+
+  if (id > low)
+  {
+    ulint fold= id_fold;
+    for (page_id_t i= id - 1;; --i)
+    {
+      fold--;
+      if (!buf_flush_check_neighbor(i, fold, lru))
+      {
+        low= i + 1;
+        break;
+      }
+      if (i == low)
+        break;
+    }
+  }
+
+  page_id_t i= id;
+  id= low;
+  ulint fold= id_fold;
+  while (++i < high)
+  {
+    ++fold;
+    if (!buf_flush_check_neighbor(i, fold, lru))
+      break;
+  }
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+  return i;
+}
+
+MY_ATTRIBUTE((nonnull))
+/** Write punch-hole or zeroes of the freed ranges when
+innodb_immediate_scrub_data_uncompressed from the freed ranges.
+@param space   tablespace which may contain ranges of freed pages */
+static void buf_flush_freed_pages(fil_space_t *space)
+{
+  const bool punch_hole= space->punch_hole;
+  if (!srv_immediate_scrub_data_uncompressed && !punch_hole)
+    return;
+  lsn_t flush_to_disk_lsn= log_sys.get_flushed_lsn();
+
+  std::unique_lock<std::mutex> freed_lock(space->freed_range_mutex);
+  if (space->freed_ranges.empty()
+      || flush_to_disk_lsn < space->get_last_freed_lsn())
+  {
+    freed_lock.unlock();
+    return;
+  }
+
+  range_set freed_ranges= std::move(space->freed_ranges);
+  freed_lock.unlock();
+
+  for (const auto &range : freed_ranges)
+  {
+    const ulint physical_size= space->physical_size();
+
+    if (punch_hole)
+    {
+      space->reacquire();
+      space->io(IORequest(IORequest::PUNCH_RANGE),
+                          os_offset_t{range.first} * physical_size,
+                          (range.last - range.first + 1) * physical_size,
+                          nullptr);
+    }
+    else if (srv_immediate_scrub_data_uncompressed)
+    {
+      for (os_offset_t i= range.first; i <= range.last; i++)
+      {
+        space->reacquire();
+        space->io(IORequest(IORequest::WRITE_ASYNC),
+                  i * physical_size, physical_size,
+                  const_cast<byte*>(field_ref_zero));
+      }
+    }
+    buf_pool.stat.n_pages_written+= (range.last - range.first + 1);
+  }
+}
+
+/** Flushes to disk all flushable pages within the flush area
+and also write zeroes or punch the hole for the freed ranges of pages.
+@param space       tablespace
+@param page_id     page identifier
+@param contiguous  whether to consider contiguous areas of pages
+@param lru         true=buf_pool.LRU; false=buf_pool.flush_list
+@param n_flushed   number of pages flushed so far in this batch
+@param n_to_flush  maximum number of pages we are allowed to flush
+@return number of pages flushed */
+static ulint buf_flush_try_neighbors(fil_space_t *space,
+                                     const page_id_t page_id,
+                                     bool contiguous, bool lru,
+                                     ulint n_flushed, ulint n_to_flush)
+{
+  ut_ad(space->id == page_id.space());
+
+  ulint count= 0;
+  page_id_t id= page_id;
+  page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, lru);
+
+  ut_ad(page_id >= id);
+  ut_ad(page_id < high);
+
+  for (ulint id_fold= id.fold(); id < high && !space->is_stopping();
+       ++id, ++id_fold)
+  {
+    if (count + n_flushed >= n_to_flush)
+    {
+      if (id > page_id)
+        break;
+      /* If the page whose neighbors we are flushing has not been
+      flushed yet, we must flush the page that we selected originally. */
+      id= page_id;
+      id_fold= id.fold();
+    }
+
+    mysql_mutex_lock(&buf_pool.mutex);
+
+    if (buf_page_t *bpage= buf_pool.page_hash_get_low(id, id_fold))
+    {
+      ut_ad(bpage->in_file());
+      /* We avoid flushing 'non-old' blocks in an LRU flush,
+      because the flushed blocks are soon freed */
+      if (!lru || id == page_id || bpage->is_old())
+      {
+        if (!buf_pool.watch_is_sentinel(*bpage) &&
+            bpage->oldest_modification() > 1 &&
+            bpage->ready_for_flush() && buf_flush_page(bpage, lru, space))
+        {
+          ++count;
+          continue;
+        }
+      }
+    }
+
+    mysql_mutex_unlock(&buf_pool.mutex);
+  }
+
+  if (auto n= count - 1)
+  {
+    MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+                                 MONITOR_FLUSH_NEIGHBOR_COUNT,
+                                 MONITOR_FLUSH_NEIGHBOR_PAGES, n);
+  }
+
+  return count;
+}
+
+/*******************************************************************//**
+This utility moves the uncompressed frames of pages to the free list.
+Note that this function does not actually flush any data to disk. It
+just detaches the uncompressed frames from the compressed pages at the
+tail of the unzip_LRU and puts those freed frames in the free list.
+Note that it is a best effort attempt and it is not guaranteed that
+after a call to this function there will be 'max' blocks in the free
+list.
+@param[in]	max		desired number of blocks in the free_list
+@return number of blocks moved to the free list. */
+static ulint buf_free_from_unzip_LRU_list_batch(ulint max)
+{
+	ulint		scanned = 0;
+	ulint		count = 0;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	buf_block_t*	block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+
+	while (block
+	       && count < max
+	       && UT_LIST_GET_LEN(buf_pool.free) < srv_LRU_scan_depth
+	       && UT_LIST_GET_LEN(buf_pool.unzip_LRU)
+	       > UT_LIST_GET_LEN(buf_pool.LRU) / 10) {
+
+		++scanned;
+		if (buf_LRU_free_page(&block->page, false)) {
+			/* Block was freed. buf_pool.mutex potentially
+			released and reacquired */
+			++count;
+			block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+		} else {
+			block = UT_LIST_GET_PREV(unzip_LRU, block);
+		}
+	}
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	if (scanned) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_BATCH_SCANNED,
+			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+			scanned);
+	}
+
+	return(count);
+}
+
+/** Start writing out pages for a tablespace.
+@param id   tablespace identifier
+@return tablespace
+@retval nullptr if the pages for this tablespace should be discarded */
+static fil_space_t *buf_flush_space(const uint32_t id)
+{
+  fil_space_t *space= fil_space_t::get(id);
+  if (space)
+    buf_flush_freed_pages(space);
+  return space;
+}
+
+struct flush_counters_t
+{
+  /** number of dirty pages flushed */
+  ulint flushed;
+  /** number of clean pages evicted */
+  ulint evicted;
+};
+
+/** Try to discard a dirty page.
+@param bpage      dirty page whose tablespace is not accessible */
+static void buf_flush_discard_page(buf_page_t *bpage)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+  ut_ad(bpage->in_file());
+  ut_ad(bpage->oldest_modification());
+
+  rw_lock_t *rw_lock;
+
+  if (bpage->state() != BUF_BLOCK_FILE_PAGE)
+    rw_lock= nullptr;
+  else
+  {
+    rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock;
+    if (!rw_lock_sx_lock_nowait(rw_lock, 0))
+      return;
+  }
+
+  bpage->status= buf_page_t::NORMAL;
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  buf_pool.delete_from_flush_list(bpage);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  if (rw_lock)
+    rw_lock_sx_unlock(rw_lock);
+
+  buf_LRU_free_page(bpage, true);
+}
+
+/** Flush dirty blocks from the end of the LRU list.
+@param max   maximum number of blocks to make available in buf_pool.free
+@param n     counts of flushed and evicted pages */
+static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
+{
+  ulint scanned= 0;
+  ulint free_limit= srv_LRU_scan_depth;
+
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  if (buf_pool.withdraw_target && buf_pool.curr_size < buf_pool.old_size)
+    free_limit+= buf_pool.withdraw_target - UT_LIST_GET_LEN(buf_pool.withdraw);
+
+  const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
+    ? 0 : srv_flush_neighbors;
+  fil_space_t *space= nullptr;
+  uint32_t last_space_id= FIL_NULL;
+  static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
+  static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
+
+  for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU);
+       bpage && n->flushed + n->evicted < max &&
+       UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN &&
+       UT_LIST_GET_LEN(buf_pool.free) < free_limit; ++scanned)
+  {
+    buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage);
+    const lsn_t oldest_modification= bpage->oldest_modification();
+    buf_pool.lru_hp.set(prev);
+
+    if (oldest_modification <= 1 && bpage->can_relocate())
+    {
+      /* block is ready for eviction i.e., it is clean and is not
+      IO-fixed or buffer fixed. */
+      if (buf_LRU_free_page(bpage, true))
+        ++n->evicted;
+    }
+    else if (oldest_modification > 1 && bpage->ready_for_flush())
+    {
+      /* Block is ready for flush. Dispatch an IO request. The IO
+      helper thread will put it on free list in IO completion routine. */
+      const page_id_t page_id(bpage->id());
+      const uint32_t space_id= page_id.space();
+      if (!space || space->id != space_id)
+      {
+        if (last_space_id != space_id)
+        {
+          if (space)
+            space->release();
+          space= buf_flush_space(space_id);
+          last_space_id= space_id;
+        }
+        else
+          ut_ad(!space);
+      }
+      else if (space->is_stopping())
+      {
+        space->release();
+        space= nullptr;
+      }
+
+      if (!space)
+        buf_flush_discard_page(bpage);
+      else if (neighbors && space->is_rotational())
+      {
+        mysql_mutex_unlock(&buf_pool.mutex);
+        n->flushed+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
+                                             true, n->flushed, max);
+reacquire_mutex:
+        mysql_mutex_lock(&buf_pool.mutex);
+      }
+      else if (buf_flush_page(bpage, true, space))
+      {
+        ++n->flushed;
+        goto reacquire_mutex;
+      }
+    }
+    else
+      /* Can't evict or dispatch this block. Go to previous. */
+      ut_ad(buf_pool.lru_hp.is_hp(prev));
+    bpage= buf_pool.lru_hp.get();
+  }
+
+  buf_pool.lru_hp.set(nullptr);
+
+  if (space)
+    space->release();
+
+  /* We keep track of all flushes happening as part of LRU flush. When
+  estimating the desired rate at which flush_list should be flushed,
+  we factor in this value. */
+  buf_lru_flush_page_count+= n->flushed;
+
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+
+  if (scanned)
+    MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_SCANNED,
+                                 MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+                                 MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+                                 scanned);
+}
+
+/** Flush and move pages from LRU or unzip_LRU list to the free list.
+Whether LRU or unzip_LRU is used depends on the state of the system.
+@param max   maximum number of blocks to make available in buf_pool.free
+@return number of flushed pages */
+static ulint buf_do_LRU_batch(ulint max)
+{
+  const ulint n_unzip_LRU_evicted= buf_LRU_evict_from_unzip_LRU()
+    ? buf_free_from_unzip_LRU_list_batch(max)
+    : 0;
+  flush_counters_t n;
+  n.flushed= 0;
+  n.evicted= n_unzip_LRU_evicted;
+  buf_flush_LRU_list_batch(max, &n);
+
+  if (const ulint evicted= n.evicted - n_unzip_LRU_evicted)
+  {
+    MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+                                 MONITOR_LRU_BATCH_EVICT_COUNT,
+                                 MONITOR_LRU_BATCH_EVICT_PAGES,
+                                 evicted);
+  }
+
+  return n.flushed;
+}
+
+/** This utility flushes dirty blocks from the end of the flush_list.
+The calling thread is not allowed to own any latches on pages!
+@param max_n    maximum mumber of blocks to flush
+@param lsn      once an oldest_modification>=lsn is found, terminate the batch
+@return number of blocks for which the write request was queued */
+static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
+{
+  ulint count= 0;
+  ulint scanned= 0;
+
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+
+  const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
+    ? 0 : srv_flush_neighbors;
+  fil_space_t *space= nullptr;
+  uint32_t last_space_id= FIL_NULL;
+  static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
+  static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
+
+  /* Start from the end of the list looking for a suitable block to be
+  flushed. */
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
+
+  for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list);
+       bpage && len && count < max_n; ++scanned, len--)
+  {
+    const lsn_t oldest_modification= bpage->oldest_modification();
+    if (oldest_modification >= lsn)
+      break;
+    ut_ad(bpage->in_file());
+
+    buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+
+    if (oldest_modification == 1)
+    {
+      buf_pool.delete_from_flush_list(bpage);
+    skip:
+      bpage= prev;
+      continue;
+    }
+
+    ut_ad(oldest_modification > 2);
+    ut_ad(bpage->in_file());
+
+    if (!bpage->ready_for_flush())
+      goto skip;
+
+    /* In order not to degenerate this scan to O(n*n) we attempt to
+    preserve the pointer position. Any thread that would remove 'prev'
+    from buf_pool.flush_list must adjust the hazard pointer.
+
+    Note: A concurrent execution of buf_flush_list_space() may
+    terminate this scan prematurely. The buf_pool.n_flush_list()
+    should prevent multiple threads from executing
+    buf_do_flush_list_batch() concurrently,
+    but buf_flush_list_space() is ignoring that. */
+    buf_pool.flush_hp.set(prev);
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+    const page_id_t page_id(bpage->id());
+    const uint32_t space_id= page_id.space();
+    if (!space || space->id != space_id)
+    {
+      if (last_space_id != space_id)
+      {
+        if (space)
+          space->release();
+        space= buf_flush_space(space_id);
+        last_space_id= space_id;
+      }
+      else
+        ut_ad(!space);
+    }
+    else if (space->is_stopping())
+    {
+      space->release();
+      space= nullptr;
+    }
+
+    if (!space)
+      buf_flush_discard_page(bpage);
+    else if (neighbors && space->is_rotational())
+    {
+      mysql_mutex_unlock(&buf_pool.mutex);
+      count+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
+                                      false, count, max_n);
+    reacquire_mutex:
+      mysql_mutex_lock(&buf_pool.mutex);
+    }
+    else if (buf_flush_page(bpage, false, space))
+    {
+      ++count;
+      goto reacquire_mutex;
+    }
+
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    bpage= buf_pool.flush_hp.get();
+  }
+
+  buf_pool.flush_hp.set(nullptr);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  if (space)
+    space->release();
+
+  if (scanned)
+    MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
+                                 MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+                                 MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
+                                 scanned);
+  if (count)
+    MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+                                 MONITOR_FLUSH_BATCH_COUNT,
+                                 MONITOR_FLUSH_BATCH_PAGES,
+                                 count);
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  return count;
+}
+
+/** Wait until a flush batch ends.
+@param lru    true=buf_pool.LRU; false=buf_pool.flush_list */
+void buf_flush_wait_batch_end(bool lru)
+{
+  const auto &n_flush= lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_;
+
+  if (n_flush)
+  {
+    auto cond= lru ? &buf_pool.done_flush_LRU : &buf_pool.done_flush_list;
+    tpool::tpool_wait_begin();
+    thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+    do
+      my_cond_wait(cond, &buf_pool.mutex.m_mutex);
+    while (n_flush);
+    tpool::tpool_wait_end();
+    thd_wait_end(nullptr);
+    pthread_cond_broadcast(cond);
+  }
+}
+
+/** Write out dirty blocks from buf_pool.flush_list.
+@param max_n    wished maximum mumber of blocks flushed
+@param lsn      buf_pool.get_oldest_modification(LSN_MAX) target
+@return the number of processed pages
+@retval 0 if a buf_pool.flush_list batch is already running */
+ulint buf_flush_list(ulint max_n, lsn_t lsn)
+{
+  ut_ad(lsn);
+
+  if (buf_pool.n_flush_list())
+    return 0;
+
+  mysql_mutex_lock(&buf_pool.mutex);
+  const bool running= buf_pool.n_flush_list_ != 0;
+  /* FIXME: we are performing a dirty read of buf_pool.flush_list.count
+  while not holding buf_pool.flush_list_mutex */
+  if (running || !UT_LIST_GET_LEN(buf_pool.flush_list))
+  {
+    if (!running)
+      pthread_cond_broadcast(&buf_pool.done_flush_list);
+    mysql_mutex_unlock(&buf_pool.mutex);
+    return 0;
+  }
+
+  buf_pool.n_flush_list_++;
+  const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn);
+  const ulint n_flushing= --buf_pool.n_flush_list_;
+
+  buf_pool.try_LRU_scan= true;
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  if (!n_flushing)
+    pthread_cond_broadcast(&buf_pool.done_flush_list);
+
+  buf_dblwr.flush_buffered_writes();
+
+  DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed));
+  return n_flushed;
+}
+
+/** Try to flush all the dirty pages that belong to a given tablespace.
+@param space       tablespace
+@param n_flushed   number of pages written
+@return whether the flush for some pages might not have been initiated */
+bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
+{
+  const auto space_id= space->id;
+  ut_ad(space_id <= SRV_SPACE_ID_UPPER_BOUND);
+
+  bool may_have_skipped= false;
+  ulint max_n_flush= srv_io_capacity;
+
+  mysql_mutex_lock(&buf_pool.mutex);
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+  bool acquired= space->acquire();
+  buf_flush_freed_pages(space);
+
+  for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
+  {
+    ut_d(const auto s= bpage->state());
+    ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE ||
+          s == BUF_BLOCK_REMOVE_HASH);
+    ut_ad(bpage->oldest_modification());
+    ut_ad(bpage->in_file());
+
+    buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+    if (bpage->id().space() != space_id);
+    else if (bpage->oldest_modification() == 1)
+      buf_pool.delete_from_flush_list(bpage);
+    else if (!bpage->ready_for_flush())
+      may_have_skipped= true;
+    else
+    {
+      /* In order not to degenerate this scan to O(n*n) we attempt to
+      preserve the pointer position. Any thread that would remove 'prev'
+      from buf_pool.flush_list must adjust the hazard pointer.
+
+      Note: Multiple executions of buf_flush_list_space() may be
+      interleaved, and also buf_do_flush_list_batch() may be running
+      concurrently. This may terminate our iteration prematurely,
+      leading us to return may_have_skipped=true. */
+      buf_pool.flush_hp.set(prev);
+      mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+      if (!acquired)
+      {
+      was_freed:
+        buf_flush_discard_page(bpage);
+      }
+      else
+      {
+        if (space->is_stopping())
+        {
+          space->release();
+          acquired= false;
+          goto was_freed;
+        }
+        if (!buf_flush_page(bpage, false, space))
+        {
+          may_have_skipped= true;
+          mysql_mutex_lock(&buf_pool.flush_list_mutex);
+          goto next_after_skip;
+        }
+        if (n_flushed)
+          ++*n_flushed;
+        if (!--max_n_flush)
+        {
+          mysql_mutex_lock(&buf_pool.mutex);
+          mysql_mutex_lock(&buf_pool.flush_list_mutex);
+          may_have_skipped= true;
+          break;
+        }
+        mysql_mutex_lock(&buf_pool.mutex);
+      }
+
+      mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      if (!buf_pool.flush_hp.is_hp(prev))
+        may_have_skipped= true;
+    next_after_skip:
+      bpage= buf_pool.flush_hp.get();
+      continue;
+    }
+
+    bpage= prev;
+  }
+
+  /* Note: this loop may have been executed concurrently with
+  buf_do_flush_list_batch() as well as other threads executing
+  buf_flush_list_space(). We should always return true from
+  buf_flush_list_space() if that should be the case; in
+  buf_do_flush_list_batch() we will simply perform less work. */
+
+  buf_pool.flush_hp.set(nullptr);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  buf_pool.try_LRU_scan= true;
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  if (acquired)
+    space->release();
+
+  if (space->purpose == FIL_TYPE_IMPORT)
+    os_aio_wait_until_no_pending_writes();
+  else
+    buf_dblwr.flush_buffered_writes();
+
+  return may_have_skipped;
+}
+
+/** Write out dirty blocks from buf_pool.LRU.
+@param max_n    wished maximum mumber of blocks flushed
+@return the number of processed pages
+@retval 0 if a buf_pool.LRU batch is already running */
+ulint buf_flush_LRU(ulint max_n)
+{
+  if (buf_pool.n_flush_LRU())
+    return 0;
+
+  log_buffer_flush_to_disk(true);
+
+  mysql_mutex_lock(&buf_pool.mutex);
+  if (buf_pool.n_flush_LRU_)
+  {
+    mysql_mutex_unlock(&buf_pool.mutex);
+    return 0;
+  }
+  buf_pool.n_flush_LRU_++;
+
+  ulint n_flushed= buf_do_LRU_batch(max_n);
+
+  const ulint n_flushing= --buf_pool.n_flush_LRU_;
+
+  buf_pool.try_LRU_scan= true;
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  if (!n_flushing)
+  {
+    pthread_cond_broadcast(&buf_pool.done_flush_LRU);
+    pthread_cond_signal(&buf_pool.done_free);
+  }
+
+  buf_dblwr.flush_buffered_writes();
+
+  DBUG_PRINT("ib_buf", ("LRU flush completed, " ULINTPF " pages", n_flushed));
+  return n_flushed;
+}
+
+/** Initiate a log checkpoint, discarding the start of the log.
+@param oldest_lsn   the checkpoint LSN
+@param end_lsn      log_sys.get_lsn()
+@return true if success, false if a checkpoint write was already running */
+static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn)
+{
+  ut_ad(!srv_read_only_mode);
+  mysql_mutex_assert_owner(&log_sys.mutex);
+  ut_ad(oldest_lsn <= end_lsn);
+  ut_ad(end_lsn == log_sys.get_lsn());
+  ut_ad(!recv_no_log_write);
+
+  ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
+
+  if (oldest_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT)
+    /* Some log has been written since the previous checkpoint. */;
+  else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
+    /* MariaDB startup expects the redo log file to be logically empty
+    (not even containing a FILE_CHECKPOINT record) after a clean shutdown.
+    Perform an extra checkpoint at shutdown. */;
+  else
+  {
+    /* Do nothing, because nothing was logged (other than a
+    FILE_CHECKPOINT record) since the previous checkpoint. */
+    mysql_mutex_unlock(&log_sys.mutex);
+    return true;
+  }
+
+  /* Repeat the FILE_MODIFY records after the checkpoint, in case some
+  log records between the checkpoint and log_sys.lsn need them.
+  Finally, write a FILE_CHECKPOINT record. Redo log apply expects to
+  see a FILE_CHECKPOINT after the checkpoint, except on clean
+  shutdown, where the log will be empty after the checkpoint.
+
+  It is important that we write out the redo log before any further
+  dirty pages are flushed to the tablespace files.  At this point,
+  because we hold log_sys.mutex, mtr_t::commit() in other threads will
+  be blocked, and no pages can be added to the flush lists. */
+  lsn_t flush_lsn= oldest_lsn;
+
+  if (fil_names_clear(flush_lsn, oldest_lsn != end_lsn ||
+                      srv_shutdown_state <= SRV_SHUTDOWN_INITIATED))
+  {
+    flush_lsn= log_sys.get_lsn();
+    ut_ad(flush_lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT);
+    mysql_mutex_unlock(&log_sys.mutex);
+    log_write_up_to(flush_lsn, true, true);
+    mysql_mutex_lock(&log_sys.mutex);
+    if (log_sys.last_checkpoint_lsn >= oldest_lsn)
+    {
+      mysql_mutex_unlock(&log_sys.mutex);
+      return true;
+    }
+  }
+  else
+    ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn);
+
+  ut_ad(log_sys.get_flushed_lsn() >= flush_lsn);
+
+  if (log_sys.n_pending_checkpoint_writes)
+  {
+    /* A checkpoint write is running */
+    mysql_mutex_unlock(&log_sys.mutex);
+    return false;
+  }
+
+  log_sys.next_checkpoint_lsn= oldest_lsn;
+  log_write_checkpoint_info(end_lsn);
+  mysql_mutex_assert_not_owner(&log_sys.mutex);
+
+  return true;
+}
+
+/** Make a checkpoint. Note that this function does not flush dirty
+blocks from the buffer pool: it only checks what is lsn of the oldest
+modification in the pool, and writes information about the lsn in
+log file. Use log_make_checkpoint() to flush also the pool.
+@retval true if the checkpoint was or had been made
+@retval false if a checkpoint write was already running */
+static bool log_checkpoint()
+{
+  if (recv_recovery_is_on())
+    recv_sys.apply(true);
+
+  switch (srv_file_flush_method) {
+  case SRV_NOSYNC:
+  case SRV_O_DIRECT_NO_FSYNC:
+    break;
+  default:
+    fil_flush_file_spaces();
+  }
+
+  mysql_mutex_lock(&log_sys.mutex);
+  const lsn_t end_lsn= log_sys.get_lsn();
+  mysql_mutex_lock(&log_sys.flush_order_mutex);
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  const lsn_t oldest_lsn= buf_pool.get_oldest_modification(end_lsn);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+  mysql_mutex_unlock(&log_sys.flush_order_mutex);
+  return log_checkpoint_low(oldest_lsn, end_lsn);
+}
+
+/** Make a checkpoint. */
+ATTRIBUTE_COLD void log_make_checkpoint()
+{
+  buf_flush_wait_flushed(log_sys.get_lsn(std::memory_order_acquire));
+  while (!log_checkpoint());
+}
+
+/** Wait until all persistent pages are flushed up to a limit.
+@param sync_lsn   buf_pool.get_oldest_modification(LSN_MAX) to wait for */
+ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
+{
+  ut_ad(sync_lsn);
+  ut_ad(sync_lsn < LSN_MAX);
+  mysql_mutex_assert_not_owner(&log_sys.mutex);
+  ut_ad(!srv_read_only_mode);
+
+  if (recv_recovery_is_on())
+    recv_sys.apply(true);
+
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+  if (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn)
+  {
+#if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */
+    if (UNIV_UNLIKELY(!buf_page_cleaner_is_active)
+        ut_d(|| innodb_page_cleaner_disabled_debug))
+    {
+      do
+      {
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+        ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn);
+        buf_flush_wait_batch_end_acquiring_mutex(false);
+        if (n_pages)
+        {
+          MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+                                       MONITOR_FLUSH_SYNC_COUNT,
+                                       MONITOR_FLUSH_SYNC_PAGES, n_pages);
+        }
+        MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
+        mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      }
+      while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn);
+
+      goto try_checkpoint;
+    }
+#endif
+    if (buf_flush_sync_lsn < sync_lsn)
+    {
+      buf_flush_sync_lsn= sync_lsn;
+      pthread_cond_signal(&buf_pool.do_flush_list);
+    }
+
+    do
+    {
+      tpool::tpool_wait_begin();
+      thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+      my_cond_wait(&buf_pool.done_flush_list,
+                   &buf_pool.flush_list_mutex.m_mutex);
+      thd_wait_end(nullptr);
+      tpool::tpool_wait_end();
+
+      MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
+    }
+    while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn);
+  }
+
+try_checkpoint:
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  if (UNIV_UNLIKELY(log_sys.last_checkpoint_lsn < sync_lsn))
+  {
+    /* If the buffer pool was clean, no log write was guaranteed
+    to happen until now. There could be an outstanding FILE_CHECKPOINT
+    record from a previous fil_names_clear() call, which we must
+    write out before we can advance the checkpoint. */
+    if (sync_lsn > log_sys.get_flushed_lsn())
+      log_write_up_to(sync_lsn, true);
+    log_checkpoint();
+  }
+}
+
+/** Initiate more eager page flushing if the log checkpoint age is too old.
+@param lsn      buf_pool.get_oldest_modification(LSN_MAX) target
+@param furious  true=furious flushing, false=limit to innodb_io_capacity */
+ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious)
+{
+  mysql_mutex_assert_not_owner(&log_sys.mutex);
+  ut_ad(!srv_read_only_mode);
+
+  if (recv_recovery_is_on())
+    recv_sys.apply(true);
+
+  Atomic_relaxed<lsn_t> &limit= furious
+    ? buf_flush_sync_lsn : buf_flush_async_lsn;
+
+  if (limit < lsn)
+  {
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    if (limit < lsn)
+      limit= lsn;
+    pthread_cond_signal(&buf_pool.do_flush_list);
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+  }
+}
+
+/** Wait for pending flushes to complete. */
+void buf_flush_wait_batch_end_acquiring_mutex(bool lru)
+{
+  if (lru ? buf_pool.n_flush_LRU() : buf_pool.n_flush_list())
+  {
+    mysql_mutex_lock(&buf_pool.mutex);
+    buf_flush_wait_batch_end(lru);
+    mysql_mutex_unlock(&buf_pool.mutex);
+  }
+}
+
+/** Conduct checkpoint-related flushing for innodb_flush_sync=ON,
+and try to initiate checkpoints until the target is met.
+@param lsn   minimum value of buf_pool.get_oldest_modification(LSN_MAX) */
+ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
+{
+  ut_ad(!srv_read_only_mode);
+
+  for (;;)
+  {
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+    if (ulint n_flushed= buf_flush_list(srv_max_io_capacity, lsn))
+    {
+      MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+                                   MONITOR_FLUSH_SYNC_COUNT,
+                                   MONITOR_FLUSH_SYNC_PAGES, n_flushed);
+    }
+
+    /* Attempt to perform a log checkpoint upon completing each batch. */
+    if (recv_recovery_is_on())
+      recv_sys.apply(true);
+
+    switch (srv_file_flush_method) {
+    case SRV_NOSYNC:
+    case SRV_O_DIRECT_NO_FSYNC:
+      break;
+    default:
+      fil_flush_file_spaces();
+    }
+
+    mysql_mutex_lock(&log_sys.mutex);
+    const lsn_t newest_lsn= log_sys.get_lsn();
+    mysql_mutex_lock(&log_sys.flush_order_mutex);
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    lsn_t measure= buf_pool.get_oldest_modification(0);
+    mysql_mutex_unlock(&log_sys.flush_order_mutex);
+    const lsn_t checkpoint_lsn= measure ? measure : newest_lsn;
+
+    if (checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT)
+    {
+      mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+      log_checkpoint_low(checkpoint_lsn, newest_lsn);
+      mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      measure= buf_pool.get_oldest_modification(LSN_MAX);
+    }
+    else
+    {
+      mysql_mutex_unlock(&log_sys.mutex);
+      if (!measure)
+        measure= LSN_MAX;
+    }
+
+    mysql_mutex_assert_not_owner(&log_sys.mutex);
+
+    /* After attempting log checkpoint, check if we have reached our target. */
+    const lsn_t target= buf_flush_sync_lsn;
+
+    if (measure >= target)
+      buf_flush_sync_lsn= 0;
+    else if (measure >= buf_flush_async_lsn)
+      buf_flush_async_lsn= 0;
+
+    /* wake up buf_flush_wait_flushed() */
+    pthread_cond_broadcast(&buf_pool.done_flush_list);
+
+    lsn= std::max(lsn, target);
+
+    if (measure >= lsn)
+      return;
+  }
+}
+
+/** Check if the adpative flushing threshold is recommended based on
+redo log capacity filled threshold.
+@param oldest_lsn     buf_pool.get_oldest_modification()
+@return true if adaptive flushing is recommended. */
+static bool af_needed_for_redo(lsn_t oldest_lsn)
+{
+  lsn_t age= (log_sys.get_lsn() - oldest_lsn);
+  lsn_t af_lwm= static_cast<lsn_t>(srv_adaptive_flushing_lwm *
+    static_cast<double>(log_sys.log_capacity) / 100);
+
+  /* if age > af_lwm adaptive flushing is recommended */
+  return (age > af_lwm);
+}
+
+/*********************************************************************//**
+Calculates if flushing is required based on redo generation rate.
+@return percent of io_capacity to flush to manage redo space */
+static
+ulint
+af_get_pct_for_lsn(
+/*===============*/
+	lsn_t	age)	/*!< in: current age of LSN. */
+{
+	lsn_t	af_lwm = static_cast<lsn_t>(
+		srv_adaptive_flushing_lwm
+		* static_cast<double>(log_sys.log_capacity) / 100);
+
+	if (age < af_lwm) {
+		/* No adaptive flushing. */
+		return(0);
+	}
+
+	lsn_t lsn_age_factor = (age * 100) / log_sys.max_modified_age_async;
+
+	ut_ad(srv_max_io_capacity >= srv_io_capacity);
+	return static_cast<ulint>(
+		(static_cast<double>(srv_max_io_capacity / srv_io_capacity
+				     * lsn_age_factor)
+		 * sqrt(static_cast<double>(lsn_age_factor))
+		 / 7.5));
+}
+
+/** This function is called approximately once every second by the
+page_cleaner thread if innodb_adaptive_flushing=ON.
+Based on various factors it decides if there is a need to do flushing.
+@return number of pages recommended to be flushed
+@param last_pages_in  number of pages flushed in previous batch
+@param oldest_lsn     buf_pool.get_oldest_modification(0)
+@param dirty_blocks   UT_LIST_GET_LEN(buf_pool.flush_list)
+@param dirty_pct      100*flush_list.count / (LRU.count + free.count) */
+static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in,
+                                                     lsn_t oldest_lsn,
+                                                     ulint dirty_blocks,
+                                                     double dirty_pct)
+{
+	static	lsn_t		prev_lsn = 0;
+	static	ulint		sum_pages = 0;
+	static	ulint		avg_page_rate = 0;
+	static	ulint		n_iterations = 0;
+	static	time_t		prev_time;
+	lsn_t			lsn_rate;
+	ulint			n_pages = 0;
+
+	const lsn_t cur_lsn = log_sys.get_lsn();
+	ut_ad(oldest_lsn <= cur_lsn);
+	ulint pct_for_lsn = af_get_pct_for_lsn(cur_lsn - oldest_lsn);
+	time_t curr_time = time(nullptr);
+	const double max_pct = srv_max_buf_pool_modified_pct;
+
+	if (!prev_lsn || !pct_for_lsn) {
+		prev_time = curr_time;
+		prev_lsn = cur_lsn;
+		if (max_pct > 0.0) {
+			dirty_pct /= max_pct;
+		}
+
+		n_pages = ulint(dirty_pct * double(srv_io_capacity));
+		if (n_pages < dirty_blocks) {
+			n_pages= std::min<ulint>(srv_io_capacity, dirty_blocks);
+		}
+
+		return n_pages;
+	}
+
+	sum_pages += last_pages_in;
+
+	double	time_elapsed = difftime(curr_time, prev_time);
+
+	/* We update our variables every srv_flushing_avg_loops
+	iterations to smooth out transition in workload. */
+	if (++n_iterations >= srv_flushing_avg_loops
+	    || time_elapsed >= static_cast<double>(srv_flushing_avg_loops)) {
+
+		if (time_elapsed < 1) {
+			time_elapsed = 1;
+		}
+
+		avg_page_rate = static_cast<ulint>(
+			((static_cast<double>(sum_pages)
+			  / time_elapsed)
+			 + static_cast<double>(avg_page_rate)) / 2);
+
+		/* How much LSN we have generated since last call. */
+		lsn_rate = static_cast<lsn_t>(
+			static_cast<double>(cur_lsn - prev_lsn)
+			/ time_elapsed);
+
+		lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
+
+		ulint	flush_tm = page_cleaner.flush_time;
+		ulint	flush_pass = page_cleaner.flush_pass;
+
+		page_cleaner.flush_time = 0;
+		page_cleaner.flush_pass = 0;
+
+		if (flush_pass) {
+			flush_tm /= flush_pass;
+		}
+
+		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME, flush_tm);
+		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, flush_pass);
+
+		prev_lsn = cur_lsn;
+		prev_time = curr_time;
+
+		n_iterations = 0;
+
+		sum_pages = 0;
+	}
+
+	const ulint pct_for_dirty = srv_max_dirty_pages_pct_lwm == 0
+		? (dirty_pct >= max_pct ? 100 : 0)
+		: static_cast<ulint>
+		(max_pct > 0.0 ? dirty_pct / max_pct : dirty_pct);
+	ulint pct_total = std::max(pct_for_dirty, pct_for_lsn);
+
+	/* Estimate pages to be flushed for the lsn progress */
+	lsn_t	target_lsn = oldest_lsn
+		+ lsn_avg_rate * buf_flush_lsn_scan_factor;
+	ulint	pages_for_lsn = 0;
+
+	mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+	for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool.flush_list);
+	     b != NULL;
+	     b = UT_LIST_GET_PREV(list, b)) {
+		if (b->oldest_modification() > target_lsn) {
+			break;
+		}
+		if (++pages_for_lsn >= srv_max_io_capacity) {
+			break;
+		}
+	}
+	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+	pages_for_lsn /= buf_flush_lsn_scan_factor;
+	if (pages_for_lsn < 1) {
+		pages_for_lsn = 1;
+	}
+
+	n_pages = (ulint(double(srv_io_capacity) * double(pct_total) / 100.0)
+		   + avg_page_rate + pages_for_lsn) / 3;
+
+	if (n_pages > srv_max_io_capacity) {
+		n_pages = srv_max_io_capacity;
+	}
+
+	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
+
+	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, pages_for_lsn);
+
+	MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
+	MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
+	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
+	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
+
+	return(n_pages);
+}
+
+/******************************************************************//**
+page_cleaner thread tasked with flushing dirty pages from the buffer
+pools. As of now we'll have only one coordinator.
+@return a dummy parameter */
+static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*)
+{
+  my_thread_init();
+#ifdef UNIV_PFS_THREAD
+  pfs_register_thread(page_cleaner_thread_key);
+#endif /* UNIV_PFS_THREAD */
+  ut_ad(!srv_read_only_mode);
+  ut_ad(buf_page_cleaner_is_active);
+
+  ulint last_pages= 0;
+  timespec abstime;
+  set_timespec(abstime, 1);
+
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+  lsn_t lsn_limit;
+  ulint last_activity_count= srv_get_activity_count();
+
+  for (;;)
+  {
+    lsn_limit= buf_flush_sync_lsn;
+
+    if (UNIV_UNLIKELY(lsn_limit != 0))
+    {
+furious_flush:
+      if (UNIV_LIKELY(srv_flush_sync))
+      {
+        buf_flush_sync_for_checkpoint(lsn_limit);
+        last_pages= 0;
+        set_timespec(abstime, 1);
+        continue;
+      }
+    }
+    else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
+      break;
+
+    /* If buf pager cleaner is idle and there is no work
+    (either dirty pages are all flushed or adaptive flushing
+    is not enabled) then opt for non-timed wait */
+    if (buf_pool.page_cleaner_idle() &&
+        (!UT_LIST_GET_LEN(buf_pool.flush_list) ||
+         srv_max_dirty_pages_pct_lwm == 0.0))
+      my_cond_wait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex.m_mutex);
+    else
+      my_cond_timedwait(&buf_pool.do_flush_list,
+                        &buf_pool.flush_list_mutex.m_mutex, &abstime);
+
+    set_timespec(abstime, 1);
+
+    lsn_t soft_lsn_limit= buf_flush_async_lsn;
+    lsn_limit= buf_flush_sync_lsn;
+
+    if (UNIV_UNLIKELY(lsn_limit != 0))
+    {
+      if (UNIV_LIKELY(srv_flush_sync))
+        goto furious_flush;
+    }
+    else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
+      break;
+
+    const lsn_t oldest_lsn= buf_pool.get_oldest_modification(0);
+
+    if (!oldest_lsn)
+    {
+      if (UNIV_UNLIKELY(lsn_limit != 0))
+      {
+        buf_flush_sync_lsn= 0;
+        /* wake up buf_flush_wait_flushed() */
+        pthread_cond_broadcast(&buf_pool.done_flush_list);
+      }
+unemployed:
+      buf_flush_async_lsn= 0;
+      buf_pool.page_cleaner_set_idle(true);
+      continue;
+    }
+
+    const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list);
+    ut_ad(dirty_blocks);
+    /* We perform dirty reads of the LRU+free list lengths here.
+    Division by zero is not possible, because buf_pool.flush_list is
+    guaranteed to be nonempty, and it is a subset of buf_pool.LRU. */
+    const double dirty_pct= double(dirty_blocks) * 100.0 /
+      double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free));
+
+    bool idle_flush= false;
+
+    if (lsn_limit || soft_lsn_limit);
+    else if (af_needed_for_redo(oldest_lsn));
+    else if (srv_max_dirty_pages_pct_lwm != 0.0)
+    {
+      const ulint activity_count= srv_get_activity_count();
+      if (activity_count != last_activity_count)
+        last_activity_count= activity_count;
+      else if (buf_pool.page_cleaner_idle() && buf_pool.n_pend_reads == 0)
+      {
+         /* reaching here means 3 things:
+         - last_activity_count == activity_count: suggesting server is idle
+           (no trx_t::commit activity)
+         - page cleaner is idle (dirty_pct < srv_max_dirty_pages_pct_lwm)
+         - there are no pending reads but there are dirty pages to flush */
+        idle_flush= true;
+        buf_pool.update_last_activity_count(activity_count);
+      }
+
+      if (!idle_flush && dirty_pct < srv_max_dirty_pages_pct_lwm)
+        goto unemployed;
+    }
+    else if (dirty_pct < srv_max_buf_pool_modified_pct)
+      goto unemployed;
+
+    if (UNIV_UNLIKELY(lsn_limit != 0) && oldest_lsn >= lsn_limit)
+      lsn_limit= buf_flush_sync_lsn= 0;
+    if (UNIV_UNLIKELY(soft_lsn_limit != 0) && oldest_lsn >= soft_lsn_limit)
+      soft_lsn_limit= buf_flush_async_lsn= 0;
+
+    buf_pool.page_cleaner_set_idle(false);
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+    if (!lsn_limit)
+      lsn_limit= soft_lsn_limit;
+
+    ulint n_flushed;
+
+    if (UNIV_UNLIKELY(lsn_limit != 0))
+    {
+      n_flushed= buf_flush_list(srv_max_io_capacity, lsn_limit);
+      /* wake up buf_flush_wait_flushed() */
+      pthread_cond_broadcast(&buf_pool.done_flush_list);
+      goto try_checkpoint;
+    }
+    else if (idle_flush || !srv_adaptive_flushing)
+    {
+      n_flushed= buf_flush_list(srv_io_capacity);
+try_checkpoint:
+      if (n_flushed)
+      {
+        MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+                                     MONITOR_FLUSH_BACKGROUND_COUNT,
+                                     MONITOR_FLUSH_BACKGROUND_PAGES,
+                                     n_flushed);
+do_checkpoint:
+        /* The periodic log_checkpoint() call here makes it harder to
+        reproduce bugs in crash recovery or mariabackup --prepare, or
+        in code that writes the redo log records. Omitting the call
+        here should not affect correctness, because log_free_check()
+        should still be invoking checkpoints when needed. */
+        DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", goto next;);
+
+        if (!recv_recovery_is_on() && srv_operation == SRV_OPERATION_NORMAL)
+          log_checkpoint();
+      }
+    }
+    else if (ulint n= page_cleaner_flush_pages_recommendation(last_pages,
+                                                              oldest_lsn,
+                                                              dirty_blocks,
+                                                              dirty_pct))
+    {
+      page_cleaner.flush_pass++;
+      const ulint tm= ut_time_ms();
+      last_pages= n_flushed= buf_flush_list(n);
+      page_cleaner.flush_time+= ut_time_ms() - tm;
+
+      if (n_flushed)
+      {
+        MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+                                     MONITOR_FLUSH_ADAPTIVE_COUNT,
+                                     MONITOR_FLUSH_ADAPTIVE_PAGES,
+                                     n_flushed);
+        goto do_checkpoint;
+      }
+    }
+    else if (buf_flush_async_lsn <= oldest_lsn)
+    {
+      mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      goto unemployed;
+    }
+
+#ifdef UNIV_DEBUG
+    while (innodb_page_cleaner_disabled_debug && !buf_flush_sync_lsn &&
+           srv_shutdown_state == SRV_SHUTDOWN_NONE)
+      os_thread_sleep(100000);
+#endif /* UNIV_DEBUG */
+
+#ifndef DBUG_OFF
+next:
+#endif /* !DBUG_OFF */
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+    /* when idle flushing kicks in page_cleaner is marked active.
+    reset it back to idle since the it was made active as part of
+    idle flushing stage. */
+    if (idle_flush)
+      buf_pool.page_cleaner_set_idle(true);
+  }
+
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  if (srv_fast_shutdown != 2)
+  {
+    buf_flush_wait_batch_end_acquiring_mutex(true);
+    buf_flush_wait_batch_end_acquiring_mutex(false);
+  }
+
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  lsn_limit= buf_flush_sync_lsn;
+  if (UNIV_UNLIKELY(lsn_limit != 0))
+    goto furious_flush;
+  buf_page_cleaner_is_active= false;
+  pthread_cond_broadcast(&buf_pool.done_flush_list);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  my_thread_end();
+  /* We count the number of threads in os_thread_exit(). A created
+  thread should always use that to exit and not use return() to exit. */
+  os_thread_exit();
+
+  OS_THREAD_DUMMY_RETURN;
+}
+
+/** Initialize page_cleaner. */
+ATTRIBUTE_COLD void buf_flush_page_cleaner_init()
+{
+  ut_ad(!buf_page_cleaner_is_active);
+  ut_ad(srv_operation == SRV_OPERATION_NORMAL ||
+        srv_operation == SRV_OPERATION_RESTORE ||
+        srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+  buf_flush_async_lsn= 0;
+  buf_flush_sync_lsn= 0;
+  buf_page_cleaner_is_active= true;
+  os_thread_create(buf_flush_page_cleaner);
+}
+
+/** @return the number of dirty pages in the buffer pool */
+static ulint buf_flush_list_length()
+{
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  const ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+  return len;
+}
+
+/** Flush the buffer pool on shutdown. */
+ATTRIBUTE_COLD void buf_flush_buffer_pool()
+{
+  ut_ad(!buf_page_cleaner_is_active);
+  ut_ad(!buf_flush_sync_lsn);
+
+  service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+                                 "Waiting to flush the buffer pool");
+
+  while (buf_pool.n_flush_list() || buf_flush_list_length())
+  {
+    buf_flush_list(srv_max_io_capacity);
+    timespec abstime;
+
+    if (buf_pool.n_flush_list())
+    {
+      service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+                                     "Waiting to flush " ULINTPF " pages",
+                                     buf_flush_list_length());
+      set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2);
+      mysql_mutex_lock(&buf_pool.mutex);
+      while (buf_pool.n_flush_list_)
+        my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
+                          &abstime);
+      mysql_mutex_unlock(&buf_pool.mutex);
+    }
+  }
+
+  ut_ad(!buf_pool.any_io_pending());
+}
+
+/** Synchronously flush dirty blocks.
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync()
+{
+  ut_ad(!sync_check_iterate(dict_sync_check()));
+
+  for (;;)
+  {
+    const ulint n_flushed= buf_flush_list(srv_max_io_capacity);
+    buf_flush_wait_batch_end_acquiring_mutex(false);
+    if (!n_flushed && !buf_flush_list_length())
+      return;
+  }
+}
+
+#ifdef UNIV_DEBUG
+/** Functor to validate the flush list. */
+struct	Check {
+	void operator()(const buf_page_t* elem) const
+	{
+		ut_ad(elem->oldest_modification());
+		ut_ad(!fsp_is_system_temporary(elem->id().space()));
+	}
+};
+
+/** Validate the flush list. */
+static void buf_flush_validate_low()
+{
+	buf_page_t*		bpage;
+
+	mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+
+	ut_list_validate(buf_pool.flush_list, Check());
+
+	bpage = UT_LIST_GET_FIRST(buf_pool.flush_list);
+
+	while (bpage != NULL) {
+		const lsn_t	om = bpage->oldest_modification();
+		/* A page in buf_pool.flush_list can be in
+		BUF_BLOCK_REMOVE_HASH state. This happens when a page
+		is in the middle of being relocated. In that case the
+		original descriptor can have this state and still be
+		in the flush list waiting to acquire the
+		buf_pool.flush_list_mutex to complete the relocation. */
+		ut_d(const auto s= bpage->state());
+		ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE
+		      || s == BUF_BLOCK_REMOVE_HASH);
+		ut_ad(om == 1 || om > 2);
+
+		bpage = UT_LIST_GET_NEXT(list, bpage);
+		ut_ad(om == 1 || !bpage || recv_recovery_is_on()
+		      || om >= bpage->oldest_modification());
+	}
+}
+
+/** Validate the flush list. */
+void buf_flush_validate()
+{
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  buf_flush_validate_low();
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
new file mode 100644
index 00000000..b282eb17
--- /dev/null
+++ b/storage/innobase/buf/buf0lru.cc
@@ -0,0 +1,1477 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0lru.cc
+The database buffer replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0lru.h"
+#include "sync0rw.h"
+#include "fil0fil.h"
+#include "btr0btr.h"
+#include "buf0buddy.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0rea.h"
+#include "btr0sea.h"
+#include "os0file.h"
+#include "page0zip.h"
+#include "log0recv.h"
+#include "srv0srv.h"
+#include "srv0mon.h"
+
+/** Flush this many pages in buf_LRU_get_free_block() */
+size_t innodb_lru_flush_size;
+
+/** The number of blocks from the LRU_old pointer onward, including
+the block pointed to, must be buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
+of the whole LRU list length, except that the tolerance defined below
+is allowed. Note that the tolerance must be small enough such that for
+even the BUF_LRU_OLD_MIN_LEN long LRU list, the LRU_old pointer is not
+allowed to point to either end of the LRU list. */
+
+static constexpr ulint BUF_LRU_OLD_TOLERANCE = 20;
+
+/** The minimum amount of non-old blocks when the LRU_old list exists
+(that is, when there are more than BUF_LRU_OLD_MIN_LEN blocks).
+@see buf_LRU_old_adjust_len */
+#define BUF_LRU_NON_OLD_MIN_LEN	5
+
+/** If we switch on the InnoDB monitor because there are too few available
+frames in the buffer pool, we set this to TRUE */
+static bool buf_lru_switched_on_innodb_mon = false;
+
+/** True if diagnostic message about difficult to find free blocks
+in the buffer bool has already printed. */
+static bool	buf_lru_free_blocks_error_printed;
+
+/******************************************************************//**
+These statistics are not 'of' LRU but 'for' LRU.  We keep count of I/O
+and page_zip_decompress() operations.  Based on the statistics,
+buf_LRU_evict_from_unzip_LRU() decides if we want to evict from
+unzip_LRU or the regular LRU.  From unzip_LRU, we will only evict the
+uncompressed frame (meaning we can evict dirty blocks as well).  From
+the regular LRU, we will evict the entire block (i.e.: both the
+uncompressed and compressed data), which must be clean. */
+
+/* @{ */
+
+/** Number of intervals for which we keep the history of these stats.
+Updated at SRV_MONITOR_INTERVAL (the buf_LRU_stat_update() call rate). */
+static constexpr ulint BUF_LRU_STAT_N_INTERVAL= 4;
+
+/** Co-efficient with which we multiply I/O operations to equate them
+with page_zip_decompress() operations. */
+static constexpr ulint BUF_LRU_IO_TO_UNZIP_FACTOR= 50;
+
+/** Sampled values buf_LRU_stat_cur.
+Not protected by any mutex.  Updated by buf_LRU_stat_update(). */
+static buf_LRU_stat_t		buf_LRU_stat_arr[BUF_LRU_STAT_N_INTERVAL];
+
+/** Cursor to buf_LRU_stat_arr[] that is updated in a round-robin fashion. */
+static ulint			buf_LRU_stat_arr_ind;
+
+/** Current operation counters.  Not protected by any mutex.  Cleared
+by buf_LRU_stat_update(). */
+buf_LRU_stat_t	buf_LRU_stat_cur;
+
+/** Running sum of past values of buf_LRU_stat_cur.
+Updated by buf_LRU_stat_update().  Not Protected by any mutex. */
+buf_LRU_stat_t	buf_LRU_stat_sum;
+
+/* @} */
+
+/** @name Heuristics for detecting index scan @{ */
+/** Move blocks to "new" LRU list only if the first access was at
+least this many milliseconds ago.  Not protected by any mutex or latch. */
+uint	buf_LRU_old_threshold_ms;
+/* @} */
+
+/** Remove bpage from buf_pool.LRU and buf_pool.page_hash.
+
+If bpage->state() == BUF_BLOCK_ZIP_PAGE && bpage->oldest_modification() <= 1,
+the object will be freed.
+
+@param bpage      buffer block
+@param id         page identifier
+@param hash_lock  buf_pool.page_hash latch (will be released here)
+@param zip        whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed
+
+If a compressed page is freed other compressed pages may be relocated.
+@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
+caller needs to free the page to the free list
+@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
+this case the block is already returned to the buddy allocator. */
+static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
+                                        page_hash_latch *hash_lock, bool zip);
+
+/** Free a block to buf_pool */
+static void buf_LRU_block_free_hashed_page(buf_block_t *block)
+{
+  block->page.free_file_page();
+  buf_LRU_block_free_non_file_page(block);
+}
+
+/** Increase LRU size in bytes by the page size.
+@param[in]	bpage		control block */
+static inline void incr_LRU_size_in_bytes(const buf_page_t* bpage)
+{
+	/* FIXME: use atomics, not mutex */
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	buf_pool.stat.LRU_bytes += bpage->physical_size();
+
+	ut_ad(buf_pool.stat.LRU_bytes <= buf_pool.curr_pool_size);
+}
+
+/** @return whether the unzip_LRU list should be used for evicting a victim
+instead of the general LRU list */
+bool buf_LRU_evict_from_unzip_LRU()
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	/* If the unzip_LRU list is empty, we can only use the LRU. */
+	if (UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0) {
+		return false;
+	}
+
+	/* If unzip_LRU is at most 10% of the size of the LRU list,
+	then use the LRU.  This slack allows us to keep hot
+	decompressed pages in the buffer pool. */
+	if (UT_LIST_GET_LEN(buf_pool.unzip_LRU)
+	    <= UT_LIST_GET_LEN(buf_pool.LRU) / 10) {
+		return false;
+	}
+
+	/* If eviction hasn't started yet, we assume by default
+	that a workload is disk bound. */
+	if (buf_pool.freed_page_clock == 0) {
+		return true;
+	}
+
+	/* Calculate the average over past intervals, and add the values
+	of the current interval. */
+	ulint	io_avg = buf_LRU_stat_sum.io / BUF_LRU_STAT_N_INTERVAL
+		+ buf_LRU_stat_cur.io;
+
+	ulint	unzip_avg = buf_LRU_stat_sum.unzip / BUF_LRU_STAT_N_INTERVAL
+		+ buf_LRU_stat_cur.unzip;
+
+	/* Decide based on our formula.  If the load is I/O bound
+	(unzip_avg is smaller than the weighted io_avg), evict an
+	uncompressed frame from unzip_LRU.  Otherwise we assume that
+	the load is CPU bound and evict from the regular LRU. */
+	return(unzip_avg <= io_avg * BUF_LRU_IO_TO_UNZIP_FACTOR);
+}
+
+/** Try to free an uncompressed page of a compressed block from the unzip
+LRU list.  The compressed page is preserved, and it need not be clean.
+@param limit  maximum number of blocks to scan
+@return true if freed */
+static bool buf_LRU_free_from_unzip_LRU_list(ulint limit)
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	if (!buf_LRU_evict_from_unzip_LRU()) {
+		return(false);
+	}
+
+	ulint	scanned = 0;
+	bool	freed = false;
+
+	for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+	     block && scanned < limit; ++scanned) {
+		buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
+
+		ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+		ut_ad(block->in_unzip_LRU_list);
+		ut_ad(block->page.in_LRU_list);
+
+		freed = buf_LRU_free_page(&block->page, false);
+		if (freed) {
+			break;
+		}
+
+		block = prev_block;
+	}
+
+	if (scanned) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+			MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+			MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
+			scanned);
+	}
+
+	return(freed);
+}
+
+/** Try to free a clean page from the common LRU list.
+@param limit  maximum number of blocks to scan
+@return whether a page was freed */
+static bool buf_LRU_free_from_common_LRU_list(ulint limit)
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	ulint		scanned = 0;
+	bool		freed = false;
+
+	for (buf_page_t* bpage = buf_pool.lru_scan_itr.start();
+	     bpage && scanned < limit;
+	     ++scanned, bpage = buf_pool.lru_scan_itr.get()) {
+		buf_page_t*	prev = UT_LIST_GET_PREV(LRU, bpage);
+		buf_pool.lru_scan_itr.set(prev);
+
+		const auto accessed = bpage->is_accessed();
+
+		if (buf_LRU_free_page(bpage, true)) {
+			if (!accessed) {
+				/* Keep track of pages that are evicted without
+				ever being accessed. This gives us a measure of
+				the effectiveness of readahead */
+				++buf_pool.stat.n_ra_pages_evicted;
+			}
+
+			freed = true;
+			break;
+		}
+	}
+
+	if (scanned) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_SEARCH_SCANNED,
+			MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+			MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
+			scanned);
+	}
+
+	return(freed);
+}
+
+/** Try to free a replaceable block.
+@param limit  maximum number of blocks to scan
+@return true if found and freed */
+bool buf_LRU_scan_and_free_block(ulint limit)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+
+  return buf_LRU_free_from_unzip_LRU_list(limit) ||
+    buf_LRU_free_from_common_LRU_list(limit);
+}
+
+/** @return a buffer block from the buf_pool.free list
+@retval	NULL	if the free list is empty */
+buf_block_t* buf_LRU_get_free_only()
+{
+	buf_block_t*	block;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	block = reinterpret_cast<buf_block_t*>(
+		UT_LIST_GET_FIRST(buf_pool.free));
+
+	while (block != NULL) {
+		ut_ad(block->page.in_free_list);
+		ut_d(block->page.in_free_list = FALSE);
+		ut_ad(!block->page.oldest_modification());
+		ut_ad(!block->page.in_LRU_list);
+		ut_a(!block->page.in_file());
+		UT_LIST_REMOVE(buf_pool.free, &block->page);
+
+		if (buf_pool.curr_size >= buf_pool.old_size
+		    || UT_LIST_GET_LEN(buf_pool.withdraw)
+			>= buf_pool.withdraw_target
+		    || !buf_pool.will_be_withdrawn(block->page)) {
+			/* No adaptive hash index entries may point to
+			a free block. */
+			assert_block_ahi_empty(block);
+
+			block->page.set_state(BUF_BLOCK_MEMORY);
+			MEM_MAKE_ADDRESSABLE(block->frame, srv_page_size);
+			break;
+		}
+
+		/* This should be withdrawn */
+		UT_LIST_ADD_LAST(
+			buf_pool.withdraw,
+			&block->page);
+		ut_d(block->in_withdraw_list = true);
+
+		block = reinterpret_cast<buf_block_t*>(
+			UT_LIST_GET_FIRST(buf_pool.free));
+	}
+
+	return(block);
+}
+
+/******************************************************************//**
+Checks how much of buf_pool is occupied by non-data objects like
+AHI, lock heaps etc. Depending on the size of non-data objects this
+function will either assert or issue a warning and switch on the
+status monitor. */
+static void buf_LRU_check_size_of_non_data_objects()
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+
+  if (recv_recovery_is_on() || buf_pool.curr_size != buf_pool.old_size)
+    return;
+
+  const auto s= UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU);
+
+  if (s < buf_pool.curr_size / 20)
+    ib::fatal() << "Over 95 percent of the buffer pool is"
+            " occupied by lock heaps"
+#ifdef BTR_CUR_HASH_ADAPT
+            " or the adaptive hash index"
+#endif /* BTR_CUR_HASH_ADAPT */
+            "! Check that your transactions do not set too many"
+            " row locks, or review if innodb_buffer_pool_size="
+                << (buf_pool.curr_size >> (20U - srv_page_size_shift))
+                << "M could be bigger.";
+
+  if (s < buf_pool.curr_size / 3)
+  {
+    if (!buf_lru_switched_on_innodb_mon && srv_monitor_timer)
+    {
+      /* Over 67 % of the buffer pool is occupied by lock heaps or
+      the adaptive hash index. This may be a memory leak! */
+      ib::warn() << "Over 67 percent of the buffer pool is"
+              " occupied by lock heaps"
+#ifdef BTR_CUR_HASH_ADAPT
+              " or the adaptive hash index"
+#endif /* BTR_CUR_HASH_ADAPT */
+              "! Check that your transactions do not set too many row locks."
+              " innodb_buffer_pool_size="
+                 << (buf_pool.curr_size >> (20U - srv_page_size_shift))
+                 << "M. Starting the InnoDB Monitor to print diagnostics.";
+      buf_lru_switched_on_innodb_mon= true;
+      srv_print_innodb_monitor= TRUE;
+      srv_monitor_timer_schedule_now();
+    }
+  }
+  else if (buf_lru_switched_on_innodb_mon)
+  {
+    /* Switch off the InnoDB Monitor; this is a simple way to stop the
+    monitor if the situation becomes less urgent, but may also
+    surprise users who did SET GLOBAL innodb_status_output=ON earlier! */
+    buf_lru_switched_on_innodb_mon= false;
+    srv_print_innodb_monitor= FALSE;
+  }
+}
+
+/** Get a block from the buf_pool.free list.
+If the list is empty, blocks will be moved from the end of buf_pool.LRU
+to buf_pool.free.
+
+This function is called from a user thread when it needs a clean
+block to read in a page. Note that we only ever get a block from
+the free list. Even when we flush a page or find a page in LRU scan
+we put it to free list to be used.
+* iteration 0:
+  * get a block from the buf_pool.free list, success:done
+  * if buf_pool.try_LRU_scan is set
+    * scan LRU up to 100 pages to free a clean block
+    * success:retry the free list
+  * flush up to innodb_lru_flush_size LRU blocks to data files
+    (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth)
+    * on buf_page_write_complete() the blocks will put on buf_pool.free list
+    * success: retry the free list
+* subsequent iterations: same as iteration 0 except:
+  * scan whole LRU list
+  * scan LRU list even if buf_pool.try_LRU_scan is not set
+
+@param have_mutex  whether buf_pool.mutex is already being held
+@return the free control block, in state BUF_BLOCK_MEMORY */
+buf_block_t *buf_LRU_get_free_block(bool have_mutex)
+{
+	ulint		n_iterations	= 0;
+	ulint		flush_failures	= 0;
+	MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH);
+	if (have_mutex) {
+		mysql_mutex_assert_owner(&buf_pool.mutex);
+		goto got_mutex;
+	}
+	mysql_mutex_lock(&buf_pool.mutex);
+got_mutex:
+	buf_LRU_check_size_of_non_data_objects();
+	buf_block_t* block;
+
+	DBUG_EXECUTE_IF("ib_lru_force_no_free_page",
+		if (!buf_lru_free_blocks_error_printed) {
+			n_iterations = 21;
+			goto not_found;});
+
+retry:
+	/* If there is a block in the free list, take it */
+	if ((block = buf_LRU_get_free_only()) != nullptr) {
+got_block:
+		if (!have_mutex) {
+			mysql_mutex_unlock(&buf_pool.mutex);
+		}
+		memset(&block->page.zip, 0, sizeof block->page.zip);
+		return block;
+	}
+
+	MONITOR_INC( MONITOR_LRU_GET_FREE_LOOPS );
+	if (n_iterations || buf_pool.try_LRU_scan) {
+		/* If no block was in the free list, search from the
+		end of the LRU list and try to free a block there.
+		If we are doing for the first time we'll scan only
+		tail of the LRU list otherwise we scan the whole LRU
+		list. */
+		if (buf_LRU_scan_and_free_block(n_iterations
+						? ULINT_UNDEFINED : 100)) {
+			goto retry;
+		}
+
+		/* Tell other threads that there is no point
+		in scanning the LRU list. */
+		buf_pool.try_LRU_scan = false;
+	}
+
+	for (;;) {
+		if ((block = buf_LRU_get_free_only()) != nullptr) {
+			goto got_block;
+		}
+		if (!buf_pool.n_flush_LRU_) {
+			break;
+		}
+		my_cond_wait(&buf_pool.done_free, &buf_pool.mutex.m_mutex);
+	}
+
+#ifndef DBUG_OFF
+not_found:
+#endif
+	mysql_mutex_unlock(&buf_pool.mutex);
+
+	if (n_iterations > 20 && !buf_lru_free_blocks_error_printed
+	    && srv_buf_pool_old_size == srv_buf_pool_size) {
+
+		ib::warn() << "Difficult to find free blocks in the buffer pool"
+			" (" << n_iterations << " search iterations)! "
+			<< flush_failures << " failed attempts to"
+			" flush a page!"
+			" Consider increasing innodb_buffer_pool_size."
+			" Pending flushes (fsync) log: "
+			<< log_sys.get_pending_flushes()
+			<< "; buffer pool: "
+			<< fil_n_pending_tablespace_flushes
+			<< ". " << os_n_file_reads << " OS file reads, "
+			<< os_n_file_writes << " OS file writes, "
+			<< os_n_fsyncs
+			<< " OS fsyncs.";
+
+		buf_lru_free_blocks_error_printed = true;
+	}
+
+	if (n_iterations > 1) {
+		MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS );
+	}
+
+	/* No free block was found: try to flush the LRU list.
+	The freed blocks will be up for grabs for all threads.
+
+	TODO: A more elegant way would have been to return one freed
+	up block to the caller here but the code that deals with
+	removing the block from buf_pool.page_hash and buf_pool.LRU is fairly
+	involved (particularly in case of ROW_FORMAT=COMPRESSED pages). We
+	can do that in a separate patch sometime in future. */
+
+	if (!buf_flush_LRU(innodb_lru_flush_size)) {
+		MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
+		++flush_failures;
+	}
+
+	n_iterations++;
+	mysql_mutex_lock(&buf_pool.mutex);
+	buf_pool.stat.LRU_waits++;
+	goto got_mutex;
+}
+
+/** Move the LRU_old pointer so that the length of the old blocks list
+is inside the allowed limits. */
+static void buf_LRU_old_adjust_len()
+{
+	ulint	old_len;
+	ulint	new_len;
+
+	ut_a(buf_pool.LRU_old);
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(buf_pool.LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN);
+	ut_ad(buf_pool.LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX);
+	compile_time_assert(BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN
+			    > BUF_LRU_OLD_RATIO_DIV
+			    * (BUF_LRU_OLD_TOLERANCE + 5));
+	compile_time_assert(BUF_LRU_NON_OLD_MIN_LEN < BUF_LRU_OLD_MIN_LEN);
+
+#ifdef UNIV_LRU_DEBUG
+	/* buf_pool.LRU_old must be the first item in the LRU list
+	whose "old" flag is set. */
+	ut_a(buf_pool.LRU_old->old);
+	ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)
+	     || !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old);
+	ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)
+	     || UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old);
+#endif /* UNIV_LRU_DEBUG */
+
+	old_len = buf_pool.LRU_old_len;
+	new_len = ut_min(UT_LIST_GET_LEN(buf_pool.LRU)
+			 * buf_pool.LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV,
+			 UT_LIST_GET_LEN(buf_pool.LRU)
+			 - (BUF_LRU_OLD_TOLERANCE
+			    + BUF_LRU_NON_OLD_MIN_LEN));
+
+	for (;;) {
+		buf_page_t*	LRU_old = buf_pool.LRU_old;
+
+		ut_a(LRU_old);
+		ut_ad(LRU_old->in_LRU_list);
+#ifdef UNIV_LRU_DEBUG
+		ut_a(LRU_old->old);
+#endif /* UNIV_LRU_DEBUG */
+
+		/* Update the LRU_old pointer if necessary */
+
+		if (old_len + BUF_LRU_OLD_TOLERANCE < new_len) {
+
+			buf_pool.LRU_old = LRU_old = UT_LIST_GET_PREV(
+				LRU, LRU_old);
+#ifdef UNIV_LRU_DEBUG
+			ut_a(!LRU_old->old);
+#endif /* UNIV_LRU_DEBUG */
+			old_len = ++buf_pool.LRU_old_len;
+			LRU_old->set_old(true);
+
+		} else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) {
+
+			buf_pool.LRU_old = UT_LIST_GET_NEXT(LRU, LRU_old);
+			old_len = --buf_pool.LRU_old_len;
+			LRU_old->set_old(false);
+		} else {
+			return;
+		}
+	}
+}
+
+/** Initialize the old blocks pointer in the LRU list. This function should be
+called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */
+static void buf_LRU_old_init()
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_a(UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN);
+
+	/* We first initialize all blocks in the LRU list as old and then use
+	the adjust function to move the LRU_old pointer to the right
+	position */
+
+	for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool.LRU);
+	     bpage != NULL;
+	     bpage = UT_LIST_GET_PREV(LRU, bpage)) {
+
+		ut_ad(bpage->in_LRU_list);
+
+		/* This loop temporarily violates the
+		assertions of buf_page_t::set_old(). */
+		bpage->old = true;
+	}
+
+	buf_pool.LRU_old = UT_LIST_GET_FIRST(buf_pool.LRU);
+	buf_pool.LRU_old_len = UT_LIST_GET_LEN(buf_pool.LRU);
+
+	buf_LRU_old_adjust_len();
+}
+
+/** Remove a block from the unzip_LRU list if it belonged to the list.
+@param[in]	bpage	control block */
+static void buf_unzip_LRU_remove_block_if_needed(buf_page_t* bpage)
+{
+	ut_ad(bpage->in_file());
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	if (bpage->belongs_to_unzip_LRU()) {
+		buf_block_t*	block = reinterpret_cast<buf_block_t*>(bpage);
+
+		ut_ad(block->in_unzip_LRU_list);
+		ut_d(block->in_unzip_LRU_list = false);
+
+		UT_LIST_REMOVE(buf_pool.unzip_LRU, block);
+	}
+}
+
+/** Removes a block from the LRU list.
+@param[in]	bpage	control block */
+static inline void buf_LRU_remove_block(buf_page_t* bpage)
+{
+	/* Important that we adjust the hazard pointers before removing
+	bpage from the LRU list. */
+	buf_page_t* prev_bpage = buf_pool.LRU_remove(bpage);
+
+	/* If the LRU_old pointer is defined and points to just this block,
+	move it backward one step */
+
+	if (bpage == buf_pool.LRU_old) {
+
+		/* Below: the previous block is guaranteed to exist,
+		because the LRU_old pointer is only allowed to differ
+		by BUF_LRU_OLD_TOLERANCE from strict
+		buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU
+		list length. */
+		ut_a(prev_bpage);
+#ifdef UNIV_LRU_DEBUG
+		ut_a(!prev_bpage->old);
+#endif /* UNIV_LRU_DEBUG */
+		buf_pool.LRU_old = prev_bpage;
+		prev_bpage->set_old(true);
+
+		buf_pool.LRU_old_len++;
+	}
+
+	buf_pool.stat.LRU_bytes -= bpage->physical_size();
+
+	buf_unzip_LRU_remove_block_if_needed(bpage);
+
+	/* If the LRU list is so short that LRU_old is not defined,
+	clear the "old" flags and return */
+	if (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN) {
+
+		for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
+		     bpage != NULL;
+		     bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+
+			/* This loop temporarily violates the
+			assertions of buf_page_t::set_old(). */
+			bpage->old = false;
+		}
+
+		buf_pool.LRU_old = NULL;
+		buf_pool.LRU_old_len = 0;
+
+		return;
+	}
+
+	ut_ad(buf_pool.LRU_old);
+
+	/* Update the LRU_old_len field if necessary */
+	if (bpage->old) {
+		buf_pool.LRU_old_len--;
+	}
+
+	/* Adjust the length of the old block list if necessary */
+	buf_LRU_old_adjust_len();
+}
+
+/******************************************************************//**
+Adds a block to the LRU list of decompressed zip pages. */
+void
+buf_unzip_LRU_add_block(
+/*====================*/
+	buf_block_t*	block,	/*!< in: control block */
+	ibool		old)	/*!< in: TRUE if should be put to the end
+				of the list, else put to the start */
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_a(block->page.belongs_to_unzip_LRU());
+	ut_ad(!block->in_unzip_LRU_list);
+	ut_d(block->in_unzip_LRU_list = true);
+
+	if (old) {
+		UT_LIST_ADD_LAST(buf_pool.unzip_LRU, block);
+	} else {
+		UT_LIST_ADD_FIRST(buf_pool.unzip_LRU, block);
+	}
+}
+
+/******************************************************************//**
+Adds a block to the LRU list. Please make sure that the page_size is
+already set when invoking the function, so that we can get correct
+page_size from the buffer page when adding a block into LRU */
+void
+buf_LRU_add_block(
+	buf_page_t*	bpage,	/*!< in: control block */
+	bool		old)	/*!< in: true if should be put to the old blocks
+				in the LRU list, else put to the start; if the
+				LRU list is very short, the block is added to
+				the start, regardless of this parameter */
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(!bpage->in_LRU_list);
+
+	if (!old || (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN)) {
+
+		UT_LIST_ADD_FIRST(buf_pool.LRU, bpage);
+
+		bpage->freed_page_clock = buf_pool.freed_page_clock
+			& ((1U << 31) - 1);
+	} else {
+#ifdef UNIV_LRU_DEBUG
+		/* buf_pool.LRU_old must be the first item in the LRU list
+		whose "old" flag is set. */
+		ut_a(buf_pool.LRU_old->old);
+		ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)
+		     || !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old);
+		ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)
+		     || UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old);
+#endif /* UNIV_LRU_DEBUG */
+		UT_LIST_INSERT_AFTER(buf_pool.LRU, buf_pool.LRU_old,
+			bpage);
+
+		buf_pool.LRU_old_len++;
+	}
+
+	ut_d(bpage->in_LRU_list = TRUE);
+
+	incr_LRU_size_in_bytes(bpage);
+
+	if (UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_OLD_MIN_LEN) {
+
+		ut_ad(buf_pool.LRU_old);
+
+		/* Adjust the length of the old block list if necessary */
+
+		bpage->set_old(old);
+		buf_LRU_old_adjust_len();
+
+	} else if (UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN) {
+
+		/* The LRU list is now long enough for LRU_old to become
+		defined: init it */
+
+		buf_LRU_old_init();
+	} else {
+		bpage->set_old(buf_pool.LRU_old != NULL);
+	}
+
+	/* If this is a zipped block with decompressed frame as well
+	then put it on the unzip_LRU list */
+	if (bpage->belongs_to_unzip_LRU()) {
+		buf_unzip_LRU_add_block((buf_block_t*) bpage, old);
+	}
+}
+
+/** Move a block to the start of the LRU list. */
+void buf_page_make_young(buf_page_t *bpage)
+{
+  ut_ad(bpage->in_file());
+
+  mysql_mutex_lock(&buf_pool.mutex);
+
+  if (UNIV_UNLIKELY(bpage->old))
+    buf_pool.stat.n_pages_made_young++;
+
+  buf_LRU_remove_block(bpage);
+  buf_LRU_add_block(bpage, false);
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/** Try to free a block. If bpage is a descriptor of a compressed-only
+ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well.
+The caller must hold buf_pool.mutex.
+@param bpage      block to be freed
+@param zip        whether to remove both copies of a ROW_FORMAT=COMPRESSED page
+@retval true if freed and buf_pool.mutex may have been temporarily released
+@retval false if the page was not freed */
+bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
+{
+	const page_id_t id(bpage->id());
+	buf_page_t*	b = nullptr;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(bpage->in_file());
+	ut_ad(bpage->in_LRU_list);
+
+	/* First, perform a quick check before we acquire hash_lock. */
+	if (!bpage->can_relocate()) {
+		return false;
+	}
+
+	/* We must hold an exclusive hash_lock to prevent
+	bpage->can_relocate() from changing due to a concurrent
+	execution of buf_page_get_low(). */
+	const ulint fold = id.fold();
+	page_hash_latch* hash_lock = buf_pool.page_hash.lock_get(fold);
+	hash_lock->write_lock();
+	lsn_t oldest_modification = bpage->oldest_modification_acquire();
+
+	if (UNIV_UNLIKELY(!bpage->can_relocate())) {
+		/* Do not free buffer fixed and I/O-fixed blocks. */
+		goto func_exit;
+	}
+
+	if (oldest_modification == 1) {
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		oldest_modification = bpage->oldest_modification();
+		if (oldest_modification) {
+			ut_ad(oldest_modification == 1);
+			buf_pool.delete_from_flush_list(bpage);
+		}
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+		ut_ad(!bpage->oldest_modification());
+		oldest_modification = 0;
+	}
+
+	if (zip || !bpage->zip.data) {
+		/* This would completely free the block. */
+		/* Do not completely free dirty blocks. */
+
+		if (oldest_modification) {
+			goto func_exit;
+		}
+	} else if (oldest_modification
+		   && bpage->state() != BUF_BLOCK_FILE_PAGE) {
+func_exit:
+		hash_lock->write_unlock();
+		return(false);
+
+	} else if (bpage->state() == BUF_BLOCK_FILE_PAGE) {
+		b = buf_page_alloc_descriptor();
+		ut_a(b);
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		new (b) buf_page_t(*bpage);
+		b->set_state(BUF_BLOCK_ZIP_PAGE);
+	}
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(bpage->in_file());
+	ut_ad(bpage->in_LRU_list);
+
+	DBUG_PRINT("ib_buf", ("free page %u:%u",
+			      id.space(), id.page_no()));
+
+	ut_ad(bpage->can_relocate());
+
+	if (!buf_LRU_block_remove_hashed(bpage, id, hash_lock, zip)) {
+		ut_ad(!b);
+		mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+		return(true);
+	}
+
+	/* We have just freed a BUF_BLOCK_FILE_PAGE. If b != nullptr
+	then it was a compressed page with an uncompressed frame and
+	we are interested in freeing only the uncompressed frame.
+	Therefore we have to reinsert the compressed page descriptor
+	into the LRU and page_hash (and possibly flush_list).
+	if !b then it was a regular page that has been freed */
+
+	if (UNIV_LIKELY_NULL(b)) {
+		buf_page_t*	prev_b	= UT_LIST_GET_PREV(LRU, b);
+
+		ut_ad(!buf_pool.page_hash_get_low(id, fold));
+		ut_ad(b->zip_size());
+
+		/* The field in_LRU_list of
+		the to-be-freed block descriptor should have
+		been cleared in
+		buf_LRU_block_remove_hashed(), which
+		invokes buf_LRU_remove_block(). */
+		ut_ad(!bpage->in_LRU_list);
+
+		/* bpage->state was BUF_BLOCK_FILE_PAGE because
+		b != nullptr. The type cast below is thus valid. */
+		ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list);
+
+		/* The fields of bpage were copied to b before
+		buf_LRU_block_remove_hashed() was invoked. */
+		ut_ad(!b->in_zip_hash);
+		ut_ad(b->in_LRU_list);
+		ut_ad(b->in_page_hash);
+
+		HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, b);
+
+		/* Insert b where bpage was in the LRU list. */
+		if (prev_b) {
+			ulint	lru_len;
+
+			ut_ad(prev_b->in_LRU_list);
+			ut_ad(prev_b->in_file());
+
+			UT_LIST_INSERT_AFTER(buf_pool.LRU, prev_b, b);
+
+			incr_LRU_size_in_bytes(b);
+
+			if (b->is_old()) {
+				buf_pool.LRU_old_len++;
+				if (buf_pool.LRU_old
+				    == UT_LIST_GET_NEXT(LRU, b)) {
+
+					buf_pool.LRU_old = b;
+				}
+			}
+
+			lru_len = UT_LIST_GET_LEN(buf_pool.LRU);
+
+			if (lru_len > BUF_LRU_OLD_MIN_LEN) {
+				ut_ad(buf_pool.LRU_old);
+				/* Adjust the length of the
+				old block list if necessary */
+				buf_LRU_old_adjust_len();
+			} else if (lru_len == BUF_LRU_OLD_MIN_LEN) {
+				/* The LRU list is now long
+				enough for LRU_old to become
+				defined: init it */
+				buf_LRU_old_init();
+			}
+#ifdef UNIV_LRU_DEBUG
+			/* Check that the "old" flag is consistent
+			in the block and its neighbours. */
+			b->set_old(b->is_old());
+#endif /* UNIV_LRU_DEBUG */
+		} else {
+			ut_d(b->in_LRU_list = FALSE);
+			buf_LRU_add_block(b, b->old);
+		}
+
+		buf_flush_relocate_on_flush_list(bpage, b);
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+		bpage->zip.data = nullptr;
+
+		page_zip_set_size(&bpage->zip, 0);
+
+		/* Prevent buf_page_get_gen() from
+		decompressing the block while we release
+		hash_lock. */
+		b->set_io_fix(BUF_IO_PIN);
+		hash_lock->write_unlock();
+	} else if (!zip) {
+		hash_lock->write_unlock();
+	}
+
+	buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (block->index) {
+		mysql_mutex_unlock(&buf_pool.mutex);
+
+		/* Remove the adaptive hash index on the page.
+		The page was declared uninitialized by
+		buf_LRU_block_remove_hashed().  We need to flag
+		the contents of the page valid (which it still is) in
+		order to avoid bogus Valgrind or MSAN warnings.*/
+
+		MEM_MAKE_DEFINED(block->frame, srv_page_size);
+		btr_search_drop_page_hash_index(block);
+		MEM_UNDEFINED(block->frame, srv_page_size);
+
+		if (UNIV_LIKELY_NULL(b)) {
+			ut_ad(b->zip_size());
+			b->io_unfix();
+		}
+
+		mysql_mutex_lock(&buf_pool.mutex);
+	} else
+#endif
+	if (UNIV_LIKELY_NULL(b)) {
+		ut_ad(b->zip_size());
+		b->io_unfix();
+	}
+
+	buf_LRU_block_free_hashed_page(block);
+
+	return(true);
+}
+
+/******************************************************************//**
+Puts a block back to the free list. */
+void
+buf_LRU_block_free_non_file_page(
+/*=============================*/
+	buf_block_t*	block)	/*!< in: block, must not contain a file page */
+{
+	void*		data;
+
+	ut_ad(block->page.state() == BUF_BLOCK_MEMORY);
+	assert_block_ahi_empty(block);
+	ut_ad(!block->page.in_free_list);
+	ut_ad(!block->page.oldest_modification());
+	ut_ad(!block->page.in_LRU_list);
+
+	block->page.set_state(BUF_BLOCK_NOT_USED);
+
+	MEM_UNDEFINED(block->frame, srv_page_size);
+	/* Wipe page_no and space_id */
+	static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+	memset_aligned<4>(block->frame + FIL_PAGE_OFFSET, 0xfe, 4);
+	static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+		      "not perfect alignment");
+	memset_aligned<2>(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+			  0xfe, 4);
+	data = block->page.zip.data;
+
+	if (data != NULL) {
+		block->page.zip.data = NULL;
+		buf_pool_mutex_exit_forbid();
+
+		ut_ad(block->zip_size());
+
+		buf_buddy_free(data, block->zip_size());
+
+		buf_pool_mutex_exit_allow();
+		page_zip_set_size(&block->page.zip, 0);
+	}
+
+	if (buf_pool.curr_size < buf_pool.old_size
+	    && UT_LIST_GET_LEN(buf_pool.withdraw) < buf_pool.withdraw_target
+	    && buf_pool.will_be_withdrawn(block->page)) {
+		/* This should be withdrawn */
+		UT_LIST_ADD_LAST(
+			buf_pool.withdraw,
+			&block->page);
+		ut_d(block->in_withdraw_list = true);
+	} else {
+		UT_LIST_ADD_FIRST(buf_pool.free, &block->page);
+		ut_d(block->page.in_free_list = true);
+		pthread_cond_signal(&buf_pool.done_free);
+	}
+
+	MEM_NOACCESS(block->frame, srv_page_size);
+}
+
+/** Release a memory block to the buffer pool. */
+ATTRIBUTE_COLD void buf_pool_t::free_block(buf_block_t *block)
+{
+  ut_ad(this == &buf_pool);
+  mysql_mutex_lock(&mutex);
+  buf_LRU_block_free_non_file_page(block);
+  mysql_mutex_unlock(&mutex);
+}
+
+
+/** Remove bpage from buf_pool.LRU and buf_pool.page_hash.
+
+If bpage->state() == BUF_BLOCK_ZIP_PAGE && !bpage->oldest_modification(),
+the object will be freed.
+
+@param bpage      buffer block
+@param id         page identifier
+@param hash_lock  buf_pool.page_hash latch (will be released here)
+@param zip        whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed
+
+If a compressed page is freed other compressed pages may be relocated.
+@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
+caller needs to free the page to the free list
+@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
+this case the block is already returned to the buddy allocator. */
+static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
+                                        page_hash_latch *hash_lock, bool zip)
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+        ut_ad(hash_lock->is_write_locked());
+
+	ut_a(bpage->io_fix() == BUF_IO_NONE);
+	ut_a(!bpage->buf_fix_count());
+
+	buf_LRU_remove_block(bpage);
+
+	buf_pool.freed_page_clock += 1;
+
+	switch (bpage->state()) {
+	case BUF_BLOCK_FILE_PAGE:
+		MEM_CHECK_ADDRESSABLE(bpage, sizeof(buf_block_t));
+		MEM_CHECK_ADDRESSABLE(((buf_block_t*) bpage)->frame,
+				      srv_page_size);
+		buf_block_modify_clock_inc((buf_block_t*) bpage);
+		if (bpage->zip.data) {
+			const page_t*	page = ((buf_block_t*) bpage)->frame;
+
+			ut_a(!zip || !bpage->oldest_modification());
+			ut_ad(bpage->zip_size());
+
+			switch (fil_page_get_type(page)) {
+			case FIL_PAGE_TYPE_ALLOCATED:
+			case FIL_PAGE_INODE:
+			case FIL_PAGE_IBUF_BITMAP:
+			case FIL_PAGE_TYPE_FSP_HDR:
+			case FIL_PAGE_TYPE_XDES:
+				/* These are essentially uncompressed pages. */
+				if (!zip) {
+					/* InnoDB writes the data to the
+					uncompressed page frame.  Copy it
+					to the compressed page, which will
+					be preserved. */
+					memcpy(bpage->zip.data, page,
+					       bpage->zip_size());
+				}
+				break;
+			case FIL_PAGE_TYPE_ZBLOB:
+			case FIL_PAGE_TYPE_ZBLOB2:
+				break;
+			case FIL_PAGE_INDEX:
+			case FIL_PAGE_RTREE:
+#if defined UNIV_ZIP_DEBUG && defined BTR_CUR_HASH_ADAPT
+				/* During recovery, we only update the
+				compressed page, not the uncompressed one. */
+				ut_a(recv_recovery_is_on()
+				     || page_zip_validate(
+					     &bpage->zip, page,
+					     ((buf_block_t*) bpage)->index));
+#endif /* UNIV_ZIP_DEBUG && BTR_CUR_HASH_ADAPT */
+				break;
+			default:
+				ib::error() << "The compressed page to be"
+					" evicted seems corrupt:";
+				ut_print_buf(stderr, page, srv_page_size);
+
+				ib::error() << "Possibly older version of"
+					" the page:";
+
+				ut_print_buf(stderr, bpage->zip.data,
+					     bpage->zip_size());
+				putc('\n', stderr);
+				ut_error;
+			}
+
+			break;
+		}
+		/* fall through */
+	case BUF_BLOCK_ZIP_PAGE:
+		ut_a(!bpage->oldest_modification());
+		MEM_CHECK_ADDRESSABLE(bpage->zip.data, bpage->zip_size());
+		break;
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		ut_error;
+		break;
+	}
+
+	ut_ad(!bpage->in_zip_hash);
+	HASH_DELETE(buf_page_t, hash, &buf_pool.page_hash, id.fold(), bpage);
+
+	switch (bpage->state()) {
+	case BUF_BLOCK_ZIP_PAGE:
+		ut_ad(!bpage->in_free_list);
+		ut_ad(!bpage->in_LRU_list);
+		ut_a(bpage->zip.data);
+		ut_a(bpage->zip.ssize);
+		ut_ad(!bpage->oldest_modification());
+
+		hash_lock->write_unlock();
+		buf_pool_mutex_exit_forbid();
+
+		buf_buddy_free(bpage->zip.data, bpage->zip_size());
+
+		buf_pool_mutex_exit_allow();
+		buf_page_free_descriptor(bpage);
+		return(false);
+
+	case BUF_BLOCK_FILE_PAGE:
+		static_assert(FIL_NULL == 0xffffffffU, "fill pattern");
+		static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+		memset_aligned<4>(reinterpret_cast<buf_block_t*>(bpage)->frame
+				  + FIL_PAGE_OFFSET, 0xff, 4);
+		static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+			      "not perfect alignment");
+		memset_aligned<2>(reinterpret_cast<buf_block_t*>(bpage)->frame
+				  + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
+		MEM_UNDEFINED(((buf_block_t*) bpage)->frame, srv_page_size);
+		bpage->set_state(BUF_BLOCK_REMOVE_HASH);
+
+		if (!zip) {
+			return true;
+		}
+
+		/* Question: If we release hash_lock here
+		then what protects us against:
+		1) Some other thread buffer fixing this page
+		2) Some other thread trying to read this page and
+		not finding it in buffer pool attempting to read it
+		from the disk.
+		Answer:
+		1) Cannot happen because the page is no longer in the
+		page_hash. Only possibility is when while invalidating
+		a tablespace we buffer fix the prev_page in LRU to
+		avoid relocation during the scan. But that is not
+		possible because we are holding buf_pool mutex.
+
+		2) Not possible because in buf_page_init_for_read()
+		we do a look up of page_hash while holding buf_pool
+		mutex and since we are holding buf_pool mutex here
+		and by the time we'll release it in the caller we'd
+		have inserted the compressed only descriptor in the
+		page_hash. */
+		hash_lock->write_unlock();
+
+		if (bpage->zip.data) {
+			/* Free the compressed page. */
+			void*	data = bpage->zip.data;
+			bpage->zip.data = NULL;
+
+			ut_ad(!bpage->in_free_list);
+			ut_ad(!bpage->oldest_modification());
+			ut_ad(!bpage->in_LRU_list);
+			buf_pool_mutex_exit_forbid();
+
+			buf_buddy_free(data, bpage->zip_size());
+
+			buf_pool_mutex_exit_allow();
+
+			page_zip_set_size(&bpage->zip, 0);
+		}
+
+		return(true);
+
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		break;
+	}
+
+	ut_error;
+	return(false);
+}
+
+/** Remove one page from LRU list and put it to free list.
+@param bpage     file page to be freed
+@param id        page identifier
+@param hash_lock buf_pool.page_hash latch (will be released here) */
+void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id,
+                           page_hash_latch *hash_lock)
+{
+  while (bpage->buf_fix_count())
+    /* Wait for other threads to release the fix count
+    before releasing the bpage from LRU list. */
+    (void) LF_BACKOFF();
+
+  if (buf_LRU_block_remove_hashed(bpage, id, hash_lock, true))
+    buf_LRU_block_free_hashed_page(reinterpret_cast<buf_block_t*>(bpage));
+}
+
+/** Update buf_pool.LRU_old_ratio.
+@param[in]	old_pct		Reserve this percentage of
+				the buffer pool for "old" blocks
+@param[in]	adjust		true=adjust the LRU list;
+				false=just assign buf_pool.LRU_old_ratio
+				during the initialization of InnoDB
+@return updated old_pct */
+uint buf_LRU_old_ratio_update(uint old_pct, bool adjust)
+{
+	uint	ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100;
+	if (ratio < BUF_LRU_OLD_RATIO_MIN) {
+		ratio = BUF_LRU_OLD_RATIO_MIN;
+	} else if (ratio > BUF_LRU_OLD_RATIO_MAX) {
+		ratio = BUF_LRU_OLD_RATIO_MAX;
+	}
+
+	if (adjust) {
+		mysql_mutex_lock(&buf_pool.mutex);
+
+		if (ratio != buf_pool.LRU_old_ratio) {
+			buf_pool.LRU_old_ratio = ratio;
+
+			if (UT_LIST_GET_LEN(buf_pool.LRU)
+			    >= BUF_LRU_OLD_MIN_LEN) {
+				buf_LRU_old_adjust_len();
+			}
+		}
+
+		mysql_mutex_unlock(&buf_pool.mutex);
+	} else {
+		buf_pool.LRU_old_ratio = ratio;
+	}
+	/* the reverse of
+	ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100 */
+	return((uint) (ratio * 100 / (double) BUF_LRU_OLD_RATIO_DIV + 0.5));
+}
+
+/********************************************************************//**
+Update the historical stats that we are collecting for LRU eviction
+policy at the end of each interval. */
+void
+buf_LRU_stat_update()
+{
+	buf_LRU_stat_t*	item;
+	buf_LRU_stat_t	cur_stat;
+
+	if (!buf_pool.freed_page_clock) {
+		goto func_exit;
+	}
+
+	/* Update the index. */
+	item = &buf_LRU_stat_arr[buf_LRU_stat_arr_ind];
+	buf_LRU_stat_arr_ind++;
+	buf_LRU_stat_arr_ind %= BUF_LRU_STAT_N_INTERVAL;
+
+	/* Add the current value and subtract the obsolete entry.
+	Since buf_LRU_stat_cur is not protected by any mutex,
+	it can be changing between adding to buf_LRU_stat_sum
+	and copying to item. Assign it to local variables to make
+	sure the same value assign to the buf_LRU_stat_sum
+	and item */
+	cur_stat = buf_LRU_stat_cur;
+
+	buf_LRU_stat_sum.io += cur_stat.io - item->io;
+	buf_LRU_stat_sum.unzip += cur_stat.unzip - item->unzip;
+
+	/* Put current entry in the array. */
+	memcpy(item, &cur_stat, sizeof *item);
+
+func_exit:
+	/* Clear the current entry. */
+	memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur);
+}
+
+#ifdef UNIV_DEBUG
+/** Validate the LRU list. */
+void buf_LRU_validate()
+{
+	ulint	old_len;
+	ulint	new_len;
+
+	mysql_mutex_lock(&buf_pool.mutex);
+
+	if (UT_LIST_GET_LEN(buf_pool.LRU) >= BUF_LRU_OLD_MIN_LEN) {
+
+		ut_a(buf_pool.LRU_old);
+		old_len = buf_pool.LRU_old_len;
+
+		new_len = ut_min(UT_LIST_GET_LEN(buf_pool.LRU)
+				 * buf_pool.LRU_old_ratio
+				 / BUF_LRU_OLD_RATIO_DIV,
+				 UT_LIST_GET_LEN(buf_pool.LRU)
+				 - (BUF_LRU_OLD_TOLERANCE
+				    + BUF_LRU_NON_OLD_MIN_LEN));
+
+		ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE);
+		ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE);
+	}
+
+	CheckInLRUList::validate();
+
+	old_len = 0;
+
+	for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
+	     bpage != NULL;
+             bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+
+		switch (bpage->state()) {
+		case BUF_BLOCK_NOT_USED:
+		case BUF_BLOCK_MEMORY:
+		case BUF_BLOCK_REMOVE_HASH:
+			ut_error;
+			break;
+		case BUF_BLOCK_FILE_PAGE:
+			ut_ad(reinterpret_cast<buf_block_t*>(bpage)
+			      ->in_unzip_LRU_list
+			      == bpage->belongs_to_unzip_LRU());
+		case BUF_BLOCK_ZIP_PAGE:
+			break;
+		}
+
+		if (bpage->is_old()) {
+			const buf_page_t*	prev
+				= UT_LIST_GET_PREV(LRU, bpage);
+			const buf_page_t*	next
+				= UT_LIST_GET_NEXT(LRU, bpage);
+
+			if (!old_len++) {
+				ut_a(buf_pool.LRU_old == bpage);
+			} else {
+				ut_a(!prev || prev->is_old());
+			}
+
+			ut_a(!next || next->is_old());
+		}
+	}
+
+	ut_a(buf_pool.LRU_old_len == old_len);
+
+	CheckInFreeList::validate();
+
+	for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.free);
+	     bpage != NULL;
+	     bpage = UT_LIST_GET_NEXT(list, bpage)) {
+
+		ut_a(bpage->state() == BUF_BLOCK_NOT_USED);
+	}
+
+	CheckUnzipLRUAndLRUList::validate();
+
+	for (buf_block_t* block = UT_LIST_GET_FIRST(buf_pool.unzip_LRU);
+	     block != NULL;
+	     block = UT_LIST_GET_NEXT(unzip_LRU, block)) {
+
+		ut_ad(block->in_unzip_LRU_list);
+		ut_ad(block->page.in_LRU_list);
+		ut_a(block->page.belongs_to_unzip_LRU());
+	}
+
+	mysql_mutex_unlock(&buf_pool.mutex);
+}
+#endif /* UNIV_DEBUG */
+
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+/** Dump the LRU list to stderr. */
+void buf_LRU_print()
+{
+	mysql_mutex_lock(&buf_pool.mutex);
+
+	for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
+	     bpage != NULL;
+	     bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+		const page_id_t id(bpage->id());
+
+		fprintf(stderr, "BLOCK space %u page %u ",
+			id.space(), id.page_no());
+
+		if (bpage->is_old()) {
+			fputs("old ", stderr);
+		}
+
+		if (const uint32_t buf_fix_count = bpage->buf_fix_count()) {
+			fprintf(stderr, "buffix count %u ", buf_fix_count);
+		}
+
+		if (const auto io_fix = bpage->io_fix()) {
+			fprintf(stderr, "io_fix %d ", io_fix);
+		}
+
+		if (bpage->oldest_modification()) {
+			fputs("modif. ", stderr);
+		}
+
+		switch (const auto state = bpage->state()) {
+			const byte*	frame;
+		case BUF_BLOCK_FILE_PAGE:
+			frame = buf_block_get_frame((buf_block_t*) bpage);
+			fprintf(stderr, "\ntype %u index id " IB_ID_FMT "\n",
+				fil_page_get_type(frame),
+				btr_page_get_index_id(frame));
+			break;
+		case BUF_BLOCK_ZIP_PAGE:
+			frame = bpage->zip.data;
+			fprintf(stderr, "\ntype %u size " ULINTPF
+				" index id " IB_ID_FMT "\n",
+				fil_page_get_type(frame),
+				bpage->zip_size(),
+				btr_page_get_index_id(frame));
+			break;
+
+		default:
+			fprintf(stderr, "\n!state %d!\n", state);
+			break;
+		}
+	}
+
+	mysql_mutex_unlock(&buf_pool.mutex);
+}
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc
new file mode 100644
index 00000000..253a2542
--- /dev/null
+++ b/storage/innobase/buf/buf0rea.cc
@@ -0,0 +1,785 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0rea.cc
+The database buffer read
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+#include <mysql/service_thd_wait.h>
+
+#include "buf0rea.h"
+#include "fil0fil.h"
+#include "mtr0mtr.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "buf0buddy.h"
+#include "buf0dblwr.h"
+#include "ibuf0ibuf.h"
+#include "log0recv.h"
+#include "trx0sys.h"
+#include "os0file.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+
+/** If there are buf_pool.curr_size per the number below pending reads, then
+read-ahead is not done: this is to prevent flooding the buffer pool with
+i/o-fixed buffer blocks */
+#define BUF_READ_AHEAD_PEND_LIMIT	2
+
+/** Remove the sentinel block for the watch before replacing it with a
+real block. watch_unset() or watch_occurred() will notice
+that the block has been replaced with the real block.
+@param watch   sentinel */
+inline void buf_pool_t::watch_remove(buf_page_t *watch)
+{
+  ut_ad(hash_lock_get(watch->id())->is_write_locked());
+  ut_a(watch_is_sentinel(*watch));
+  if (watch->buf_fix_count())
+  {
+    ut_ad(watch->in_page_hash);
+    ut_d(watch->in_page_hash= false);
+    HASH_DELETE(buf_page_t, hash, &page_hash, watch->id().fold(), watch);
+    watch->set_buf_fix_count(0);
+  }
+  ut_ad(!watch->in_page_hash);
+  watch->set_state(BUF_BLOCK_NOT_USED);
+  watch->id_= page_id_t(~0ULL);
+}
+
+/** Initialize a page for read to the buffer buf_pool. If the page is
+(1) already in buf_pool, or
+(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
+(3) if the space is deleted or being deleted,
+then this function does nothing.
+Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
+on the buffer frame. The io-handler must take care that the flag is cleared
+and the lock released later.
+@param[in]	mode			BUF_READ_IBUF_PAGES_ONLY, ...
+@param[in]	page_id			page id
+@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	unzip			whether the uncompressed page is
+					requested (for ROW_FORMAT=COMPRESSED)
+@return pointer to the block
+@retval	NULL	in case of an error */
+static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
+                                          ulint zip_size, bool unzip)
+{
+  mtr_t mtr;
+
+  if (mode == BUF_READ_IBUF_PAGES_ONLY)
+  {
+    /* It is a read-ahead within an ibuf routine */
+    ut_ad(!ibuf_bitmap_page(page_id, zip_size));
+    ibuf_mtr_start(&mtr);
+
+    if (!recv_no_ibuf_operations && !ibuf_page(page_id, zip_size, &mtr))
+    {
+      ibuf_mtr_commit(&mtr);
+      return nullptr;
+    }
+  }
+  else
+    ut_ad(mode == BUF_READ_ANY_PAGE);
+
+  buf_page_t *bpage= nullptr;
+  buf_block_t *block= nullptr;
+  if (!zip_size || unzip || recv_recovery_is_on())
+  {
+    block= buf_LRU_get_free_block(false);
+    block->initialise(page_id, zip_size);
+    /* We set a pass-type x-lock on the frame because then
+    the same thread which called for the read operation
+    (and is running now at this point of code) can wait
+    for the read to complete by waiting for the x-lock on
+    the frame; if the x-lock were recursive, the same
+    thread would illegally get the x-lock before the page
+    read is completed.  The x-lock will be released
+    in buf_page_read_complete() by the io-handler thread. */
+    rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
+  }
+
+  const ulint fold= page_id.fold();
+
+  mysql_mutex_lock(&buf_pool.mutex);
+
+  /* We must acquire hash_lock this early to prevent
+  a race condition with buf_pool_t::watch_remove() */
+  page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
+  hash_lock->write_lock();
+
+  buf_page_t *hash_page= buf_pool.page_hash_get_low(page_id, fold);
+  if (hash_page && !buf_pool.watch_is_sentinel(*hash_page))
+  {
+    /* The page is already in the buffer pool. */
+    hash_lock->write_unlock();
+    if (block)
+    {
+      rw_lock_x_unlock_gen(&block->lock, BUF_IO_READ);
+      buf_LRU_block_free_non_file_page(block);
+    }
+    goto func_exit;
+  }
+
+  if (UNIV_LIKELY(block != nullptr))
+  {
+    bpage= &block->page;
+
+    /* Insert into the hash table of file pages */
+    if (hash_page)
+    {
+      /* Preserve the reference count. */
+      auto buf_fix_count= hash_page->buf_fix_count();
+      ut_a(buf_fix_count > 0);
+      block->page.add_buf_fix_count(buf_fix_count);
+      buf_pool.watch_remove(hash_page);
+    }
+
+    block->page.set_io_fix(BUF_IO_READ);
+    block->page.set_state(BUF_BLOCK_FILE_PAGE);
+    ut_ad(!block->page.in_page_hash);
+    ut_d(block->page.in_page_hash= true);
+    HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage);
+    hash_lock->write_unlock();
+
+    /* The block must be put to the LRU list, to the old blocks */
+    buf_LRU_add_block(bpage, true/* to old blocks */);
+
+    if (UNIV_UNLIKELY(zip_size))
+    {
+      /* buf_pool.mutex may be released and reacquired by
+      buf_buddy_alloc(). We must defer this operation until after the
+      block descriptor has been added to buf_pool.LRU and
+      buf_pool.page_hash. */
+      block->page.zip.data= static_cast<page_zip_t*>
+        (buf_buddy_alloc(zip_size));
+
+      /* To maintain the invariant
+      block->in_unzip_LRU_list == block->page.belongs_to_unzip_LRU()
+      we have to add this block to unzip_LRU
+      after block->page.zip.data is set. */
+      ut_ad(block->page.belongs_to_unzip_LRU());
+      buf_unzip_LRU_add_block(block, TRUE);
+    }
+  }
+  else
+  {
+    hash_lock->write_unlock();
+
+    /* The compressed page must be allocated before the
+    control block (bpage), in order to avoid the
+    invocation of buf_buddy_relocate_block() on
+    uninitialized data. */
+    bool lru= false;
+    void *data= buf_buddy_alloc(zip_size, &lru);
+
+    hash_lock->write_lock();
+
+    /* If buf_buddy_alloc() allocated storage from the LRU list,
+    it released and reacquired buf_pool.mutex.  Thus, we must
+    check the page_hash again, as it may have been modified. */
+    if (UNIV_UNLIKELY(lru))
+    {
+      hash_page= buf_pool.page_hash_get_low(page_id, fold);
+
+      if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page)))
+      {
+        /* The block was added by some other thread. */
+        hash_lock->write_unlock();
+        buf_buddy_free(data, zip_size);
+        goto func_exit;
+      }
+    }
+
+    bpage= buf_page_alloc_descriptor();
+
+    page_zip_des_init(&bpage->zip);
+    page_zip_set_size(&bpage->zip, zip_size);
+    bpage->zip.data = (page_zip_t*) data;
+
+    bpage->init(BUF_BLOCK_ZIP_PAGE, page_id);
+
+    if (hash_page)
+    {
+      /* Preserve the reference count. It can be 0 if
+      buf_pool_t::watch_unset() is executing concurrently,
+      waiting for buf_pool.mutex, which we are holding. */
+      bpage->add_buf_fix_count(hash_page->buf_fix_count());
+      buf_pool.watch_remove(hash_page);
+    }
+
+    ut_ad(!bpage->in_page_hash);
+    ut_d(bpage->in_page_hash= true);
+    HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage);
+    bpage->set_io_fix(BUF_IO_READ);
+    hash_lock->write_unlock();
+
+    /* The block must be put to the LRU list, to the old blocks.
+    The zip size is already set into the page zip */
+    buf_LRU_add_block(bpage, true/* to old blocks */);
+  }
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+  buf_pool.n_pend_reads++;
+  goto func_exit_no_mutex;
+func_exit:
+  mysql_mutex_unlock(&buf_pool.mutex);
+func_exit_no_mutex:
+  if (mode == BUF_READ_IBUF_PAGES_ONLY)
+    ibuf_mtr_commit(&mtr);
+
+  ut_ad(!bpage || bpage->in_file());
+
+  return bpage;
+}
+
+/** Low-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there, in which case does nothing.
+Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
+flag is cleared and the x-lock released by an i/o-handler thread.
+
+@param[out] err		DB_SUCCESS or DB_TABLESPACE_DELETED
+			if we are trying
+			to read from a non-existent tablespace
+@param[in,out] space	tablespace
+@param[in] sync		true if synchronous aio is desired
+@param[in] mode		BUF_READ_IBUF_PAGES_ONLY, ...,
+@param[in] page_id	page id
+@param[in] zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] unzip	true=request uncompressed page
+@return whether a read request was queued */
+static
+bool
+buf_read_page_low(
+	dberr_t*		err,
+	fil_space_t*		space,
+	bool			sync,
+	ulint			mode,
+	const page_id_t		page_id,
+	ulint			zip_size,
+	bool			unzip)
+{
+	buf_page_t*	bpage;
+
+	*err = DB_SUCCESS;
+
+	if (buf_dblwr.is_inside(page_id)) {
+		ib::error() << "Trying to read doublewrite buffer page "
+			<< page_id;
+		ut_ad(0);
+nothing_read:
+		space->release();
+		return false;
+	}
+
+	if (sync) {
+	} else if (trx_sys_hdr_page(page_id)
+		   || ibuf_bitmap_page(page_id, zip_size)
+		   || (!recv_no_ibuf_operations
+		       && ibuf_page(page_id, zip_size, nullptr))) {
+
+		/* Trx sys header is so low in the latching order that we play
+		safe and do not leave the i/o-completion to an asynchronous
+		i/o-thread. Change buffer pages must always be read with
+		syncronous i/o, to make sure they do not get involved in
+		thread deadlocks. */
+		sync = true;
+	}
+
+	/* The following call will also check if the tablespace does not exist
+	or is being dropped; if we succeed in initing the page in the buffer
+	pool for read, then DISCARD cannot proceed until the read has
+	completed */
+	bpage = buf_page_init_for_read(mode, page_id, zip_size, unzip);
+
+	if (bpage == NULL) {
+		goto nothing_read;
+	}
+
+	ut_ad(bpage->in_file());
+
+	if (sync) {
+		thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+	}
+
+	DBUG_LOG("ib_buf",
+		 "read page " << page_id << " zip_size=" << zip_size
+		 << " unzip=" << unzip << ',' << (sync ? "sync" : "async"));
+
+	void*	dst;
+
+	if (zip_size) {
+		dst = bpage->zip.data;
+	} else {
+		ut_a(bpage->state() == BUF_BLOCK_FILE_PAGE);
+
+		dst = ((buf_block_t*) bpage)->frame;
+	}
+
+	const ulint len = zip_size ? zip_size : srv_page_size;
+
+	auto fio = space->io(IORequest(sync
+				       ? IORequest::READ_SYNC
+				       : IORequest::READ_ASYNC),
+			     page_id.page_no() * len, len, dst, bpage);
+	*err= fio.err;
+
+	if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) {
+		if (!sync || fio.err == DB_TABLESPACE_DELETED) {
+			buf_pool.corrupted_evict(bpage);
+			return false;
+		}
+
+		ut_error;
+	}
+
+	if (sync) {
+		thd_wait_end(NULL);
+
+		/* The i/o was already completed in space->io() */
+		*err = buf_page_read_complete(bpage, *fio.node);
+		space->release();
+
+		if (*err != DB_SUCCESS) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/** Applies a random read-ahead in buf_pool if there are at least a threshold
+value of accessed pages from the random read-ahead area. Does not read any
+page, not even the one at the position (space, offset), if the read-ahead
+mechanism is not activated. NOTE 1: the calling thread may own latches on
+pages: to avoid deadlocks this function must be written such that it cannot
+end up waiting for these latches! NOTE 2: the calling thread must want
+access to the page given: this rule is set to prevent unintended read-aheads
+performed by ibuf routines, a situation which could result in a deadlock if
+the OS does not support asynchronous i/o.
+@param[in]	page_id		page id of a page which the current thread
+wants to access
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	ibuf		whether we are inside ibuf routine
+@return number of page read requests issued; NOTE that if we read ibuf
+pages, it may happen that the page at the given page number does not
+get read even if we return a positive value! */
+ulint
+buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
+{
+  if (!srv_random_read_ahead)
+    return 0;
+
+  if (srv_startup_is_before_trx_rollback_phase)
+    /* No read-ahead to avoid thread deadlocks */
+    return 0;
+
+  if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id))
+    /* If it is an ibuf bitmap page or trx sys hdr, we do no
+    read-ahead, as that could break the ibuf page access order */
+    return 0;
+
+  if (buf_pool.n_pend_reads > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
+    return 0;
+
+  fil_space_t* space= fil_space_t::get(page_id.space());
+  if (!space)
+    return 0;
+
+  const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area;
+  ulint count= 5 + buf_read_ahead_area / 8;
+  const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area);
+  page_id_t high= low + buf_read_ahead_area;
+  high.set_page_no(std::min(high.page_no(), space->last_page_number()));
+
+  /* Count how many blocks in the area have been recently accessed,
+  that is, reside near the start of the LRU list. */
+
+  for (page_id_t i= low; i < high; ++i)
+  {
+    const ulint fold= i.fold();
+    page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
+    const buf_page_t *bpage= buf_pool.page_hash_get_low(i, fold);
+    bool found= bpage && bpage->is_accessed() && buf_page_peek_if_young(bpage);
+    hash_lock->read_unlock();
+    if (found && !--count)
+      goto read_ahead;
+  }
+
+no_read_ahead:
+  space->release();
+  return 0;
+
+read_ahead:
+  if (space->is_stopping())
+    goto no_read_ahead;
+
+  /* Read all the suitable blocks within the area */
+  const ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
+
+  for (page_id_t i= low; i < high; ++i)
+  {
+    if (ibuf_bitmap_page(i, zip_size))
+      continue;
+    if (space->is_stopping())
+      break;
+    dberr_t err;
+    space->reacquire();
+    if (buf_read_page_low(&err, space, false, ibuf_mode, i, zip_size, false))
+      count++;
+  }
+
+  if (count)
+    DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
+			  count, space->chain.start->name,
+			  low.page_no()));
+  space->release();
+
+  /* Read ahead is considered one I/O operation for the purpose of
+  LRU policy decision. */
+  buf_LRU_stat_inc_io();
+
+  buf_pool.stat.n_ra_pages_read_rnd+= count;
+  srv_stats.buf_pool_reads.add(count);
+  return count;
+}
+
+/** High-level function which reads a page from a file to buf_pool
+if it is not already there. Sets the io_fix and an exclusive lock
+on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@param[in]	page_id		page id
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@retval DB_SUCCESS if the page was read and is not corrupted,
+@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted,
+@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
+after decryption normal page checksum does not match.
+@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
+dberr_t buf_read_page(const page_id_t page_id, ulint zip_size)
+{
+  fil_space_t *space= fil_space_t::get(page_id.space());
+  if (!space)
+  {
+    ib::info() << "trying to read page " << page_id
+               << " in nonexisting or being-dropped tablespace";
+    return DB_TABLESPACE_DELETED;
+  }
+
+  dberr_t err;
+  if (buf_read_page_low(&err, space, true, BUF_READ_ANY_PAGE,
+			page_id, zip_size, false))
+    srv_stats.buf_pool_reads.add(1);
+
+  buf_LRU_stat_inc_io();
+  return err;
+}
+
+/** High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@param[in,out]	space		tablespace
+@param[in]	page_id		page id
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	sync		true if synchronous aio is desired */
+void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
+			      ulint zip_size, bool sync)
+{
+	dberr_t		err;
+
+	if (buf_read_page_low(&err, space, sync, BUF_READ_ANY_PAGE,
+			      page_id, zip_size, false)) {
+		srv_stats.buf_pool_reads.add(1);
+	}
+
+	switch (err) {
+	case DB_SUCCESS:
+	case DB_ERROR:
+		break;
+	case DB_TABLESPACE_DELETED:
+		ib::info() << "trying to read page " << page_id
+			<< " in the background"
+			" in a non-existing or being-dropped tablespace";
+		break;
+	case DB_PAGE_CORRUPTED:
+	case DB_DECRYPTION_FAILED:
+		ib::error()
+			<< "Background Page read failed to "
+			"read or decrypt " << page_id;
+		break;
+	default:
+		ib::fatal() << "Error " << err << " in background read of "
+			<< page_id;
+	}
+
+	/* We do not increment number of I/O operations used for LRU policy
+	here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
+	about evicting uncompressed version of compressed pages from the
+	buffer pool. Since this function is called from buffer pool load
+	these IOs are deliberate and are not part of normal workload we can
+	ignore these in our heuristics. */
+}
+
+/** Applies linear read-ahead if in the buf_pool the page is a border page of
+a linear read-ahead area and all the pages in the area have been accessed.
+Does not read any page if the read-ahead mechanism is not activated. Note
+that the algorithm looks at the 'natural' adjacent successor and
+predecessor of the page, which on the leaf level of a B-tree are the next
+and previous page in the chain of leaves. To know these, the page specified
+in (space, offset) must already be present in the buf_pool. Thus, the
+natural way to use this function is to call it when a page in the buf_pool
+is accessed the first time, calling this function just after it has been
+bufferfixed.
+NOTE 1: as this function looks at the natural predecessor and successor
+fields on the page, what happens, if these are not initialized to any
+sensible value? No problem, before applying read-ahead we check that the
+area to read is within the span of the space, if not, read-ahead is not
+applied. An uninitialized value may result in a useless read operation, but
+only very improbably.
+NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
+function must be written such that it cannot end up waiting for these
+latches!
+NOTE 3: the calling thread must want access to the page given: this rule is
+set to prevent unintended read-aheads performed by ibuf routines, a situation
+which could result in a deadlock if the OS does not support asynchronous io.
+@param[in]	page_id		page id; see NOTE 3 above
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	ibuf		whether if we are inside ibuf routine
+@return number of page read requests issued */
+ulint
+buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
+{
+  /* check if readahead is disabled */
+  if (!srv_read_ahead_threshold)
+    return 0;
+
+  if (srv_startup_is_before_trx_rollback_phase)
+    /* No read-ahead to avoid thread deadlocks */
+    return 0;
+
+  if (buf_pool.n_pend_reads > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
+    return 0;
+
+  const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area;
+  const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area);
+  const page_id_t high_1= low + (buf_read_ahead_area - 1);
+
+  /* We will check that almost all pages in the area have been accessed
+  in the desired order. */
+  const bool descending= page_id == low;
+
+  if (!descending && page_id != high_1)
+    /* This is not a border page of the area */
+    return 0;
+
+  if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id))
+    /* If it is an ibuf bitmap page or trx sys hdr, we do no
+    read-ahead, as that could break the ibuf page access order */
+    return 0;
+
+  fil_space_t *space= fil_space_t::get(page_id.space());
+  if (!space)
+    return 0;
+
+  if (high_1.page_no() > space->last_page_number())
+  {
+    /* The area is not whole. */
+fail:
+    space->release();
+    return 0;
+  }
+
+  /* How many out of order accessed pages can we ignore
+  when working out the access pattern for linear readahead */
+  ulint count= std::min<ulint>(buf_pool_t::READ_AHEAD_PAGES -
+                               srv_read_ahead_threshold,
+                               uint32_t{buf_pool.read_ahead_area});
+  page_id_t new_low= low, new_high_1= high_1;
+  unsigned prev_accessed= 0;
+  for (page_id_t i= low; i != high_1; ++i)
+  {
+    const ulint fold= i.fold();
+    page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
+    const buf_page_t* bpage= buf_pool.page_hash_get_low(i, fold);
+    if (i == page_id)
+    {
+      /* Read the natural predecessor and successor page addresses from
+      the page; NOTE that because the calling thread may have an x-latch
+      on the page, we do not acquire an s-latch on the page, this is to
+      prevent deadlocks. The hash_lock is only protecting the
+      buf_pool.page_hash for page i, not the bpage contents itself. */
+      if (!bpage)
+      {
+hard_fail:
+        hash_lock->read_unlock();
+	goto fail;
+      }
+      const byte *f;
+      switch (UNIV_EXPECT(bpage->state(), BUF_BLOCK_FILE_PAGE)) {
+      case BUF_BLOCK_FILE_PAGE:
+        f= reinterpret_cast<const buf_block_t*>(bpage)->frame;
+        break;
+      case BUF_BLOCK_ZIP_PAGE:
+        f= bpage->zip.data;
+        break;
+      default:
+        goto hard_fail;
+      }
+
+      uint32_t prev= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_PREV));
+      uint32_t next= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_NEXT));
+      if (prev == FIL_NULL || next == FIL_NULL)
+        goto hard_fail;
+      page_id_t id= page_id;
+      if (descending && next - 1 == page_id.page_no())
+        id.set_page_no(prev);
+      else if (!descending && prev + 1 == page_id.page_no())
+        id.set_page_no(next);
+      else
+        goto hard_fail; /* Successor or predecessor not in the right order */
+
+      new_low= id - (id.page_no() % buf_read_ahead_area);
+      new_high_1= new_low + (buf_read_ahead_area - 1);
+
+      if (id != new_low && id != new_high_1)
+        /* This is not a border page of the area: return */
+        goto hard_fail;
+      if (new_high_1.page_no() > space->last_page_number())
+        /* The area is not whole */
+        goto hard_fail;
+    }
+    else if (!bpage)
+    {
+failed:
+      hash_lock->read_unlock();
+      if (--count)
+        continue;
+      goto fail;
+    }
+
+    const unsigned accessed= bpage->is_accessed();
+    if (!accessed)
+      goto failed;
+    /* Note that buf_page_t::is_accessed() returns the time of the
+    first access. If some blocks of the extent existed in the buffer
+    pool at the time of a linear access pattern, the first access
+    times may be nonmonotonic, even though the latest access times
+    were linear. The threshold (srv_read_ahead_factor) should help a
+    little against this. */
+    bool fail= prev_accessed &&
+      (descending ? prev_accessed > accessed : prev_accessed < accessed);
+    prev_accessed= accessed;
+    if (fail)
+      goto failed;
+    hash_lock->read_unlock();
+  }
+
+  /* If we got this far, read-ahead can be sensible: do it */
+  count= 0;
+  for (ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
+       new_low != new_high_1; ++new_low)
+  {
+    if (ibuf_bitmap_page(new_low, zip_size))
+      continue;
+    if (space->is_stopping())
+      break;
+    dberr_t err;
+    space->reacquire();
+    count+= buf_read_page_low(&err, space, false, ibuf_mode, new_low, zip_size,
+                              false);
+  }
+
+  if (count)
+    DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
+                          count, space->chain.start->name,
+                          new_low.page_no()));
+  space->release();
+
+  /* Read ahead is considered one I/O operation for the purpose of
+  LRU policy decision. */
+  buf_LRU_stat_inc_io();
+
+  buf_pool.stat.n_ra_pages_read+= count;
+  return count;
+}
+
+/** Issues read requests for pages which recovery wants to read in.
+@param[in]	space_id	tablespace id
+@param[in]	page_nos	array of page numbers to read, with the
+highest page number the last in the array
+@param[in]	n		number of page numbers in the array */
+void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n)
+{
+	fil_space_t* space = fil_space_t::get(space_id);
+
+	if (!space) {
+		/* The tablespace is missing or unreadable: do nothing */
+		return;
+	}
+
+	const ulint zip_size = space->zip_size();
+
+	for (ulint i = 0; i < n; i++) {
+
+		/* Ignore if the page already present in freed ranges. */
+		if (space->freed_ranges.contains(page_nos[i])) {
+			continue;
+		}
+
+		const page_id_t	cur_page_id(space_id, page_nos[i]);
+
+		ulint limit = 0;
+		for (ulint j = 0; j < buf_pool.n_chunks; j++) {
+			limit += buf_pool.chunks[j].size / 2;
+		}
+
+		for (ulint count = 0; buf_pool.n_pend_reads >= limit; ) {
+			os_thread_sleep(10000);
+
+			if (!(++count % 1000)) {
+
+				ib::error()
+					<< "Waited for " << count / 100
+					<< " seconds for "
+					<< buf_pool.n_pend_reads
+					<< " pending reads";
+			}
+		}
+
+		dberr_t err;
+		space->reacquire();
+		buf_read_page_low(&err, space, false,
+				  BUF_READ_ANY_PAGE, cur_page_id, zip_size,
+				  true);
+
+		if (err == DB_DECRYPTION_FAILED || err == DB_PAGE_CORRUPTED) {
+			ib::error() << "Recovery failed to read or decrypt "
+				<< cur_page_id;
+		}
+	}
+
+
+        DBUG_PRINT("ib_buf", ("recovery read (%u pages) for %s", n,
+			      space->chain.start->name));
+	space->release();
+}