summaryrefslogtreecommitdiffstats
path: root/storage/innobase/buf/buf0buf.cc
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/buf/buf0buf.cc')
-rw-r--r--storage/innobase/buf/buf0buf.cc4728
1 files changed, 4728 insertions, 0 deletions
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
new file mode 100644
index 00000000..b658bdfc
--- /dev/null
+++ b/storage/innobase/buf/buf0buf.cc
@@ -0,0 +1,4728 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buf.cc
+The database buffer buf_pool
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "assume_aligned.h"
+#include "mtr0types.h"
+#include "mach0data.h"
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "ut0crc32.h"
+#include <string.h>
+
+#ifndef UNIV_INNOCHECKSUM
+#include "my_cpu.h"
+#include "mem0mem.h"
+#include "btr0btr.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "buf0buddy.h"
+#include "buf0dblwr.h"
+#include "lock0lock.h"
+#include "sync0rw.h"
+#include "btr0sea.h"
+#include "ibuf0ibuf.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "log0log.h"
+#include "dict0stats_bg.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "dict0dict.h"
+#include "log0recv.h"
+#include "srv0mon.h"
+#include "log0crypt.h"
+#include "fil0pagecompress.h"
+#endif /* !UNIV_INNOCHECKSUM */
+#include "page0zip.h"
+#include "sync0sync.h"
+#include "buf0dump.h"
+#include <map>
+#include <sstream>
+
+using st_::span;
+
+#ifdef HAVE_LIBNUMA
+#include <numa.h>
+#include <numaif.h>
+struct set_numa_interleave_t
+{
+ set_numa_interleave_t()
+ {
+ if (srv_numa_interleave) {
+
+ struct bitmask *numa_mems_allowed = numa_get_mems_allowed();
+ ib::info() << "Setting NUMA memory policy to"
+ " MPOL_INTERLEAVE";
+ if (set_mempolicy(MPOL_INTERLEAVE,
+ numa_mems_allowed->maskp,
+ numa_mems_allowed->size) != 0) {
+
+ ib::warn() << "Failed to set NUMA memory"
+ " policy to MPOL_INTERLEAVE: "
+ << strerror(errno);
+ }
+ numa_bitmask_free(numa_mems_allowed);
+ }
+ }
+
+ ~set_numa_interleave_t()
+ {
+ if (srv_numa_interleave) {
+
+ ib::info() << "Setting NUMA memory policy to"
+ " MPOL_DEFAULT";
+ if (set_mempolicy(MPOL_DEFAULT, NULL, 0) != 0) {
+ ib::warn() << "Failed to set NUMA memory"
+ " policy to MPOL_DEFAULT: "
+ << strerror(errno);
+ }
+ }
+ }
+};
+
+#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE set_numa_interleave_t scoped_numa
+#else
+#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE
+#endif /* HAVE_LIBNUMA */
+
+/*
+ IMPLEMENTATION OF THE BUFFER POOL
+ =================================
+
+ Buffer frames and blocks
+ ------------------------
+Following the terminology of Gray and Reuter, we call the memory
+blocks where file pages are loaded buffer frames. For each buffer
+frame there is a control block, or shortly, a block, in the buffer
+control array. The control info which does not need to be stored
+in the file along with the file page, resides in the control block.
+
+ Buffer pool struct
+ ------------------
+The buffer buf_pool contains a single mutex which protects all the
+control data structures of the buf_pool. The content of a buffer frame is
+protected by a separate read-write lock in its control block, though.
+These locks can be locked and unlocked without owning the buf_pool.mutex.
+The OS events in the buf_pool struct can be waited for without owning the
+buf_pool.mutex.
+
+The buf_pool.mutex is a hot-spot in main memory, causing a lot of
+memory bus traffic on multiprocessor systems when processors
+alternately access the mutex. On our Pentium, the mutex is accessed
+maybe every 10 microseconds. We gave up the solution to have mutexes
+for each control block, for instance, because it seemed to be
+complicated.
+
+A solution to reduce mutex contention of the buf_pool.mutex is to
+create a separate mutex for the page hash table. On Pentium,
+accessing the hash table takes 2 microseconds, about half
+of the total buf_pool.mutex hold time.
+
+ Control blocks
+ --------------
+
+The control block contains, for instance, the bufferfix count
+which is incremented when a thread wants a file page to be fixed
+in a buffer frame. The bufferfix operation does not lock the
+contents of the frame, however. For this purpose, the control
+block contains a read-write lock.
+
+The buffer frames have to be aligned so that the start memory
+address of a frame is divisible by the universal page size, which
+is a power of two.
+
+The control blocks containing file pages are put to a hash table
+according to the file address of the page.
+We could speed up the access to an individual page by using
+"pointer swizzling": we could replace the page references on
+non-leaf index pages by direct pointers to the page, if it exists
+in the buf_pool. We could make a separate hash table where we could
+chain all the page references in non-leaf pages residing in the buf_pool,
+using the page reference as the hash key,
+and at the time of reading of a page update the pointers accordingly.
+Drawbacks of this solution are added complexity and,
+possibly, extra space required on non-leaf pages for memory pointers.
+A simpler solution is just to speed up the hash table mechanism
+in the database, using tables whose size is a power of 2.
+
+ Lists of blocks
+ ---------------
+
+There are several lists of control blocks.
+
+The free list (buf_pool.free) contains blocks which are currently not
+used.
+
+The common LRU list contains all the blocks holding a file page
+except those for which the bufferfix count is non-zero.
+The pages are in the LRU list roughly in the order of the last
+access to the page, so that the oldest pages are at the end of the
+list. We also keep a pointer to near the end of the LRU list,
+which we can use when we want to artificially age a page in the
+buf_pool. This is used if we know that some page is not needed
+again for some time: we insert the block right after the pointer,
+causing it to be replaced sooner than would normally be the case.
+Currently this aging mechanism is used for read-ahead mechanism
+of pages, and it can also be used when there is a scan of a full
+table which cannot fit in the memory. Putting the pages near the
+end of the LRU list, we make sure that most of the buf_pool stays
+in the main memory, undisturbed.
+
+The unzip_LRU list contains a subset of the common LRU list. The
+blocks on the unzip_LRU list hold a compressed file page and the
+corresponding uncompressed page frame. A block is in unzip_LRU if and
+only if the predicate block->page.belongs_to_unzip_LRU()
+holds. The blocks in unzip_LRU will be in same order as they are in
+the common LRU list. That is, each manipulation of the common LRU
+list will result in the same manipulation of the unzip_LRU list.
+
+The chain of modified blocks (buf_pool.flush_list) contains the blocks
+holding persistent file pages that have been modified in the memory
+but not written to disk yet. The block with the oldest modification
+which has not yet been written to disk is at the end of the chain.
+The access to this list is protected by buf_pool.flush_list_mutex.
+
+The control blocks for uncompressed pages are accessible via
+buf_block_t objects that are reachable via buf_pool.chunks[].
+The control blocks (buf_page_t) of those ROW_FORMAT=COMPRESSED pages
+that are not in buf_pool.flush_list and for which no uncompressed
+page has been allocated in buf_pool are only accessible via
+buf_pool.LRU.
+
+The chains of free memory blocks (buf_pool.zip_free[]) are used by
+the buddy allocator (buf0buddy.cc) to keep track of currently unused
+memory blocks of size sizeof(buf_page_t)..srv_page_size / 2. These
+blocks are inside the srv_page_size-sized memory blocks of type
+BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
+pool. The buddy allocator is solely used for allocating control
+blocks for compressed pages (buf_page_t) and compressed page frames.
+
+ Loading a file page
+ -------------------
+
+First, a victim block for replacement has to be found in the
+buf_pool. It is taken from the free list or searched for from the
+end of the LRU-list. An exclusive lock is reserved for the frame,
+the io_fix field is set in the block fixing the block in buf_pool,
+and the io-operation for loading the page is queued. The io-handler thread
+releases the X-lock on the frame and resets the io_fix field
+when the io operation completes.
+
+A thread may request the above operation using the function
+buf_page_get(). It may then continue to request a lock on the frame.
+The lock is granted when the io-handler releases the x-lock.
+
+ Read-ahead
+ ----------
+
+The read-ahead mechanism is intended to be intelligent and
+isolated from the semantically higher levels of the database
+index management. From the higher level we only need the
+information if a file page has a natural successor or
+predecessor page. On the leaf level of a B-tree index,
+these are the next and previous pages in the natural
+order of the pages.
+
+Let us first explain the read-ahead mechanism when the leafs
+of a B-tree are scanned in an ascending or descending order.
+When a read page is the first time referenced in the buf_pool,
+the buffer manager checks if it is at the border of a so-called
+linear read-ahead area. The tablespace is divided into these
+areas of size 64 blocks, for example. So if the page is at the
+border of such an area, the read-ahead mechanism checks if
+all the other blocks in the area have been accessed in an
+ascending or descending order. If this is the case, the system
+looks at the natural successor or predecessor of the page,
+checks if that is at the border of another area, and in this case
+issues read-requests for all the pages in that area. Maybe
+we could relax the condition that all the pages in the area
+have to be accessed: if data is deleted from a table, there may
+appear holes of unused pages in the area.
+
+A different read-ahead mechanism is used when there appears
+to be a random access pattern to a file.
+If a new page is referenced in the buf_pool, and several pages
+of its random access area (for instance, 32 consecutive pages
+in a tablespace) have recently been referenced, we may predict
+that the whole area may be needed in the near future, and issue
+the read requests for the whole area.
+*/
+
+#ifndef UNIV_INNOCHECKSUM
+void page_hash_latch::read_lock_wait()
+{
+ /* First, try busy spinning for a while. */
+ for (auto spin= srv_n_spin_wait_rounds; spin--; )
+ {
+ ut_delay(srv_spin_wait_delay);
+ if (read_trylock())
+ return;
+ }
+ /* Fall back to yielding to other threads. */
+ do
+ os_thread_yield();
+ while (!read_trylock());
+}
+
+void page_hash_latch::write_lock_wait()
+{
+ write_lock_wait_start();
+
+ /* First, try busy spinning for a while. */
+ for (auto spin= srv_n_spin_wait_rounds; spin--; )
+ {
+ if (write_lock_poll())
+ return;
+ ut_delay(srv_spin_wait_delay);
+ }
+
+ /* Fall back to yielding to other threads. */
+ do
+ os_thread_yield();
+ while (!write_lock_poll());
+}
+
+/** Value in microseconds */
+constexpr int WAIT_FOR_READ= 100;
+constexpr int WAIT_FOR_WRITE= 100;
+/** Number of attempts made to read in a page in the buffer pool */
+constexpr ulint BUF_PAGE_READ_MAX_RETRIES= 100;
+/** The maximum portion of the buffer pool that can be used for the
+read-ahead buffer. (Divide buf_pool size by this amount) */
+constexpr uint32_t BUF_READ_AHEAD_PORTION= 32;
+
+/** A 64KiB buffer of NUL bytes, for use in assertions and checks,
+and dummy default values of instantly dropped columns.
+Initially, BLOB field references are set to NUL bytes, in
+dtuple_convert_big_rec(). */
+const byte *field_ref_zero;
+
+/** The InnoDB buffer pool */
+buf_pool_t buf_pool;
+buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_reg;
+buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_ref;
+
+#ifdef UNIV_DEBUG
+/** Disable resizing buffer pool to make assertion code not expensive. */
+my_bool buf_disable_resize_buffer_pool_debug = TRUE;
+
+/** This is used to insert validation operations in execution
+in the debug version */
+static ulint buf_dbg_counter;
+#endif /* UNIV_DEBUG */
+
+/** Macro to determine whether the read of write counter is used depending
+on the io_type */
+#define MONITOR_RW_COUNTER(io_type, counter) \
+ ((io_type == BUF_IO_READ) \
+ ? (counter##_READ) \
+ : (counter##_WRITTEN))
+
+
+/** Decrypt a page for temporary tablespace.
+@param[in,out] tmp_frame Temporary buffer
+@param[in] src_frame Page to decrypt
+@return true if temporary tablespace decrypted, false if not */
+static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame)
+{
+ if (buf_is_zeroes(span<const byte>(src_frame, srv_page_size))) {
+ return true;
+ }
+
+ /* read space & lsn */
+ uint header_len = FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+
+ /* Copy FIL page header, it is not encrypted */
+ memcpy(tmp_frame, src_frame, header_len);
+
+ /* Calculate the offset where decryption starts */
+ const byte* src = src_frame + header_len;
+ byte* dst = tmp_frame + header_len;
+ uint srclen = uint(srv_page_size)
+ - (header_len + FIL_PAGE_FCRC32_CHECKSUM);
+ ulint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET);
+
+ if (!log_tmp_block_decrypt(src, srclen, dst,
+ (offset * srv_page_size))) {
+ return false;
+ }
+
+ static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment");
+ memcpy_aligned<4>(tmp_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+ src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+ FIL_PAGE_FCRC32_CHECKSUM);
+
+ memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(src_frame, tmp_frame,
+ srv_page_size);
+ srv_stats.pages_decrypted.inc();
+ srv_stats.n_temp_blocks_decrypted.inc();
+
+ return true; /* page was decrypted */
+}
+
+/** Decrypt a page.
+@param[in,out] bpage Page control block
+@param[in] node data file
+@return whether the operation was successful */
+static bool buf_page_decrypt_after_read(buf_page_t *bpage,
+ const fil_node_t &node)
+{
+ ut_ad(node.space->referenced());
+ ut_ad(node.space->id == bpage->id().space());
+ const auto flags = node.space->flags;
+
+ byte* dst_frame = bpage->zip.data ? bpage->zip.data :
+ ((buf_block_t*) bpage)->frame;
+ bool page_compressed = node.space->is_compressed()
+ && buf_page_is_compressed(dst_frame, flags);
+ const page_id_t id(bpage->id());
+
+ if (id.page_no() == 0) {
+ /* File header pages are not encrypted/compressed */
+ return (true);
+ }
+
+ if (node.space->purpose == FIL_TYPE_TEMPORARY
+ && innodb_encrypt_temporary_tables) {
+ buf_tmp_buffer_t* slot = buf_pool.io_buf_reserve();
+ ut_a(slot);
+ slot->allocate();
+
+ if (!buf_tmp_page_decrypt(slot->crypt_buf, dst_frame)) {
+ slot->release();
+ ib::error() << "Encrypted page " << id
+ << " in file " << node.name;
+ return false;
+ }
+
+ slot->release();
+ return true;
+ }
+
+ /* Page is encrypted if encryption information is found from
+ tablespace and page contains used key_version. This is true
+ also for pages first compressed and then encrypted. */
+
+ buf_tmp_buffer_t* slot;
+ uint key_version = buf_page_get_key_version(dst_frame, flags);
+
+ if (page_compressed && !key_version) {
+ /* the page we read is unencrypted */
+ /* Find free slot from temporary memory array */
+decompress:
+ if (fil_space_t::full_crc32(flags)
+ && buf_page_is_corrupted(true, dst_frame, flags)) {
+ return false;
+ }
+
+ slot = buf_pool.io_buf_reserve();
+ ut_a(slot);
+ slot->allocate();
+
+decompress_with_slot:
+ ut_d(fil_page_type_validate(node.space, dst_frame));
+
+ ulint write_size = fil_page_decompress(
+ slot->crypt_buf, dst_frame, flags);
+ slot->release();
+ ut_ad(!write_size
+ || fil_page_type_validate(node.space, dst_frame));
+ ut_ad(node.space->referenced());
+ return write_size != 0;
+ }
+
+ if (key_version && node.space->crypt_data) {
+ /* Verify encryption checksum before we even try to
+ decrypt. */
+ if (!buf_page_verify_crypt_checksum(dst_frame, flags)) {
+decrypt_failed:
+ ib::error() << "Encrypted page " << id
+ << " in file " << node.name
+ << " looks corrupted; key_version="
+ << key_version;
+ return false;
+ }
+
+ slot = buf_pool.io_buf_reserve();
+ ut_a(slot);
+ slot->allocate();
+ ut_d(fil_page_type_validate(node.space, dst_frame));
+
+ /* decrypt using crypt_buf to dst_frame */
+ if (!fil_space_decrypt(node.space, slot->crypt_buf, dst_frame)) {
+ slot->release();
+ goto decrypt_failed;
+ }
+
+ ut_d(fil_page_type_validate(node.space, dst_frame));
+
+ if ((fil_space_t::full_crc32(flags) && page_compressed)
+ || fil_page_get_type(dst_frame)
+ == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
+ goto decompress_with_slot;
+ }
+
+ slot->release();
+ } else if (fil_page_get_type(dst_frame)
+ == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
+ goto decompress;
+ }
+
+ ut_ad(node.space->referenced());
+ return true;
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Checks if the page is in crc32 checksum format.
+@param[in] read_buf database page
+@param[in] checksum_field1 new checksum field
+@param[in] checksum_field2 old checksum field
+@return true if the page is in crc32 checksum format. */
+bool
+buf_page_is_checksum_valid_crc32(
+ const byte* read_buf,
+ ulint checksum_field1,
+ ulint checksum_field2)
+{
+ const uint32_t crc32 = buf_calc_page_crc32(read_buf);
+
+#ifdef UNIV_INNOCHECKSUM
+ if (log_file
+ && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32) {
+ fprintf(log_file, "page::" UINT32PF ";"
+ " crc32 calculated = " UINT32PF ";"
+ " recorded checksum field1 = " ULINTPF " recorded"
+ " checksum field2 =" ULINTPF "\n", cur_page_num,
+ crc32, checksum_field1, checksum_field2);
+ }
+#endif /* UNIV_INNOCHECKSUM */
+
+ if (checksum_field1 != checksum_field2) {
+ return false;
+ }
+
+ return checksum_field1 == crc32;
+}
+
+/** Checks if the page is in innodb checksum format.
+@param[in] read_buf database page
+@param[in] checksum_field1 new checksum field
+@param[in] checksum_field2 old checksum field
+@return true if the page is in innodb checksum format. */
+bool
+buf_page_is_checksum_valid_innodb(
+ const byte* read_buf,
+ ulint checksum_field1,
+ ulint checksum_field2)
+{
+ /* There are 2 valid formulas for
+ checksum_field2 (old checksum field) which algo=innodb could have
+ written to the page:
+
+ 1. Very old versions of InnoDB only stored 8 byte lsn to the
+ start and the end of the page.
+
+ 2. Newer InnoDB versions store the old formula checksum
+ (buf_calc_page_old_checksum()). */
+
+ ulint old_checksum = buf_calc_page_old_checksum(read_buf);
+ ulint new_checksum = buf_calc_page_new_checksum(read_buf);
+
+#ifdef UNIV_INNOCHECKSUM
+ if (log_file
+ && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB) {
+ fprintf(log_file, "page::" UINT32PF ";"
+ " old style: calculated ="
+ " " ULINTPF "; recorded = " ULINTPF "\n",
+ cur_page_num, old_checksum,
+ checksum_field2);
+ fprintf(log_file, "page::" UINT32PF ";"
+ " new style: calculated ="
+ " " ULINTPF "; crc32 = " UINT32PF "; recorded = " ULINTPF "\n",
+ cur_page_num, new_checksum,
+ buf_calc_page_crc32(read_buf), checksum_field1);
+ }
+
+ if (log_file
+ && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB) {
+ fprintf(log_file, "page::" UINT32PF ";"
+ " old style: calculated ="
+ " " ULINTPF "; recorded checksum = " ULINTPF "\n",
+ cur_page_num, old_checksum,
+ checksum_field2);
+ fprintf(log_file, "page::" UINT32PF ";"
+ " new style: calculated ="
+ " " ULINTPF "; recorded checksum = " ULINTPF "\n",
+ cur_page_num, new_checksum,
+ checksum_field1);
+ }
+#endif /* UNIV_INNOCHECKSUM */
+
+
+ if (checksum_field2 != mach_read_from_4(read_buf + FIL_PAGE_LSN)
+ && checksum_field2 != old_checksum) {
+ DBUG_LOG("checksum",
+ "Page checksum crc32 not valid"
+ << " field1 " << checksum_field1
+ << " field2 " << checksum_field2
+ << " crc32 " << buf_calc_page_old_checksum(read_buf)
+ << " lsn " << mach_read_from_4(
+ read_buf + FIL_PAGE_LSN));
+ return(false);
+ }
+
+ /* old field is fine, check the new field */
+
+ /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
+ (always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
+
+ if (checksum_field1 != 0 && checksum_field1 != new_checksum) {
+ DBUG_LOG("checksum",
+ "Page checksum crc32 not valid"
+ << " field1 " << checksum_field1
+ << " field2 " << checksum_field2
+ << " crc32 " << buf_calc_page_new_checksum(read_buf)
+ << " lsn " << mach_read_from_4(
+ read_buf + FIL_PAGE_LSN));
+ return(false);
+ }
+
+ return(true);
+}
+
+/** Checks if the page is in none checksum format.
+@param[in] read_buf database page
+@param[in] checksum_field1 new checksum field
+@param[in] checksum_field2 old checksum field
+@return true if the page is in none checksum format. */
+bool
+buf_page_is_checksum_valid_none(
+ const byte* read_buf,
+ ulint checksum_field1,
+ ulint checksum_field2)
+{
+#ifndef DBUG_OFF
+ if (checksum_field1 != checksum_field2
+ && checksum_field1 != BUF_NO_CHECKSUM_MAGIC) {
+ DBUG_LOG("checksum",
+ "Page checksum crc32 not valid"
+ << " field1 " << checksum_field1
+ << " field2 " << checksum_field2
+ << " crc32 " << BUF_NO_CHECKSUM_MAGIC
+ << " lsn " << mach_read_from_4(read_buf
+ + FIL_PAGE_LSN));
+ }
+#endif /* DBUG_OFF */
+
+#ifdef UNIV_INNOCHECKSUM
+ if (log_file
+ && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_NONE) {
+ fprintf(log_file,
+ "page::" UINT32PF "; none checksum: calculated"
+ " = %lu; recorded checksum_field1 = " ULINTPF
+ " recorded checksum_field2 = " ULINTPF "\n",
+ cur_page_num, BUF_NO_CHECKSUM_MAGIC,
+ checksum_field1, checksum_field2);
+ }
+#endif /* UNIV_INNOCHECKSUM */
+
+ return(checksum_field1 == checksum_field2
+ && checksum_field1 == BUF_NO_CHECKSUM_MAGIC);
+}
+
+/** Checks whether the lsn present in the page is lesser than the
+peek current lsn.
+@param[in] check_lsn lsn to check
+@param[in] read_buf page. */
+static void buf_page_check_lsn(bool check_lsn, const byte* read_buf)
+{
+#ifndef UNIV_INNOCHECKSUM
+ if (check_lsn && recv_lsn_checks_on) {
+ const lsn_t current_lsn = log_sys.get_lsn();
+ const lsn_t page_lsn
+ = mach_read_from_8(read_buf + FIL_PAGE_LSN);
+
+ /* Since we are going to reset the page LSN during the import
+ phase it makes no sense to spam the log with error messages. */
+ if (current_lsn < page_lsn) {
+
+ const uint32_t space_id = mach_read_from_4(
+ read_buf + FIL_PAGE_SPACE_ID);
+ const uint32_t page_no = mach_read_from_4(
+ read_buf + FIL_PAGE_OFFSET);
+
+ ib::error() << "Page " << page_id_t(space_id, page_no)
+ << " log sequence number " << page_lsn
+ << " is in the future! Current system"
+ << " log sequence number "
+ << current_lsn << ".";
+
+ ib::error() << "Your database may be corrupt or"
+ " you may have copied the InnoDB"
+ " tablespace but not the InnoDB"
+ " log files. "
+ << FORCE_RECOVERY_MSG;
+
+ }
+ }
+#endif /* !UNIV_INNOCHECKSUM */
+}
+
+
+/** Check if a buffer is all zeroes.
+@param[in] buf data to check
+@return whether the buffer is all zeroes */
+bool buf_is_zeroes(span<const byte> buf)
+{
+ ut_ad(buf.size() <= UNIV_PAGE_SIZE_MAX);
+ return memcmp(buf.data(), field_ref_zero, buf.size()) == 0;
+}
+
+/** Check if a page is corrupt.
+@param[in] check_lsn whether the LSN should be checked
+@param[in] read_buf database page
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] space tablespace
+@return whether the page is corrupted */
+bool
+buf_page_is_corrupted(
+ bool check_lsn,
+ const byte* read_buf,
+ ulint fsp_flags)
+{
+#ifndef UNIV_INNOCHECKSUM
+ DBUG_EXECUTE_IF("buf_page_import_corrupt_failure", return(true); );
+#endif
+ if (fil_space_t::full_crc32(fsp_flags)) {
+ bool compressed = false, corrupted = false;
+ const uint size = buf_page_full_crc32_size(
+ read_buf, &compressed, &corrupted);
+ if (corrupted) {
+ return true;
+ }
+ const byte* end = read_buf + (size - FIL_PAGE_FCRC32_CHECKSUM);
+ uint crc32 = mach_read_from_4(end);
+
+ if (!crc32 && size == srv_page_size
+ && buf_is_zeroes(span<const byte>(read_buf, size))) {
+ return false;
+ }
+
+ DBUG_EXECUTE_IF(
+ "page_intermittent_checksum_mismatch", {
+ static int page_counter;
+ if (page_counter++ == 2) {
+ crc32++;
+ }
+ });
+
+ if (crc32 != ut_crc32(read_buf,
+ size - FIL_PAGE_FCRC32_CHECKSUM)) {
+ return true;
+ }
+ static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "alignment");
+ static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+ if (!compressed
+ && !mach_read_from_4(FIL_PAGE_FCRC32_KEY_VERSION
+ + read_buf)
+ && memcmp_aligned<4>(read_buf + (FIL_PAGE_LSN + 4),
+ end - (FIL_PAGE_FCRC32_END_LSN
+ - FIL_PAGE_FCRC32_CHECKSUM),
+ 4)) {
+ return true;
+ }
+
+ buf_page_check_lsn(check_lsn, read_buf);
+ return false;
+ }
+
+ size_t checksum_field1 = 0;
+ size_t checksum_field2 = 0;
+ uint32_t crc32 = 0;
+ bool crc32_inited = false;
+ bool crc32_chksum = false;
+ const ulint zip_size = fil_space_t::zip_size(fsp_flags);
+ const uint16_t page_type = fil_page_get_type(read_buf);
+
+ /* We can trust page type if page compression is set on tablespace
+ flags because page compression flag means file must have been
+ created with 10.1 (later than 5.5 code base). In 10.1 page
+ compressed tables do not contain post compression checksum and
+ FIL_PAGE_END_LSN_OLD_CHKSUM field stored. Note that space can
+ be null if we are in fil_check_first_page() and first page
+ is not compressed or encrypted. Page checksum is verified
+ after decompression (i.e. normally pages are already
+ decompressed at this stage). */
+ if ((page_type == FIL_PAGE_PAGE_COMPRESSED ||
+ page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED)
+#ifndef UNIV_INNOCHECKSUM
+ && FSP_FLAGS_HAS_PAGE_COMPRESSION(fsp_flags)
+#endif
+ ) {
+ return(false);
+ }
+
+ static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 4 == 0, "alignment");
+
+ if (!zip_size
+ && memcmp_aligned<4>(read_buf + FIL_PAGE_LSN + 4,
+ read_buf + srv_page_size
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
+ /* Stored log sequence numbers at the start and the end
+ of page do not match */
+
+ return(true);
+ }
+
+ buf_page_check_lsn(check_lsn, read_buf);
+
+ /* Check whether the checksum fields have correct values */
+
+ const srv_checksum_algorithm_t curr_algo =
+ static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm);
+
+ if (curr_algo == SRV_CHECKSUM_ALGORITHM_NONE) {
+ return(false);
+ }
+
+ if (zip_size) {
+ return !page_zip_verify_checksum(read_buf, zip_size);
+ }
+
+ checksum_field1 = mach_read_from_4(
+ read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
+
+ checksum_field2 = mach_read_from_4(
+ read_buf + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+ static_assert(FIL_PAGE_LSN % 8 == 0, "alignment");
+
+ /* A page filled with NUL bytes is considered not corrupted.
+ Before MariaDB Server 10.1.25 (MDEV-12113) or 10.2.2 (or MySQL 5.7),
+ the FIL_PAGE_FILE_FLUSH_LSN field may have been written nonzero
+ for the first page of each file of the system tablespace.
+ We want to ignore it for the system tablespace, but because
+ we do not know the expected tablespace here, we ignore the
+ field for all data files, except for
+ innodb_checksum_algorithm=full_crc32 which we handled above. */
+ if (!checksum_field1 && !checksum_field2) {
+ /* Checksum fields can have valid value as zero.
+ If the page is not empty then do the checksum
+ calculation for the page. */
+ bool all_zeroes = true;
+ for (size_t i = 0; i < srv_page_size; i++) {
+#ifndef UNIV_INNOCHECKSUM
+ if (i == FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) {
+ i += 8;
+ }
+#endif
+ if (read_buf[i]) {
+ all_zeroes = false;
+ break;
+ }
+ }
+
+ if (all_zeroes) {
+ return false;
+ }
+ }
+
+ switch (curr_algo) {
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+ return !buf_page_is_checksum_valid_crc32(
+ read_buf, checksum_field1, checksum_field2);
+ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+ return !buf_page_is_checksum_valid_innodb(
+ read_buf, checksum_field1, checksum_field2);
+ case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+ return !buf_page_is_checksum_valid_none(
+ read_buf, checksum_field1, checksum_field2);
+ case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_INNODB:
+ if (buf_page_is_checksum_valid_none(read_buf,
+ checksum_field1, checksum_field2)) {
+#ifdef UNIV_INNOCHECKSUM
+ if (log_file) {
+ fprintf(log_file, "page::" UINT32PF ";"
+ " old style: calculated = %u;"
+ " recorded = " ULINTPF ";\n",
+ cur_page_num,
+ buf_calc_page_old_checksum(read_buf),
+ checksum_field2);
+ fprintf(log_file, "page::" UINT32PF ";"
+ " new style: calculated = " UINT32PF ";"
+ " crc32 = " UINT32PF "; recorded = " ULINTPF ";\n",
+ cur_page_num,
+ buf_calc_page_new_checksum(read_buf),
+ buf_calc_page_crc32(read_buf),
+ checksum_field1);
+ }
+#endif /* UNIV_INNOCHECKSUM */
+ return false;
+ }
+
+ crc32_chksum = curr_algo == SRV_CHECKSUM_ALGORITHM_CRC32
+ || curr_algo == SRV_CHECKSUM_ALGORITHM_FULL_CRC32;
+
+ /* Very old versions of InnoDB only stored 8 byte lsn to the
+ start and the end of the page. */
+
+ /* Since innodb_checksum_algorithm is not strict_* allow
+ any of the algos to match for the old field */
+
+ if (checksum_field2
+ != mach_read_from_4(read_buf + FIL_PAGE_LSN)
+ && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) {
+
+ if (crc32_chksum) {
+ crc32 = buf_calc_page_crc32(read_buf);
+ crc32_inited = true;
+
+ DBUG_EXECUTE_IF(
+ "page_intermittent_checksum_mismatch", {
+ static int page_counter;
+ if (page_counter++ == 2) {
+ crc32++;
+ }
+ });
+
+ if (checksum_field2 != crc32
+ && checksum_field2
+ != buf_calc_page_old_checksum(read_buf)) {
+ return true;
+ }
+ } else {
+ ut_ad(curr_algo
+ == SRV_CHECKSUM_ALGORITHM_INNODB);
+
+ if (checksum_field2
+ != buf_calc_page_old_checksum(read_buf)) {
+ crc32 = buf_calc_page_crc32(read_buf);
+ crc32_inited = true;
+
+ if (checksum_field2 != crc32) {
+ return true;
+ }
+ }
+ }
+ }
+
+ if (checksum_field1 == 0
+ || checksum_field1 == BUF_NO_CHECKSUM_MAGIC) {
+ } else if (crc32_chksum) {
+
+ if (!crc32_inited) {
+ crc32 = buf_calc_page_crc32(read_buf);
+ crc32_inited = true;
+ }
+
+ if (checksum_field1 != crc32
+ && checksum_field1
+ != buf_calc_page_new_checksum(read_buf)) {
+ return true;
+ }
+ } else {
+ ut_ad(curr_algo == SRV_CHECKSUM_ALGORITHM_INNODB);
+
+ if (checksum_field1
+ != buf_calc_page_new_checksum(read_buf)) {
+
+ if (!crc32_inited) {
+ crc32 = buf_calc_page_crc32(read_buf);
+ crc32_inited = true;
+ }
+
+ if (checksum_field1 != crc32) {
+ return true;
+ }
+ }
+ }
+
+ if (crc32_inited
+ && ((checksum_field1 == crc32
+ && checksum_field2 != crc32)
+ || (checksum_field1 != crc32
+ && checksum_field2 == crc32))) {
+ return true;
+ }
+
+ break;
+ case SRV_CHECKSUM_ALGORITHM_NONE:
+ /* should have returned false earlier */
+ break;
+ }
+
+ return false;
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP)
+/** Enable buffers to be dumped to core files
+
+A convience function, not called anyhwere directly however
+it is left available for gdb or any debugger to call
+in the event that you want all of the memory to be dumped
+to a core file.
+
+Returns number of errors found in madvise calls. */
+int
+buf_madvise_do_dump()
+{
+ int ret= 0;
+
+ /* mirrors allocation in log_t::create() */
+ if (log_sys.buf) {
+ ret += madvise(log_sys.buf,
+ srv_log_buffer_size,
+ MADV_DODUMP);
+ ret += madvise(log_sys.flush_buf,
+ srv_log_buffer_size,
+ MADV_DODUMP);
+ }
+ /* mirrors recv_sys_t::create() */
+ if (recv_sys.buf)
+ {
+ ret+= madvise(recv_sys.buf, recv_sys.len, MADV_DODUMP);
+ }
+
+ mysql_mutex_lock(&buf_pool.mutex);
+ auto chunk = buf_pool.chunks;
+
+ for (ulint n = buf_pool.n_chunks; n--; chunk++) {
+ ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP);
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ return ret;
+}
+#endif
+
+/** Dump a page to stderr.
+@param[in] read_buf database page
+@param[in] zip_size compressed page size, or 0 */
+void buf_page_print(const byte* read_buf, ulint zip_size)
+{
+ dict_index_t* index;
+
+#ifndef UNIV_DEBUG
+ const ulint size = zip_size ? zip_size : srv_page_size;
+ ib::info() << "Page dump in ascii and hex ("
+ << size << " bytes):";
+
+ ut_print_buf(stderr, read_buf, size);
+ fputs("\nInnoDB: End of page dump\n", stderr);
+#endif
+
+ if (zip_size) {
+ /* Print compressed page. */
+ ib::info() << "Compressed page type ("
+ << fil_page_get_type(read_buf)
+ << "); stored checksum in field1 "
+ << mach_read_from_4(
+ read_buf + FIL_PAGE_SPACE_OR_CHKSUM)
+ << "; calculated checksums for field1: "
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_CRC32)
+ << " "
+ << page_zip_calc_checksum(
+ read_buf, zip_size,
+ SRV_CHECKSUM_ALGORITHM_CRC32)
+ << ", "
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_INNODB)
+ << " "
+ << page_zip_calc_checksum(
+ read_buf, zip_size,
+ SRV_CHECKSUM_ALGORITHM_INNODB)
+ << ", "
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_NONE)
+ << " "
+ << page_zip_calc_checksum(
+ read_buf, zip_size,
+ SRV_CHECKSUM_ALGORITHM_NONE)
+ << "; page LSN "
+ << mach_read_from_8(read_buf + FIL_PAGE_LSN)
+ << "; page number (if stored to page"
+ << " already) "
+ << mach_read_from_4(read_buf + FIL_PAGE_OFFSET)
+ << "; space id (if stored to page already) "
+ << mach_read_from_4(
+ read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ } else {
+ const uint32_t crc32 = buf_calc_page_crc32(read_buf);
+ ulint page_type = fil_page_get_type(read_buf);
+
+ ib::info() << "Uncompressed page, stored checksum in field1 "
+ << mach_read_from_4(
+ read_buf + FIL_PAGE_SPACE_OR_CHKSUM)
+ << ", calculated checksums for field1: "
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_CRC32) << " "
+ << crc32
+ << ", "
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_INNODB) << " "
+ << buf_calc_page_new_checksum(read_buf)
+ << ", "
+ << " page type " << page_type << " == "
+ << fil_get_page_type_name(page_type) << "."
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_NONE) << " "
+ << BUF_NO_CHECKSUM_MAGIC
+ << ", stored checksum in field2 "
+ << mach_read_from_4(read_buf + srv_page_size
+ - FIL_PAGE_END_LSN_OLD_CHKSUM)
+ << ", calculated checksums for field2: "
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_CRC32) << " "
+ << crc32
+ << ", "
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_INNODB) << " "
+ << buf_calc_page_old_checksum(read_buf)
+ << ", "
+ << buf_checksum_algorithm_name(
+ SRV_CHECKSUM_ALGORITHM_NONE) << " "
+ << BUF_NO_CHECKSUM_MAGIC
+ << ", page LSN "
+ << mach_read_from_4(read_buf + FIL_PAGE_LSN)
+ << " "
+ << mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
+ << ", low 4 bytes of LSN at page end "
+ << mach_read_from_4(read_buf + srv_page_size
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)
+ << ", page number (if stored to page already) "
+ << mach_read_from_4(read_buf + FIL_PAGE_OFFSET)
+ << ", space id (if created with >= MySQL-4.1.1"
+ " and stored already) "
+ << mach_read_from_4(
+ read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ }
+
+ switch (fil_page_get_type(read_buf)) {
+ index_id_t index_id;
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_TYPE_INSTANT:
+ case FIL_PAGE_RTREE:
+ index_id = btr_page_get_index_id(read_buf);
+ ib::info() << "Page may be an index page where"
+ " index id is " << index_id;
+
+ index = dict_index_find_on_id_low(index_id);
+ if (index) {
+ ib::info()
+ << "Index " << index_id
+ << " is " << index->name
+ << " in table " << index->table->name;
+ }
+ break;
+ case FIL_PAGE_UNDO_LOG:
+ fputs("InnoDB: Page may be an undo log page\n", stderr);
+ break;
+ case FIL_PAGE_INODE:
+ fputs("InnoDB: Page may be an 'inode' page\n", stderr);
+ break;
+ case FIL_PAGE_IBUF_FREE_LIST:
+ fputs("InnoDB: Page may be an insert buffer free list page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_ALLOCATED:
+ fputs("InnoDB: Page may be a freshly allocated page\n",
+ stderr);
+ break;
+ case FIL_PAGE_IBUF_BITMAP:
+ fputs("InnoDB: Page may be an insert buffer bitmap page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_SYS:
+ fputs("InnoDB: Page may be a system page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_TRX_SYS:
+ fputs("InnoDB: Page may be a transaction system page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_FSP_HDR:
+ fputs("InnoDB: Page may be a file space header page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_XDES:
+ fputs("InnoDB: Page may be an extent descriptor page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_BLOB:
+ fputs("InnoDB: Page may be a BLOB page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ fputs("InnoDB: Page may be a compressed BLOB page\n",
+ stderr);
+ break;
+ }
+}
+
+/** Initialize a buffer page descriptor.
+@param[in,out] block buffer page descriptor
+@param[in] frame buffer page frame */
+static
+void
+buf_block_init(buf_block_t* block, byte* frame)
+{
+ /* This function should only be executed at database startup or by
+ buf_pool.resize(). Either way, adaptive hash index must not exist. */
+ assert_block_ahi_empty_on_init(block);
+
+ block->frame = frame;
+
+ block->modify_clock = 0;
+ block->page.init(BUF_BLOCK_NOT_USED, page_id_t(~0ULL));
+#ifdef BTR_CUR_HASH_ADAPT
+ block->index = NULL;
+#endif /* BTR_CUR_HASH_ADAPT */
+ ut_d(block->in_unzip_LRU_list = false);
+ ut_d(block->in_withdraw_list = false);
+
+ page_zip_des_init(&block->page.zip);
+
+ ut_d(block->debug_latch = (rw_lock_t *) ut_malloc_nokey(sizeof(rw_lock_t)));
+
+ rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING);
+
+ ut_d(rw_lock_create(PFS_NOT_INSTRUMENTED, block->debug_latch,
+ SYNC_LEVEL_VARYING));
+
+ block->lock.is_block_lock = 1;
+
+ ut_ad(rw_lock_validate(&(block->lock)));
+}
+
+/** Allocate a chunk of buffer frames.
+@param bytes requested size
+@return whether the allocation succeeded */
+inline bool buf_pool_t::chunk_t::create(size_t bytes)
+{
+ DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return false;);
+ /* Round down to a multiple of page size, although it already should be. */
+ bytes= ut_2pow_round<size_t>(bytes, srv_page_size);
+
+ mem= buf_pool.allocator.allocate_large_dontdump(bytes, &mem_pfx);
+
+ if (UNIV_UNLIKELY(!mem))
+ return false;
+
+ MEM_UNDEFINED(mem, mem_size());
+
+#ifdef HAVE_LIBNUMA
+ if (srv_numa_interleave)
+ {
+ struct bitmask *numa_mems_allowed= numa_get_mems_allowed();
+ if (mbind(mem, mem_size(), MPOL_INTERLEAVE,
+ numa_mems_allowed->maskp, numa_mems_allowed->size,
+ MPOL_MF_MOVE))
+ {
+ ib::warn() << "Failed to set NUMA memory policy of"
+ " buffer pool page frames to MPOL_INTERLEAVE"
+ " (error: " << strerror(errno) << ").";
+ }
+ numa_bitmask_free(numa_mems_allowed);
+ }
+#endif /* HAVE_LIBNUMA */
+
+
+ /* Allocate the block descriptors from
+ the start of the memory block. */
+ blocks= reinterpret_cast<buf_block_t*>(mem);
+
+ /* Align a pointer to the first frame. Note that when
+ opt_large_page_size is smaller than srv_page_size,
+ (with max srv_page_size at 64k don't think any hardware
+ makes this true),
+ we may allocate one fewer block than requested. When
+ it is bigger, we may allocate more blocks than requested. */
+ static_assert(sizeof(byte*) == sizeof(ulint), "pointer size");
+
+ byte *frame= reinterpret_cast<byte*>((reinterpret_cast<ulint>(mem) +
+ srv_page_size - 1) &
+ ~ulint{srv_page_size - 1});
+ size= (mem_pfx.m_size >> srv_page_size_shift) - (frame != mem);
+
+ /* Subtract the space needed for block descriptors. */
+ {
+ ulint s= size;
+
+ while (frame < reinterpret_cast<const byte*>(blocks + s))
+ {
+ frame+= srv_page_size;
+ s--;
+ }
+
+ size= s;
+ }
+
+ /* Init block structs and assign frames for them. Then we assign the
+ frames to the first blocks (we already mapped the memory above). */
+
+ buf_block_t *block= blocks;
+
+ for (auto i= size; i--; ) {
+ buf_block_init(block, frame);
+ MEM_UNDEFINED(block->frame, srv_page_size);
+ /* Add the block to the free list */
+ UT_LIST_ADD_LAST(buf_pool.free, &block->page);
+
+ ut_d(block->page.in_free_list = TRUE);
+ block++;
+ frame+= srv_page_size;
+ }
+
+ reg();
+
+ return true;
+}
+
+#ifdef UNIV_DEBUG
+/** Check that all file pages in the buffer chunk are in a replaceable state.
+@return address of a non-free block
+@retval nullptr if all freed */
+inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const
+{
+ buf_block_t *block= blocks;
+ for (auto i= size; i--; block++)
+ {
+ switch (block->page.state()) {
+ case BUF_BLOCK_ZIP_PAGE:
+ /* The uncompressed buffer pool should never
+ contain ROW_FORMAT=COMPRESSED block descriptors. */
+ ut_error;
+ break;
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ /* Skip blocks that are not being used for file pages. */
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ const lsn_t lsn= block->page.oldest_modification();
+
+ if (srv_read_only_mode)
+ {
+ /* The page cleaner is disabled in read-only mode. No pages
+ can be dirtied, so all of them must be clean. */
+ ut_ad(lsn == 0 || lsn == recv_sys.recovered_lsn ||
+ srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
+ ut_ad(!block->page.buf_fix_count());
+ ut_ad(block->page.io_fix() == BUF_IO_NONE);
+ break;
+ }
+
+ if (fsp_is_system_temporary(block->page.id().space()))
+ {
+ ut_ad(lsn == 0 || lsn == 2);
+ break;
+ }
+
+ if (lsn > 1 || !block->page.can_relocate())
+ return block;
+
+ break;
+ }
+ }
+
+ return nullptr;
+}
+#endif /* UNIV_DEBUG */
+
+/** Free the synchronization objects of a buffer pool block descriptor
+@param[in,out] block buffer pool block descriptor */
+static void buf_block_free_mutexes(buf_block_t* block)
+{
+ rw_lock_free(&block->lock);
+ ut_d(rw_lock_free(block->debug_latch));
+ ut_d(ut_free(block->debug_latch));
+}
+
+/** Create the hash table.
+@param n the lower bound of n_cells */
+void buf_pool_t::page_hash_table::create(ulint n)
+{
+ n_cells= ut_find_prime(n);
+ const size_t size= pad(n_cells) * sizeof *array;
+ void* v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE);
+ memset(v, 0, size);
+ array= static_cast<hash_cell_t*>(v);
+}
+
+/** Create the buffer pool.
+@return whether the creation failed */
+bool buf_pool_t::create()
+{
+ ut_ad(this == &buf_pool);
+ ut_ad(srv_buf_pool_size % srv_buf_pool_chunk_unit == 0);
+ ut_ad(!is_initialised());
+ ut_ad(srv_buf_pool_size > 0);
+ ut_ad(!resizing);
+ ut_ad(!chunks_old);
+ ut_ad(!field_ref_zero);
+
+ NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
+
+ if (auto b= aligned_malloc(UNIV_PAGE_SIZE_MAX, 4096))
+ field_ref_zero= static_cast<const byte*>
+ (memset_aligned<4096>(b, 0, UNIV_PAGE_SIZE_MAX));
+ else
+ return true;
+
+ chunk_t::map_reg= UT_NEW_NOKEY(chunk_t::map());
+
+ new(&allocator) ut_allocator<unsigned char>(mem_key_buf_buf_pool);
+
+ n_chunks= srv_buf_pool_size / srv_buf_pool_chunk_unit;
+ const size_t chunk_size= srv_buf_pool_chunk_unit;
+
+ chunks= static_cast<chunk_t*>(ut_zalloc_nokey(n_chunks * sizeof *chunks));
+ UT_LIST_INIT(free, &buf_page_t::list);
+ curr_size= 0;
+ auto chunk= chunks;
+
+ do
+ {
+ if (!chunk->create(chunk_size))
+ {
+ while (--chunk >= chunks)
+ {
+ buf_block_t* block= chunk->blocks;
+
+ for (auto i= chunk->size; i--; block++)
+ buf_block_free_mutexes(block);
+
+ allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx);
+ }
+ ut_free(chunks);
+ chunks= nullptr;
+ UT_DELETE(chunk_t::map_reg);
+ chunk_t::map_reg= nullptr;
+ aligned_free(const_cast<byte*>(field_ref_zero));
+ field_ref_zero= nullptr;
+ ut_ad(!is_initialised());
+ return true;
+ }
+
+ curr_size+= chunk->size;
+ }
+ while (++chunk < chunks + n_chunks);
+
+ ut_ad(is_initialised());
+ mysql_mutex_init(buf_pool_mutex_key, &mutex, MY_MUTEX_INIT_FAST);
+
+ UT_LIST_INIT(LRU, &buf_page_t::LRU);
+ UT_LIST_INIT(withdraw, &buf_page_t::list);
+ withdraw_target= 0;
+ UT_LIST_INIT(flush_list, &buf_page_t::list);
+ UT_LIST_INIT(unzip_LRU, &buf_block_t::unzip_LRU);
+
+ for (size_t i= 0; i < UT_ARR_SIZE(zip_free); ++i)
+ UT_LIST_INIT(zip_free[i], &buf_buddy_free_t::list);
+ ulint s= curr_size;
+ old_size= s;
+ s/= BUF_READ_AHEAD_PORTION;
+ read_ahead_area= s >= READ_AHEAD_PAGES
+ ? READ_AHEAD_PAGES
+ : my_round_up_to_next_power(static_cast<uint32_t>(s));
+ curr_pool_size= srv_buf_pool_size;
+
+ n_chunks_new= n_chunks;
+
+ page_hash.create(2 * curr_size);
+ zip_hash.create(2 * curr_size);
+ last_printout_time= time(NULL);
+
+ mysql_mutex_init(flush_list_mutex_key, &flush_list_mutex,
+ MY_MUTEX_INIT_FAST);
+
+ pthread_cond_init(&done_flush_LRU, nullptr);
+ pthread_cond_init(&done_flush_list, nullptr);
+ pthread_cond_init(&do_flush_list, nullptr);
+ pthread_cond_init(&done_free, nullptr);
+
+ try_LRU_scan= true;
+
+ ut_d(flush_hp.m_mutex= &flush_list_mutex;);
+ ut_d(lru_hp.m_mutex= &mutex);
+ ut_d(lru_scan_itr.m_mutex= &mutex);
+
+ io_buf.create((srv_n_read_io_threads + srv_n_write_io_threads) *
+ OS_AIO_N_PENDING_IOS_PER_THREAD);
+
+ /* FIXME: remove some of these variables */
+ srv_buf_pool_curr_size= curr_pool_size;
+ srv_buf_pool_old_size= srv_buf_pool_size;
+ srv_buf_pool_base_size= srv_buf_pool_size;
+
+ last_activity_count= srv_get_activity_count();
+
+ chunk_t::map_ref= chunk_t::map_reg;
+ buf_LRU_old_ratio_update(100 * 3 / 8, false);
+ btr_search_sys_create();
+ ut_ad(is_initialised());
+ return false;
+}
+
+/** Clean up after successful create() */
+void buf_pool_t::close()
+{
+ ut_ad(this == &buf_pool);
+ if (!is_initialised())
+ return;
+
+ mysql_mutex_destroy(&mutex);
+ mysql_mutex_destroy(&flush_list_mutex);
+
+ for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev_bpage= nullptr; bpage;
+ bpage= prev_bpage)
+ {
+ prev_bpage= UT_LIST_GET_PREV(LRU, bpage);
+ ut_ad(bpage->in_file());
+ ut_ad(bpage->in_LRU_list);
+ /* The buffer pool must be clean during normal shutdown.
+ Only on aborted startup (with recovery) or with innodb_fast_shutdown=2
+ we may discard changes. */
+ ut_d(const lsn_t oldest= bpage->oldest_modification();)
+ ut_ad(fsp_is_system_temporary(bpage->id().space())
+ ? (oldest == 0 || oldest == 2)
+ : oldest <= 1 || srv_is_being_started || srv_fast_shutdown == 2);
+
+ if (bpage->state() != BUF_BLOCK_FILE_PAGE)
+ buf_page_free_descriptor(bpage);
+ }
+
+ for (auto chunk= chunks + n_chunks; --chunk >= chunks; )
+ {
+ buf_block_t *block= chunk->blocks;
+
+ for (auto i= chunk->size; i--; block++)
+ buf_block_free_mutexes(block);
+
+ allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx);
+ }
+
+ pthread_cond_destroy(&done_flush_LRU);
+ pthread_cond_destroy(&done_flush_list);
+ pthread_cond_destroy(&do_flush_list);
+ pthread_cond_destroy(&done_free);
+
+ ut_free(chunks);
+ chunks= nullptr;
+ page_hash.free();
+ zip_hash.free();
+
+ io_buf.close();
+ UT_DELETE(chunk_t::map_reg);
+ chunk_t::map_reg= chunk_t::map_ref= nullptr;
+ aligned_free(const_cast<byte*>(field_ref_zero));
+ field_ref_zero= nullptr;
+}
+
+/** Try to reallocate a control block.
+@param block control block to reallocate
+@return whether the reallocation succeeded */
+inline bool buf_pool_t::realloc(buf_block_t *block)
+{
+ buf_block_t* new_block;
+
+ mysql_mutex_assert_owner(&mutex);
+ ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+
+ new_block = buf_LRU_get_free_only();
+
+ if (new_block == NULL) {
+ return(false); /* free list was not enough */
+ }
+
+ const page_id_t id(block->page.id());
+ page_hash_latch* hash_lock = hash_lock_get(id);
+ hash_lock->write_lock();
+
+ if (block->page.can_relocate()) {
+ memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(
+ new_block->frame, block->frame, srv_page_size);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ new (&new_block->page) buf_page_t(block->page);
+
+ /* relocate LRU list */
+ if (buf_page_t* prev_b = buf_pool.LRU_remove(&block->page)) {
+ UT_LIST_INSERT_AFTER(LRU, prev_b, &new_block->page);
+ } else {
+ UT_LIST_ADD_FIRST(LRU, &new_block->page);
+ }
+
+ if (LRU_old == &block->page) {
+ LRU_old = &new_block->page;
+ }
+
+ ut_ad(new_block->page.in_LRU_list);
+
+ /* relocate unzip_LRU list */
+ if (block->page.zip.data != NULL) {
+ ut_ad(block->in_unzip_LRU_list);
+ ut_d(new_block->in_unzip_LRU_list = true);
+
+ buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
+ UT_LIST_REMOVE(unzip_LRU, block);
+
+ ut_d(block->in_unzip_LRU_list = false);
+ block->page.zip.data = NULL;
+ page_zip_set_size(&block->page.zip, 0);
+
+ if (prev_block != NULL) {
+ UT_LIST_INSERT_AFTER(unzip_LRU, prev_block, new_block);
+ } else {
+ UT_LIST_ADD_FIRST(unzip_LRU, new_block);
+ }
+ } else {
+ ut_ad(!block->in_unzip_LRU_list);
+ ut_d(new_block->in_unzip_LRU_list = false);
+ }
+
+ /* relocate page_hash */
+ ut_ad(block->page.in_page_hash);
+ ut_ad(new_block->page.in_page_hash);
+ const ulint fold = id.fold();
+ ut_ad(&block->page == page_hash_get_low(id, fold));
+ ut_d(block->page.in_page_hash = false);
+ HASH_REPLACE(buf_page_t, hash, &page_hash, fold,
+ &block->page, &new_block->page);
+
+ buf_block_modify_clock_inc(block);
+ static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+ memset_aligned<4>(block->frame + FIL_PAGE_OFFSET, 0xff, 4);
+ static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+ "not perfect alignment");
+ memset_aligned<2>(block->frame
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
+ MEM_UNDEFINED(block->frame, srv_page_size);
+ block->page.set_state(BUF_BLOCK_REMOVE_HASH);
+ if (!fsp_is_system_temporary(id.space())) {
+ buf_flush_relocate_on_flush_list(&block->page,
+ &new_block->page);
+ }
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ block->page.set_corrupt_id();
+
+ /* set other flags of buf_block_t */
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* This code should only be executed by resize(),
+ while the adaptive hash index is disabled. */
+ assert_block_ahi_empty(block);
+ assert_block_ahi_empty_on_init(new_block);
+ ut_ad(!block->index);
+ new_block->index = NULL;
+ new_block->n_hash_helps = 0;
+ new_block->n_fields = 1;
+ new_block->left_side = TRUE;
+#endif /* BTR_CUR_HASH_ADAPT */
+ ut_d(block->page.set_state(BUF_BLOCK_MEMORY));
+ /* free block */
+ new_block = block;
+ }
+
+ hash_lock->write_unlock();
+ buf_LRU_block_free_non_file_page(new_block);
+ return(true); /* free_list was enough */
+}
+
+/** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3).
+@param[in] fmt format
+@param[in] ... extra parameters according to fmt */
+static
+void
+buf_resize_status(
+ const char* fmt,
+ ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+
+ vsnprintf(
+ export_vars.innodb_buffer_pool_resize_status,
+ sizeof(export_vars.innodb_buffer_pool_resize_status),
+ fmt, ap);
+
+ va_end(ap);
+
+ ib::info() << export_vars.innodb_buffer_pool_resize_status;
+}
+
+/** Withdraw blocks from the buffer pool until meeting withdraw_target.
+@return whether retry is needed */
+inline bool buf_pool_t::withdraw_blocks()
+{
+ buf_block_t* block;
+ ulint loop_count = 0;
+
+ ib::info() << "start to withdraw the last "
+ << withdraw_target << " blocks";
+
+ /* Minimize zip_free[i] lists */
+ mysql_mutex_lock(&mutex);
+ buf_buddy_condense_free();
+ mysql_mutex_unlock(&mutex);
+
+ while (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
+
+ /* try to withdraw from free_list */
+ ulint count1 = 0;
+
+ mysql_mutex_lock(&mutex);
+ block = reinterpret_cast<buf_block_t*>(
+ UT_LIST_GET_FIRST(free));
+ while (block != NULL
+ && UT_LIST_GET_LEN(withdraw) < withdraw_target) {
+ ut_ad(block->page.in_free_list);
+ ut_ad(!block->page.oldest_modification());
+ ut_ad(!block->page.in_LRU_list);
+ ut_a(!block->page.in_file());
+
+ buf_block_t* next_block;
+ next_block = reinterpret_cast<buf_block_t*>(
+ UT_LIST_GET_NEXT(
+ list, &block->page));
+
+ if (will_be_withdrawn(block->page)) {
+ /* This should be withdrawn */
+ UT_LIST_REMOVE(free, &block->page);
+ UT_LIST_ADD_LAST(withdraw, &block->page);
+ ut_d(block->in_withdraw_list = true);
+ count1++;
+ }
+
+ block = next_block;
+ }
+ mysql_mutex_unlock(&mutex);
+
+ /* reserve free_list length */
+ if (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
+ ulint n_flushed = buf_flush_LRU(
+ std::max<ulint>(withdraw_target
+ - UT_LIST_GET_LEN(withdraw),
+ srv_LRU_scan_depth));
+ buf_flush_wait_batch_end_acquiring_mutex(true);
+
+ if (n_flushed) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_FLUSH_COUNT,
+ MONITOR_LRU_BATCH_FLUSH_PAGES,
+ n_flushed);
+ }
+ }
+
+ /* relocate blocks/buddies in withdrawn area */
+ ulint count2 = 0;
+
+ mysql_mutex_lock(&mutex);
+ buf_page_t* bpage;
+ bpage = UT_LIST_GET_FIRST(LRU);
+ while (bpage != NULL) {
+ buf_page_t* next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
+ if (bpage->zip.data != NULL
+ && will_be_withdrawn(bpage->zip.data)
+ && bpage->can_relocate()) {
+ buf_pool_mutex_exit_forbid();
+ if (!buf_buddy_realloc(
+ bpage->zip.data,
+ page_zip_get_size(&bpage->zip))) {
+ /* failed to allocate block */
+ buf_pool_mutex_exit_allow();
+ break;
+ }
+ buf_pool_mutex_exit_allow();
+ count2++;
+ }
+
+ if (bpage->state() == BUF_BLOCK_FILE_PAGE
+ && will_be_withdrawn(*bpage)) {
+ if (bpage->can_relocate()) {
+ buf_pool_mutex_exit_forbid();
+ if (!realloc(
+ reinterpret_cast<buf_block_t*>(
+ bpage))) {
+ /* failed to allocate block */
+ buf_pool_mutex_exit_allow();
+ break;
+ }
+ buf_pool_mutex_exit_allow();
+ count2++;
+ }
+ /* NOTE: if the page is in use,
+ not relocated yet */
+ }
+
+ bpage = next_bpage;
+ }
+ mysql_mutex_unlock(&mutex);
+
+ buf_resize_status(
+ "withdrawing blocks. (" ULINTPF "/" ULINTPF ")",
+ UT_LIST_GET_LEN(withdraw),
+ withdraw_target);
+
+ ib::info() << "withdrew "
+ << count1 << " blocks from free list."
+ << " Tried to relocate " << count2 << " pages ("
+ << UT_LIST_GET_LEN(withdraw) << "/"
+ << withdraw_target << ")";
+
+ if (++loop_count >= 10) {
+ /* give up for now.
+ retried after user threads paused. */
+
+ ib::info() << "will retry to withdraw later";
+
+ /* need retry later */
+ return(true);
+ }
+ }
+
+ /* confirm withdrawn enough */
+ for (const chunk_t* chunk = chunks + n_chunks_new,
+ * const echunk = chunks + n_chunks; chunk != echunk; chunk++) {
+ block = chunk->blocks;
+ for (ulint j = chunk->size; j--; block++) {
+ ut_a(block->page.state() == BUF_BLOCK_NOT_USED);
+ ut_ad(block->in_withdraw_list);
+ }
+ }
+
+ ib::info() << "withdrawn target: " << UT_LIST_GET_LEN(withdraw)
+ << " blocks";
+
+ return(false);
+}
+
+
+
+inline void buf_pool_t::page_hash_table::write_lock_all()
+{
+ for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
+ {
+ reinterpret_cast<page_hash_latch&>(array[n]).write_lock();
+ if (!n)
+ break;
+ }
+}
+
+
+inline void buf_pool_t::page_hash_table::write_unlock_all()
+{
+ for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
+ {
+ reinterpret_cast<page_hash_latch&>(array[n]).write_unlock();
+ if (!n)
+ break;
+ }
+}
+
+
+namespace
+{
+
+struct find_interesting_trx
+{
+ void operator()(const trx_t &trx)
+ {
+ if (trx.state == TRX_STATE_NOT_STARTED)
+ return;
+ if (trx.mysql_thd == nullptr)
+ return;
+ if (withdraw_started <= trx.start_time)
+ return;
+
+ if (!found)
+ {
+ ib::warn() << "The following trx might hold "
+ "the blocks in buffer pool to "
+ "be withdrawn. Buffer pool "
+ "resizing can complete only "
+ "after all the transactions "
+ "below release the blocks.";
+ found= true;
+ }
+
+ lock_trx_print_wait_and_mvcc_state(stderr, &trx, current_time);
+ }
+
+ bool &found;
+ time_t withdraw_started;
+ time_t current_time;
+};
+
+} // namespace
+
+/** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */
+inline void buf_pool_t::resize()
+{
+ ut_ad(this == &buf_pool);
+
+ bool warning = false;
+
+ NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
+
+ ut_ad(!resize_in_progress());
+ ut_ad(srv_buf_pool_chunk_unit > 0);
+
+ ulint new_instance_size = srv_buf_pool_size >> srv_page_size_shift;
+
+ buf_resize_status("Resizing buffer pool from " ULINTPF " to "
+ ULINTPF " (unit=" ULINTPF ").",
+ srv_buf_pool_old_size, srv_buf_pool_size,
+ srv_buf_pool_chunk_unit);
+
+ mysql_mutex_lock(&mutex);
+ ut_ad(curr_size == old_size);
+ ut_ad(n_chunks_new == n_chunks);
+ ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
+
+ n_chunks_new = (new_instance_size << srv_page_size_shift)
+ / srv_buf_pool_chunk_unit;
+ curr_size = n_chunks_new * chunks->size;
+ mysql_mutex_unlock(&mutex);
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* disable AHI if needed */
+ const bool btr_search_disabled = btr_search_enabled;
+
+ buf_resize_status("Disabling adaptive hash index.");
+
+ btr_search_s_lock_all();
+ if (btr_search_disabled) {
+ btr_search_s_unlock_all();
+ } else {
+ btr_search_s_unlock_all();
+ }
+
+ btr_search_disable();
+
+ if (btr_search_disabled) {
+ ib::info() << "disabled adaptive hash index.";
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ if (curr_size < old_size) {
+ /* set withdraw target */
+ size_t w = 0;
+
+ for (const chunk_t* chunk = chunks + n_chunks_new,
+ * const echunk = chunks + n_chunks;
+ chunk != echunk; chunk++)
+ w += chunk->size;
+
+ ut_ad(withdraw_target == 0);
+ withdraw_target = w;
+ }
+
+ buf_resize_status("Withdrawing blocks to be shrunken.");
+
+ time_t withdraw_started = time(NULL);
+ double message_interval = 60;
+ ulint retry_interval = 1;
+
+withdraw_retry:
+ /* wait for the number of blocks fit to the new size (if needed)*/
+ bool should_retry_withdraw = curr_size < old_size
+ && withdraw_blocks();
+
+ if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
+ /* abort to resize for shutdown. */
+ return;
+ }
+
+ /* abort buffer pool load */
+ buf_load_abort();
+
+ const time_t current_time = time(NULL);
+
+ if (should_retry_withdraw
+ && difftime(current_time, withdraw_started) >= message_interval) {
+
+ if (message_interval > 900) {
+ message_interval = 1800;
+ } else {
+ message_interval *= 2;
+ }
+
+ lock_mutex_enter();
+ bool found = false;
+ trx_sys.trx_list.for_each(find_interesting_trx{
+ found, withdraw_started, current_time});
+ lock_mutex_exit();
+
+ withdraw_started = current_time;
+ }
+
+ if (should_retry_withdraw) {
+ ib::info() << "Will retry to withdraw " << retry_interval
+ << " seconds later.";
+ os_thread_sleep(retry_interval * 1000000);
+
+ if (retry_interval > 5) {
+ retry_interval = 10;
+ } else {
+ retry_interval *= 2;
+ }
+
+ goto withdraw_retry;
+ }
+
+ buf_resize_status("Latching whole of buffer pool.");
+
+#ifndef DBUG_OFF
+ {
+ bool should_wait = true;
+
+ while (should_wait) {
+ should_wait = false;
+ DBUG_EXECUTE_IF(
+ "ib_buf_pool_resize_wait_before_resize",
+ should_wait = true; os_thread_sleep(10000););
+ }
+ }
+#endif /* !DBUG_OFF */
+
+ if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
+ return;
+ }
+
+ /* Indicate critical path */
+ resizing.store(true, std::memory_order_relaxed);
+
+ mysql_mutex_lock(&mutex);
+ page_hash.write_lock_all();
+
+ chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map());
+
+ /* add/delete chunks */
+
+ buf_resize_status("buffer pool resizing with chunks "
+ ULINTPF " to " ULINTPF ".",
+ n_chunks, n_chunks_new);
+
+ if (n_chunks_new < n_chunks) {
+ /* delete chunks */
+ chunk_t* chunk = chunks + n_chunks_new;
+ const chunk_t* const echunk = chunks + n_chunks;
+
+ ulint sum_freed = 0;
+
+ while (chunk < echunk) {
+ /* buf_LRU_block_free_non_file_page() invokes
+ MEM_NOACCESS() on any buf_pool.free blocks.
+ We must cancel the effect of that. In
+ MemorySanitizer, MEM_NOACCESS() is no-op, so
+ we must not do anything special for it here. */
+#ifdef HAVE_valgrind
+# if !__has_feature(memory_sanitizer)
+ MEM_MAKE_DEFINED(chunk->mem, chunk->mem_size());
+# endif
+#else
+ MEM_MAKE_ADDRESSABLE(chunk->mem, chunk->size);
+#endif
+
+ buf_block_t* block = chunk->blocks;
+
+ for (ulint j = chunk->size; j--; block++) {
+ buf_block_free_mutexes(block);
+ }
+
+ allocator.deallocate_large_dodump(
+ chunk->mem, &chunk->mem_pfx);
+ sum_freed += chunk->size;
+ ++chunk;
+ }
+
+ /* discard withdraw list */
+ UT_LIST_INIT(withdraw, &buf_page_t::list);
+ withdraw_target = 0;
+
+ ib::info() << n_chunks - n_chunks_new
+ << " chunks (" << sum_freed
+ << " blocks) were freed.";
+
+ n_chunks = n_chunks_new;
+ }
+
+ {
+ /* reallocate chunks */
+ const size_t new_chunks_size
+ = n_chunks_new * sizeof(chunk_t);
+
+ chunk_t* new_chunks = static_cast<chunk_t*>(
+ ut_zalloc_nokey_nofatal(new_chunks_size));
+
+ DBUG_EXECUTE_IF("buf_pool_resize_chunk_null",
+ ut_free(new_chunks); new_chunks= nullptr; );
+
+ if (!new_chunks) {
+ ib::error() << "failed to allocate"
+ " the chunk array.";
+ n_chunks_new = n_chunks;
+ warning = true;
+ chunks_old = NULL;
+ goto calc_buf_pool_size;
+ }
+
+ ulint n_chunks_copy = ut_min(n_chunks_new,
+ n_chunks);
+
+ memcpy(new_chunks, chunks,
+ n_chunks_copy * sizeof *new_chunks);
+
+ for (ulint j = 0; j < n_chunks_copy; j++) {
+ new_chunks[j].reg();
+ }
+
+ chunks_old = chunks;
+ chunks = new_chunks;
+ }
+
+ if (n_chunks_new > n_chunks) {
+ /* add chunks */
+ ulint sum_added = 0;
+ ulint n = n_chunks;
+ const size_t unit = srv_buf_pool_chunk_unit;
+
+ for (chunk_t* chunk = chunks + n_chunks,
+ * const echunk = chunks + n_chunks_new;
+ chunk != echunk; chunk++) {
+ if (!chunk->create(unit)) {
+ ib::error() << "failed to allocate"
+ " memory for buffer pool chunk";
+
+ warning = true;
+ n_chunks_new = n_chunks;
+ break;
+ }
+
+ sum_added += chunk->size;
+ ++n;
+ }
+
+ ib::info() << n_chunks_new - n_chunks
+ << " chunks (" << sum_added
+ << " blocks) were added.";
+
+ n_chunks = n;
+ }
+calc_buf_pool_size:
+ /* recalc curr_size */
+ ulint new_size = 0;
+
+ {
+ chunk_t* chunk = chunks;
+ const chunk_t* const echunk = chunk + n_chunks;
+ do {
+ new_size += chunk->size;
+ } while (++chunk != echunk);
+ }
+
+ curr_size = new_size;
+ n_chunks_new = n_chunks;
+
+ if (chunks_old) {
+ ut_free(chunks_old);
+ chunks_old = NULL;
+ }
+
+ chunk_t::map* chunk_map_old = chunk_t::map_ref;
+ chunk_t::map_ref = chunk_t::map_reg;
+
+ /* set size */
+ ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
+ ulint s= curr_size;
+ old_size= s;
+ s/= BUF_READ_AHEAD_PORTION;
+ read_ahead_area= s >= READ_AHEAD_PAGES
+ ? READ_AHEAD_PAGES
+ : my_round_up_to_next_power(static_cast<uint32_t>(s));
+ curr_pool_size= n_chunks * srv_buf_pool_chunk_unit;
+ srv_buf_pool_curr_size= curr_pool_size;/* FIXME: remove*/
+ innodb_set_buf_pool_size(buf_pool_size_align(srv_buf_pool_curr_size));
+
+ const bool new_size_too_diff
+ = srv_buf_pool_base_size > srv_buf_pool_size * 2
+ || srv_buf_pool_base_size * 2 < srv_buf_pool_size;
+
+ mysql_mutex_unlock(&mutex);
+ page_hash.write_unlock_all();
+
+ UT_DELETE(chunk_map_old);
+
+ resizing.store(false, std::memory_order_relaxed);
+
+ /* Normalize other components, if the new size is too different */
+ if (!warning && new_size_too_diff) {
+ srv_buf_pool_base_size = srv_buf_pool_size;
+
+ buf_resize_status("Resizing also other hash tables.");
+
+ srv_lock_table_size = 5
+ * (srv_buf_pool_size >> srv_page_size_shift);
+ lock_sys.resize(srv_lock_table_size);
+ dict_sys.resize();
+
+ ib::info() << "Resized hash tables at lock_sys,"
+#ifdef BTR_CUR_HASH_ADAPT
+ " adaptive hash index,"
+#endif /* BTR_CUR_HASH_ADAPT */
+ " dictionary.";
+ }
+
+ /* normalize ibuf.max_size */
+ ibuf_max_size_update(srv_change_buffer_max_size);
+
+ if (srv_buf_pool_old_size != srv_buf_pool_size) {
+
+ ib::info() << "Completed to resize buffer pool from "
+ << srv_buf_pool_old_size
+ << " to " << srv_buf_pool_size << ".";
+ srv_buf_pool_old_size = srv_buf_pool_size;
+ }
+
+#ifdef BTR_CUR_HASH_ADAPT
+ /* enable AHI if needed */
+ if (btr_search_disabled) {
+ btr_search_enable(true);
+ ib::info() << "Re-enabled adaptive hash index.";
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ char now[32];
+
+ ut_sprintf_timestamp(now);
+ if (!warning) {
+ buf_resize_status("Completed resizing buffer pool at %s.",
+ now);
+ } else {
+ buf_resize_status("Resizing buffer pool failed,"
+ " finished resizing at %s.", now);
+ }
+
+ ut_d(validate());
+
+ return;
+}
+
+/** Thread pool task invoked by innodb_buffer_pool_size changes. */
+static void buf_resize_callback(void *)
+{
+ DBUG_ENTER("buf_resize_callback");
+ ut_ad(srv_shutdown_state < SRV_SHUTDOWN_CLEANUP);
+ mysql_mutex_lock(&buf_pool.mutex);
+ const auto size= srv_buf_pool_size;
+ const bool work= srv_buf_pool_old_size != size;
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ if (work)
+ buf_pool.resize();
+ else
+ {
+ std::ostringstream sout;
+ sout << "Size did not change: old size = new size = " << size;
+ buf_resize_status(sout.str().c_str());
+ }
+ DBUG_VOID_RETURN;
+}
+
+/* Ensure that task does not run in parallel, by setting max_concurrency to 1 for the thread group */
+static tpool::task_group single_threaded_group(1);
+static tpool::waitable_task buf_resize_task(buf_resize_callback,
+ nullptr, &single_threaded_group);
+
+void buf_resize_start()
+{
+ srv_thread_pool->submit_task(&buf_resize_task);
+}
+
+void buf_resize_shutdown()
+{
+ buf_resize_task.wait();
+}
+
+
+/** Relocate a ROW_FORMAT=COMPRESSED block in the LRU list and
+buf_pool.page_hash.
+The caller must relocate bpage->list.
+@param bpage BUF_BLOCK_ZIP_PAGE block
+@param dpage destination control block */
+static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage)
+{
+ const ulint fold= bpage->id().fold();
+ ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE);
+ mysql_mutex_assert_owner(&buf_pool.mutex);
+ ut_ad(buf_pool.hash_lock_get(bpage->id())->is_write_locked());
+ ut_a(bpage->io_fix() == BUF_IO_NONE);
+ ut_a(!bpage->buf_fix_count());
+ ut_ad(bpage == buf_pool.page_hash_get_low(bpage->id(), fold));
+ ut_ad(!buf_pool.watch_is_sentinel(*bpage));
+ ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE);
+
+ new (dpage) buf_page_t(*bpage);
+
+ /* Important that we adjust the hazard pointer before
+ removing bpage from LRU list. */
+ if (buf_page_t *b= buf_pool.LRU_remove(bpage))
+ UT_LIST_INSERT_AFTER(buf_pool.LRU, b, dpage);
+ else
+ UT_LIST_ADD_FIRST(buf_pool.LRU, dpage);
+
+ if (UNIV_UNLIKELY(buf_pool.LRU_old == bpage))
+ {
+ buf_pool.LRU_old= dpage;
+#ifdef UNIV_LRU_DEBUG
+ /* buf_pool.LRU_old must be the first item in the LRU list
+ whose "old" flag is set. */
+ ut_a(buf_pool.LRU_old->old);
+ ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old) ||
+ !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old);
+ ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old) ||
+ UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old);
+ }
+ else
+ {
+ /* Check that the "old" flag is consistent in
+ the block and its neighbours. */
+ dpage->set_old(dpage->is_old());
+#endif /* UNIV_LRU_DEBUG */
+ }
+
+ ut_d(CheckInLRUList::validate());
+
+ /* relocate buf_pool.page_hash */
+ ut_ad(bpage->in_page_hash);
+ ut_ad(dpage->in_page_hash);
+ ut_d(bpage->in_page_hash= false);
+ HASH_REPLACE(buf_page_t, hash, &buf_pool.page_hash, fold, bpage, dpage);
+}
+
+/** Register a watch for a page identifier. The caller must hold an
+exclusive page hash latch. The *hash_lock may be released,
+relocated, and reacquired.
+@param id page identifier
+@param hash_lock exclusively held page_hash latch
+@return a buffer pool block corresponding to id
+@retval nullptr if the block was not present, and a watch was installed */
+inline buf_page_t *buf_pool_t::watch_set(const page_id_t id,
+ page_hash_latch **hash_lock)
+{
+ const ulint fold= id.fold();
+ ut_ad(*hash_lock == page_hash.lock_get(fold));
+ ut_ad((*hash_lock)->is_write_locked());
+
+retry:
+ if (buf_page_t *bpage= page_hash_get_low(id, fold))
+ {
+ if (!watch_is_sentinel(*bpage))
+ /* The page was loaded meanwhile. */
+ return bpage;
+ /* Add to an existing watch. */
+ bpage->fix();
+ return nullptr;
+ }
+
+ (*hash_lock)->write_unlock();
+ /* Allocate a watch[] and then try to insert it into the page_hash. */
+ mysql_mutex_lock(&mutex);
+
+ /* The maximum number of purge tasks should never exceed
+ the UT_ARR_SIZE(watch) - 1, and there is no way for a purge task to hold a
+ watch when setting another watch. */
+ for (buf_page_t *w= &watch[UT_ARR_SIZE(watch)]; w-- >= watch; )
+ {
+ ut_ad(w->access_time == 0);
+ ut_ad(!w->oldest_modification());
+ ut_ad(!w->zip.data);
+ ut_ad(!w->in_zip_hash);
+ if (w->state() == BUF_BLOCK_ZIP_PAGE)
+ /* This watch may be in use for some other page. */
+ continue;
+ ut_ad(w->state() == BUF_BLOCK_NOT_USED);
+ ut_ad(!w->buf_fix_count());
+ /* w is pointing to watch[], which is protected by mutex.
+ Normally, buf_page_t::id for objects that are reachable by
+ page_hash_get_low(id, fold) are protected by hash_lock. */
+ w->set_state(BUF_BLOCK_ZIP_PAGE);
+ w->id_= id;
+
+ *hash_lock= page_hash.lock_get(fold);
+ (*hash_lock)->write_lock();
+ mysql_mutex_unlock(&mutex);
+
+ buf_page_t *bpage= page_hash_get_low(id, fold);
+ if (UNIV_LIKELY_NULL(bpage))
+ {
+ (*hash_lock)->write_unlock();
+ mysql_mutex_lock(&mutex);
+ w->set_state(BUF_BLOCK_NOT_USED);
+ *hash_lock= page_hash.lock_get(fold);
+ (*hash_lock)->write_lock();
+ mysql_mutex_unlock(&mutex);
+ goto retry;
+ }
+
+ ut_ad(!w->buf_fix_count_);
+ w->buf_fix_count_= 1;
+ ut_ad(!w->in_page_hash);
+ ut_d(w->in_page_hash= true); /* Not holding buf_pool.mutex here! */
+ HASH_INSERT(buf_page_t, hash, &page_hash, fold, w);
+ return nullptr;
+ }
+
+ ut_error;
+ mysql_mutex_unlock(&mutex);
+ return nullptr;
+}
+
+/** Mark the page status as FREED for the given tablespace id and
+page number. If the page is not in buffer pool then ignore it.
+@param[in,out] space tablespace
+@param[in] page page number
+@param[in,out] mtr mini-transaction
+@param[in] file file name
+@param[in] line line where called */
+void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr,
+ const char *file, unsigned line)
+{
+ ut_ad(mtr);
+ ut_ad(mtr->is_active());
+
+ if (srv_immediate_scrub_data_uncompressed
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+ || space->is_compressed()
+#endif
+ )
+ mtr->add_freed_offset(space, page);
+
+ buf_pool.stat.n_page_gets++;
+ const page_id_t page_id(space->id, page);
+ const ulint fold= page_id.fold();
+ page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
+ if (buf_block_t *block= reinterpret_cast<buf_block_t*>
+ (buf_pool.page_hash_get_low(page_id, fold)))
+ {
+ if (block->page.state() != BUF_BLOCK_FILE_PAGE)
+ /* FIXME: convert, but avoid buf_zip_decompress() */;
+ else
+ {
+ buf_block_buf_fix_inc(block, file, line);
+ ut_ad(block->page.buf_fix_count());
+ hash_lock->read_unlock();
+
+ mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+ rw_lock_x_lock_inline(&block->lock, 0, file, line);
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ block->page.status= buf_page_t::FREED;
+ return;
+ }
+ }
+
+ hash_lock->read_unlock();
+}
+
+/** Get read access to a compressed page (usually of type
+FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
+The page must be released with buf_page_release_zip().
+NOTE: the page is not protected by any latch. Mutual exclusion has to
+be implemented at a higher level. In other words, all possible
+accesses to a given page through this function must be protected by
+the same set of mutexes or latches.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size
+@return pointer to the block */
+buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size)
+{
+ ut_ad(zip_size);
+ ut_ad(ut_is_2pow(zip_size));
+ buf_pool.stat.n_page_gets++;
+
+ bool discard_attempted= false;
+ const ulint fold= page_id.fold();
+ buf_page_t *bpage;
+ page_hash_latch *hash_lock;
+
+ for (;;)
+ {
+lookup:
+ bpage= buf_pool.page_hash_get_locked<false>(page_id, fold, &hash_lock);
+ if (bpage)
+ break;
+
+ dberr_t err= buf_read_page(page_id, zip_size);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS))
+ {
+ ib::error() << "Reading compressed page " << page_id
+ << " failed with error: " << err;
+ goto err_exit;
+ }
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ }
+
+ ut_ad(hash_lock->is_read_locked());
+
+ if (!bpage->zip.data)
+ {
+ /* There is no compressed page. */
+err_exit:
+ hash_lock->read_unlock();
+ return nullptr;
+ }
+
+ ut_ad(!buf_pool.watch_is_sentinel(*bpage));
+
+ switch (bpage->state()) {
+ case BUF_BLOCK_ZIP_PAGE:
+ bpage->fix();
+ goto got_block;
+ case BUF_BLOCK_FILE_PAGE:
+ /* Discard the uncompressed page frame if possible. */
+ if (!discard_attempted)
+ {
+ discard_attempted= true;
+ hash_lock->read_unlock();
+ mysql_mutex_lock(&buf_pool.mutex);
+ if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold))
+ buf_LRU_free_page(bpage, false);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ goto lookup;
+ }
+
+ buf_block_buf_fix_inc(reinterpret_cast<buf_block_t*>(bpage),
+ __FILE__, __LINE__);
+ goto got_block;
+ default:
+ break;
+ }
+
+ ut_error;
+ goto err_exit;
+
+got_block:
+ bool must_read= bpage->io_fix() == BUF_IO_READ;
+ hash_lock->read_unlock();
+
+ DBUG_ASSERT(bpage->status != buf_page_t::FREED);
+
+ bpage->set_accessed();
+ buf_page_make_young_if_needed(bpage);
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ ut_ad(bpage->buf_fix_count());
+ ut_ad(bpage->in_file());
+
+ if (must_read)
+ /* Let us wait until the read operation completes */
+ while (bpage->io_fix() == BUF_IO_READ)
+ os_thread_sleep(WAIT_FOR_READ);
+
+ return bpage;
+}
+
+/********************************************************************//**
+Initialize some fields of a control block. */
+UNIV_INLINE
+void
+buf_block_init_low(
+/*===============*/
+ buf_block_t* block) /*!< in: block to init */
+{
+#ifdef BTR_CUR_HASH_ADAPT
+ /* No adaptive hash index entries may point to a previously
+ unused (and now freshly allocated) block. */
+ assert_block_ahi_empty_on_init(block);
+ block->index = NULL;
+
+ block->n_hash_helps = 0;
+ block->n_fields = 1;
+ block->n_bytes = 0;
+ block->left_side = TRUE;
+#endif /* BTR_CUR_HASH_ADAPT */
+}
+
+/********************************************************************//**
+Decompress a block.
+@return TRUE if successful */
+ibool
+buf_zip_decompress(
+/*===============*/
+ buf_block_t* block, /*!< in/out: block */
+ ibool check) /*!< in: TRUE=verify the page checksum */
+{
+ const byte* frame = block->page.zip.data;
+ ulint size = page_zip_get_size(&block->page.zip);
+ /* The tablespace will not be found if this function is called
+ during IMPORT. */
+ fil_space_t* space= fil_space_t::get(block->page.id().space());
+ const unsigned key_version = mach_read_from_4(
+ frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+ fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL;
+ const bool encrypted = crypt_data
+ && crypt_data->type != CRYPT_SCHEME_UNENCRYPTED
+ && (!crypt_data->is_default_encryption()
+ || srv_encrypt_tables);
+
+ ut_ad(block->zip_size());
+ ut_a(block->page.id().space() != 0);
+
+ if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) {
+
+ ib::error() << "Compressed page checksum mismatch for "
+ << (space ? space->chain.start->name : "")
+ << block->page.id() << ": stored: "
+ << mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
+ << ", crc32: "
+ << page_zip_calc_checksum(
+ frame, size, SRV_CHECKSUM_ALGORITHM_CRC32)
+ << " innodb: "
+ << page_zip_calc_checksum(
+ frame, size, SRV_CHECKSUM_ALGORITHM_INNODB)
+ << ", none: "
+ << page_zip_calc_checksum(
+ frame, size, SRV_CHECKSUM_ALGORITHM_NONE)
+ << " (algorithm: " << srv_checksum_algorithm << ")";
+ goto err_exit;
+ }
+
+ switch (fil_page_get_type(frame)) {
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_RTREE:
+ if (page_zip_decompress(&block->page.zip,
+ block->frame, TRUE)) {
+ if (space) {
+ space->release();
+ }
+ return(TRUE);
+ }
+
+ ib::error() << "Unable to decompress "
+ << (space ? space->chain.start->name : "")
+ << block->page.id();
+ goto err_exit;
+ case FIL_PAGE_TYPE_ALLOCATED:
+ case FIL_PAGE_INODE:
+ case FIL_PAGE_IBUF_BITMAP:
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ /* Copy to uncompressed storage. */
+ memcpy(block->frame, frame, block->zip_size());
+ if (space) {
+ space->release();
+ }
+
+ return(TRUE);
+ }
+
+ ib::error() << "Unknown compressed page type "
+ << fil_page_get_type(frame)
+ << " in " << (space ? space->chain.start->name : "")
+ << block->page.id();
+
+err_exit:
+ if (encrypted) {
+ ib::info() << "Row compressed page could be encrypted"
+ " with key_version " << key_version;
+ }
+
+ if (space) {
+ if (encrypted) {
+ dict_set_encrypted_by_space(space);
+ } else {
+ dict_set_corrupted_by_space(space);
+ }
+
+ space->release();
+ }
+
+ return(FALSE);
+}
+
+/** Wait for the block to be read in.
+@param[in] block The block to check */
+static
+void
+buf_wait_for_read(
+ buf_block_t* block)
+{
+ /* Note:
+
+ We are using the block->lock to check for IO state.
+ We set the IO_READ state under the protection of the hash_lock.
+ This is safe because another thread can only
+ access the block (and check for IO state) after the block has been
+ added to the page hashtable. */
+
+ while (block->page.io_fix() == BUF_IO_READ) {
+ rw_lock_s_lock(&block->lock);
+ rw_lock_s_unlock(&block->lock);
+ }
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** If a stale adaptive hash index exists on the block, drop it.
+Multiple executions of btr_search_drop_page_hash_index() on the
+same block must be prevented by exclusive page latch. */
+ATTRIBUTE_COLD
+static void buf_defer_drop_ahi(buf_block_t *block, mtr_memo_type_t fix_type)
+{
+ switch (fix_type) {
+ case MTR_MEMO_BUF_FIX:
+ /* We do not drop the adaptive hash index, because safely doing
+ so would require acquiring block->lock, and that is not safe
+ to acquire in some RW_NO_LATCH access paths. Those code paths
+ should have no business accessing the adaptive hash index anyway. */
+ break;
+ case MTR_MEMO_PAGE_S_FIX:
+ /* Temporarily release our S-latch. */
+ rw_lock_s_unlock(&block->lock);
+ rw_lock_x_lock(&block->lock);
+ if (dict_index_t *index= block->index)
+ if (index->freed())
+ btr_search_drop_page_hash_index(block);
+ rw_lock_x_unlock(&block->lock);
+ rw_lock_s_lock(&block->lock);
+ break;
+ case MTR_MEMO_PAGE_SX_FIX:
+ rw_lock_sx_unlock(&block->lock);
+ rw_lock_x_lock(&block->lock);
+ if (dict_index_t *index= block->index)
+ if (index->freed())
+ btr_search_drop_page_hash_index(block);
+ rw_lock_x_unlock(&block->lock);
+ rw_lock_sx_lock(&block->lock);
+ break;
+ default:
+ ut_ad(fix_type == MTR_MEMO_PAGE_X_FIX);
+ btr_search_drop_page_hash_index(block);
+ }
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/** Lock the page with the given latch type.
+@param[in,out] block block to be locked
+@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in] mtr mini-transaction
+@param[in] file file name
+@param[in] line line where called
+@return pointer to locked block */
+static buf_block_t* buf_page_mtr_lock(buf_block_t *block,
+ ulint rw_latch,
+ mtr_t* mtr,
+ const char *file,
+ unsigned line)
+{
+ mtr_memo_type_t fix_type;
+ switch (rw_latch)
+ {
+ case RW_NO_LATCH:
+ fix_type= MTR_MEMO_BUF_FIX;
+ goto done;
+ case RW_S_LATCH:
+ rw_lock_s_lock_inline(&block->lock, 0, file, line);
+ fix_type= MTR_MEMO_PAGE_S_FIX;
+ break;
+ case RW_SX_LATCH:
+ rw_lock_sx_lock_inline(&block->lock, 0, file, line);
+ fix_type= MTR_MEMO_PAGE_SX_FIX;
+ break;
+ default:
+ ut_ad(rw_latch == RW_X_LATCH);
+ rw_lock_x_lock_inline(&block->lock, 0, file, line);
+ fix_type= MTR_MEMO_PAGE_X_FIX;
+ break;
+ }
+
+#ifdef BTR_CUR_HASH_ADAPT
+ {
+ dict_index_t *index= block->index;
+ if (index && index->freed())
+ buf_defer_drop_ahi(block, fix_type);
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+done:
+ mtr_memo_push(mtr, block, fix_type);
+ return block;
+}
+
+/** Low level function used to get access to a database page.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in] guess guessed block or NULL
+@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in] file file name
+@param[in] line line where called
+@param[in] mtr mini-transaction
+@param[out] err DB_SUCCESS or error code
+@param[in] allow_ibuf_merge Allow change buffer merge to happen
+while reading the page from file
+then it makes sure that it does merging of change buffer changes while
+reading the page from file.
+@return pointer to the block or NULL */
+buf_block_t*
+buf_page_get_low(
+ const page_id_t page_id,
+ ulint zip_size,
+ ulint rw_latch,
+ buf_block_t* guess,
+ ulint mode,
+ const char* file,
+ unsigned line,
+ mtr_t* mtr,
+ dberr_t* err,
+ bool allow_ibuf_merge)
+{
+ buf_block_t* block;
+ unsigned access_time;
+ ulint retries = 0;
+ const ulint fold = page_id.fold();
+
+ ut_ad((mtr == NULL) == (mode == BUF_EVICT_IF_IN_POOL));
+ ut_ad(!mtr || mtr->is_active());
+ ut_ad((rw_latch == RW_S_LATCH)
+ || (rw_latch == RW_X_LATCH)
+ || (rw_latch == RW_SX_LATCH)
+ || (rw_latch == RW_NO_LATCH));
+ ut_ad(!allow_ibuf_merge
+ || mode == BUF_GET
+ || mode == BUF_GET_POSSIBLY_FREED
+ || mode == BUF_GET_IF_IN_POOL
+ || mode == BUF_GET_IF_IN_POOL_OR_WATCH);
+
+ if (err) {
+ *err = DB_SUCCESS;
+ }
+
+#ifdef UNIV_DEBUG
+ switch (mode) {
+ case BUF_EVICT_IF_IN_POOL:
+ /* After DISCARD TABLESPACE, the tablespace would not exist,
+ but in IMPORT TABLESPACE, PageConverter::operator() must
+ replace any old pages, which were not evicted during DISCARD.
+ Skip the assertion on space_page_size. */
+ break;
+ case BUF_PEEK_IF_IN_POOL:
+ case BUF_GET_IF_IN_POOL:
+ /* The caller may pass a dummy page size,
+ because it does not really matter. */
+ break;
+ default:
+ ut_error;
+ case BUF_GET_POSSIBLY_FREED:
+ break;
+ case BUF_GET_NO_LATCH:
+ ut_ad(rw_latch == RW_NO_LATCH);
+ /* fall through */
+ case BUF_GET:
+ case BUF_GET_IF_IN_POOL_OR_WATCH:
+ fil_space_t* s = fil_space_get(page_id.space());
+ ut_ad(s);
+ ut_ad(s->zip_size() == zip_size);
+ }
+#endif /* UNIV_DEBUG */
+
+ ut_ad(!mtr || !ibuf_inside(mtr)
+ || ibuf_page_low(page_id, zip_size, FALSE, file, line, NULL));
+
+ buf_pool.stat.n_page_gets++;
+loop:
+ buf_block_t* fix_block;
+ block = guess;
+
+ page_hash_latch* hash_lock = buf_pool.page_hash.lock<false>(fold);
+
+ if (block) {
+
+ /* If the guess is a compressed page descriptor that
+ has been allocated by buf_page_alloc_descriptor(),
+ it may have been freed by buf_relocate(). */
+
+ if (!buf_pool.is_uncompressed(block)
+ || page_id != block->page.id()
+ || block->page.state() != BUF_BLOCK_FILE_PAGE) {
+ /* Our guess was bogus or things have changed
+ since. */
+ guess = nullptr;
+ goto lookup;
+ } else {
+ ut_ad(!block->page.in_zip_hash);
+ }
+ } else {
+lookup:
+ block = reinterpret_cast<buf_block_t*>(
+ buf_pool.page_hash_get_low(page_id, fold));
+ }
+
+ if (!block || buf_pool.watch_is_sentinel(block->page)) {
+ hash_lock->read_unlock();
+ block = nullptr;
+ }
+
+ if (UNIV_UNLIKELY(!block)) {
+ /* Page not in buf_pool: needs to be read from file */
+ if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
+ hash_lock = buf_pool.page_hash.lock<true>(fold);
+
+ if (buf_page_t *bpage= buf_pool.watch_set(
+ page_id, &hash_lock)) {
+ /* We can release hash_lock after we
+ increment the fix count to make
+ sure that no state change takes place. */
+ bpage->fix();
+ hash_lock->write_unlock();
+ block = reinterpret_cast<buf_block_t*>(bpage);
+ fix_block = block;
+ goto got_block;
+ }
+
+ hash_lock->write_unlock();
+ }
+
+ switch (mode) {
+ case BUF_GET_IF_IN_POOL:
+ case BUF_GET_IF_IN_POOL_OR_WATCH:
+ case BUF_PEEK_IF_IN_POOL:
+ case BUF_EVICT_IF_IN_POOL:
+ return(NULL);
+ }
+
+ /* The call path is buf_read_page() ->
+ buf_read_page_low() (fil_space_t::io()) ->
+ buf_page_read_complete() ->
+ buf_decrypt_after_read(). Here fil_space_t* is used
+ and we decrypt -> buf_page_check_corrupt() where page
+ checksums are compared. Decryption, decompression as
+ well as error handling takes place at a lower level.
+ Here we only need to know whether the page really is
+ corrupted, or if an encrypted page with a valid
+ checksum cannot be decypted. */
+
+ dberr_t local_err = buf_read_page(page_id, zip_size);
+
+ if (local_err == DB_SUCCESS) {
+ buf_read_ahead_random(page_id, zip_size,
+ ibuf_inside(mtr));
+
+ retries = 0;
+ } else if (mode == BUF_GET_POSSIBLY_FREED) {
+ if (err) {
+ *err = local_err;
+ }
+ return NULL;
+ } else if (retries < BUF_PAGE_READ_MAX_RETRIES) {
+ ++retries;
+
+ DBUG_EXECUTE_IF(
+ "innodb_page_corruption_retries",
+ retries = BUF_PAGE_READ_MAX_RETRIES;
+ );
+ } else {
+ if (err) {
+ *err = local_err;
+ }
+
+ /* Pages whose encryption key is unavailable or used
+ key, encryption algorithm or encryption method is
+ incorrect are marked as encrypted in
+ buf_page_check_corrupt(). Unencrypted page could be
+ corrupted in a way where the key_id field is
+ nonzero. There is no checksum on field
+ FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION. */
+ if (local_err == DB_DECRYPTION_FAILED) {
+ return (NULL);
+ }
+
+ if (local_err == DB_PAGE_CORRUPTED
+ && srv_force_recovery) {
+ return NULL;
+ }
+
+ /* Try to set table as corrupted instead of
+ asserting. */
+ if (page_id.space() == TRX_SYS_SPACE) {
+ } else if (page_id.space() == SRV_TMP_SPACE_ID) {
+ } else if (fil_space_t* space= fil_space_t::get(
+ page_id.space())) {
+ bool set = dict_set_corrupted_by_space(space);
+ space->release();
+ if (set) {
+ return NULL;
+ }
+ }
+
+ ib::fatal() << "Unable to read page " << page_id
+ << " into the buffer pool after "
+ << BUF_PAGE_READ_MAX_RETRIES
+ << ". The most probable cause"
+ " of this error may be that the"
+ " table has been corrupted."
+ " See https://mariadb.com/kb/en/library/innodb-recovery-modes/";
+ }
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ goto loop;
+ } else {
+ fix_block = block;
+ }
+
+ fix_block->fix();
+ hash_lock->read_unlock();
+
+got_block:
+ switch (mode) {
+ default:
+ ut_ad(block->zip_size() == zip_size);
+ break;
+ case BUF_GET_IF_IN_POOL:
+ case BUF_PEEK_IF_IN_POOL:
+ case BUF_EVICT_IF_IN_POOL:
+ if (fix_block->page.io_fix() == BUF_IO_READ) {
+ /* The page is being read to buffer pool,
+ but we cannot wait around for the read to
+ complete. */
+ fix_block->unfix();
+ return(NULL);
+ }
+ }
+
+ switch (UNIV_EXPECT(fix_block->page.state(), BUF_BLOCK_FILE_PAGE)) {
+ case BUF_BLOCK_FILE_PAGE:
+ if (fsp_is_system_temporary(page_id.space())
+ && block->page.io_fix() != BUF_IO_NONE) {
+ /* This suggests that the page is being flushed.
+ Avoid returning reference to this page.
+ Instead wait for the flush action to complete. */
+ fix_block->unfix();
+ os_thread_sleep(WAIT_FOR_WRITE);
+ goto loop;
+ }
+
+ if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
+evict_from_pool:
+ ut_ad(!fix_block->page.oldest_modification());
+ mysql_mutex_lock(&buf_pool.mutex);
+ fix_block->unfix();
+
+ if (!buf_LRU_free_page(&fix_block->page, true)) {
+ ut_ad(0);
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ return(NULL);
+ }
+
+ break;
+ default:
+ ut_error;
+ break;
+
+ case BUF_BLOCK_ZIP_PAGE:
+ if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
+ goto evict_from_pool;
+ }
+
+ if (mode == BUF_PEEK_IF_IN_POOL) {
+ /* This mode is only used for dropping an
+ adaptive hash index. There cannot be an
+ adaptive hash index for a compressed-only
+ page, so do not bother decompressing the page. */
+ fix_block->unfix();
+
+ return(NULL);
+ }
+
+ buf_page_t* bpage = &block->page;
+
+ /* Note: We have already buffer fixed this block. */
+ if (bpage->buf_fix_count() > 1
+ || bpage->io_fix() != BUF_IO_NONE) {
+
+ /* This condition often occurs when the buffer
+ is not buffer-fixed, but I/O-fixed by
+ buf_page_init_for_read(). */
+ fix_block->unfix();
+
+ /* The block is buffer-fixed or I/O-fixed.
+ Try again later. */
+ os_thread_sleep(WAIT_FOR_READ);
+
+ goto loop;
+ }
+
+ /* Buffer-fix the block so that it cannot be evicted
+ or relocated while we are attempting to allocate an
+ uncompressed page. */
+
+ block = buf_LRU_get_free_block(false);
+ buf_block_init_low(block);
+
+ mysql_mutex_lock(&buf_pool.mutex);
+ hash_lock = buf_pool.page_hash.lock_get(fold);
+
+ hash_lock->write_lock();
+
+ /* Buffer-fixing prevents the page_hash from changing. */
+ ut_ad(bpage == buf_pool.page_hash_get_low(page_id, fold));
+
+ fix_block->unfix(); /* hash_lock protects us after this */
+
+ if (bpage->buf_fix_count() || bpage->io_fix() != BUF_IO_NONE) {
+ /* The block was buffer-fixed or I/O-fixed while
+ buf_pool.mutex was not held by this thread.
+ Free the block that was allocated and retry.
+ This should be extremely unlikely, for example,
+ if buf_page_get_zip() was invoked. */
+
+ hash_lock->write_unlock();
+ buf_LRU_block_free_non_file_page(block);
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ /* Try again */
+ goto loop;
+ }
+
+ fix_block = block;
+
+ /* Move the compressed page from bpage to block,
+ and uncompress it. */
+
+ /* Note: this is the uncompressed block and it is not
+ accessible by other threads yet because it is not in
+ any list or hash table */
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_relocate(bpage, &block->page);
+
+ /* Set after buf_relocate(). */
+ block->page.set_buf_fix_count(1);
+
+ buf_flush_relocate_on_flush_list(bpage, &block->page);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ /* Buffer-fix, I/O-fix, and X-latch the block
+ for the duration of the decompression.
+ Also add the block to the unzip_LRU list. */
+ block->page.set_state(BUF_BLOCK_FILE_PAGE);
+
+ /* Insert at the front of unzip_LRU list */
+ buf_unzip_LRU_add_block(block, FALSE);
+
+ block->page.set_io_fix(BUF_IO_READ);
+ rw_lock_x_lock_inline(&block->lock, 0, file, line);
+
+ MEM_UNDEFINED(bpage, sizeof *bpage);
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+ hash_lock->write_unlock();
+ buf_pool.n_pend_unzip++;
+
+ access_time = block->page.is_accessed();
+
+ if (!access_time && !recv_no_ibuf_operations
+ && ibuf_page_exists(block->page.id(), zip_size)) {
+ block->page.ibuf_exist = true;
+ }
+
+ buf_page_free_descriptor(bpage);
+
+ /* Decompress the page while not holding
+ buf_pool.mutex. */
+
+ if (!buf_zip_decompress(block, false)) {
+ rw_lock_x_unlock(&fix_block->lock);
+ fix_block->page.io_unfix();
+ fix_block->unfix();
+ --buf_pool.n_pend_unzip;
+
+ if (err) {
+ *err = DB_PAGE_CORRUPTED;
+ }
+ return NULL;
+ }
+
+ rw_lock_x_unlock(&block->lock);
+ fix_block->page.io_unfix();
+ --buf_pool.n_pend_unzip;
+ break;
+ }
+
+ ut_ad(block == fix_block);
+ ut_ad(fix_block->page.buf_fix_count());
+
+ ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE);
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+re_evict:
+ if (mode != BUF_GET_IF_IN_POOL
+ && mode != BUF_GET_IF_IN_POOL_OR_WATCH) {
+ } else if (!ibuf_debug) {
+ } else if (fil_space_t* space = fil_space_t::get(page_id.space())) {
+ /* Try to evict the block from the buffer pool, to use the
+ insert buffer (change buffer) as much as possible. */
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ fix_block->unfix();
+
+ /* Blocks cannot be relocated or enter or exit the
+ buf_pool while we are holding the buf_pool.mutex. */
+ const bool evicted = buf_LRU_free_page(&fix_block->page, true);
+ space->release();
+
+ if (evicted) {
+ hash_lock = buf_pool.page_hash.lock_get(fold);
+ hash_lock->write_lock();
+ mysql_mutex_unlock(&buf_pool.mutex);
+ /* We may set the watch, as it would have
+ been set if the page were not in the
+ buffer pool in the first place. */
+ block= reinterpret_cast<buf_block_t*>(
+ mode == BUF_GET_IF_IN_POOL_OR_WATCH
+ ? buf_pool.watch_set(page_id, &hash_lock)
+ : buf_pool.page_hash_get_low(page_id, fold));
+ hash_lock->write_unlock();
+
+ if (block != NULL) {
+ /* Either the page has been read in or
+ a watch was set on that in the window
+ where we released the buf_pool.mutex
+ and before we acquire the hash_lock
+ above. Try again. */
+ guess = block;
+
+ goto loop;
+ }
+
+ return(NULL);
+ }
+
+ fix_block->fix();
+ mysql_mutex_unlock(&buf_pool.mutex);
+ buf_flush_list();
+ buf_flush_wait_batch_end_acquiring_mutex(false);
+ while (buf_flush_list_space(space));
+ os_aio_wait_until_no_pending_writes();
+
+ if (fix_block->page.buf_fix_count() == 1
+ && !fix_block->page.oldest_modification()) {
+ goto re_evict;
+ }
+
+ /* Failed to evict the page; change it directly */
+ }
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+ ut_ad(fix_block->page.buf_fix_count());
+
+#ifdef UNIV_DEBUG
+ /* We have already buffer fixed the page, and we are committed to
+ returning this page to the caller. Register for debugging.
+ Avoid debug latching if page/block belongs to system temporary
+ tablespace (Not much needed for table with single threaded access.). */
+ if (!fsp_is_system_temporary(page_id.space())) {
+ ibool ret;
+ ret = rw_lock_s_lock_nowait(
+ fix_block->debug_latch, file, line);
+ ut_a(ret);
+ }
+#endif /* UNIV_DEBUG */
+
+ /* While tablespace is reinited the indexes are already freed but the
+ blocks related to it still resides in buffer pool. Trying to remove
+ such blocks from buffer pool would invoke removal of AHI entries
+ associated with these blocks. Logic to remove AHI entry will try to
+ load the block but block is already in free state. Handle the said case
+ with mode = BUF_PEEK_IF_IN_POOL that is invoked from
+ "btr_search_drop_page_hash_when_freed". */
+ ut_ad(mode == BUF_GET_POSSIBLY_FREED
+ || mode == BUF_PEEK_IF_IN_POOL
+ || fix_block->page.status != buf_page_t::FREED);
+
+ const bool not_first_access = fix_block->page.set_accessed();
+
+ if (mode != BUF_PEEK_IF_IN_POOL) {
+ buf_page_make_young_if_needed(&fix_block->page);
+ }
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE);
+
+ /* We have to wait here because the IO_READ state was set
+ under the protection of the hash_lock and not block->lock. */
+ buf_wait_for_read(fix_block);
+
+ if (fix_block->page.id() != page_id) {
+ fix_block->unfix();
+
+#ifdef UNIV_DEBUG
+ if (!fsp_is_system_temporary(page_id.space())) {
+ rw_lock_s_unlock(fix_block->debug_latch);
+ }
+#endif /* UNIV_DEBUG */
+
+ if (err) {
+ *err = DB_PAGE_CORRUPTED;
+ }
+
+ return NULL;
+ }
+
+ if (fix_block->page.status != buf_page_t::FREED
+ && allow_ibuf_merge
+ && fil_page_get_type(fix_block->frame) == FIL_PAGE_INDEX
+ && page_is_leaf(fix_block->frame)) {
+ rw_lock_x_lock_inline(&fix_block->lock, 0, file, line);
+
+ if (fix_block->page.ibuf_exist) {
+ fix_block->page.ibuf_exist = false;
+ ibuf_merge_or_delete_for_page(fix_block, page_id,
+ zip_size);
+ }
+
+ if (rw_latch == RW_X_LATCH) {
+ mtr->memo_push(fix_block, MTR_MEMO_PAGE_X_FIX);
+ } else {
+ rw_lock_x_unlock(&fix_block->lock);
+ goto get_latch;
+ }
+ } else {
+get_latch:
+ fix_block = buf_page_mtr_lock(fix_block, rw_latch, mtr,
+ file, line);
+ }
+
+ if (!not_first_access && mode != BUF_PEEK_IF_IN_POOL) {
+ /* In the case of a first access, try to apply linear
+ read-ahead */
+
+ buf_read_ahead_linear(page_id, zip_size, ibuf_inside(mtr));
+ }
+
+ return(fix_block);
+}
+
+/** Get access to a database page. Buffered redo log may be applied.
+@param[in] page_id page id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in] guess guessed block or NULL
+@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in] file file name
+@param[in] line line where called
+@param[in] mtr mini-transaction
+@param[out] err DB_SUCCESS or error code
+@param[in] allow_ibuf_merge Allow change buffer merge while
+reading the pages from file.
+@return pointer to the block or NULL */
+buf_block_t*
+buf_page_get_gen(
+ const page_id_t page_id,
+ ulint zip_size,
+ ulint rw_latch,
+ buf_block_t* guess,
+ ulint mode,
+ const char* file,
+ unsigned line,
+ mtr_t* mtr,
+ dberr_t* err,
+ bool allow_ibuf_merge)
+{
+ if (buf_block_t *block= recv_sys.recover(page_id))
+ {
+ block->fix();
+ ut_ad(rw_lock_s_lock_nowait(block->debug_latch, file, line));
+ if (err)
+ *err= DB_SUCCESS;
+ const bool must_merge= allow_ibuf_merge &&
+ ibuf_page_exists(page_id, block->zip_size());
+ if (block->page.status == buf_page_t::FREED)
+ ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL);
+ else if (must_merge && fil_page_get_type(block->frame) == FIL_PAGE_INDEX &&
+ page_is_leaf(block->frame))
+ {
+ rw_lock_x_lock_inline(&block->lock, 0, file, line);
+ block->page.ibuf_exist= false;
+ ibuf_merge_or_delete_for_page(block, page_id, block->zip_size());
+
+ if (rw_latch == RW_X_LATCH)
+ {
+ mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+ return block;
+ }
+ rw_lock_x_unlock(&block->lock);
+ }
+ block= buf_page_mtr_lock(block, rw_latch, mtr, file, line);
+ return block;
+ }
+
+ return buf_page_get_low(page_id, zip_size, rw_latch,
+ guess, mode, file, line, mtr, err, allow_ibuf_merge);
+}
+
+/********************************************************************//**
+This is the general function used to get optimistic access to a database
+page.
+@return TRUE if success */
+ibool
+buf_page_optimistic_get(
+/*====================*/
+ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+ buf_block_t* block, /*!< in: guessed buffer block */
+ ib_uint64_t modify_clock,/*!< in: modify clock value */
+ const char* file, /*!< in: file name */
+ unsigned line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ ibool success;
+
+ ut_ad(block);
+ ut_ad(mtr);
+ ut_ad(mtr->is_active());
+ ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
+
+ if (UNIV_UNLIKELY(block->page.state() != BUF_BLOCK_FILE_PAGE
+ || block->page.io_fix() != BUF_IO_NONE)) {
+ return FALSE;
+ }
+
+ const page_id_t id(block->page.id());
+
+ page_hash_latch *hash_lock = buf_pool.hash_lock_get(id);
+ hash_lock->read_lock();
+
+ if (UNIV_UNLIKELY(id != block->page.id()
+ || block->page.state() != BUF_BLOCK_FILE_PAGE
+ || block->page.io_fix() != BUF_IO_NONE)) {
+ hash_lock->read_unlock();
+ return(FALSE);
+ }
+
+ buf_block_buf_fix_inc(block, file, line);
+ hash_lock->read_unlock();
+
+ block->page.set_accessed();
+
+ buf_page_make_young_if_needed(&block->page);
+
+ ut_ad(!ibuf_inside(mtr) || ibuf_page(id, block->zip_size(), NULL));
+
+ mtr_memo_type_t fix_type;
+
+ if (rw_latch == RW_S_LATCH) {
+ fix_type = MTR_MEMO_PAGE_S_FIX;
+ success = rw_lock_s_lock_nowait(&block->lock, file, line);
+ } else {
+ fix_type = MTR_MEMO_PAGE_X_FIX;
+ success = rw_lock_x_lock_func_nowait_inline(
+ &block->lock, file, line);
+ }
+
+ ut_ad(id == block->page.id());
+
+ if (!success) {
+ buf_block_buf_fix_dec(block);
+ return(FALSE);
+ }
+
+ if (modify_clock != block->modify_clock) {
+
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ if (rw_latch == RW_S_LATCH) {
+ rw_lock_s_unlock(&block->lock);
+ } else {
+ rw_lock_x_unlock(&block->lock);
+ }
+
+ buf_block_buf_fix_dec(block);
+ return(FALSE);
+ }
+
+ mtr_memo_push(mtr, block, fix_type);
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ ut_ad(block->page.buf_fix_count());
+ ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+
+ buf_pool.stat.n_page_gets++;
+
+ return(TRUE);
+}
+
+/** Given a tablespace id and page number tries to get that page. If the
+page is not in the buffer pool it is not loaded and NULL is returned.
+Suitable for using when holding the lock_sys_t::mutex.
+@param[in] page_id page id
+@param[in] file file name
+@param[in] line line where called
+@param[in] mtr mini-transaction
+@return pointer to a page or NULL */
+buf_block_t*
+buf_page_try_get_func(
+ const page_id_t page_id,
+ const char* file,
+ unsigned line,
+ mtr_t* mtr)
+{
+ ut_ad(mtr);
+ ut_ad(mtr->is_active());
+
+ page_hash_latch *hash_lock;
+ buf_page_t *bpage= buf_pool.page_hash_get_locked<false>(page_id,
+ page_id.fold(),
+ &hash_lock);
+ if (!bpage)
+ return nullptr;
+ if (bpage->state() != BUF_BLOCK_FILE_PAGE)
+ {
+ hash_lock->read_unlock();
+ return nullptr;
+ }
+
+ buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
+ buf_block_buf_fix_inc(block, file, line);
+ hash_lock->read_unlock();
+
+ mtr_memo_type_t fix_type= MTR_MEMO_PAGE_S_FIX;
+ if (!rw_lock_s_lock_nowait(&block->lock, file, line))
+ {
+ /* Let us try to get an X-latch. If the current thread
+ is holding an X-latch on the page, we cannot get an S-latch. */
+ fix_type= MTR_MEMO_PAGE_X_FIX;
+ if (!rw_lock_x_lock_func_nowait_inline(&block->lock, file, line))
+ {
+ buf_block_buf_fix_dec(block);
+ return nullptr;
+ }
+ }
+
+ mtr_memo_push(mtr, block, fix_type);
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ ut_ad(bpage->buf_fix_count());
+ ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
+ ut_ad(bpage->id() == page_id);
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ buf_pool.stat.n_page_gets++;
+ return block;
+}
+
+/** Initialize the block.
+@param page_id page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param fix initial buf_fix_count() */
+void buf_block_t::initialise(const page_id_t page_id, ulint zip_size,
+ uint32_t fix)
+{
+ ut_ad(page.state() != BUF_BLOCK_FILE_PAGE);
+ buf_block_init_low(this);
+ page.init(page_id, fix);
+ page_zip_set_size(&page.zip, zip_size);
+}
+
+/** Initialize a page in the buffer pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED =>
+FILE_PAGE (the other is buf_page_get_gen).
+@param[in,out] space space object
+@param[in] offset offset of the tablespace
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] mtr mini-transaction
+@param[in,out] free_block pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t*
+buf_page_create(fil_space_t *space, uint32_t offset,
+ ulint zip_size, mtr_t *mtr, buf_block_t *free_block)
+{
+ page_id_t page_id(space->id, offset);
+ ut_ad(mtr->is_active());
+ ut_ad(page_id.space() != 0 || !zip_size);
+
+ space->free_page(offset, false);
+ free_block->initialise(page_id, zip_size, 1);
+
+ const ulint fold= page_id.fold();
+ mysql_mutex_lock(&buf_pool.mutex);
+
+loop:
+ buf_block_t *block= reinterpret_cast<buf_block_t*>
+ (buf_pool.page_hash_get_low(page_id, fold));
+
+ if (block && block->page.in_file() &&
+ !buf_pool.watch_is_sentinel(block->page))
+ {
+#ifdef BTR_CUR_HASH_ADAPT
+ const dict_index_t *drop_hash_entry= nullptr;
+#endif
+ switch (UNIV_EXPECT(block->page.state(), BUF_BLOCK_FILE_PAGE)) {
+ default:
+ ut_ad(0);
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ if (!mtr->have_x_latch(*block))
+ {
+ buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+ while (!rw_lock_x_lock_nowait(&block->lock))
+ {
+ /* Wait for buf_page_write_complete() to release block->lock.
+ We must not hold buf_pool.mutex while waiting. */
+ timespec abstime;
+ set_timespec_nsec(abstime, 1000000);
+ my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
+ &abstime);
+ }
+ mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
+ }
+ else
+ {
+ ut_ad(!block->page.ibuf_exist);
+#ifdef BTR_CUR_HASH_ADAPT
+ ut_ad(!block->index);
+#endif
+ }
+#ifdef BTR_CUR_HASH_ADAPT
+ drop_hash_entry= block->index;
+#endif
+ break;
+ case BUF_BLOCK_ZIP_PAGE:
+ page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
+ hash_lock->write_lock();
+ if (block->page.io_fix() != BUF_IO_NONE)
+ {
+ hash_lock->write_unlock();
+ /* Wait for buf_page_write_complete() to release the I/O fix. */
+ timespec abstime;
+ set_timespec_nsec(abstime, 1000000);
+ my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
+ &abstime);
+ goto loop;
+ }
+
+ rw_lock_x_lock(&free_block->lock);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_relocate(&block->page, &free_block->page);
+ buf_flush_relocate_on_flush_list(&block->page, &free_block->page);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ free_block->page.set_state(BUF_BLOCK_FILE_PAGE);
+ buf_unzip_LRU_add_block(free_block, FALSE);
+ hash_lock->write_unlock();
+ buf_page_free_descriptor(&block->page);
+ block= free_block;
+ buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+ mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
+ break;
+ }
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (drop_hash_entry)
+ btr_search_drop_page_hash_index(block);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ if (block->page.ibuf_exist)
+ {
+ if (!recv_recovery_is_on())
+ ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size);
+ block->page.ibuf_exist= false;
+ }
+
+ return block;
+ }
+
+ /* If we get here, the page was not in buf_pool: init it there */
+
+ DBUG_PRINT("ib_buf", ("create page %u:%u",
+ page_id.space(), page_id.page_no()));
+
+ block= free_block;
+
+ /* Duplicate buf_block_buf_fix_inc_func() */
+ ut_ad(block->page.buf_fix_count() == 1);
+ ut_ad(fsp_is_system_temporary(page_id.space()) ||
+ rw_lock_s_lock_nowait(block->debug_latch, __FILE__, __LINE__));
+
+ /* The block must be put to the LRU list */
+ buf_LRU_add_block(&block->page, false);
+ page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
+ hash_lock->write_lock();
+ block->page.set_state(BUF_BLOCK_FILE_PAGE);
+ ut_d(block->page.in_page_hash= true);
+ HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, &block->page);
+
+ rw_lock_x_lock(&block->lock);
+ if (UNIV_UNLIKELY(zip_size))
+ {
+ /* Prevent race conditions during buf_buddy_alloc(), which may
+ release and reacquire buf_pool.mutex, by IO-fixing and X-latching
+ the block. */
+ block->page.set_io_fix(BUF_IO_READ);
+ hash_lock->write_unlock();
+
+ /* buf_pool.mutex may be released and reacquired by
+ buf_buddy_alloc(). We must defer this operation until
+ after the block descriptor has been added to
+ buf_pool.LRU and buf_pool.page_hash. */
+ block->page.zip.data= buf_buddy_alloc(zip_size);
+
+ /* To maintain the invariant block->in_unzip_LRU_list ==
+ block->page.belongs_to_unzip_LRU() we have to add this
+ block to unzip_LRU after block->page.zip.data is set. */
+ ut_ad(block->page.belongs_to_unzip_LRU());
+ buf_unzip_LRU_add_block(block, FALSE);
+
+ block->page.set_io_fix(BUF_IO_NONE);
+ }
+ else
+ hash_lock->write_unlock();
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+ block->page.set_accessed();
+ buf_pool.stat.n_pages_created++;
+
+ /* Delete possible entries for the page from the insert buffer:
+ such can exist if the page belonged to an index which was dropped */
+ if (page_id < page_id_t{SRV_SPACE_ID_UPPER_BOUND, 0} &&
+ !recv_recovery_is_on())
+ ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size);
+
+ static_assert(FIL_PAGE_PREV + 4 == FIL_PAGE_NEXT, "adjacent");
+ memset_aligned<8>(block->frame + FIL_PAGE_PREV, 0xff, 8);
+ mach_write_to_2(block->frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
+
+ /* FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is only used on the
+ following pages:
+ (1) The first page of the InnoDB system tablespace (page 0:0)
+ (2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages
+ (3) key_version on encrypted pages (not page 0:0) */
+
+ memset(block->frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
+ memset_aligned<8>(block->frame + FIL_PAGE_LSN, 0, 8);
+
+#ifdef UNIV_DEBUG
+ if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+ return block;
+}
+
+/** Monitor the buffer page read/write activity, and increment corresponding
+counter value in MONITOR_MODULE_BUF_PAGE.
+@param bpage buffer page whose read or write was completed
+@param io_type BUF_IO_READ or BUF_IO_WRITE */
+ATTRIBUTE_COLD __attribute__((nonnull))
+void buf_page_monitor(const buf_page_t *bpage, buf_io_fix io_type)
+{
+ const byte* frame;
+ monitor_id_t counter;
+
+ ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
+
+ frame = bpage->zip.data
+ ? bpage->zip.data
+ : ((buf_block_t*) bpage)->frame;
+
+ switch (fil_page_get_type(frame)) {
+ ulint level;
+ case FIL_PAGE_TYPE_INSTANT:
+ case FIL_PAGE_INDEX:
+ case FIL_PAGE_RTREE:
+ level = btr_page_get_level(frame);
+
+ /* Check if it is an index page for insert buffer */
+ if (fil_page_get_type(frame) == FIL_PAGE_INDEX
+ && btr_page_get_index_id(frame)
+ == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
+ if (level == 0) {
+ counter = MONITOR_RW_COUNTER(
+ io_type, MONITOR_INDEX_IBUF_LEAF_PAGE);
+ } else {
+ counter = MONITOR_RW_COUNTER(
+ io_type,
+ MONITOR_INDEX_IBUF_NON_LEAF_PAGE);
+ }
+ } else {
+ if (level == 0) {
+ counter = MONITOR_RW_COUNTER(
+ io_type, MONITOR_INDEX_LEAF_PAGE);
+ } else {
+ counter = MONITOR_RW_COUNTER(
+ io_type, MONITOR_INDEX_NON_LEAF_PAGE);
+ }
+ }
+ break;
+
+ case FIL_PAGE_UNDO_LOG:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_UNDO_LOG_PAGE);
+ break;
+
+ case FIL_PAGE_INODE:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_INODE_PAGE);
+ break;
+
+ case FIL_PAGE_IBUF_FREE_LIST:
+ counter = MONITOR_RW_COUNTER(io_type,
+ MONITOR_IBUF_FREELIST_PAGE);
+ break;
+
+ case FIL_PAGE_IBUF_BITMAP:
+ counter = MONITOR_RW_COUNTER(io_type,
+ MONITOR_IBUF_BITMAP_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_SYS:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_SYSTEM_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_TRX_SYS:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_TRX_SYSTEM_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_FSP_HDR:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_FSP_HDR_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_XDES:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_XDES_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_BLOB:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_BLOB_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_ZBLOB:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB_PAGE);
+ break;
+
+ case FIL_PAGE_TYPE_ZBLOB2:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB2_PAGE);
+ break;
+
+ default:
+ counter = MONITOR_RW_COUNTER(io_type, MONITOR_OTHER_PAGE);
+ }
+
+ MONITOR_INC_NOCHECK(counter);
+}
+
+/** Mark a table corrupted.
+@param[in] bpage corrupted page
+@param[in] space tablespace of the corrupted page */
+ATTRIBUTE_COLD
+static void buf_mark_space_corrupt(buf_page_t* bpage, const fil_space_t& space)
+{
+ /* If block is not encrypted find the table with specified
+ space id, and mark it corrupted. Encrypted tables
+ are marked unusable later e.g. in ::open(). */
+ if (!space.crypt_data
+ || space.crypt_data->type == CRYPT_SCHEME_UNENCRYPTED) {
+ dict_set_corrupted_by_space(&space);
+ } else {
+ dict_set_encrypted_by_space(&space);
+ }
+}
+
+/** Release and evict a corrupted page.
+@param bpage page that was being read */
+ATTRIBUTE_COLD void buf_pool_t::corrupted_evict(buf_page_t *bpage)
+{
+ const page_id_t id(bpage->id());
+ page_hash_latch *hash_lock= hash_lock_get(id);
+
+ mysql_mutex_lock(&mutex);
+ hash_lock->write_lock();
+
+ ut_ad(bpage->io_fix() == BUF_IO_READ);
+ ut_ad(!bpage->oldest_modification());
+ bpage->set_corrupt_id();
+
+ if (bpage->state() == BUF_BLOCK_FILE_PAGE)
+ rw_lock_x_unlock_gen(&reinterpret_cast<buf_block_t*>(bpage)->lock,
+ BUF_IO_READ);
+
+ bpage->io_unfix();
+
+ /* remove from LRU and page_hash */
+ buf_LRU_free_one_page(bpage, id, hash_lock);
+ mysql_mutex_unlock(&mutex);
+
+ ut_d(auto n=) n_pend_reads--;
+ ut_ad(n > 0);
+}
+
+/** Mark a table corrupted.
+@param[in] bpage Corrupted page
+@param[in] node data file
+Also remove the bpage from LRU list. */
+ATTRIBUTE_COLD
+static void buf_corrupt_page_release(buf_page_t *bpage, const fil_node_t &node)
+{
+ ut_ad(bpage->id().space() == node.space->id);
+ buf_pool.corrupted_evict(bpage);
+
+ if (!srv_force_recovery)
+ buf_mark_space_corrupt(bpage, *node.space);
+}
+
+/** Check if the encrypted page is corrupted for the full crc32 format.
+@param[in] space_id page belongs to space id
+@param[in] d page
+@param[in] is_compressed compressed page
+@return true if page is corrupted or false if it isn't */
+static bool buf_page_full_crc32_is_corrupted(ulint space_id, const byte* d,
+ bool is_compressed)
+{
+ if (space_id != mach_read_from_4(d + FIL_PAGE_SPACE_ID))
+ return true;
+
+ static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+ static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+
+ return !is_compressed &&
+ memcmp_aligned<4>(FIL_PAGE_LSN + 4 + d,
+ d + srv_page_size - FIL_PAGE_FCRC32_END_LSN, 4);
+}
+
+/** Check if page is maybe compressed, encrypted or both when we encounter
+corrupted page. Note that we can't be 100% sure if page is corrupted
+or decrypt/decompress just failed.
+@param[in,out] bpage page
+@param[in] node data file
+@return whether the operation succeeded
+@retval DB_SUCCESS if page has been read and is not corrupted
+@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted
+@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
+after decryption normal page checksum does not match.
+@retval DB_TABLESPACE_DELETED if accessed tablespace is not found */
+static dberr_t buf_page_check_corrupt(buf_page_t *bpage,
+ const fil_node_t &node)
+{
+ ut_ad(node.space->referenced());
+
+ byte* dst_frame = (bpage->zip.data) ? bpage->zip.data :
+ ((buf_block_t*) bpage)->frame;
+ dberr_t err = DB_SUCCESS;
+ uint key_version = buf_page_get_key_version(dst_frame,
+ node.space->flags);
+
+ /* In buf_decrypt_after_read we have either decrypted the page if
+ page post encryption checksum matches and used key_id is found
+ from the encryption plugin. If checksum did not match page was
+ not decrypted and it could be either encrypted and corrupted
+ or corrupted or good page. If we decrypted, there page could
+ still be corrupted if used key does not match. */
+ const bool seems_encrypted = !node.space->full_crc32() && key_version
+ && node.space->crypt_data
+ && node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED;
+ ut_ad(node.space->purpose != FIL_TYPE_TEMPORARY ||
+ node.space->full_crc32());
+
+ /* If traditional checksums match, we assume that page is
+ not anymore encrypted. */
+ if (node.space->full_crc32()
+ && !buf_is_zeroes(span<const byte>(dst_frame,
+ node.space->physical_size()))
+ && (key_version || node.space->is_compressed()
+ || node.space->purpose == FIL_TYPE_TEMPORARY)) {
+ if (buf_page_full_crc32_is_corrupted(
+ bpage->id().space(), dst_frame,
+ node.space->is_compressed())) {
+ err = DB_PAGE_CORRUPTED;
+ }
+ } else if (buf_page_is_corrupted(true, dst_frame, node.space->flags)) {
+ err = DB_PAGE_CORRUPTED;
+ }
+
+ if (seems_encrypted && err == DB_PAGE_CORRUPTED
+ && bpage->id().page_no() != 0) {
+ err = DB_DECRYPTION_FAILED;
+
+ ib::error()
+ << "The page " << bpage->id()
+ << " in file '" << node.name
+ << "' cannot be decrypted.";
+
+ ib::info()
+ << "However key management plugin or used key_version "
+ << key_version
+ << " is not found or"
+ " used encryption algorithm or method does not match.";
+
+ if (bpage->id().space() != TRX_SYS_SPACE) {
+ ib::info()
+ << "Marking tablespace as missing."
+ " You may drop this table or"
+ " install correct key management plugin"
+ " and key file.";
+ }
+ }
+
+ return (err);
+}
+
+/** Complete a read request of a file page to buf_pool.
+@param bpage recently read page
+@param node data file
+@return whether the operation succeeded
+@retval DB_SUCCESS always when writing, or if a read page was OK
+@retval DB_PAGE_CORRUPTED if the checksum fails on a page read
+@retval DB_DECRYPTION_FAILED if the page cannot be decrypted */
+dberr_t buf_page_read_complete(buf_page_t *bpage, const fil_node_t &node)
+{
+ const page_id_t id(bpage->id());
+ ut_ad(bpage->in_file());
+ ut_ad(!buf_dblwr.is_inside(id));
+ ut_ad(id.space() == node.space->id);
+ ut_ad(bpage->zip_size() == node.space->zip_size());
+
+ /* We do not need protect io_fix here by mutex to read it because
+ this and buf_page_write_complete() are the only functions where we can
+ change the value from BUF_IO_READ or BUF_IO_WRITE to some other
+ value, and our code ensures that this is the only thread that handles
+ the i/o for this block. */
+
+ ut_ad(bpage->io_fix() == BUF_IO_READ);
+ ut_ad(!!bpage->zip.ssize == !!bpage->zip.data);
+ ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE || bpage->zip.data);
+
+ const byte *frame= bpage->zip.data
+ ? bpage->zip.data
+ : reinterpret_cast<buf_block_t*>(bpage)->frame;
+ ut_ad(frame);
+
+ dberr_t err;
+ if (!buf_page_decrypt_after_read(bpage, node))
+ {
+ err= DB_DECRYPTION_FAILED;
+ goto database_corrupted;
+ }
+
+ if (bpage->zip.data && bpage->state() == BUF_BLOCK_FILE_PAGE)
+ {
+ buf_pool.n_pend_unzip++;
+ auto ok= buf_zip_decompress(reinterpret_cast<buf_block_t*>(bpage), FALSE);
+ buf_pool.n_pend_unzip--;
+
+ if (!ok)
+ {
+ ib::info() << "Page " << id << " zip_decompress failure.";
+ err= DB_PAGE_CORRUPTED;
+ goto database_corrupted;
+ }
+ }
+
+ {
+ const page_id_t read_id(mach_read_from_4(frame + FIL_PAGE_SPACE_ID),
+ mach_read_from_4(frame + FIL_PAGE_OFFSET));
+
+ if (read_id == id);
+ else if (read_id == page_id_t(0, 0))
+ /* This is likely an uninitialized page. */;
+ else if (!node.space->full_crc32() &&
+ page_id_t(0, read_id.page_no()) == id)
+ /* FIL_PAGE_SPACE_ID was written as garbage in the system tablespace
+ before MySQL 4.1.1, which introduced innodb_file_per_table. */;
+ else if (node.space->full_crc32() &&
+ *reinterpret_cast<const uint32_t*>
+ (&frame[FIL_PAGE_FCRC32_KEY_VERSION]) &&
+ node.space->crypt_data &&
+ node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)
+ {
+ ib::error() << "Cannot decrypt " << id;
+ err= DB_DECRYPTION_FAILED;
+ goto release_page;
+ }
+ else
+ ib::error() << "Space id and page no stored in the page, read in are "
+ << read_id << ", should be " << id;
+ }
+
+ err= buf_page_check_corrupt(bpage, node);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS))
+ {
+database_corrupted:
+ /* Not a real corruption if it was triggered by error injection */
+ DBUG_EXECUTE_IF("buf_page_import_corrupt_failure",
+ if (!is_predefined_tablespace(id.space()))
+ {
+ buf_corrupt_page_release(bpage, node);
+ ib::info() << "Simulated IMPORT corruption";
+ return err;
+ }
+ err= DB_SUCCESS;
+ goto page_not_corrupt;);
+
+ if (bpage->zip.data && bpage->state() == BUF_BLOCK_FILE_PAGE)
+ memset(reinterpret_cast<buf_block_t*>(bpage)->frame, 0, srv_page_size);
+
+ if (err == DB_PAGE_CORRUPTED)
+ {
+ ib::error() << "Database page corruption on disk"
+ " or a failed read of file '"
+ << node.name << "' page " << id
+ << ". You may have to recover from a backup.";
+
+ buf_page_print(frame, bpage->zip_size());
+
+ ib::info() << " You can use CHECK TABLE to scan"
+ " your table for corruption. "
+ << FORCE_RECOVERY_MSG;
+ }
+
+ if (!srv_force_recovery)
+ {
+ /* If the corruption is in the system tablespace, we will
+ intentionally crash the server. */
+ if (id.space() == TRX_SYS_SPACE)
+ ib::fatal() << "Aborting because of a corrupt database page.";
+ buf_corrupt_page_release(bpage, node);
+ return err;
+ }
+ }
+
+ DBUG_EXECUTE_IF("buf_page_import_corrupt_failure",
+ page_not_corrupt: bpage= bpage; );
+
+ if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED)
+ {
+release_page:
+ buf_corrupt_page_release(bpage, node);
+ if (recv_recovery_is_on())
+ recv_sys.free_corrupted_page(id);
+ return err;
+ }
+
+ if (recv_recovery_is_on())
+ recv_recover_page(node.space, bpage);
+
+ if (bpage->state() == BUF_BLOCK_FILE_PAGE && !recv_no_ibuf_operations &&
+ (!id.space() || !is_predefined_tablespace(id.space())) &&
+ fil_page_get_type(frame) == FIL_PAGE_INDEX &&
+ page_is_leaf(frame))
+ bpage->ibuf_exist= true;
+
+ if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
+ buf_page_monitor(bpage, BUF_IO_READ);
+ DBUG_PRINT("ib_buf", ("read page %u:%u",
+ id.space(), id.page_no()));
+
+ /* Because this thread which does the unlocking might not be the same that
+ did the locking, we use a pass value != 0 in unlock, which simply
+ removes the newest lock debug record, without checking the thread id. */
+ if (bpage->state() == BUF_BLOCK_FILE_PAGE)
+ rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_READ);
+ bpage->io_unfix();
+
+ ut_d(auto n=) buf_pool.n_pend_reads--;
+ ut_ad(n > 0);
+ buf_pool.stat.n_pages_read++;
+
+ return DB_SUCCESS;
+}
+
+#ifdef UNIV_DEBUG
+/** Check that all blocks are in a replaceable state.
+@return address of a non-free block
+@retval nullptr if all freed */
+void buf_pool_t::assert_all_freed()
+{
+ mysql_mutex_lock(&mutex);
+ const chunk_t *chunk= chunks;
+ for (auto i= n_chunks; i--; chunk++)
+ if (const buf_block_t* block= chunk->not_freed())
+ ib::fatal() << "Page " << block->page.id() << " still fixed or dirty";
+ mysql_mutex_unlock(&mutex);
+}
+#endif /* UNIV_DEBUG */
+
+/** Refresh the statistics used to print per-second averages. */
+void buf_refresh_io_stats()
+{
+ buf_pool.last_printout_time = time(NULL);
+ buf_pool.old_stat = buf_pool.stat;
+}
+
+/** Invalidate all pages in the buffer pool.
+All pages must be in a replaceable state (not modified or latched). */
+void buf_pool_invalidate()
+{
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ buf_flush_wait_batch_end(true);
+ buf_flush_wait_batch_end(false);
+
+ /* It is possible that a write batch that has been posted
+ earlier is still not complete. For buffer pool invalidation to
+ proceed we must ensure there is NO write activity happening. */
+
+ ut_d(mysql_mutex_unlock(&buf_pool.mutex));
+ ut_d(buf_pool.assert_all_freed());
+ ut_d(mysql_mutex_lock(&buf_pool.mutex));
+
+ while (buf_LRU_scan_and_free_block());
+
+ ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == 0);
+ ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0);
+
+ buf_pool.freed_page_clock = 0;
+ buf_pool.LRU_old = NULL;
+ buf_pool.LRU_old_len = 0;
+
+ memset(&buf_pool.stat, 0x00, sizeof(buf_pool.stat));
+ buf_refresh_io_stats();
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+#ifdef UNIV_DEBUG
+/** Validate the buffer pool. */
+void buf_pool_t::validate()
+{
+ ulint n_lru = 0;
+ ulint n_flushing = 0;
+ ulint n_free = 0;
+ ulint n_zip = 0;
+
+ mysql_mutex_lock(&mutex);
+
+ chunk_t* chunk = chunks;
+
+ /* Check the uncompressed blocks. */
+
+ for (auto i = n_chunks; i--; chunk++) {
+
+ ulint j;
+ buf_block_t* block = chunk->blocks;
+
+ for (j = chunk->size; j--; block++) {
+ switch (block->page.state()) {
+ case BUF_BLOCK_ZIP_PAGE:
+ /* This kind of block descriptors should
+ be allocated by malloc() only. */
+ ut_error;
+ break;
+
+ case BUF_BLOCK_NOT_USED:
+ n_free++;
+ break;
+
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ /* do nothing */
+ break;
+
+ case BUF_BLOCK_FILE_PAGE:
+ const page_id_t id = block->page.id();
+ ut_ad(page_hash_get_low(id, id.fold())
+ == &block->page);
+ n_lru++;
+ break;
+
+ }
+ }
+ }
+
+ /* Check dirty blocks. */
+
+ mysql_mutex_lock(&flush_list_mutex);
+ for (buf_page_t* b = UT_LIST_GET_FIRST(flush_list); b;
+ b = UT_LIST_GET_NEXT(list, b)) {
+ ut_ad(b->oldest_modification());
+ ut_ad(!fsp_is_system_temporary(b->id().space()));
+ n_flushing++;
+
+ switch (b->state()) {
+ case BUF_BLOCK_ZIP_PAGE:
+ n_lru++;
+ n_zip++;
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ /* uncompressed page */
+ break;
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ break;
+ }
+ const page_id_t id = b->id();
+ ut_ad(page_hash_get_low(id, id.fold()) == b);
+ }
+
+ ut_ad(UT_LIST_GET_LEN(flush_list) == n_flushing);
+
+ mysql_mutex_unlock(&flush_list_mutex);
+
+ if (curr_size == old_size
+ && n_lru + n_free > curr_size + n_zip) {
+
+ ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free
+ << ", pool " << curr_size
+ << " zip " << n_zip << ". Aborting...";
+ }
+
+ ut_ad(UT_LIST_GET_LEN(LRU) >= n_lru);
+
+ if (curr_size == old_size
+ && UT_LIST_GET_LEN(free) != n_free) {
+
+ ib::fatal() << "Free list len "
+ << UT_LIST_GET_LEN(free)
+ << ", free blocks " << n_free << ". Aborting...";
+ }
+
+ mysql_mutex_unlock(&mutex);
+
+ ut_d(buf_LRU_validate());
+ ut_d(buf_flush_validate());
+}
+#endif /* UNIV_DEBUG */
+
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+/** Write information of the buf_pool to the error log. */
+void buf_pool_t::print()
+{
+ index_id_t* index_ids;
+ ulint* counts;
+ ulint size;
+ ulint i;
+ ulint j;
+ index_id_t id;
+ ulint n_found;
+ chunk_t* chunk;
+ dict_index_t* index;
+
+ size = curr_size;
+
+ index_ids = static_cast<index_id_t*>(
+ ut_malloc_nokey(size * sizeof *index_ids));
+
+ counts = static_cast<ulint*>(ut_malloc_nokey(sizeof(ulint) * size));
+
+ mysql_mutex_lock(&mutex);
+ mysql_mutex_lock(&flush_list_mutex);
+
+ ib::info()
+ << "[buffer pool: size=" << curr_size
+ << ", database pages=" << UT_LIST_GET_LEN(LRU)
+ << ", free pages=" << UT_LIST_GET_LEN(free)
+ << ", modified database pages="
+ << UT_LIST_GET_LEN(flush_list)
+ << ", n pending decompressions=" << n_pend_unzip
+ << ", n pending reads=" << n_pend_reads
+ << ", n pending flush LRU=" << n_flush_LRU_
+ << " list=" << n_flush_list_
+ << ", pages made young=" << stat.n_pages_made_young
+ << ", not young=" << stat.n_pages_not_made_young
+ << ", pages read=" << stat.n_pages_read
+ << ", created=" << stat.n_pages_created
+ << ", written=" << stat.n_pages_written << "]";
+
+ mysql_mutex_unlock(&flush_list_mutex);
+
+ /* Count the number of blocks belonging to each index in the buffer */
+
+ n_found = 0;
+
+ chunk = chunks;
+
+ for (i = n_chunks; i--; chunk++) {
+ buf_block_t* block = chunk->blocks;
+ ulint n_blocks = chunk->size;
+
+ for (; n_blocks--; block++) {
+ const buf_frame_t* frame = block->frame;
+
+ if (fil_page_index_page_check(frame)) {
+
+ id = btr_page_get_index_id(frame);
+
+ /* Look for the id in the index_ids array */
+ j = 0;
+
+ while (j < n_found) {
+
+ if (index_ids[j] == id) {
+ counts[j]++;
+
+ break;
+ }
+ j++;
+ }
+
+ if (j == n_found) {
+ n_found++;
+ index_ids[j] = id;
+ counts[j] = 1;
+ }
+ }
+ }
+ }
+
+ mysql_mutex_unlock(&mutex);
+
+ for (i = 0; i < n_found; i++) {
+ index = dict_index_get_if_in_cache(index_ids[i]);
+
+ if (!index) {
+ ib::info() << "Block count for index "
+ << index_ids[i] << " in buffer is about "
+ << counts[i];
+ } else {
+ ib::info() << "Block count for index " << index_ids[i]
+ << " in buffer is about " << counts[i]
+ << ", index " << index->name
+ << " of table " << index->table->name;
+ }
+ }
+
+ ut_free(index_ids);
+ ut_free(counts);
+
+ validate();
+}
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+/** @return the number of latched pages in the buffer pool */
+ulint buf_get_latched_pages_number()
+{
+ ulint fixed_pages_number= 0;
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ for (buf_page_t *b= UT_LIST_GET_FIRST(buf_pool.LRU); b;
+ b= UT_LIST_GET_NEXT(LRU, b))
+ if (b->in_file() && (b->buf_fix_count() || b->io_fix() != BUF_IO_NONE))
+ fixed_pages_number++;
+
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ return fixed_pages_number;
+}
+#endif /* UNIV_DEBUG */
+
+/** Collect buffer pool metadata.
+@param[out] pool_info buffer pool metadata */
+void buf_stats_get_pool_info(buf_pool_info_t *pool_info)
+{
+ time_t current_time;
+ double time_elapsed;
+
+ mysql_mutex_lock(&buf_pool.mutex);
+
+ pool_info->pool_size = buf_pool.curr_size;
+
+ pool_info->lru_len = UT_LIST_GET_LEN(buf_pool.LRU);
+
+ pool_info->old_lru_len = buf_pool.LRU_old_len;
+
+ pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool.free);
+
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool.flush_list);
+
+ pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool.unzip_LRU);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+ pool_info->n_pend_reads = buf_pool.n_pend_reads;
+
+ pool_info->n_pending_flush_lru = buf_pool.n_flush_LRU_;
+
+ pool_info->n_pending_flush_list = buf_pool.n_flush_list_;
+
+ current_time = time(NULL);
+ time_elapsed = 0.001 + difftime(current_time,
+ buf_pool.last_printout_time);
+
+ pool_info->n_pages_made_young = buf_pool.stat.n_pages_made_young;
+
+ pool_info->n_pages_not_made_young =
+ buf_pool.stat.n_pages_not_made_young;
+
+ pool_info->n_pages_read = buf_pool.stat.n_pages_read;
+
+ pool_info->n_pages_created = buf_pool.stat.n_pages_created;
+
+ pool_info->n_pages_written = buf_pool.stat.n_pages_written;
+
+ pool_info->n_page_gets = buf_pool.stat.n_page_gets;
+
+ pool_info->n_ra_pages_read_rnd = buf_pool.stat.n_ra_pages_read_rnd;
+ pool_info->n_ra_pages_read = buf_pool.stat.n_ra_pages_read;
+
+ pool_info->n_ra_pages_evicted = buf_pool.stat.n_ra_pages_evicted;
+
+ pool_info->page_made_young_rate =
+ static_cast<double>(buf_pool.stat.n_pages_made_young
+ - buf_pool.old_stat.n_pages_made_young)
+ / time_elapsed;
+
+ pool_info->page_not_made_young_rate =
+ static_cast<double>(buf_pool.stat.n_pages_not_made_young
+ - buf_pool.old_stat.n_pages_not_made_young)
+ / time_elapsed;
+
+ pool_info->pages_read_rate =
+ static_cast<double>(buf_pool.stat.n_pages_read
+ - buf_pool.old_stat.n_pages_read)
+ / time_elapsed;
+
+ pool_info->pages_created_rate =
+ static_cast<double>(buf_pool.stat.n_pages_created
+ - buf_pool.old_stat.n_pages_created)
+ / time_elapsed;
+
+ pool_info->pages_written_rate =
+ static_cast<double>(buf_pool.stat.n_pages_written
+ - buf_pool.old_stat.n_pages_written)
+ / time_elapsed;
+
+ pool_info->n_page_get_delta = buf_pool.stat.n_page_gets
+ - buf_pool.old_stat.n_page_gets;
+
+ if (pool_info->n_page_get_delta) {
+ pool_info->page_read_delta = buf_pool.stat.n_pages_read
+ - buf_pool.old_stat.n_pages_read;
+
+ pool_info->young_making_delta =
+ buf_pool.stat.n_pages_made_young
+ - buf_pool.old_stat.n_pages_made_young;
+
+ pool_info->not_young_making_delta =
+ buf_pool.stat.n_pages_not_made_young
+ - buf_pool.old_stat.n_pages_not_made_young;
+ }
+ pool_info->pages_readahead_rnd_rate =
+ static_cast<double>(buf_pool.stat.n_ra_pages_read_rnd
+ - buf_pool.old_stat.n_ra_pages_read_rnd)
+ / time_elapsed;
+
+
+ pool_info->pages_readahead_rate =
+ static_cast<double>(buf_pool.stat.n_ra_pages_read
+ - buf_pool.old_stat.n_ra_pages_read)
+ / time_elapsed;
+
+ pool_info->pages_evicted_rate =
+ static_cast<double>(buf_pool.stat.n_ra_pages_evicted
+ - buf_pool.old_stat.n_ra_pages_evicted)
+ / time_elapsed;
+
+ pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool.unzip_LRU);
+
+ pool_info->io_sum = buf_LRU_stat_sum.io;
+
+ pool_info->io_cur = buf_LRU_stat_cur.io;
+
+ pool_info->unzip_sum = buf_LRU_stat_sum.unzip;
+
+ pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
+
+ buf_refresh_io_stats();
+ mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+static
+void
+buf_print_io_instance(
+/*==================*/
+ buf_pool_info_t*pool_info, /*!< in: buffer pool info */
+ FILE* file) /*!< in/out: buffer where to print */
+{
+ ut_ad(pool_info);
+
+ fprintf(file,
+ "Buffer pool size " ULINTPF "\n"
+ "Free buffers " ULINTPF "\n"
+ "Database pages " ULINTPF "\n"
+ "Old database pages " ULINTPF "\n"
+ "Modified db pages " ULINTPF "\n"
+ "Percent of dirty pages(LRU & free pages): %.3f\n"
+ "Max dirty pages percent: %.3f\n"
+ "Pending reads " ULINTPF "\n"
+ "Pending writes: LRU " ULINTPF ", flush list " ULINTPF "\n",
+ pool_info->pool_size,
+ pool_info->free_list_len,
+ pool_info->lru_len,
+ pool_info->old_lru_len,
+ pool_info->flush_list_len,
+ static_cast<double>(pool_info->flush_list_len)
+ / (static_cast<double>(pool_info->lru_len
+ + pool_info->free_list_len) + 1.0)
+ * 100.0,
+ srv_max_buf_pool_modified_pct,
+ pool_info->n_pend_reads,
+ pool_info->n_pending_flush_lru,
+ pool_info->n_pending_flush_list);
+
+ fprintf(file,
+ "Pages made young " ULINTPF ", not young " ULINTPF "\n"
+ "%.2f youngs/s, %.2f non-youngs/s\n"
+ "Pages read " ULINTPF ", created " ULINTPF
+ ", written " ULINTPF "\n"
+ "%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
+ pool_info->n_pages_made_young,
+ pool_info->n_pages_not_made_young,
+ pool_info->page_made_young_rate,
+ pool_info->page_not_made_young_rate,
+ pool_info->n_pages_read,
+ pool_info->n_pages_created,
+ pool_info->n_pages_written,
+ pool_info->pages_read_rate,
+ pool_info->pages_created_rate,
+ pool_info->pages_written_rate);
+
+ if (pool_info->n_page_get_delta) {
+ double hit_rate = static_cast<double>(
+ pool_info->page_read_delta)
+ / static_cast<double>(pool_info->n_page_get_delta);
+
+ if (hit_rate > 1) {
+ hit_rate = 1;
+ }
+
+ fprintf(file,
+ "Buffer pool hit rate " ULINTPF " / 1000,"
+ " young-making rate " ULINTPF " / 1000 not "
+ ULINTPF " / 1000\n",
+ ulint(1000 * (1 - hit_rate)),
+ ulint(1000
+ * double(pool_info->young_making_delta)
+ / double(pool_info->n_page_get_delta)),
+ ulint(1000 * double(pool_info->not_young_making_delta)
+ / double(pool_info->n_page_get_delta)));
+ } else {
+ fputs("No buffer pool page gets since the last printout\n",
+ file);
+ }
+
+ /* Statistics about read ahead algorithm */
+ fprintf(file, "Pages read ahead %.2f/s,"
+ " evicted without access %.2f/s,"
+ " Random read ahead %.2f/s\n",
+
+ pool_info->pages_readahead_rate,
+ pool_info->pages_evicted_rate,
+ pool_info->pages_readahead_rnd_rate);
+
+ /* Print some values to help us with visualizing what is
+ happening with LRU eviction. */
+ fprintf(file,
+ "LRU len: " ULINTPF ", unzip_LRU len: " ULINTPF "\n"
+ "I/O sum[" ULINTPF "]:cur[" ULINTPF "], "
+ "unzip sum[" ULINTPF "]:cur[" ULINTPF "]\n",
+ pool_info->lru_len, pool_info->unzip_lru_len,
+ pool_info->io_sum, pool_info->io_cur,
+ pool_info->unzip_sum, pool_info->unzip_cur);
+}
+
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+void
+buf_print_io(
+/*=========*/
+ FILE* file) /*!< in/out: buffer where to print */
+{
+ buf_pool_info_t pool_info;
+
+ buf_stats_get_pool_info(&pool_info);
+ buf_print_io_instance(&pool_info, file);
+}
+
+/** Verify that post encryption checksum match with the calculated checksum.
+This function should be called only if tablespace contains crypt data metadata.
+@param[in] page page frame
+@param[in] fsp_flags tablespace flags
+@return true if true if page is encrypted and OK, false otherwise */
+bool buf_page_verify_crypt_checksum(const byte* page, ulint fsp_flags)
+{
+ if (!fil_space_t::full_crc32(fsp_flags)) {
+ return fil_space_verify_crypt_checksum(
+ page, fil_space_t::zip_size(fsp_flags));
+ }
+
+ return !buf_page_is_corrupted(true, page, fsp_flags);
+}
+
+/** Print the given page_id_t object.
+@param[in,out] out the output stream
+@param[in] page_id the page_id_t object to be printed
+@return the output stream */
+std::ostream& operator<<(std::ostream &out, const page_id_t page_id)
+{
+ out << "[page id: space=" << page_id.space()
+ << ", page number=" << page_id.page_no() << "]";
+ return out;
+}
+#endif /* !UNIV_INNOCHECKSUM */