diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
commit | 3f619478f796eddbba6e39502fe941b285dd97b1 (patch) | |
tree | e2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/buf/buf0buf.cc | |
parent | Initial commit. (diff) | |
download | mariadb-3f619478f796eddbba6e39502fe941b285dd97b1.tar.xz mariadb-3f619478f796eddbba6e39502fe941b285dd97b1.zip |
Adding upstream version 1:10.11.6.upstream/1%10.11.6upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/buf/buf0buf.cc')
-rw-r--r-- | storage/innobase/buf/buf0buf.cc | 4180 |
1 files changed, 4180 insertions, 0 deletions
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc new file mode 100644 index 00000000..8ef18ee0 --- /dev/null +++ b/storage/innobase/buf/buf0buf.cc @@ -0,0 +1,4180 @@ +/***************************************************************************** + +Copyright (c) 1995, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0buf.cc +The database buffer buf_pool + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "assume_aligned.h" +#include "mtr0types.h" +#include "mach0data.h" +#include "buf0checksum.h" +#include "mariadb_stats.h" +#include <string.h> + +#ifdef UNIV_INNOCHECKSUM +# include "my_sys.h" +# include "buf0buf.h" +#else +#include "my_cpu.h" +#include "mem0mem.h" +#include "btr0btr.h" +#include "fil0fil.h" +#include "fil0crypt.h" +#include "buf0rea.h" +#include "buf0flu.h" +#include "buf0buddy.h" +#include "buf0dblwr.h" +#include "lock0lock.h" +#include "btr0sea.h" +#include "ibuf0ibuf.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "log0log.h" +#include "dict0stats_bg.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "dict0dict.h" +#include "log0recv.h" +#include "srv0mon.h" +#include "log0crypt.h" +#include "fil0pagecompress.h" +#endif /* !UNIV_INNOCHECKSUM */ +#include "page0zip.h" +#include "buf0dump.h" +#include <map> +#include <sstream> +#include "log.h" + +using st_::span; + +#ifdef HAVE_LIBNUMA +#include <numa.h> +#include <numaif.h> +struct set_numa_interleave_t +{ + set_numa_interleave_t() + { + if (srv_numa_interleave) { + + struct bitmask *numa_mems_allowed = numa_get_mems_allowed(); + ib::info() << "Setting NUMA memory policy to" + " MPOL_INTERLEAVE"; + if (set_mempolicy(MPOL_INTERLEAVE, + numa_mems_allowed->maskp, + numa_mems_allowed->size) != 0) { + + ib::warn() << "Failed to set NUMA memory" + " policy to MPOL_INTERLEAVE: " + << strerror(errno); + } + numa_bitmask_free(numa_mems_allowed); + } + } + + ~set_numa_interleave_t() + { + if (srv_numa_interleave) { + + ib::info() << "Setting NUMA memory policy to" + " MPOL_DEFAULT"; + if (set_mempolicy(MPOL_DEFAULT, NULL, 0) != 0) { + ib::warn() << "Failed to set NUMA memory" + " policy to MPOL_DEFAULT: " + << strerror(errno); + } + } + } +}; + +#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE set_numa_interleave_t scoped_numa +#else +#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE +#endif /* HAVE_LIBNUMA */ + +/* + IMPLEMENTATION OF THE BUFFER POOL + ================================= + + Buffer frames and blocks + ------------------------ +Following the terminology of Gray and Reuter, we call the memory +blocks where file pages are loaded buffer frames. For each buffer +frame there is a control block, or shortly, a block, in the buffer +control array. The control info which does not need to be stored +in the file along with the file page, resides in the control block. + + Buffer pool struct + ------------------ +The buffer buf_pool contains a single mutex which protects all the +control data structures of the buf_pool. The content of a buffer frame is +protected by a separate read-write lock in its control block, though. +These locks can be locked and unlocked without owning the buf_pool.mutex. +The OS events in the buf_pool struct can be waited for without owning the +buf_pool.mutex. + +The buf_pool.mutex is a hot-spot in main memory, causing a lot of +memory bus traffic on multiprocessor systems when processors +alternately access the mutex. On our Pentium, the mutex is accessed +maybe every 10 microseconds. We gave up the solution to have mutexes +for each control block, for instance, because it seemed to be +complicated. + +A solution to reduce mutex contention of the buf_pool.mutex is to +create a separate mutex for the page hash table. On Pentium, +accessing the hash table takes 2 microseconds, about half +of the total buf_pool.mutex hold time. + + Control blocks + -------------- + +The control block contains, for instance, the bufferfix count +which is incremented when a thread wants a file page to be fixed +in a buffer frame. The bufferfix operation does not lock the +contents of the frame, however. For this purpose, the control +block contains a read-write lock. + +The buffer frames have to be aligned so that the start memory +address of a frame is divisible by the universal page size, which +is a power of two. + +The control blocks containing file pages are put to a hash table +according to the file address of the page. +We could speed up the access to an individual page by using +"pointer swizzling": we could replace the page references on +non-leaf index pages by direct pointers to the page, if it exists +in the buf_pool. We could make a separate hash table where we could +chain all the page references in non-leaf pages residing in the buf_pool, +using the page reference as the hash key, +and at the time of reading of a page update the pointers accordingly. +Drawbacks of this solution are added complexity and, +possibly, extra space required on non-leaf pages for memory pointers. +A simpler solution is just to speed up the hash table mechanism +in the database, using tables whose size is a power of 2. + + Lists of blocks + --------------- + +There are several lists of control blocks. + +The free list (buf_pool.free) contains blocks which are currently not +used. + +The common LRU list contains all the blocks holding a file page +except those for which the bufferfix count is non-zero. +The pages are in the LRU list roughly in the order of the last +access to the page, so that the oldest pages are at the end of the +list. We also keep a pointer to near the end of the LRU list, +which we can use when we want to artificially age a page in the +buf_pool. This is used if we know that some page is not needed +again for some time: we insert the block right after the pointer, +causing it to be replaced sooner than would normally be the case. +Currently this aging mechanism is used for read-ahead mechanism +of pages, and it can also be used when there is a scan of a full +table which cannot fit in the memory. Putting the pages near the +end of the LRU list, we make sure that most of the buf_pool stays +in the main memory, undisturbed. + +The unzip_LRU list contains a subset of the common LRU list. The +blocks on the unzip_LRU list hold a compressed file page and the +corresponding uncompressed page frame. A block is in unzip_LRU if and +only if the predicate block->page.belongs_to_unzip_LRU() +holds. The blocks in unzip_LRU will be in same order as they are in +the common LRU list. That is, each manipulation of the common LRU +list will result in the same manipulation of the unzip_LRU list. + +The chain of modified blocks (buf_pool.flush_list) contains the blocks +holding persistent file pages that have been modified in the memory +but not written to disk yet. The block with the oldest modification +which has not yet been written to disk is at the end of the chain. +The access to this list is protected by buf_pool.flush_list_mutex. + +The control blocks for uncompressed pages are accessible via +buf_block_t objects that are reachable via buf_pool.chunks[]. +The control blocks (buf_page_t) of those ROW_FORMAT=COMPRESSED pages +that are not in buf_pool.flush_list and for which no uncompressed +page has been allocated in buf_pool are only accessible via +buf_pool.LRU. + +The chains of free memory blocks (buf_pool.zip_free[]) are used by +the buddy allocator (buf0buddy.cc) to keep track of currently unused +memory blocks of size 1024..innodb_page_size / 2. These +blocks are inside the memory blocks of size innodb_page_size and type +BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer +pool. The buddy allocator is solely used for allocating +ROW_FORMAT=COMPRESSED page frames. + + Loading a file page + ------------------- + +First, a victim block for replacement has to be found in the +buf_pool. It is taken from the free list or searched for from the +end of the LRU-list. An exclusive lock is reserved for the frame, +the io_fix is set in the block fixing the block in buf_pool, +and the io-operation for loading the page is queued. The io-handler thread +releases the X-lock on the frame and releases the io_fix +when the io operation completes. + +A thread may request the above operation using the function +buf_page_get(). It may then continue to request a lock on the frame. +The lock is granted when the io-handler releases the x-lock. + + Read-ahead + ---------- + +The read-ahead mechanism is intended to be intelligent and +isolated from the semantically higher levels of the database +index management. From the higher level we only need the +information if a file page has a natural successor or +predecessor page. On the leaf level of a B-tree index, +these are the next and previous pages in the natural +order of the pages. + +Let us first explain the read-ahead mechanism when the leafs +of a B-tree are scanned in an ascending or descending order. +When a read page is the first time referenced in the buf_pool, +the buffer manager checks if it is at the border of a so-called +linear read-ahead area. The tablespace is divided into these +areas of size 64 blocks, for example. So if the page is at the +border of such an area, the read-ahead mechanism checks if +all the other blocks in the area have been accessed in an +ascending or descending order. If this is the case, the system +looks at the natural successor or predecessor of the page, +checks if that is at the border of another area, and in this case +issues read-requests for all the pages in that area. Maybe +we could relax the condition that all the pages in the area +have to be accessed: if data is deleted from a table, there may +appear holes of unused pages in the area. + +A different read-ahead mechanism is used when there appears +to be a random access pattern to a file. +If a new page is referenced in the buf_pool, and several pages +of its random access area (for instance, 32 consecutive pages +in a tablespace) have recently been referenced, we may predict +that the whole area may be needed in the near future, and issue +the read requests for the whole area. +*/ + +#ifndef UNIV_INNOCHECKSUM +# ifdef SUX_LOCK_GENERIC +void page_hash_latch::read_lock_wait() +{ + /* First, try busy spinning for a while. */ + for (auto spin= srv_n_spin_wait_rounds; spin--; ) + { + LF_BACKOFF(); + if (read_trylock()) + return; + } + /* Fall back to yielding to other threads. */ + do + std::this_thread::yield(); + while (!read_trylock()); +} + +void page_hash_latch::write_lock_wait() +{ + write_lock_wait_start(); + + /* First, try busy spinning for a while. */ + for (auto spin= srv_n_spin_wait_rounds; spin--; ) + { + if (write_lock_poll()) + return; + LF_BACKOFF(); + } + + /* Fall back to yielding to other threads. */ + do + std::this_thread::yield(); + while (!write_lock_poll()); +} +# endif + +/** Number of attempts made to read in a page in the buffer pool */ +constexpr ulint BUF_PAGE_READ_MAX_RETRIES= 100; +/** The maximum portion of the buffer pool that can be used for the +read-ahead buffer. (Divide buf_pool size by this amount) */ +constexpr uint32_t BUF_READ_AHEAD_PORTION= 32; + +/** A 64KiB buffer of NUL bytes, for use in assertions and checks, +and dummy default values of instantly dropped columns. +Initially, BLOB field references are set to NUL bytes, in +dtuple_convert_big_rec(). */ +const byte *field_ref_zero; + +/** The InnoDB buffer pool */ +buf_pool_t buf_pool; +buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_reg; +buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_ref; + +#ifdef UNIV_DEBUG +/** This is used to insert validation operations in execution +in the debug version */ +static Atomic_counter<size_t> buf_dbg_counter; +#endif /* UNIV_DEBUG */ + +/** Macro to determine whether the read of write counter is used depending +on the io_type */ +#define MONITOR_RW_COUNTER(read, counter) \ + (read ? (counter##_READ) : (counter##_WRITTEN)) + +/** Decrypt a page for temporary tablespace. +@param[in,out] tmp_frame Temporary buffer +@param[in] src_frame Page to decrypt +@return true if temporary tablespace decrypted, false if not */ +static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame) +{ + if (buf_is_zeroes(span<const byte>(src_frame, srv_page_size))) { + return true; + } + + /* read space & lsn */ + uint header_len = FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + + /* Copy FIL page header, it is not encrypted */ + memcpy(tmp_frame, src_frame, header_len); + + /* Calculate the offset where decryption starts */ + const byte* src = src_frame + header_len; + byte* dst = tmp_frame + header_len; + uint srclen = uint(srv_page_size) + - (header_len + FIL_PAGE_FCRC32_CHECKSUM); + ulint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET); + + if (!log_tmp_block_decrypt(src, srclen, dst, + (offset * srv_page_size))) { + return false; + } + + static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment"); + memcpy_aligned<4>(tmp_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM, + src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM, + FIL_PAGE_FCRC32_CHECKSUM); + + memcpy_aligned<UNIV_PAGE_SIZE_MIN>(src_frame, tmp_frame, + srv_page_size); + srv_stats.pages_decrypted.inc(); + srv_stats.n_temp_blocks_decrypted.inc(); + + return true; /* page was decrypted */ +} + +/** Decrypt a page. +@param[in,out] bpage Page control block +@param[in] node data file +@return whether the operation was successful */ +static bool buf_page_decrypt_after_read(buf_page_t *bpage, + const fil_node_t &node) +{ + ut_ad(node.space->referenced()); + ut_ad(node.space->id == bpage->id().space()); + const auto flags = node.space->flags; + + byte* dst_frame = bpage->zip.data ? bpage->zip.data : bpage->frame; + bool page_compressed = node.space->is_compressed() + && buf_page_is_compressed(dst_frame, flags); + const page_id_t id(bpage->id()); + + if (id.page_no() == 0) { + /* File header pages are not encrypted/compressed */ + return (true); + } + + buf_tmp_buffer_t* slot; + + if (id.space() == SRV_TMP_SPACE_ID + && innodb_encrypt_temporary_tables) { + slot = buf_pool.io_buf_reserve(); + slot->allocate(); + bool ok = buf_tmp_page_decrypt(slot->crypt_buf, dst_frame); + slot->release(); + return ok; + } + + /* Page is encrypted if encryption information is found from + tablespace and page contains used key_version. This is true + also for pages first compressed and then encrypted. */ + + uint key_version = buf_page_get_key_version(dst_frame, flags); + + if (page_compressed && !key_version) { + /* the page we read is unencrypted */ + /* Find free slot from temporary memory array */ +decompress: + if (fil_space_t::full_crc32(flags) + && buf_page_is_corrupted(true, dst_frame, flags)) { + return false; + } + + slot = buf_pool.io_buf_reserve(); + slot->allocate(); + +decompress_with_slot: + ulint write_size = fil_page_decompress( + slot->crypt_buf, dst_frame, flags); + slot->release(); + ut_ad(node.space->referenced()); + return write_size != 0; + } + + if (key_version && node.space->crypt_data) { + /* Verify encryption checksum before we even try to + decrypt. */ + if (!buf_page_verify_crypt_checksum(dst_frame, flags)) { +decrypt_failed: + ib::error() << "Encrypted page " << id + << " in file " << node.name + << " looks corrupted; key_version=" + << key_version; + return false; + } + + slot = buf_pool.io_buf_reserve(); + slot->allocate(); + + /* decrypt using crypt_buf to dst_frame */ + if (!fil_space_decrypt(node.space, slot->crypt_buf, dst_frame)) { + slot->release(); + goto decrypt_failed; + } + + if ((fil_space_t::full_crc32(flags) && page_compressed) + || fil_page_get_type(dst_frame) + == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) { + goto decompress_with_slot; + } + + slot->release(); + } else if (fil_page_get_type(dst_frame) + == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) { + goto decompress; + } + + ut_ad(node.space->referenced()); + return true; +} +#endif /* !UNIV_INNOCHECKSUM */ + +/** Checks if the page is in crc32 checksum format. +@param[in] read_buf database page +@param[in] checksum_field1 new checksum field +@param[in] checksum_field2 old checksum field +@return true if the page is in crc32 checksum format. */ +static +bool +buf_page_is_checksum_valid_crc32( + const byte* read_buf, + ulint checksum_field1, + ulint checksum_field2) +{ + const uint32_t crc32 = buf_calc_page_crc32(read_buf); + +#ifdef UNIV_INNOCHECKSUM + extern FILE* log_file; + extern uint32_t cur_page_num; + if (log_file) { + fprintf(log_file, "page::" UINT32PF ";" + " crc32 calculated = " UINT32PF ";" + " recorded checksum field1 = " ULINTPF " recorded" + " checksum field2 =" ULINTPF "\n", cur_page_num, + crc32, checksum_field1, checksum_field2); + } +#endif /* UNIV_INNOCHECKSUM */ + + if (checksum_field1 != checksum_field2) { + return false; + } + + return checksum_field1 == crc32; +} + +/** Checks whether the lsn present in the page is lesser than the +peek current lsn. +@param[in] check_lsn lsn to check +@param[in] read_buf page. */ +static void buf_page_check_lsn(bool check_lsn, const byte* read_buf) +{ +#ifndef UNIV_INNOCHECKSUM + if (check_lsn && recv_lsn_checks_on) { + const lsn_t current_lsn = log_sys.get_lsn(); + const lsn_t page_lsn + = mach_read_from_8(read_buf + FIL_PAGE_LSN); + + /* Since we are going to reset the page LSN during the import + phase it makes no sense to spam the log with error messages. */ + if (current_lsn < page_lsn) { + + const uint32_t space_id = mach_read_from_4( + read_buf + FIL_PAGE_SPACE_ID); + const uint32_t page_no = mach_read_from_4( + read_buf + FIL_PAGE_OFFSET); + + ib::error() << "Page " << page_id_t(space_id, page_no) + << " log sequence number " << page_lsn + << " is in the future! Current system" + << " log sequence number " + << current_lsn << "."; + + ib::error() << "Your database may be corrupt or" + " you may have copied the InnoDB" + " tablespace but not the InnoDB" + " log files. " + << FORCE_RECOVERY_MSG; + + } + } +#endif /* !UNIV_INNOCHECKSUM */ +} + + +/** Check if a buffer is all zeroes. +@param[in] buf data to check +@return whether the buffer is all zeroes */ +bool buf_is_zeroes(span<const byte> buf) +{ + ut_ad(buf.size() <= UNIV_PAGE_SIZE_MAX); + return memcmp(buf.data(), field_ref_zero, buf.size()) == 0; +} + +/** Check if a page is corrupt. +@param check_lsn whether FIL_PAGE_LSN should be checked +@param read_buf database page +@param fsp_flags contents of FIL_SPACE_FLAGS +@return whether the page is corrupted */ +bool buf_page_is_corrupted(bool check_lsn, const byte *read_buf, + uint32_t fsp_flags) +{ + if (fil_space_t::full_crc32(fsp_flags)) { + bool compressed = false, corrupted = false; + const uint size = buf_page_full_crc32_size( + read_buf, &compressed, &corrupted); + if (corrupted) { + return true; + } + const byte* end = read_buf + (size - FIL_PAGE_FCRC32_CHECKSUM); + uint crc32 = mach_read_from_4(end); + + if (!crc32 && size == srv_page_size + && buf_is_zeroes(span<const byte>(read_buf, size))) { + return false; + } + + DBUG_EXECUTE_IF( + "page_intermittent_checksum_mismatch", { + static int page_counter; + if (page_counter++ == 3) { + crc32++; + } + }); + + if (crc32 != my_crc32c(0, read_buf, + size - FIL_PAGE_FCRC32_CHECKSUM)) { + return true; + } + static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "alignment"); + static_assert(FIL_PAGE_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment"); + if (!compressed + && !mach_read_from_4(FIL_PAGE_FCRC32_KEY_VERSION + + read_buf) + && memcmp_aligned<4>(read_buf + (FIL_PAGE_LSN + 4), + end - (FIL_PAGE_FCRC32_END_LSN + - FIL_PAGE_FCRC32_CHECKSUM), + 4)) { + return true; + } + + buf_page_check_lsn(check_lsn, read_buf); + return false; + } + + const ulint zip_size = fil_space_t::zip_size(fsp_flags); + const uint16_t page_type = fil_page_get_type(read_buf); + + /* We can trust page type if page compression is set on tablespace + flags because page compression flag means file must have been + created with 10.1 (later than 5.5 code base). In 10.1 page + compressed tables do not contain post compression checksum and + FIL_PAGE_END_LSN_OLD_CHKSUM field stored. Note that space can + be null if we are in fil_check_first_page() and first page + is not compressed or encrypted. Page checksum is verified + after decompression (i.e. normally pages are already + decompressed at this stage). */ + if ((page_type == FIL_PAGE_PAGE_COMPRESSED || + page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) +#ifndef UNIV_INNOCHECKSUM + && FSP_FLAGS_HAS_PAGE_COMPRESSION(fsp_flags) +#endif + ) { + return(false); + } + + static_assert(FIL_PAGE_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 4 == 0, "alignment"); + + if (!zip_size + && memcmp_aligned<4>(read_buf + FIL_PAGE_LSN + 4, + read_buf + srv_page_size + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) { + /* Stored log sequence numbers at the start and the end + of page do not match */ + + return(true); + } + + buf_page_check_lsn(check_lsn, read_buf); + + /* Check whether the checksum fields have correct values */ + + if (zip_size) { + return !page_zip_verify_checksum(read_buf, zip_size); + } + + const uint32_t checksum_field1 = mach_read_from_4( + read_buf + FIL_PAGE_SPACE_OR_CHKSUM); + + const uint32_t checksum_field2 = mach_read_from_4( + read_buf + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM); + + static_assert(FIL_PAGE_LSN % 8 == 0, "alignment"); + + /* A page filled with NUL bytes is considered not corrupted. + Before MariaDB Server 10.1.25 (MDEV-12113) or 10.2.2 (or MySQL 5.7), + the FIL_PAGE_FILE_FLUSH_LSN field may have been written nonzero + for the first page of each file of the system tablespace. + We want to ignore it for the system tablespace, but because + we do not know the expected tablespace here, we ignore the + field for all data files, except for + innodb_checksum_algorithm=full_crc32 which we handled above. */ + if (!checksum_field1 && !checksum_field2) { + /* Checksum fields can have valid value as zero. + If the page is not empty then do the checksum + calculation for the page. */ + bool all_zeroes = true; + for (size_t i = 0; i < srv_page_size; i++) { +#ifndef UNIV_INNOCHECKSUM + if (i == FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) { + i += 8; + } +#endif + if (read_buf[i]) { + all_zeroes = false; + break; + } + } + + if (all_zeroes) { + return false; + } + } + +#ifndef UNIV_INNOCHECKSUM + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: +#endif /* !UNIV_INNOCHECKSUM */ + return !buf_page_is_checksum_valid_crc32( + read_buf, checksum_field1, checksum_field2); +#ifndef UNIV_INNOCHECKSUM + default: + if (checksum_field1 == BUF_NO_CHECKSUM_MAGIC + && checksum_field2 == BUF_NO_CHECKSUM_MAGIC) { + return false; + } + + const uint32_t crc32 = buf_calc_page_crc32(read_buf); + + /* Very old versions of InnoDB only stored 8 byte lsn to the + start and the end of the page. */ + + /* Since innodb_checksum_algorithm is not strict_* allow + any of the algos to match for the old field */ + + if (checksum_field2 + != mach_read_from_4(read_buf + FIL_PAGE_LSN) + && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) { + + DBUG_EXECUTE_IF( + "page_intermittent_checksum_mismatch", { + static int page_counter; + if (page_counter++ == 3) return true; + }); + + if ((checksum_field1 != crc32 + || checksum_field2 != crc32) + && checksum_field2 + != buf_calc_page_old_checksum(read_buf)) { + return true; + } + } + + switch (checksum_field1) { + case 0: + case BUF_NO_CHECKSUM_MAGIC: + return false; + } + return (checksum_field1 != crc32 || checksum_field2 != crc32) + && checksum_field1 + != buf_calc_page_new_checksum(read_buf); + } +#endif /* !UNIV_INNOCHECKSUM */ +} + +#ifndef UNIV_INNOCHECKSUM + +#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP) +/** Enable buffers to be dumped to core files + +A convience function, not called anyhwere directly however +it is left available for gdb or any debugger to call +in the event that you want all of the memory to be dumped +to a core file. + +Returns number of errors found in madvise calls. */ +MY_ATTRIBUTE((used)) +int +buf_madvise_do_dump() +{ + int ret= 0; + + /* mirrors allocation in log_t::create() */ + if (log_sys.buf) { + ret += madvise(log_sys.buf, log_sys.buf_size, MADV_DODUMP); + ret += madvise(log_sys.flush_buf, log_sys.buf_size, + MADV_DODUMP); + } + + mysql_mutex_lock(&buf_pool.mutex); + auto chunk = buf_pool.chunks; + + for (ulint n = buf_pool.n_chunks; n--; chunk++) { + ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP); + } + + mysql_mutex_unlock(&buf_pool.mutex); + return ret; +} +#endif + +#ifndef UNIV_DEBUG +static inline byte hex_to_ascii(byte hex_digit) +{ + const int offset= hex_digit <= 9 ? '0' : 'a' - 10; + return byte(hex_digit + offset); +} +#endif + +/** Dump a page to stderr. +@param[in] read_buf database page +@param[in] zip_size compressed page size, or 0 */ +ATTRIBUTE_COLD +void buf_page_print(const byte *read_buf, ulint zip_size) +{ +#ifndef UNIV_DEBUG + const size_t size = zip_size ? zip_size : srv_page_size; + const byte * const end= read_buf + size; + sql_print_information("InnoDB: Page dump (%zu bytes):", size); + + do + { + byte row[64]; + + for (byte *r= row; r != &row[64]; r+= 2, read_buf++) + { + r[0]= hex_to_ascii(byte(*read_buf >> 4)); + r[1]= hex_to_ascii(*read_buf & 15); + } + + sql_print_information("InnoDB: %.*s", 64, row); + } + while (read_buf != end); + + sql_print_information("InnoDB: End of page dump"); +#endif +} + +/** Initialize a buffer page descriptor. +@param[in,out] block buffer page descriptor +@param[in] frame buffer page frame */ +static +void +buf_block_init(buf_block_t* block, byte* frame) +{ + /* This function should only be executed at database startup or by + buf_pool.resize(). Either way, adaptive hash index must not exist. */ + assert_block_ahi_empty_on_init(block); + + block->page.frame = frame; + + MEM_MAKE_DEFINED(&block->modify_clock, sizeof block->modify_clock); + ut_ad(!block->modify_clock); + MEM_MAKE_DEFINED(&block->page.lock, sizeof block->page.lock); + block->page.init(buf_page_t::NOT_USED, page_id_t(~0ULL)); +#ifdef BTR_CUR_HASH_ADAPT + MEM_MAKE_DEFINED(&block->index, sizeof block->index); + ut_ad(!block->index); +#endif /* BTR_CUR_HASH_ADAPT */ + ut_d(block->in_unzip_LRU_list = false); + ut_d(block->in_withdraw_list = false); + + page_zip_des_init(&block->page.zip); + + MEM_MAKE_DEFINED(&block->page.hash, sizeof block->page.hash); + ut_ad(!block->page.hash); +} + +/** Allocate a chunk of buffer frames. +@param bytes requested size +@return whether the allocation succeeded */ +inline bool buf_pool_t::chunk_t::create(size_t bytes) +{ + DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return false;); + /* Round down to a multiple of page size, although it already should be. */ + bytes= ut_2pow_round<size_t>(bytes, srv_page_size); + + mem= buf_pool.allocator.allocate_large_dontdump(bytes, &mem_pfx); + + if (UNIV_UNLIKELY(!mem)) + return false; + + MEM_UNDEFINED(mem, mem_size()); + +#ifdef HAVE_LIBNUMA + if (srv_numa_interleave) + { + struct bitmask *numa_mems_allowed= numa_get_mems_allowed(); + if (mbind(mem, mem_size(), MPOL_INTERLEAVE, + numa_mems_allowed->maskp, numa_mems_allowed->size, + MPOL_MF_MOVE)) + { + ib::warn() << "Failed to set NUMA memory policy of" + " buffer pool page frames to MPOL_INTERLEAVE" + " (error: " << strerror(errno) << ")."; + } + numa_bitmask_free(numa_mems_allowed); + } +#endif /* HAVE_LIBNUMA */ + + + /* Allocate the block descriptors from + the start of the memory block. */ + blocks= reinterpret_cast<buf_block_t*>(mem); + + /* Align a pointer to the first frame. Note that when + opt_large_page_size is smaller than srv_page_size, + (with max srv_page_size at 64k don't think any hardware + makes this true), + we may allocate one fewer block than requested. When + it is bigger, we may allocate more blocks than requested. */ + static_assert(sizeof(byte*) == sizeof(ulint), "pointer size"); + + byte *frame= reinterpret_cast<byte*>((reinterpret_cast<ulint>(mem) + + srv_page_size - 1) & + ~ulint{srv_page_size - 1}); + size= (mem_pfx.m_size >> srv_page_size_shift) - (frame != mem); + + /* Subtract the space needed for block descriptors. */ + { + ulint s= size; + + while (frame < reinterpret_cast<const byte*>(blocks + s)) + { + frame+= srv_page_size; + s--; + } + + size= s; + } + + /* Init block structs and assign frames for them. Then we assign the + frames to the first blocks (we already mapped the memory above). */ + + buf_block_t *block= blocks; + + for (auto i= size; i--; ) { + buf_block_init(block, frame); + MEM_UNDEFINED(block->page.frame, srv_page_size); + /* Add the block to the free list */ + UT_LIST_ADD_LAST(buf_pool.free, &block->page); + + ut_d(block->page.in_free_list = TRUE); + block++; + frame+= srv_page_size; + } + + reg(); + + return true; +} + +#ifdef UNIV_DEBUG +/** Check that all file pages in the buffer chunk are in a replaceable state. +@return address of a non-free block +@retval nullptr if all freed */ +inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const +{ + buf_block_t *block= blocks; + for (auto i= size; i--; block++) + { + if (block->page.in_file()) + { + /* The uncompressed buffer pool should never + contain ROW_FORMAT=COMPRESSED block descriptors. */ + ut_ad(block->page.frame); + const lsn_t lsn= block->page.oldest_modification(); + + if (srv_read_only_mode) + { + /* The page cleaner is disabled in read-only mode. No pages + can be dirtied, so all of them must be clean. */ + ut_ad(lsn == 0 || lsn == recv_sys.lsn || + srv_force_recovery == SRV_FORCE_NO_LOG_REDO); + break; + } + + if (fsp_is_system_temporary(block->page.id().space())) + { + ut_ad(lsn == 0 || lsn == 2); + break; + } + + if (lsn > 1 || !block->page.can_relocate()) + return block; + + break; + } + } + + return nullptr; +} +#endif /* UNIV_DEBUG */ + +/** Create the hash table. +@param n the lower bound of n_cells */ +void buf_pool_t::page_hash_table::create(ulint n) +{ + n_cells= ut_find_prime(n); + const size_t size= MY_ALIGN(pad(n_cells) * sizeof *array, + CPU_LEVEL1_DCACHE_LINESIZE); + void *v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE); + memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(v, 0, size); + array= static_cast<hash_chain*>(v); +} + +/** Create the buffer pool. +@return whether the creation failed */ +bool buf_pool_t::create() +{ + ut_ad(this == &buf_pool); + ut_ad(srv_buf_pool_size % srv_buf_pool_chunk_unit == 0); + ut_ad(!is_initialised()); + ut_ad(srv_buf_pool_size > 0); + ut_ad(!resizing); + ut_ad(!chunks_old); + /* mariabackup loads tablespaces, and it requires field_ref_zero to be + allocated before innodb initialization */ + ut_ad(srv_operation >= SRV_OPERATION_RESTORE || !field_ref_zero); + + NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE; + + if (!field_ref_zero) { + if (auto b= aligned_malloc(UNIV_PAGE_SIZE_MAX, 4096)) + field_ref_zero= static_cast<const byte*> + (memset_aligned<4096>(b, 0, UNIV_PAGE_SIZE_MAX)); + else + return true; + } + + chunk_t::map_reg= UT_NEW_NOKEY(chunk_t::map()); + + new(&allocator) ut_allocator<unsigned char>(mem_key_buf_buf_pool); + + n_chunks= srv_buf_pool_size / srv_buf_pool_chunk_unit; + const size_t chunk_size= srv_buf_pool_chunk_unit; + + chunks= static_cast<chunk_t*>(ut_zalloc_nokey(n_chunks * sizeof *chunks)); + UT_LIST_INIT(free, &buf_page_t::list); + curr_size= 0; + auto chunk= chunks; + + do + { + if (!chunk->create(chunk_size)) + { + while (--chunk >= chunks) + { + buf_block_t* block= chunk->blocks; + + for (auto i= chunk->size; i--; block++) + block->page.lock.free(); + + allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx); + } + ut_free(chunks); + chunks= nullptr; + UT_DELETE(chunk_t::map_reg); + chunk_t::map_reg= nullptr; + aligned_free(const_cast<byte*>(field_ref_zero)); + field_ref_zero= nullptr; + ut_ad(!is_initialised()); + return true; + } + + curr_size+= chunk->size; + } + while (++chunk < chunks + n_chunks); + + ut_ad(is_initialised()); +#if defined(__aarch64__) + mysql_mutex_init(buf_pool_mutex_key, &mutex, MY_MUTEX_INIT_FAST); +#else + mysql_mutex_init(buf_pool_mutex_key, &mutex, nullptr); +#endif + + UT_LIST_INIT(LRU, &buf_page_t::LRU); + UT_LIST_INIT(withdraw, &buf_page_t::list); + withdraw_target= 0; + UT_LIST_INIT(flush_list, &buf_page_t::list); + UT_LIST_INIT(unzip_LRU, &buf_block_t::unzip_LRU); + + for (size_t i= 0; i < UT_ARR_SIZE(zip_free); ++i) + UT_LIST_INIT(zip_free[i], &buf_buddy_free_t::list); + ulint s= curr_size; + s/= BUF_READ_AHEAD_PORTION; + read_ahead_area= s >= READ_AHEAD_PAGES + ? READ_AHEAD_PAGES + : my_round_up_to_next_power(static_cast<uint32_t>(s)); + curr_pool_size= srv_buf_pool_size; + + n_chunks_new= n_chunks; + + page_hash.create(2 * curr_size); + zip_hash.create(2 * curr_size); + last_printout_time= time(NULL); + + mysql_mutex_init(flush_list_mutex_key, &flush_list_mutex, + MY_MUTEX_INIT_FAST); + + pthread_cond_init(&done_flush_LRU, nullptr); + pthread_cond_init(&done_flush_list, nullptr); + pthread_cond_init(&do_flush_list, nullptr); + pthread_cond_init(&done_free, nullptr); + + try_LRU_scan= true; + + ut_d(flush_hp.m_mutex= &flush_list_mutex;); + ut_d(lru_hp.m_mutex= &mutex); + ut_d(lru_scan_itr.m_mutex= &mutex); + + io_buf.create((srv_n_read_io_threads + srv_n_write_io_threads) * + OS_AIO_N_PENDING_IOS_PER_THREAD); + + /* FIXME: remove some of these variables */ + srv_buf_pool_curr_size= curr_pool_size; + srv_buf_pool_old_size= srv_buf_pool_size; + srv_buf_pool_base_size= srv_buf_pool_size; + + last_activity_count= srv_get_activity_count(); + + chunk_t::map_ref= chunk_t::map_reg; + buf_LRU_old_ratio_update(100 * 3 / 8, false); + btr_search_sys_create(); + ut_ad(is_initialised()); + return false; +} + +/** Clean up after successful create() */ +void buf_pool_t::close() +{ + ut_ad(this == &buf_pool); + if (!is_initialised()) + return; + + mysql_mutex_destroy(&mutex); + mysql_mutex_destroy(&flush_list_mutex); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev_bpage= nullptr; bpage; + bpage= prev_bpage) + { + prev_bpage= UT_LIST_GET_PREV(LRU, bpage); + ut_ad(bpage->in_file()); + ut_ad(bpage->in_LRU_list); + /* The buffer pool must be clean during normal shutdown. + Only on aborted startup (with recovery) or with innodb_fast_shutdown=2 + we may discard changes. */ + ut_d(const lsn_t oldest= bpage->oldest_modification();) + ut_ad(fsp_is_system_temporary(bpage->id().space()) + ? (oldest == 0 || oldest == 2) + : oldest <= 1 || srv_is_being_started || srv_fast_shutdown == 2); + + if (UNIV_UNLIKELY(!bpage->frame)) + { + bpage->lock.free(); + ut_free(bpage); + } + } + + for (auto chunk= chunks + n_chunks; --chunk >= chunks; ) + { + buf_block_t *block= chunk->blocks; + + for (auto i= chunk->size; i--; block++) + block->page.lock.free(); + + allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx); + } + + pthread_cond_destroy(&done_flush_LRU); + pthread_cond_destroy(&done_flush_list); + pthread_cond_destroy(&do_flush_list); + pthread_cond_destroy(&done_free); + + ut_free(chunks); + chunks= nullptr; + page_hash.free(); + zip_hash.free(); + + io_buf.close(); + UT_DELETE(chunk_t::map_reg); + chunk_t::map_reg= chunk_t::map_ref= nullptr; + aligned_free(const_cast<byte*>(field_ref_zero)); + field_ref_zero= nullptr; +} + +/** Try to reallocate a control block. +@param block control block to reallocate +@return whether the reallocation succeeded */ +inline bool buf_pool_t::realloc(buf_block_t *block) +{ + buf_block_t* new_block; + + mysql_mutex_assert_owner(&mutex); + ut_ad(block->page.in_file()); + ut_ad(block->page.frame); + + new_block = buf_LRU_get_free_only(); + + if (new_block == NULL) { + mysql_mutex_lock(&buf_pool.flush_list_mutex); + page_cleaner_wakeup(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + return(false); /* free list was not enough */ + } + + const page_id_t id{block->page.id()}; + hash_chain& chain = page_hash.cell_get(id.fold()); + page_hash_latch& hash_lock = page_hash.lock_get(chain); + /* It does not make sense to use transactional_lock_guard + here, because copying innodb_page_size (4096 to 65536) bytes + as well as other changes would likely make the memory + transaction too large. */ + hash_lock.lock(); + + if (block->page.can_relocate()) { + memcpy_aligned<UNIV_PAGE_SIZE_MIN>( + new_block->page.frame, block->page.frame, + srv_page_size); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const auto frame = new_block->page.frame; + new_block->page.lock.free(); + new (&new_block->page) buf_page_t(block->page); + new_block->page.frame = frame; + + /* relocate LRU list */ + if (buf_page_t* prev_b = buf_pool.LRU_remove(&block->page)) { + UT_LIST_INSERT_AFTER(LRU, prev_b, &new_block->page); + } else { + UT_LIST_ADD_FIRST(LRU, &new_block->page); + } + + if (LRU_old == &block->page) { + LRU_old = &new_block->page; + } + + ut_ad(new_block->page.in_LRU_list); + + /* relocate unzip_LRU list */ + if (block->page.zip.data != NULL) { + ut_ad(block->in_unzip_LRU_list); + ut_d(new_block->in_unzip_LRU_list = true); + + buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block); + UT_LIST_REMOVE(unzip_LRU, block); + + ut_d(block->in_unzip_LRU_list = false); + block->page.zip.data = NULL; + page_zip_set_size(&block->page.zip, 0); + + if (prev_block != NULL) { + UT_LIST_INSERT_AFTER(unzip_LRU, prev_block, new_block); + } else { + UT_LIST_ADD_FIRST(unzip_LRU, new_block); + } + } else { + ut_ad(!block->in_unzip_LRU_list); + ut_d(new_block->in_unzip_LRU_list = false); + } + + /* relocate page_hash */ + hash_chain& chain = page_hash.cell_get(id.fold()); + ut_ad(&block->page == page_hash.get(id, chain)); + buf_pool.page_hash.replace(chain, &block->page, + &new_block->page); + buf_block_modify_clock_inc(block); + static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); + memset_aligned<4>(block->page.frame + + FIL_PAGE_OFFSET, 0xff, 4); + static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2, + "not perfect alignment"); + memset_aligned<2>(block->page.frame + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4); + MEM_UNDEFINED(block->page.frame, srv_page_size); + block->page.set_state(buf_page_t::REMOVE_HASH); + if (!fsp_is_system_temporary(id.space())) { + buf_flush_relocate_on_flush_list(&block->page, + &new_block->page); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + block->page.set_corrupt_id(); + + /* set other flags of buf_block_t */ + +#ifdef BTR_CUR_HASH_ADAPT + /* This code should only be executed by resize(), + while the adaptive hash index is disabled. */ + assert_block_ahi_empty(block); + assert_block_ahi_empty_on_init(new_block); + ut_ad(!block->index); + new_block->index = NULL; + new_block->n_hash_helps = 0; + new_block->n_fields = 1; + new_block->left_side = TRUE; +#endif /* BTR_CUR_HASH_ADAPT */ + ut_d(block->page.set_state(buf_page_t::MEMORY)); + /* free block */ + new_block = block; + } + + hash_lock.unlock(); + buf_LRU_block_free_non_file_page(new_block); + return(true); /* free_list was enough */ +} + +void buf_pool_t::io_buf_t::create(ulint n_slots) +{ + this->n_slots= n_slots; + slots= static_cast<buf_tmp_buffer_t*> + (ut_malloc_nokey(n_slots * sizeof *slots)); + memset((void*) slots, 0, n_slots * sizeof *slots); +} + +void buf_pool_t::io_buf_t::close() +{ + for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) + { + aligned_free(s->crypt_buf); + aligned_free(s->comp_buf); + } + ut_free(slots); + slots= nullptr; + n_slots= 0; +} + +buf_tmp_buffer_t *buf_pool_t::io_buf_t::reserve() +{ + for (;;) + { + for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) + if (s->acquire()) + return s; + os_aio_wait_until_no_pending_writes(true); + for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) + if (s->acquire()) + return s; + os_aio_wait_until_no_pending_reads(true); + } +} + +/** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status +to the specified string. The format and the following parameters are the +same as the ones used for printf(3). +@param[in] fmt format +@param[in] ... extra parameters according to fmt */ +static +void +buf_resize_status( + const char* fmt, + ...) +{ + va_list ap; + + va_start(ap, fmt); + + vsnprintf( + export_vars.innodb_buffer_pool_resize_status, + sizeof(export_vars.innodb_buffer_pool_resize_status), + fmt, ap); + + va_end(ap); + + ib::info() << export_vars.innodb_buffer_pool_resize_status; +} + +/** Withdraw blocks from the buffer pool until meeting withdraw_target. +@return whether retry is needed */ +inline bool buf_pool_t::withdraw_blocks() +{ + buf_block_t* block; + ulint loop_count = 0; + + ib::info() << "Start to withdraw the last " + << withdraw_target << " blocks."; + + while (UT_LIST_GET_LEN(withdraw) < withdraw_target) { + + /* try to withdraw from free_list */ + ulint count1 = 0; + + mysql_mutex_lock(&mutex); + buf_buddy_condense_free(); + block = reinterpret_cast<buf_block_t*>( + UT_LIST_GET_FIRST(free)); + while (block != NULL + && UT_LIST_GET_LEN(withdraw) < withdraw_target) { + ut_ad(block->page.in_free_list); + ut_ad(!block->page.oldest_modification()); + ut_ad(!block->page.in_LRU_list); + ut_a(!block->page.in_file()); + + buf_block_t* next_block; + next_block = reinterpret_cast<buf_block_t*>( + UT_LIST_GET_NEXT( + list, &block->page)); + + if (will_be_withdrawn(block->page)) { + /* This should be withdrawn */ + UT_LIST_REMOVE(free, &block->page); + UT_LIST_ADD_LAST(withdraw, &block->page); + ut_d(block->in_withdraw_list = true); + count1++; + } + + block = next_block; + } + + /* reserve free_list length */ + if (UT_LIST_GET_LEN(withdraw) < withdraw_target) { + buf_flush_LRU( + std::max<ulint>(withdraw_target + - UT_LIST_GET_LEN(withdraw), + srv_LRU_scan_depth), + true); + mysql_mutex_unlock(&buf_pool.mutex); + buf_dblwr.flush_buffered_writes(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_flush_wait_LRU_batch_end(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + mysql_mutex_lock(&buf_pool.mutex); + } + + /* relocate blocks/buddies in withdrawn area */ + ulint count2 = 0; + + buf_pool_mutex_exit_forbid(); + for (buf_page_t* bpage = UT_LIST_GET_FIRST(LRU), *next_bpage; + bpage; bpage = next_bpage) { + ut_ad(bpage->in_file()); + next_bpage = UT_LIST_GET_NEXT(LRU, bpage); + if (UNIV_LIKELY_NULL(bpage->zip.data) + && will_be_withdrawn(bpage->zip.data) + && bpage->can_relocate()) { + if (!buf_buddy_realloc( + bpage->zip.data, + page_zip_get_size(&bpage->zip))) { + /* failed to allocate block */ + break; + } + count2++; + if (bpage->frame) { + goto realloc_frame; + } + } + + if (bpage->frame && will_be_withdrawn(*bpage) + && bpage->can_relocate()) { +realloc_frame: + if (!realloc(reinterpret_cast<buf_block_t*>( + bpage))) { + /* failed to allocate block */ + break; + } + count2++; + } + } + buf_pool_mutex_exit_allow(); + mysql_mutex_unlock(&mutex); + + buf_resize_status( + "Withdrawing blocks. (" ULINTPF "/" ULINTPF ").", + UT_LIST_GET_LEN(withdraw), + withdraw_target); + + ib::info() << "Withdrew " + << count1 << " blocks from free list." + << " Tried to relocate " << count2 << " blocks (" + << UT_LIST_GET_LEN(withdraw) << "/" + << withdraw_target << ")."; + + if (++loop_count >= 10) { + /* give up for now. + retried after user threads paused. */ + + ib::info() << "will retry to withdraw later"; + + /* need retry later */ + return(true); + } + } + + /* confirm withdrawn enough */ + for (const chunk_t* chunk = chunks + n_chunks_new, + * const echunk = chunks + n_chunks; chunk != echunk; chunk++) { + block = chunk->blocks; + for (ulint j = chunk->size; j--; block++) { + ut_a(block->page.state() == buf_page_t::NOT_USED); + ut_ad(block->in_withdraw_list); + } + } + + ib::info() << "Withdrawn target: " << UT_LIST_GET_LEN(withdraw) + << " blocks."; + + return(false); +} + + + +inline void buf_pool_t::page_hash_table::write_lock_all() +{ + for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1) + { + reinterpret_cast<page_hash_latch&>(array[n]).lock(); + if (!n) + break; + } +} + + +inline void buf_pool_t::page_hash_table::write_unlock_all() +{ + for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1) + { + reinterpret_cast<page_hash_latch&>(array[n]).unlock(); + if (!n) + break; + } +} + + +namespace +{ + +struct find_interesting_trx +{ + void operator()(const trx_t &trx) + { + if (trx.state == TRX_STATE_NOT_STARTED) + return; + if (trx.mysql_thd == nullptr) + return; + if (withdraw_started <= trx.start_time_micro) + return; + + if (!found) + { + ib::warn() << "The following trx might hold " + "the blocks in buffer pool to " + "be withdrawn. Buffer pool " + "resizing can complete only " + "after all the transactions " + "below release the blocks."; + found= true; + } + + lock_trx_print_wait_and_mvcc_state(stderr, &trx, current_time); + } + + bool &found; + /** microsecond_interval_timer() */ + const ulonglong withdraw_started; + const my_hrtime_t current_time; +}; + +} // namespace + +/** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */ +inline void buf_pool_t::resize() +{ + ut_ad(this == &buf_pool); + + bool warning = false; + + NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE; + + ut_ad(!resize_in_progress()); + ut_ad(srv_buf_pool_chunk_unit > 0); + + ulint new_instance_size = srv_buf_pool_size >> srv_page_size_shift; + std::ostringstream str_old_size, str_new_size, str_chunk_size; + str_old_size << ib::bytes_iec{srv_buf_pool_old_size}; + str_new_size << ib::bytes_iec{srv_buf_pool_size}; + str_chunk_size << ib::bytes_iec{srv_buf_pool_chunk_unit}; + + buf_resize_status("Resizing buffer pool from %s to %s (unit = %s).", + str_old_size.str().c_str(), + str_new_size.str().c_str(), + str_chunk_size.str().c_str()); + +#ifdef BTR_CUR_HASH_ADAPT + /* disable AHI if needed */ + buf_resize_status("Disabling adaptive hash index."); + + btr_search_s_lock_all(); + const bool btr_search_disabled = btr_search_enabled; + btr_search_s_unlock_all(); + + btr_search_disable(); + + if (btr_search_disabled) { + ib::info() << "disabled adaptive hash index."; + } +#endif /* BTR_CUR_HASH_ADAPT */ + + mysql_mutex_lock(&mutex); + ut_ad(n_chunks_new == n_chunks); + ut_ad(UT_LIST_GET_LEN(withdraw) == 0); + + n_chunks_new = (new_instance_size << srv_page_size_shift) + / srv_buf_pool_chunk_unit; + curr_size = n_chunks_new * chunks->size; + mysql_mutex_unlock(&mutex); + + if (is_shrinking()) { + /* set withdraw target */ + size_t w = 0; + + for (const chunk_t* chunk = chunks + n_chunks_new, + * const echunk = chunks + n_chunks; + chunk != echunk; chunk++) + w += chunk->size; + + ut_ad(withdraw_target == 0); + withdraw_target = w; + } + + buf_resize_status("Withdrawing blocks to be shrunken."); + + ulonglong withdraw_started = microsecond_interval_timer(); + ulonglong message_interval = 60ULL * 1000 * 1000; + ulint retry_interval = 1; + +withdraw_retry: + /* wait for the number of blocks fit to the new size (if needed)*/ + bool should_retry_withdraw = is_shrinking() + && withdraw_blocks(); + + if (srv_shutdown_state != SRV_SHUTDOWN_NONE) { + /* abort to resize for shutdown. */ + return; + } + + /* abort buffer pool load */ + buf_load_abort(); + + const ulonglong current_time = microsecond_interval_timer(); + + if (should_retry_withdraw + && current_time - withdraw_started >= message_interval) { + + if (message_interval > 900000000) { + message_interval = 1800000000; + } else { + message_interval *= 2; + } + + bool found= false; + find_interesting_trx f + {found, withdraw_started, my_hrtime_coarse()}; + withdraw_started = current_time; + + /* This is going to exceed the maximum size of a + memory transaction. */ + LockMutexGuard g{SRW_LOCK_CALL}; + trx_sys.trx_list.for_each(f); + } + + if (should_retry_withdraw) { + ib::info() << "Will retry to withdraw " << retry_interval + << " seconds later."; + std::this_thread::sleep_for( + std::chrono::seconds(retry_interval)); + + if (retry_interval > 5) { + retry_interval = 10; + } else { + retry_interval *= 2; + } + + goto withdraw_retry; + } + + buf_resize_status("Latching entire buffer pool."); + +#ifndef DBUG_OFF + { + bool should_wait = true; + + while (should_wait) { + should_wait = false; + DBUG_EXECUTE_IF( + "ib_buf_pool_resize_wait_before_resize", + should_wait = true; + std::this_thread::sleep_for( + std::chrono::milliseconds(10));); + } + } +#endif /* !DBUG_OFF */ + + if (srv_shutdown_state != SRV_SHUTDOWN_NONE) { + return; + } + + /* Indicate critical path */ + resizing.store(true, std::memory_order_relaxed); + + mysql_mutex_lock(&mutex); + page_hash.write_lock_all(); + + chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map()); + + /* add/delete chunks */ + + buf_resize_status("Resizing buffer pool from " + ULINTPF " chunks to " ULINTPF " chunks.", + n_chunks, n_chunks_new); + + if (is_shrinking()) { + /* delete chunks */ + chunk_t* chunk = chunks + n_chunks_new; + const chunk_t* const echunk = chunks + n_chunks; + + ulint sum_freed = 0; + + while (chunk < echunk) { + /* buf_LRU_block_free_non_file_page() invokes + MEM_NOACCESS() on any buf_pool.free blocks. + We must cancel the effect of that. In + MemorySanitizer, MEM_NOACCESS() is no-op, so + we must not do anything special for it here. */ +#ifdef HAVE_valgrind +# if !__has_feature(memory_sanitizer) + MEM_MAKE_DEFINED(chunk->mem, chunk->mem_size()); +# endif +#else + MEM_MAKE_ADDRESSABLE(chunk->mem, chunk->size); +#endif + + buf_block_t* block = chunk->blocks; + + for (ulint j = chunk->size; j--; block++) { + block->page.lock.free(); + } + + allocator.deallocate_large_dodump( + chunk->mem, &chunk->mem_pfx); + sum_freed += chunk->size; + ++chunk; + } + + /* discard withdraw list */ + UT_LIST_INIT(withdraw, &buf_page_t::list); + withdraw_target = 0; + + ib::info() << n_chunks - n_chunks_new + << " Chunks (" << sum_freed + << " blocks) were freed."; + + n_chunks = n_chunks_new; + } + + { + /* reallocate chunks */ + const size_t new_chunks_size + = n_chunks_new * sizeof(chunk_t); + + chunk_t* new_chunks = static_cast<chunk_t*>( + ut_zalloc_nokey_nofatal(new_chunks_size)); + + DBUG_EXECUTE_IF("buf_pool_resize_chunk_null", + ut_free(new_chunks); new_chunks= nullptr; ); + + if (!new_chunks) { + ib::error() << "failed to allocate" + " the chunk array."; + n_chunks_new = n_chunks; + warning = true; + chunks_old = NULL; + goto calc_buf_pool_size; + } + + ulint n_chunks_copy = ut_min(n_chunks_new, n_chunks); + + memcpy(new_chunks, chunks, + n_chunks_copy * sizeof *new_chunks); + + for (ulint j = 0; j < n_chunks_copy; j++) { + new_chunks[j].reg(); + } + + chunks_old = chunks; + chunks = new_chunks; + } + + if (n_chunks_new > n_chunks) { + /* add chunks */ + ulint sum_added = 0; + ulint n = n_chunks; + const size_t unit = srv_buf_pool_chunk_unit; + + for (chunk_t* chunk = chunks + n_chunks, + * const echunk = chunks + n_chunks_new; + chunk != echunk; chunk++) { + if (!chunk->create(unit)) { + ib::error() << "failed to allocate" + " memory for buffer pool chunk"; + + warning = true; + n_chunks_new = n_chunks; + break; + } + + sum_added += chunk->size; + ++n; + } + + ib::info() << n_chunks_new - n_chunks + << " chunks (" << sum_added + << " blocks) were added."; + + n_chunks = n; + } +calc_buf_pool_size: + /* recalc curr_size */ + ulint new_size = 0; + + { + chunk_t* chunk = chunks; + const chunk_t* const echunk = chunk + n_chunks; + do { + new_size += chunk->size; + } while (++chunk != echunk); + } + + curr_size = new_size; + n_chunks_new = n_chunks; + + if (chunks_old) { + ut_free(chunks_old); + chunks_old = NULL; + } + + chunk_t::map* chunk_map_old = chunk_t::map_ref; + chunk_t::map_ref = chunk_t::map_reg; + + /* set size */ + ut_ad(UT_LIST_GET_LEN(withdraw) == 0); + ulint s= curr_size; + s/= BUF_READ_AHEAD_PORTION; + read_ahead_area= s >= READ_AHEAD_PAGES + ? READ_AHEAD_PAGES + : my_round_up_to_next_power(static_cast<uint32_t>(s)); + curr_pool_size= n_chunks * srv_buf_pool_chunk_unit; + srv_buf_pool_curr_size= curr_pool_size;/* FIXME: remove*/ + extern ulonglong innobase_buffer_pool_size; + innobase_buffer_pool_size= buf_pool_size_align(srv_buf_pool_curr_size); + + const bool new_size_too_diff + = srv_buf_pool_base_size > srv_buf_pool_size * 2 + || srv_buf_pool_base_size * 2 < srv_buf_pool_size; + + mysql_mutex_unlock(&mutex); + page_hash.write_unlock_all(); + + UT_DELETE(chunk_map_old); + + resizing.store(false, std::memory_order_relaxed); + + /* Normalize other components, if the new size is too different */ + if (!warning && new_size_too_diff) { + srv_buf_pool_base_size = srv_buf_pool_size; + + buf_resize_status("Resizing other hash tables."); + + srv_lock_table_size = 5 + * (srv_buf_pool_size >> srv_page_size_shift); + lock_sys.resize(srv_lock_table_size); + dict_sys.resize(); + + ib::info() << "Resized hash tables: lock_sys," +#ifdef BTR_CUR_HASH_ADAPT + " adaptive hash index," +#endif /* BTR_CUR_HASH_ADAPT */ + " and dictionary."; + } + + /* normalize ibuf.max_size */ + ibuf_max_size_update(srv_change_buffer_max_size); + + if (srv_buf_pool_old_size != srv_buf_pool_size) { + + buf_resize_status("Completed resizing buffer pool from %zu to %zu bytes." + ,srv_buf_pool_old_size, srv_buf_pool_size); + srv_buf_pool_old_size = srv_buf_pool_size; + } + +#ifdef BTR_CUR_HASH_ADAPT + /* enable AHI if needed */ + if (btr_search_disabled) { + btr_search_enable(true); + ib::info() << "Re-enabled adaptive hash index."; + } +#endif /* BTR_CUR_HASH_ADAPT */ + + if (warning) + buf_resize_status("Resizing buffer pool failed"); + + ut_d(validate()); + + return; +} + +/** Thread pool task invoked by innodb_buffer_pool_size changes. */ +static void buf_resize_callback(void *) +{ + DBUG_ENTER("buf_resize_callback"); + ut_ad(srv_shutdown_state < SRV_SHUTDOWN_CLEANUP); + mysql_mutex_lock(&buf_pool.mutex); + const auto size= srv_buf_pool_size; + const bool work= srv_buf_pool_old_size != size; + mysql_mutex_unlock(&buf_pool.mutex); + + if (work) + buf_pool.resize(); + else + { + std::ostringstream sout; + sout << "Size did not change: old size = new size = " << size; + buf_resize_status(sout.str().c_str()); + } + DBUG_VOID_RETURN; +} + +/* Ensure that task does not run in parallel, by setting max_concurrency to 1 for the thread group */ +static tpool::task_group single_threaded_group(1); +static tpool::waitable_task buf_resize_task(buf_resize_callback, + nullptr, &single_threaded_group); + +void buf_resize_start() +{ + srv_thread_pool->submit_task(&buf_resize_task); +} + +void buf_resize_shutdown() +{ + buf_resize_task.wait(); +} + + +/** Relocate a ROW_FORMAT=COMPRESSED block in the LRU list and +buf_pool.page_hash. +The caller must relocate bpage->list. +@param bpage ROW_FORMAT=COMPRESSED only block +@param dpage destination control block */ +static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage) +{ + const page_id_t id{bpage->id()}; + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold()); + ut_ad(!bpage->frame); + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked()); + ut_ad(bpage == buf_pool.page_hash.get(id, chain)); + ut_ad(!buf_pool.watch_is_sentinel(*bpage)); + ut_d(const auto state= bpage->state()); + ut_ad(state >= buf_page_t::FREED); + ut_ad(state <= buf_page_t::READ_FIX); + ut_ad(bpage->lock.is_write_locked()); + const auto frame= dpage->frame; + + dpage->lock.free(); + new (dpage) buf_page_t(*bpage); + + dpage->frame= frame; + + /* Important that we adjust the hazard pointer before + removing bpage from LRU list. */ + if (buf_page_t *b= buf_pool.LRU_remove(bpage)) + UT_LIST_INSERT_AFTER(buf_pool.LRU, b, dpage); + else + UT_LIST_ADD_FIRST(buf_pool.LRU, dpage); + + if (UNIV_UNLIKELY(buf_pool.LRU_old == bpage)) + { + buf_pool.LRU_old= dpage; +#ifdef UNIV_LRU_DEBUG + /* buf_pool.LRU_old must be the first item in the LRU list + whose "old" flag is set. */ + ut_a(buf_pool.LRU_old->old); + ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old) || + !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old); + ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old) || + UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old); + } + else + { + /* Check that the "old" flag is consistent in + the block and its neighbours. */ + dpage->set_old(dpage->is_old()); +#endif /* UNIV_LRU_DEBUG */ + } + + ut_d(CheckInLRUList::validate()); + + buf_pool.page_hash.replace(chain, bpage, dpage); +} + +buf_page_t *buf_pool_t::watch_set(const page_id_t id, + buf_pool_t::hash_chain &chain) +{ + ut_ad(&chain == &page_hash.cell_get(id.fold())); + page_hash.lock_get(chain).lock(); + + buf_page_t *bpage= page_hash.get(id, chain); + + if (bpage) + { +got_block: + bpage->fix(); + if (watch_is_sentinel(*bpage)) + bpage= nullptr; + page_hash.lock_get(chain).unlock(); + return bpage; + } + + page_hash.lock_get(chain).unlock(); + /* Allocate a watch[] and then try to insert it into the page_hash. */ + mysql_mutex_lock(&mutex); + + /* The maximum number of purge tasks should never exceed + the UT_ARR_SIZE(watch) - 1, and there is no way for a purge task to hold a + watch when setting another watch. */ + for (buf_page_t *w= &watch[UT_ARR_SIZE(watch)]; w-- >= watch; ) + { + ut_ad(w->access_time == 0); + ut_ad(!w->oldest_modification()); + ut_ad(!w->zip.data); + ut_ad(!w->in_zip_hash); + static_assert(buf_page_t::NOT_USED == 0, "efficiency"); + if (ut_d(auto s=) w->state()) + { + /* This watch may be in use for some other page. */ + ut_ad(s >= buf_page_t::UNFIXED); + continue; + } + /* w is pointing to watch[], which is protected by mutex. + Normally, buf_page_t::id for objects that are reachable by + page_hash.get(id, chain) are protected by hash_lock. */ + w->set_state(buf_page_t::UNFIXED + 1); + w->id_= id; + + page_hash.lock_get(chain).lock(); + bpage= page_hash.get(id, chain); + if (UNIV_LIKELY_NULL(bpage)) + { + w->set_state(buf_page_t::NOT_USED); + mysql_mutex_unlock(&mutex); + goto got_block; + } + + ut_ad(w->state() == buf_page_t::UNFIXED + 1); + buf_pool.page_hash.append(chain, w); + mysql_mutex_unlock(&mutex); + page_hash.lock_get(chain).unlock(); + return nullptr; + } + + ut_error; +} + +/** Stop watching whether a page has been read in. +watch_set(id) must have returned nullptr before. +@param id page identifier +@param chain unlocked hash table chain */ +TRANSACTIONAL_TARGET +void buf_pool_t::watch_unset(const page_id_t id, buf_pool_t::hash_chain &chain) +{ + mysql_mutex_assert_not_owner(&mutex); + buf_page_t *w; + { + transactional_lock_guard<page_hash_latch> g{page_hash.lock_get(chain)}; + /* The page must exist because watch_set() did fix(). */ + w= page_hash.get(id, chain); + ut_ad(w->in_page_hash); + if (!watch_is_sentinel(*w)) + { + no_watch: + w->unfix(); + w= nullptr; + } + else + { + const auto state= w->state(); + ut_ad(~buf_page_t::LRU_MASK & state); + ut_ad(state >= buf_page_t::UNFIXED + 1); + if (state != buf_page_t::UNFIXED + 1) + goto no_watch; + } + } + + if (!w) + return; + + const auto old= w; + /* The following is based on buf_pool_t::watch_remove(). */ + mysql_mutex_lock(&mutex); + w= page_hash.get(id, chain); + + { + transactional_lock_guard<page_hash_latch> g + {buf_pool.page_hash.lock_get(chain)}; + auto f= w->unfix(); + ut_ad(f < buf_page_t::READ_FIX || w != old); + + if (f == buf_page_t::UNFIXED && w == old) + { + page_hash.remove(chain, w); + // Now that w is detached from page_hash, release it to watch[]. + ut_ad(w->id_ == id); + ut_ad(!w->frame); + ut_ad(!w->zip.data); + w->set_state(buf_page_t::NOT_USED); + } + } + + mysql_mutex_unlock(&mutex); +} + +/** Mark the page status as FREED for the given tablespace and page number. +@param[in,out] space tablespace +@param[in] page page number +@param[in,out] mtr mini-transaction */ +TRANSACTIONAL_TARGET +void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr) +{ + ut_ad(mtr); + ut_ad(mtr->is_active()); + + if (srv_immediate_scrub_data_uncompressed +#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 + || space->is_compressed() +#endif + ) + mtr->add_freed_offset(space, page); + + ++buf_pool.stat.n_page_gets; + const page_id_t page_id(space->id, page); + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); + uint32_t fix; + buf_block_t *block; + { + transactional_shared_lock_guard<page_hash_latch> g + {buf_pool.page_hash.lock_get(chain)}; + block= reinterpret_cast<buf_block_t*> + (buf_pool.page_hash.get(page_id, chain)); + if (!block || !block->page.frame) + /* FIXME: convert ROW_FORMAT=COMPRESSED, without buf_zip_decompress() */ + return; + /* To avoid a deadlock with buf_LRU_free_page() of some other page + and buf_page_write_complete() of this page, we must not wait for a + page latch while holding a page_hash latch. */ + fix= block->page.fix(); + } + + if (UNIV_UNLIKELY(fix < buf_page_t::UNFIXED)) + { + block->page.unfix(); + return; + } + + block->page.lock.x_lock(); + if (block->page.is_ibuf_exist()) + ibuf_merge_or_delete_for_page(nullptr, page_id, block->page.zip_size()); +#ifdef BTR_CUR_HASH_ADAPT + if (block->index) + btr_search_drop_page_hash_index(block, false); +#endif /* BTR_CUR_HASH_ADAPT */ + block->page.set_freed(block->page.state()); + mtr->memo_push(block, MTR_MEMO_PAGE_X_MODIFY); +} + +/** Get read access to a compressed page (usually of type +FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2). +The page must be released with unfix(). +NOTE: the page is not protected by any latch. Mutual exclusion has to +be implemented at a higher level. In other words, all possible +accesses to a given page through this function must be protected by +the same set of mutexes or latches. +@param page_id page identifier +@param zip_size ROW_FORMAT=COMPRESSED page size in bytes +@return pointer to the block, s-latched */ +TRANSACTIONAL_TARGET +buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size) +{ + ut_ad(zip_size); + ut_ad(ut_is_2pow(zip_size)); + ++buf_pool.stat.n_page_gets; + mariadb_increment_pages_accessed(); + + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); + page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); + buf_page_t *bpage; + +lookup: + for (bool discard_attempted= false;;) + { +#ifndef NO_ELISION + if (xbegin()) + { + if (hash_lock.is_locked()) + xabort(); + bpage= buf_pool.page_hash.get(page_id, chain); + if (!bpage || buf_pool.watch_is_sentinel(*bpage)) + { + xend(); + goto must_read_page; + } + if (!bpage->zip.data) + { + /* There is no ROW_FORMAT=COMPRESSED page. */ + xend(); + return nullptr; + } + if (discard_attempted || !bpage->frame) + { + if (!bpage->lock.s_lock_try()) + xabort(); + xend(); + break; + } + xend(); + } + else +#endif + { + hash_lock.lock_shared(); + bpage= buf_pool.page_hash.get(page_id, chain); + if (!bpage || buf_pool.watch_is_sentinel(*bpage)) + { + hash_lock.unlock_shared(); + goto must_read_page; + } + + ut_ad(bpage->in_file()); + ut_ad(page_id == bpage->id()); + + if (!bpage->zip.data) + { + /* There is no ROW_FORMAT=COMPRESSED page. */ + hash_lock.unlock_shared(); + return nullptr; + } + + if (discard_attempted || !bpage->frame) + { + /* Even when we are holding a hash_lock, it should be + acceptable to wait for a page S-latch here, because + buf_page_t::read_complete() will not wait for buf_pool.mutex, + and because S-latch would not conflict with a U-latch + that would be protecting buf_page_t::write_complete(). */ + bpage->lock.s_lock(); + hash_lock.unlock_shared(); + break; + } + + hash_lock.unlock_shared(); + } + + discard_attempted= true; + mysql_mutex_lock(&buf_pool.mutex); + if (buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain)) + buf_LRU_free_page(bpage, false); + mysql_mutex_unlock(&buf_pool.mutex); + } + + { + ut_d(const auto s=) bpage->fix(); + ut_ad(s >= buf_page_t::UNFIXED); + ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX); + } + + bpage->set_accessed(); + buf_page_make_young_if_needed(bpage); + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + return bpage; + +must_read_page: + switch (dberr_t err= buf_read_page(page_id, zip_size)) { + case DB_SUCCESS: + case DB_SUCCESS_LOCKED_REC: + mariadb_increment_pages_read(); + goto lookup; + default: + ib::error() << "Reading compressed page " << page_id + << " failed with error: " << err; + return nullptr; + } +} + +/********************************************************************//** +Initialize some fields of a control block. */ +UNIV_INLINE +void +buf_block_init_low( +/*===============*/ + buf_block_t* block) /*!< in: block to init */ +{ +#ifdef BTR_CUR_HASH_ADAPT + /* No adaptive hash index entries may point to a previously + unused (and now freshly allocated) block. */ + assert_block_ahi_empty_on_init(block); + block->index = NULL; + + block->n_hash_helps = 0; + block->n_fields = 1; + block->n_bytes = 0; + block->left_side = TRUE; +#endif /* BTR_CUR_HASH_ADAPT */ +} + +/********************************************************************//** +Decompress a block. +@return TRUE if successful */ +ibool +buf_zip_decompress( +/*===============*/ + buf_block_t* block, /*!< in/out: block */ + ibool check) /*!< in: TRUE=verify the page checksum */ +{ + const byte* frame = block->page.zip.data; + ulint size = page_zip_get_size(&block->page.zip); + /* The tablespace will not be found if this function is called + during IMPORT. */ + fil_space_t* space= fil_space_t::get(block->page.id().space()); + const unsigned key_version = mach_read_from_4( + frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL; + const bool encrypted = crypt_data + && crypt_data->type != CRYPT_SCHEME_UNENCRYPTED + && (!crypt_data->is_default_encryption() + || srv_encrypt_tables); + + ut_ad(block->zip_size()); + ut_a(block->page.id().space() != 0); + + if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) { + + ib::error() << "Compressed page checksum mismatch for " + << (space ? space->chain.start->name : "") + << block->page.id() << ": stored: " + << mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM) + << ", crc32: " + << page_zip_calc_checksum(frame, size, false) + << " adler32: " + << page_zip_calc_checksum(frame, size, true); + goto err_exit; + } + + switch (fil_page_get_type(frame)) { + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + if (page_zip_decompress(&block->page.zip, + block->page.frame, TRUE)) { +func_exit: + if (space) { + space->release(); + } + return(TRUE); + } + + ib::error() << "Unable to decompress " + << (space ? space->chain.start->name : "") + << block->page.id(); + goto err_exit; + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + /* Copy to uncompressed storage. */ + memcpy(block->page.frame, frame, block->zip_size()); + goto func_exit; + } + + ib::error() << "Unknown compressed page type " + << fil_page_get_type(frame) + << " in " << (space ? space->chain.start->name : "") + << block->page.id(); + +err_exit: + if (encrypted) { + ib::info() << "Row compressed page could be encrypted" + " with key_version " << key_version; + } + + if (space) { + space->release(); + } + + return(FALSE); +} + +/** Low level function used to get access to a database page. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH +@param[in] guess guessed block or NULL +@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, +BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH +@param[in] mtr mini-transaction +@param[out] err DB_SUCCESS or error code +@param[in] allow_ibuf_merge Allow change buffer merge to happen +while reading the page from file +then it makes sure that it does merging of change buffer changes while +reading the page from file. +@return pointer to the block or NULL */ +TRANSACTIONAL_TARGET +buf_block_t* +buf_page_get_low( + const page_id_t page_id, + ulint zip_size, + ulint rw_latch, + buf_block_t* guess, + ulint mode, + mtr_t* mtr, + dberr_t* err, + bool allow_ibuf_merge) +{ + unsigned access_time; + ulint retries = 0; + + ut_ad(!mtr || mtr->is_active()); + ut_ad(mtr || mode == BUF_PEEK_IF_IN_POOL); + ut_ad((rw_latch == RW_S_LATCH) + || (rw_latch == RW_X_LATCH) + || (rw_latch == RW_SX_LATCH) + || (rw_latch == RW_NO_LATCH)); + + if (err) { + *err = DB_SUCCESS; + } + +#ifdef UNIV_DEBUG + switch (mode) { + default: + ut_ad(!allow_ibuf_merge); + ut_ad(mode == BUF_PEEK_IF_IN_POOL); + break; + case BUF_GET_POSSIBLY_FREED: + case BUF_GET_IF_IN_POOL: + /* The caller may pass a dummy page size, + because it does not really matter. */ + break; + case BUF_GET: + case BUF_GET_IF_IN_POOL_OR_WATCH: + ut_ad(!mtr->is_freeing_tree()); + fil_space_t* s = fil_space_get(page_id.space()); + ut_ad(s); + ut_ad(s->zip_size() == zip_size); + } +#endif /* UNIV_DEBUG */ + + ut_ad(!mtr || !ibuf_inside(mtr) + || ibuf_page_low(page_id, zip_size, FALSE, NULL)); + + ++buf_pool.stat.n_page_gets; + mariadb_increment_pages_accessed(); + + auto& chain= buf_pool.page_hash.cell_get(page_id.fold()); + page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain); +loop: + buf_block_t* block = guess; + uint32_t state; + + if (block) { + transactional_shared_lock_guard<page_hash_latch> g{hash_lock}; + if (buf_pool.is_uncompressed(block) + && page_id == block->page.id()) { + ut_ad(!block->page.in_zip_hash); + state = block->page.state(); + /* Ignore guesses that point to read-fixed blocks. + We can only avoid a race condition by + looking up the block via buf_pool.page_hash. */ + if ((state >= buf_page_t::FREED + && state < buf_page_t::READ_FIX) + || state >= buf_page_t::WRITE_FIX) { + state = block->page.fix(); + goto got_block; + } + } + } + + guess = nullptr; + + /* A memory transaction would frequently be aborted here. */ + hash_lock.lock_shared(); + block = reinterpret_cast<buf_block_t*>( + buf_pool.page_hash.get(page_id, chain)); + if (UNIV_LIKELY(block + && !buf_pool.watch_is_sentinel(block->page))) { + state = block->page.fix(); + hash_lock.unlock_shared(); + goto got_block; + } + hash_lock.unlock_shared(); + + /* Page not in buf_pool: needs to be read from file */ + switch (mode) { + case BUF_GET_IF_IN_POOL: + case BUF_PEEK_IF_IN_POOL: + return nullptr; + case BUF_GET_IF_IN_POOL_OR_WATCH: + /* Buffer-fixing inside watch_set() will prevent eviction */ + block = reinterpret_cast<buf_block_t*> + (buf_pool.watch_set(page_id, chain)); + + if (block) { + state = block->page.state(); + goto got_block_fixed; + } + + return nullptr; + } + + /* The call path is buf_read_page() -> + buf_read_page_low() (fil_space_t::io()) -> + buf_page_t::read_complete() -> + buf_decrypt_after_read(). Here fil_space_t* is used + and we decrypt -> buf_page_check_corrupt() where page + checksums are compared. Decryption, decompression as + well as error handling takes place at a lower level. + Here we only need to know whether the page really is + corrupted, or if an encrypted page with a valid + checksum cannot be decypted. */ + + switch (dberr_t local_err = buf_read_page(page_id, zip_size)) { + case DB_SUCCESS: + case DB_SUCCESS_LOCKED_REC: + mariadb_increment_pages_read(); + buf_read_ahead_random(page_id, zip_size, ibuf_inside(mtr)); + break; + default: + if (mode != BUF_GET_POSSIBLY_FREED + && retries++ < BUF_PAGE_READ_MAX_RETRIES) { + DBUG_EXECUTE_IF("intermittent_read_failure", + retries = BUF_PAGE_READ_MAX_RETRIES;); + } + /* fall through */ + case DB_PAGE_CORRUPTED: + if (err) { + *err = local_err; + } + return nullptr; + } + + ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate()); + goto loop; + +got_block: + ut_ad(!block->page.in_zip_hash); + state++; +got_block_fixed: + ut_ad(state > buf_page_t::FREED); + + if (state > buf_page_t::READ_FIX && state < buf_page_t::WRITE_FIX) { + if (mode == BUF_PEEK_IF_IN_POOL) { +ignore_block: + ut_ad(mode == BUF_GET_POSSIBLY_FREED + || mode == BUF_PEEK_IF_IN_POOL); + block->unfix(); + if (err) { + *err = DB_CORRUPTION; + } + return nullptr; + } + + if (UNIV_UNLIKELY(!block->page.frame)) { + goto wait_for_unzip; + } + /* A read-fix is released after block->page.lock + in buf_page_t::read_complete() or + buf_pool_t::corrupted_evict(), or + after buf_zip_decompress() in this function. */ + block->page.lock.s_lock(); + state = block->page.state(); + ut_ad(state < buf_page_t::READ_FIX + || state >= buf_page_t::WRITE_FIX); + const page_id_t id{block->page.id()}; + block->page.lock.s_unlock(); + + if (UNIV_UNLIKELY(id != page_id)) { + ut_ad(id == page_id_t{~0ULL}); + block->page.unfix(); + if (++retries < BUF_PAGE_READ_MAX_RETRIES) { + goto loop; + } + + if (err) { + *err = DB_PAGE_CORRUPTED; + } + + return nullptr; + } + } else if (mode != BUF_PEEK_IF_IN_POOL) { + } else if (!mtr) { + ut_ad(!block->page.oldest_modification()); + mysql_mutex_lock(&buf_pool.mutex); + block->unfix(); + +free_unfixed_block: + if (!buf_LRU_free_page(&block->page, true)) { + ut_ad(0); + } + + mysql_mutex_unlock(&buf_pool.mutex); + return nullptr; + } else if (UNIV_UNLIKELY(!block->page.frame)) { + /* The BUF_PEEK_IF_IN_POOL mode is mainly used for dropping an + adaptive hash index. There cannot be an + adaptive hash index for a compressed-only page. */ + goto ignore_block; + } + + ut_ad(mode == BUF_GET_IF_IN_POOL || mode == BUF_PEEK_IF_IN_POOL + || block->zip_size() == zip_size); + + if (UNIV_UNLIKELY(!block->page.frame)) { + if (!block->page.lock.x_lock_try()) { +wait_for_unzip: + /* The page is being read or written, or + another thread is executing buf_zip_decompress() + in buf_page_get_low() on it. */ + block->page.unfix(); + std::this_thread::sleep_for( + std::chrono::microseconds(100)); + goto loop; + } + + buf_block_t *new_block = buf_LRU_get_free_block(false); + buf_block_init_low(new_block); + +wait_for_unfix: + mysql_mutex_lock(&buf_pool.mutex); + page_hash_latch& hash_lock=buf_pool.page_hash.lock_get(chain); + + /* It does not make sense to use + transactional_lock_guard here, because buf_relocate() + would likely make a memory transaction too large. */ + hash_lock.lock(); + + /* block->page.lock implies !block->page.can_relocate() */ + ut_ad(&block->page == buf_pool.page_hash.get(page_id, chain)); + + /* Wait for any other threads to release their buffer-fix + on the compressed-only block descriptor. + FIXME: Never fix() before acquiring the lock. + Only in buf_page_get_gen(), buf_page_get_low(), buf_page_free() + we are violating that principle. */ + state = block->page.state(); + + switch (state) { + case buf_page_t::UNFIXED + 1: + case buf_page_t::IBUF_EXIST + 1: + case buf_page_t::REINIT + 1: + break; + default: + ut_ad(state < buf_page_t::READ_FIX); + + if (state < buf_page_t::UNFIXED + 1) { + ut_ad(state > buf_page_t::FREED); + block->page.lock.x_unlock(); + hash_lock.unlock(); + buf_LRU_block_free_non_file_page(new_block); + mysql_mutex_unlock(&buf_pool.mutex); + goto ignore_block; + } + + mysql_mutex_unlock(&buf_pool.mutex); + hash_lock.unlock(); + std::this_thread::sleep_for( + std::chrono::microseconds(100)); + goto wait_for_unfix; + } + + /* Ensure that another buf_page_get_low() will wait for + new_block->page.lock.x_unlock(). */ + block->page.set_state(buf_page_t::READ_FIX); + + /* Move the compressed page from block->page to new_block, + and uncompress it. */ + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_relocate(&block->page, &new_block->page); + + /* X-latch the block for the duration of the decompression. */ + new_block->page.lock.x_lock(); + ut_d(block->page.lock.x_unlock()); + + buf_flush_relocate_on_flush_list(&block->page, + &new_block->page); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + /* Insert at the front of unzip_LRU list */ + buf_unzip_LRU_add_block(new_block, FALSE); + + mysql_mutex_unlock(&buf_pool.mutex); + hash_lock.unlock(); + +#if defined SUX_LOCK_GENERIC || defined UNIV_DEBUG + block->page.lock.free(); +#endif + ut_free(reinterpret_cast<buf_page_t*>(block)); + block = new_block; + + buf_pool.n_pend_unzip++; + + access_time = block->page.is_accessed(); + + if (!access_time && !recv_no_ibuf_operations + && ibuf_page_exists(block->page.id(), block->zip_size())) { + state = buf_page_t::IBUF_EXIST + 1; + } + + /* Decompress the page while not holding + buf_pool.mutex. */ + const auto ok = buf_zip_decompress(block, false); + --buf_pool.n_pend_unzip; + if (!ok) { + if (err) { + *err = DB_PAGE_CORRUPTED; + } + mysql_mutex_lock(&buf_pool.mutex); + } + state = block->page.read_unfix(state); + block->page.lock.x_unlock(); + + if (!ok) { + goto free_unfixed_block; + } + } + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +re_evict: + if (mode != BUF_GET_IF_IN_POOL + && mode != BUF_GET_IF_IN_POOL_OR_WATCH) { + } else if (!ibuf_debug || recv_recovery_is_on()) { + } else if (fil_space_t* space = fil_space_t::get(page_id.space())) { + for (ulint i = 0; i < mtr->get_savepoint(); i++) { + if (buf_block_t* b = mtr->block_at_savepoint(i)) { + if (b->page.oldest_modification() > 2 + && b->page.lock.have_any()) { + /* We are holding a dirty page latch + that would hang buf_flush_sync(). */ + space->release(); + goto re_evict_fail; + } + } + } + + /* Try to evict the block from the buffer pool, to use the + insert buffer (change buffer) as much as possible. */ + + mysql_mutex_lock(&buf_pool.mutex); + + block->unfix(); + + /* Blocks cannot be relocated or enter or exit the + buf_pool while we are holding the buf_pool.mutex. */ + const bool evicted = buf_LRU_free_page(&block->page, true); + space->release(); + + if (!evicted) { + block->fix(); + } + + mysql_mutex_unlock(&buf_pool.mutex); + + if (evicted) { + if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) { + buf_pool.watch_set(page_id, chain); + } + return(NULL); + } + + buf_flush_sync(); + + state = block->page.state(); + + if (state == buf_page_t::UNFIXED + 1 + && !block->page.oldest_modification()) { + goto re_evict; + } + + /* Failed to evict the page; change it directly */ + } +re_evict_fail: +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + + if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) { + goto ignore_block; + } + ut_ad((~buf_page_t::LRU_MASK) & state); + ut_ad(state > buf_page_t::WRITE_FIX || state < buf_page_t::READ_FIX); + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + ut_ad(block->page.frame); + + if (state >= buf_page_t::UNFIXED + && allow_ibuf_merge + && fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX + && page_is_leaf(block->page.frame)) { + block->page.lock.x_lock(); + ut_ad(block->page.id() == page_id + || (state >= buf_page_t::READ_FIX + && state < buf_page_t::WRITE_FIX)); + +#ifdef BTR_CUR_HASH_ADAPT + btr_search_drop_page_hash_index(block, true); +#endif /* BTR_CUR_HASH_ADAPT */ + + dberr_t e; + + if (UNIV_UNLIKELY(block->page.id() != page_id)) { +page_id_mismatch: + state = block->page.state(); + e = DB_CORRUPTION; +ibuf_merge_corrupted: + if (err) { + *err = e; + } + + if (block->page.id().is_corrupted()) { + buf_pool.corrupted_evict(&block->page, state); + } + return nullptr; + } + + state = block->page.state(); + ut_ad(state < buf_page_t::READ_FIX); + + if (state >= buf_page_t::IBUF_EXIST + && state < buf_page_t::REINIT) { + block->page.clear_ibuf_exist(); + e = ibuf_merge_or_delete_for_page(block, page_id, + block->zip_size()); + if (UNIV_UNLIKELY(e != DB_SUCCESS)) { + goto ibuf_merge_corrupted; + } + } + + if (rw_latch == RW_X_LATCH) { + goto get_latch_valid; + } else { + block->page.lock.x_unlock(); + goto get_latch; + } + } else { +get_latch: + switch (rw_latch) { + case RW_NO_LATCH: + mtr->memo_push(block, MTR_MEMO_BUF_FIX); + return block; + case RW_S_LATCH: + block->page.lock.s_lock(); + ut_ad(!block->page.is_read_fixed()); + if (UNIV_UNLIKELY(block->page.id() != page_id)) { + block->page.lock.s_unlock(); + block->page.lock.x_lock(); + goto page_id_mismatch; + } +get_latch_valid: + mtr->memo_push(block, mtr_memo_type_t(rw_latch)); +#ifdef BTR_CUR_HASH_ADAPT + btr_search_drop_page_hash_index(block, true); +#endif /* BTR_CUR_HASH_ADAPT */ + break; + case RW_SX_LATCH: + block->page.lock.u_lock(); + ut_ad(!block->page.is_io_fixed()); + if (UNIV_UNLIKELY(block->page.id() != page_id)) { + block->page.lock.u_x_upgrade(); + goto page_id_mismatch; + } + goto get_latch_valid; + default: + ut_ad(rw_latch == RW_X_LATCH); + if (block->page.lock.x_lock_upgraded()) { + ut_ad(block->page.id() == page_id); + block->unfix(); + mtr->page_lock_upgrade(*block); + return block; + } + if (UNIV_UNLIKELY(block->page.id() != page_id)) { + goto page_id_mismatch; + } + goto get_latch_valid; + } + + ut_ad(page_id_t(page_get_space_id(block->page.frame), + page_get_page_no(block->page.frame)) + == page_id); + + if (mode == BUF_GET_POSSIBLY_FREED + || mode == BUF_PEEK_IF_IN_POOL) { + return block; + } + + const bool not_first_access{block->page.set_accessed()}; + buf_page_make_young_if_needed(&block->page); + if (!not_first_access) { + buf_read_ahead_linear(page_id, block->zip_size(), + ibuf_inside(mtr)); + } + } + + return block; +} + +/** Get access to a database page. Buffered redo log may be applied. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH +@param[in] guess guessed block or NULL +@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, +BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH +@param[in,out] mtr mini-transaction, or NULL +@param[out] err DB_SUCCESS or error code +@param[in] allow_ibuf_merge Allow change buffer merge while +reading the pages from file. +@return pointer to the block or NULL */ +buf_block_t* +buf_page_get_gen( + const page_id_t page_id, + ulint zip_size, + ulint rw_latch, + buf_block_t* guess, + ulint mode, + mtr_t* mtr, + dberr_t* err, + bool allow_ibuf_merge) +{ + buf_block_t *block= recv_sys.recover(page_id); + if (UNIV_LIKELY(!block)) + return buf_page_get_low(page_id, zip_size, rw_latch, + guess, mode, mtr, err, allow_ibuf_merge); + else if (UNIV_UNLIKELY(block == reinterpret_cast<buf_block_t*>(-1))) + { + corrupted: + if (err) + *err= DB_CORRUPTION; + return nullptr; + } + /* Recovery is a special case; we fix() before acquiring lock. */ + auto s= block->page.fix(); + ut_ad(s >= buf_page_t::FREED); + /* The block may be write-fixed at this point because we are not + holding a lock, but it must not be read-fixed. */ + ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX); + if (err) + *err= DB_SUCCESS; + const bool must_merge= allow_ibuf_merge && + ibuf_page_exists(page_id, block->zip_size()); + if (s < buf_page_t::UNFIXED) + { + got_freed_page: + ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL); + mysql_mutex_lock(&buf_pool.mutex); + block->page.unfix(); + buf_LRU_free_page(&block->page, true); + mysql_mutex_unlock(&buf_pool.mutex); + goto corrupted; + } + else if (must_merge && + fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX && + page_is_leaf(block->page.frame)) + { + block->page.lock.x_lock(); + s= block->page.state(); + ut_ad(s > buf_page_t::FREED); + ut_ad(s < buf_page_t::READ_FIX); + if (s < buf_page_t::UNFIXED) + { + block->page.lock.x_unlock(); + goto got_freed_page; + } + else + { + if (block->page.is_ibuf_exist()) + block->page.clear_ibuf_exist(); + if (dberr_t e= + ibuf_merge_or_delete_for_page(block, page_id, block->zip_size())) + { + if (err) + *err= e; + buf_pool.corrupted_evict(&block->page, s); + return nullptr; + } + } + + if (rw_latch == RW_X_LATCH) + { + mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX); + return block; + } + block->page.lock.x_unlock(); + } + mtr->page_lock(block, rw_latch); + return block; +} + +/********************************************************************//** +This is the general function used to get optimistic access to a database +page. +@return TRUE if success */ +TRANSACTIONAL_TARGET +bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block, + uint64_t modify_clock, mtr_t *mtr) +{ + ut_ad(block); + ut_ad(mtr); + ut_ad(mtr->is_active()); + ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH); + + if (have_transactional_memory); + else if (UNIV_UNLIKELY(!block->page.frame)) + return false; + else + { + const auto state= block->page.state(); + if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED || + state >= buf_page_t::READ_FIX)) + return false; + } + + bool success; + const page_id_t id{block->page.id()}; + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold()); + bool have_u_not_x= false; + + { + transactional_shared_lock_guard<page_hash_latch> g + {buf_pool.page_hash.lock_get(chain)}; + if (UNIV_UNLIKELY(id != block->page.id() || !block->page.frame)) + return false; + const auto state= block->page.state(); + if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED || + state >= buf_page_t::READ_FIX)) + return false; + + if (rw_latch == RW_S_LATCH) + success= block->page.lock.s_lock_try(); + else + { + have_u_not_x= block->page.lock.have_u_not_x(); + success= have_u_not_x || block->page.lock.x_lock_try(); + } + } + + if (!success) + return false; + + if (have_u_not_x) + { + block->page.lock.u_x_upgrade(); + mtr->page_lock_upgrade(*block); + ut_ad(id == block->page.id()); + ut_ad(modify_clock == block->modify_clock); + } + else + { + ut_ad(rw_latch == RW_S_LATCH || !block->page.is_io_fixed()); + ut_ad(id == block->page.id()); + ut_ad(!ibuf_inside(mtr) || ibuf_page(id, block->zip_size(), nullptr)); + + if (modify_clock != block->modify_clock || block->page.is_freed()) + { + if (rw_latch == RW_S_LATCH) + block->page.lock.s_unlock(); + else + block->page.lock.x_unlock(); + return false; + } + + block->page.fix(); + ut_ad(!block->page.is_read_fixed()); + block->page.set_accessed(); + buf_page_make_young_if_needed(&block->page); + mtr->memo_push(block, mtr_memo_type_t(rw_latch)); + } + + ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate()); + ut_d(const auto state = block->page.state()); + ut_ad(state > buf_page_t::UNFIXED); + ut_ad(state < buf_page_t::READ_FIX || state > buf_page_t::WRITE_FIX); + ut_ad(~buf_page_t::LRU_MASK & state); + ut_ad(block->page.frame); + + return true; +} + +/** Try to S-latch a page. +Suitable for using when holding the lock_sys latches (as it avoids deadlock). +@param[in] page_id page identifier +@param[in,out] mtr mini-transaction +@return the block +@retval nullptr if an S-latch cannot be granted immediately */ +TRANSACTIONAL_TARGET +buf_block_t *buf_page_try_get(const page_id_t page_id, mtr_t *mtr) +{ + ut_ad(mtr); + ut_ad(mtr->is_active()); + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); + buf_block_t *block; + + { + transactional_shared_lock_guard<page_hash_latch> g + {buf_pool.page_hash.lock_get(chain)}; + block= reinterpret_cast<buf_block_t*> + (buf_pool.page_hash.get(page_id, chain)); + if (!block || !block->page.frame || !block->page.lock.s_lock_try()) + return nullptr; + } + + block->page.fix(); + ut_ad(!block->page.is_read_fixed()); + mtr->memo_push(block, MTR_MEMO_PAGE_S_FIX); + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + ut_ad(block->page.buf_fix_count()); + ut_ad(block->page.id() == page_id); + + ++buf_pool.stat.n_page_gets; + mariadb_increment_pages_accessed(); + return block; +} + +/** Initialize the block. +@param page_id page identifier +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param fix initial buf_fix_count() */ +void buf_block_t::initialise(const page_id_t page_id, ulint zip_size, + uint32_t fix) +{ + ut_ad(!page.in_file()); + buf_block_init_low(this); + page.init(fix, page_id); + page.set_os_used(); + page_zip_set_size(&page.zip, zip_size); +} + +TRANSACTIONAL_TARGET +static buf_block_t *buf_page_create_low(page_id_t page_id, ulint zip_size, + mtr_t *mtr, buf_block_t *free_block) +{ + ut_ad(mtr->is_active()); + ut_ad(page_id.space() != 0 || !zip_size); + + free_block->initialise(page_id, zip_size, buf_page_t::MEMORY); + + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); +retry: + mysql_mutex_lock(&buf_pool.mutex); + + buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain); + + if (bpage && !buf_pool.watch_is_sentinel(*bpage)) + { +#ifdef BTR_CUR_HASH_ADAPT + const dict_index_t *drop_hash_entry= nullptr; +#endif + bool ibuf_exist= false; + + if (!mtr->have_x_latch(reinterpret_cast<const buf_block_t&>(*bpage))) + { + const bool got= bpage->lock.x_lock_try(); + if (!got) + { + mysql_mutex_unlock(&buf_pool.mutex); + bpage->lock.x_lock(); + const page_id_t id{bpage->id()}; + if (UNIV_UNLIKELY(id != page_id)) + { + ut_ad(id.is_corrupted()); + bpage->lock.x_unlock(); + goto retry; + } + mysql_mutex_lock(&buf_pool.mutex); + } + + auto state= bpage->fix(); + ut_ad(state >= buf_page_t::FREED); + ut_ad(state < buf_page_t::READ_FIX); + + if (state < buf_page_t::UNFIXED) + bpage->set_reinit(buf_page_t::FREED); + else + { + bpage->set_reinit(state & buf_page_t::LRU_MASK); + ibuf_exist= (state & buf_page_t::LRU_MASK) == buf_page_t::IBUF_EXIST; + } + + if (UNIV_LIKELY(bpage->frame != nullptr)) + { + mysql_mutex_unlock(&buf_pool.mutex); + buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage); + mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX); +#ifdef BTR_CUR_HASH_ADAPT + drop_hash_entry= block->index; +#endif + } + else + { + auto state= bpage->state(); + ut_ad(state >= buf_page_t::FREED); + ut_ad(state < buf_page_t::READ_FIX); + + page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); + /* It does not make sense to use transactional_lock_guard here, + because buf_relocate() would likely make the memory transaction + too large. */ + hash_lock.lock(); + + if (state < buf_page_t::UNFIXED) + bpage->set_reinit(buf_page_t::FREED); + else + { + bpage->set_reinit(state & buf_page_t::LRU_MASK); + ibuf_exist= (state & buf_page_t::LRU_MASK) == buf_page_t::IBUF_EXIST; + } + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_relocate(bpage, &free_block->page); + free_block->page.lock.x_lock(); + buf_flush_relocate_on_flush_list(bpage, &free_block->page); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + buf_unzip_LRU_add_block(free_block, FALSE); + + mysql_mutex_unlock(&buf_pool.mutex); + hash_lock.unlock(); +#if defined SUX_LOCK_GENERIC || defined UNIV_DEBUG + bpage->lock.x_unlock(); + bpage->lock.free(); +#endif + ut_free(bpage); + mtr->memo_push(free_block, MTR_MEMO_PAGE_X_FIX); + bpage= &free_block->page; + } + } + else + { + mysql_mutex_unlock(&buf_pool.mutex); + ut_ad(bpage->frame); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!reinterpret_cast<buf_block_t*>(bpage)->index); +#endif + const auto state= bpage->state(); + ut_ad(state >= buf_page_t::FREED); + bpage->set_reinit(state < buf_page_t::UNFIXED ? buf_page_t::FREED + : state & buf_page_t::LRU_MASK); + } + +#ifdef BTR_CUR_HASH_ADAPT + if (drop_hash_entry) + btr_search_drop_page_hash_index(reinterpret_cast<buf_block_t*>(bpage), + false); +#endif /* BTR_CUR_HASH_ADAPT */ + + if (ibuf_exist && !recv_recovery_is_on()) + ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size); + + return reinterpret_cast<buf_block_t*>(bpage); + } + + /* If we get here, the page was not in buf_pool: init it there */ + + DBUG_PRINT("ib_buf", ("create page %u:%u", + page_id.space(), page_id.page_no())); + + bpage= &free_block->page; + + ut_ad(bpage->state() == buf_page_t::MEMORY); + bpage->lock.x_lock(); + + /* The block must be put to the LRU list */ + buf_LRU_add_block(bpage, false); + { + transactional_lock_guard<page_hash_latch> g + {buf_pool.page_hash.lock_get(chain)}; + bpage->set_state(buf_page_t::REINIT + 1); + buf_pool.page_hash.append(chain, bpage); + } + + if (UNIV_UNLIKELY(zip_size)) + { + bpage->zip.data= buf_buddy_alloc(zip_size); + + /* To maintain the invariant block->in_unzip_LRU_list == + block->page.belongs_to_unzip_LRU() we have to add this + block to unzip_LRU after block->page.zip.data is set. */ + ut_ad(bpage->belongs_to_unzip_LRU()); + buf_unzip_LRU_add_block(reinterpret_cast<buf_block_t*>(bpage), FALSE); + } + + buf_pool.stat.n_pages_created++; + mysql_mutex_unlock(&buf_pool.mutex); + + mtr->memo_push(reinterpret_cast<buf_block_t*>(bpage), MTR_MEMO_PAGE_X_FIX); + + bpage->set_accessed(); + + /* Delete possible entries for the page from the insert buffer: + such can exist if the page belonged to an index which was dropped */ + if (page_id < page_id_t{SRV_SPACE_ID_UPPER_BOUND, 0} && + !srv_is_undo_tablespace(page_id.space()) && + !recv_recovery_is_on()) + ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size); + + static_assert(FIL_PAGE_PREV + 4 == FIL_PAGE_NEXT, "adjacent"); + memset_aligned<8>(bpage->frame + FIL_PAGE_PREV, 0xff, 8); + mach_write_to_2(bpage->frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED); + + /* FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is only used on the + following pages: + (1) The first page of the InnoDB system tablespace (page 0:0) + (2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages + (3) key_version on encrypted pages (not page 0:0) */ + + memset(bpage->frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8); + memset_aligned<8>(bpage->frame + FIL_PAGE_LSN, 0, 8); + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + return reinterpret_cast<buf_block_t*>(bpage); +} + +/** Initialize a page in the buffer pool. The page is usually not read +from a file even if it cannot be found in the buffer buf_pool. This is one +of the functions which perform to a block a state transition NOT_USED => +FILE_PAGE (the other is buf_page_get_gen). +@param[in,out] space space object +@param[in] offset offset of the tablespace + or deferred space id if space + object is null +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction +@param[in,out] free_block pre-allocated buffer block +@return pointer to the block, page bufferfixed */ +buf_block_t* +buf_page_create(fil_space_t *space, uint32_t offset, + ulint zip_size, mtr_t *mtr, buf_block_t *free_block) +{ + space->free_page(offset, false); + return buf_page_create_low({space->id, offset}, zip_size, mtr, free_block); +} + +/** Initialize a page in buffer pool while initializing the +deferred tablespace +@param space_id space identfier +@param zip_size ROW_FORMAT=COMPRESSED page size or 0 +@param mtr mini-transaction +@param free_block pre-allocated buffer block +@return pointer to the block, page bufferfixed */ +buf_block_t* buf_page_create_deferred(uint32_t space_id, ulint zip_size, + mtr_t *mtr, buf_block_t *free_block) +{ + return buf_page_create_low({space_id, 0}, zip_size, mtr, free_block); +} + +/** Monitor the buffer page read/write activity, and increment corresponding +counter value in MONITOR_MODULE_BUF_PAGE. +@param bpage buffer page whose read or write was completed +@param read true=read, false=write */ +ATTRIBUTE_COLD void buf_page_monitor(const buf_page_t &bpage, bool read) +{ + monitor_id_t counter; + + const byte* frame = bpage.zip.data ? bpage.zip.data : bpage.frame; + + switch (fil_page_get_type(frame)) { + ulint level; + case FIL_PAGE_TYPE_INSTANT: + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + level = btr_page_get_level(frame); + + /* Check if it is an index page for insert buffer */ + if (fil_page_get_type(frame) == FIL_PAGE_INDEX + && btr_page_get_index_id(frame) + == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) { + if (level == 0) { + counter = MONITOR_RW_COUNTER( + read, MONITOR_INDEX_IBUF_LEAF_PAGE); + } else { + counter = MONITOR_RW_COUNTER( + read, + MONITOR_INDEX_IBUF_NON_LEAF_PAGE); + } + } else { + if (level == 0) { + counter = MONITOR_RW_COUNTER( + read, MONITOR_INDEX_LEAF_PAGE); + } else { + counter = MONITOR_RW_COUNTER( + read, MONITOR_INDEX_NON_LEAF_PAGE); + } + } + break; + + case FIL_PAGE_UNDO_LOG: + counter = MONITOR_RW_COUNTER(read, MONITOR_UNDO_LOG_PAGE); + break; + + case FIL_PAGE_INODE: + counter = MONITOR_RW_COUNTER(read, MONITOR_INODE_PAGE); + break; + + case FIL_PAGE_IBUF_FREE_LIST: + counter = MONITOR_RW_COUNTER(read, MONITOR_IBUF_FREELIST_PAGE); + break; + + case FIL_PAGE_IBUF_BITMAP: + counter = MONITOR_RW_COUNTER(read, MONITOR_IBUF_BITMAP_PAGE); + break; + + case FIL_PAGE_TYPE_SYS: + counter = MONITOR_RW_COUNTER(read, MONITOR_SYSTEM_PAGE); + break; + + case FIL_PAGE_TYPE_TRX_SYS: + counter = MONITOR_RW_COUNTER(read, MONITOR_TRX_SYSTEM_PAGE); + break; + + case FIL_PAGE_TYPE_FSP_HDR: + counter = MONITOR_RW_COUNTER(read, MONITOR_FSP_HDR_PAGE); + break; + + case FIL_PAGE_TYPE_XDES: + counter = MONITOR_RW_COUNTER(read, MONITOR_XDES_PAGE); + break; + + case FIL_PAGE_TYPE_BLOB: + counter = MONITOR_RW_COUNTER(read, MONITOR_BLOB_PAGE); + break; + + case FIL_PAGE_TYPE_ZBLOB: + counter = MONITOR_RW_COUNTER(read, MONITOR_ZBLOB_PAGE); + break; + + case FIL_PAGE_TYPE_ZBLOB2: + counter = MONITOR_RW_COUNTER(read, MONITOR_ZBLOB2_PAGE); + break; + + default: + counter = MONITOR_RW_COUNTER(read, MONITOR_OTHER_PAGE); + } + + MONITOR_INC_NOCHECK(counter); +} + +/** Check if the encrypted page is corrupted for the full crc32 format. +@param[in] space_id page belongs to space id +@param[in] d page +@param[in] is_compressed compressed page +@return true if page is corrupted or false if it isn't */ +static bool buf_page_full_crc32_is_corrupted(ulint space_id, const byte* d, + bool is_compressed) +{ + if (space_id != mach_read_from_4(d + FIL_PAGE_SPACE_ID)) + return true; + + static_assert(FIL_PAGE_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment"); + + return !is_compressed && + memcmp_aligned<4>(FIL_PAGE_LSN + 4 + d, + d + srv_page_size - FIL_PAGE_FCRC32_END_LSN, 4); +} + +/** Check if page is maybe compressed, encrypted or both when we encounter +corrupted page. Note that we can't be 100% sure if page is corrupted +or decrypt/decompress just failed. +@param[in,out] bpage page +@param[in] node data file +@return whether the operation succeeded +@retval DB_SUCCESS if page has been read and is not corrupted +@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted +@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but +after decryption normal page checksum does not match. */ +static dberr_t buf_page_check_corrupt(buf_page_t *bpage, + const fil_node_t &node) +{ + ut_ad(node.space->referenced()); + + byte* dst_frame = bpage->zip.data ? bpage->zip.data : bpage->frame; + dberr_t err = DB_SUCCESS; + uint key_version = buf_page_get_key_version(dst_frame, + node.space->flags); + + /* In buf_decrypt_after_read we have either decrypted the page if + page post encryption checksum matches and used key_id is found + from the encryption plugin. If checksum did not match page was + not decrypted and it could be either encrypted and corrupted + or corrupted or good page. If we decrypted, there page could + still be corrupted if used key does not match. */ + const bool seems_encrypted = !node.space->full_crc32() && key_version + && node.space->crypt_data + && node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED; + ut_ad(node.space->purpose != FIL_TYPE_TEMPORARY || + node.space->full_crc32()); + + /* If traditional checksums match, we assume that page is + not anymore encrypted. */ + if (node.space->full_crc32() + && !buf_is_zeroes(span<const byte>(dst_frame, + node.space->physical_size())) + && (key_version || node.space->is_compressed() + || node.space->purpose == FIL_TYPE_TEMPORARY)) { + if (buf_page_full_crc32_is_corrupted( + bpage->id().space(), dst_frame, + node.space->is_compressed())) { + err = DB_PAGE_CORRUPTED; + } + } else if (buf_page_is_corrupted(true, dst_frame, node.space->flags)) { + err = DB_PAGE_CORRUPTED; + } + + if (seems_encrypted && err == DB_PAGE_CORRUPTED + && bpage->id().page_no() != 0) { + err = DB_DECRYPTION_FAILED; + + ib::error() + << "The page " << bpage->id() + << " in file '" << node.name + << "' cannot be decrypted; key_version=" + << key_version; + } + + return (err); +} + +/** Complete a read of a page. +@param node data file +@return whether the operation succeeded +@retval DB_PAGE_CORRUPTED if the checksum fails +@retval DB_DECRYPTION_FAILED if the page cannot be decrypted +@retval DB_FAIL if the page contains the wrong ID */ +dberr_t buf_page_t::read_complete(const fil_node_t &node) +{ + const page_id_t expected_id{id()}; + ut_ad(is_read_fixed()); + ut_ad(!buf_dblwr.is_inside(id())); + ut_ad(id().space() == node.space->id); + ut_ad(zip_size() == node.space->zip_size()); + ut_ad(!!zip.ssize == !!zip.data); + + const byte *read_frame= zip.data ? zip.data : frame; + ut_ad(read_frame); + + dberr_t err; + if (!buf_page_decrypt_after_read(this, node)) + { + err= DB_DECRYPTION_FAILED; + goto database_corrupted; + } + + if (belongs_to_unzip_LRU()) + { + buf_pool.n_pend_unzip++; + auto ok= buf_zip_decompress(reinterpret_cast<buf_block_t*>(this), false); + buf_pool.n_pend_unzip--; + + if (!ok) + { + ib::info() << "Page " << expected_id << " zip_decompress failure."; + err= DB_PAGE_CORRUPTED; + goto database_corrupted; + } + } + + { + const page_id_t read_id(mach_read_from_4(read_frame + FIL_PAGE_SPACE_ID), + mach_read_from_4(read_frame + FIL_PAGE_OFFSET)); + + if (read_id == expected_id); + else if (read_id == page_id_t(0, 0)) + { + /* This is likely an uninitialized (all-zero) page. */ + err= DB_FAIL; + goto release_page; + } + else if (!node.space->full_crc32() && + page_id_t(0, read_id.page_no()) == expected_id) + /* FIL_PAGE_SPACE_ID was written as garbage in the system tablespace + before MySQL 4.1.1, which introduced innodb_file_per_table. */; + else if (node.space->full_crc32() && + *reinterpret_cast<const uint32_t*> + (&read_frame[FIL_PAGE_FCRC32_KEY_VERSION]) && + node.space->crypt_data && + node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED) + { + ib::error() << "Cannot decrypt " << expected_id; + err= DB_DECRYPTION_FAILED; + goto release_page; + } + else + { + ib::error() << "Space id and page no stored in the page, read in are " + << read_id << ", should be " << expected_id; + err= DB_PAGE_CORRUPTED; + goto release_page; + } + } + + err= buf_page_check_corrupt(this, node); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) + { +database_corrupted: + if (belongs_to_unzip_LRU()) + memset_aligned<UNIV_PAGE_SIZE_MIN>(frame, 0, srv_page_size); + + if (err == DB_PAGE_CORRUPTED) + { + ib::error() << "Database page corruption on disk" + " or a failed read of file '" + << node.name << "' page " << expected_id + << ". You may have to recover from a backup."; + + buf_page_print(read_frame, zip_size()); + + node.space->set_corrupted(); + + ib::info() << " You can use CHECK TABLE to scan" + " your table for corruption. " + << FORCE_RECOVERY_MSG; + } + + if (!srv_force_recovery) + goto release_page; + } + + if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED) + { +release_page: + buf_pool.corrupted_evict(this, buf_page_t::READ_FIX); + return err; + } + + const bool recovery= recv_recovery_is_on(); + + if (recovery && !recv_recover_page(node.space, this)) + return DB_PAGE_CORRUPTED; + + const bool ibuf_may_exist= frame && !recv_no_ibuf_operations && + (!expected_id.space() || !is_predefined_tablespace(expected_id.space())) && + fil_page_get_type(read_frame) == FIL_PAGE_INDEX && + page_is_leaf(read_frame); + + if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE))) + buf_page_monitor(*this, true); + DBUG_PRINT("ib_buf", ("read page %u:%u", id().space(), id().page_no())); + + if (!recovery) + { + ut_d(auto f=) zip.fix.fetch_sub(ibuf_may_exist + ? READ_FIX - IBUF_EXIST + : READ_FIX - UNFIXED); + ut_ad(f >= READ_FIX); + ut_ad(f < WRITE_FIX); + } + else if (ibuf_may_exist) + set_ibuf_exist(); + + lock.x_unlock(true); + + return DB_SUCCESS; +} + +#ifdef UNIV_DEBUG +/** Check that all blocks are in a replaceable state. +@return address of a non-free block +@retval nullptr if all freed */ +void buf_pool_t::assert_all_freed() +{ + mysql_mutex_lock(&mutex); + const chunk_t *chunk= chunks; + for (auto i= n_chunks; i--; chunk++) + if (const buf_block_t* block= chunk->not_freed()) + ib::fatal() << "Page " << block->page.id() << " still fixed or dirty"; + mysql_mutex_unlock(&mutex); +} +#endif /* UNIV_DEBUG */ + +/** Refresh the statistics used to print per-second averages. */ +void buf_refresh_io_stats() +{ + buf_pool.last_printout_time = time(NULL); + buf_pool.old_stat = buf_pool.stat; +} + +/** Invalidate all pages in the buffer pool. +All pages must be in a replaceable state (not modified or latched). */ +void buf_pool_invalidate() +{ + mysql_mutex_lock(&buf_pool.mutex); + + /* It is possible that a write batch that has been posted + earlier is still not complete. For buffer pool invalidation to + proceed we must ensure there is NO write activity happening. */ + + ut_d(mysql_mutex_unlock(&buf_pool.mutex)); + ut_d(buf_pool.assert_all_freed()); + ut_d(mysql_mutex_lock(&buf_pool.mutex)); + + while (UT_LIST_GET_LEN(buf_pool.LRU)) { + buf_LRU_scan_and_free_block(); + } + + ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0); + + buf_pool.freed_page_clock = 0; + buf_pool.LRU_old = NULL; + buf_pool.LRU_old_len = 0; + buf_pool.stat.init(); + + buf_refresh_io_stats(); + mysql_mutex_unlock(&buf_pool.mutex); +} + +#ifdef UNIV_DEBUG +/** Validate the buffer pool. */ +void buf_pool_t::validate() +{ + ulint n_lru = 0; + ulint n_flushing = 0; + ulint n_free = 0; + ulint n_zip = 0; + + mysql_mutex_lock(&mutex); + + chunk_t* chunk = chunks; + + /* Check the uncompressed blocks. */ + + for (auto i = n_chunks; i--; chunk++) { + buf_block_t* block = chunk->blocks; + + for (auto j = chunk->size; j--; block++) { + ut_ad(block->page.frame); + switch (const auto f = block->page.state()) { + case buf_page_t::NOT_USED: + n_free++; + break; + + case buf_page_t::MEMORY: + case buf_page_t::REMOVE_HASH: + /* do nothing */ + break; + + default: + if (f >= buf_page_t::READ_FIX + && f < buf_page_t::WRITE_FIX) { + /* A read-fixed block is not + necessarily in the page_hash yet. */ + break; + } + ut_ad(f >= buf_page_t::FREED); + const page_id_t id{block->page.id()}; + ut_ad(page_hash.get( + id, + page_hash.cell_get(id.fold())) + == &block->page); + n_lru++; + } + } + } + + /* Check dirty blocks. */ + + mysql_mutex_lock(&flush_list_mutex); + for (buf_page_t* b = UT_LIST_GET_FIRST(flush_list); b; + b = UT_LIST_GET_NEXT(list, b)) { + ut_ad(b->in_file()); + ut_ad(b->oldest_modification()); + ut_ad(!fsp_is_system_temporary(b->id().space())); + n_flushing++; + + if (UNIV_UNLIKELY(!b->frame)) { + n_lru++; + n_zip++; + } + const page_id_t id{b->id()}; + ut_ad(page_hash.get(id, page_hash.cell_get(id.fold())) == b); + } + + ut_ad(UT_LIST_GET_LEN(flush_list) == n_flushing); + + mysql_mutex_unlock(&flush_list_mutex); + + if (n_chunks_new == n_chunks + && n_lru + n_free > curr_size + n_zip) { + + ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free + << ", pool " << curr_size + << " zip " << n_zip << ". Aborting..."; + } + + ut_ad(UT_LIST_GET_LEN(LRU) >= n_lru); + + if (n_chunks_new == n_chunks + && UT_LIST_GET_LEN(free) != n_free) { + + ib::fatal() << "Free list len " + << UT_LIST_GET_LEN(free) + << ", free blocks " << n_free << ". Aborting..."; + } + + mysql_mutex_unlock(&mutex); + + ut_d(buf_LRU_validate()); + ut_d(buf_flush_validate()); +} +#endif /* UNIV_DEBUG */ + +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG +/** Write information of the buf_pool to the error log. */ +void buf_pool_t::print() +{ + index_id_t* index_ids; + ulint* counts; + ulint size; + ulint i; + ulint j; + index_id_t id; + ulint n_found; + chunk_t* chunk; + dict_index_t* index; + + size = curr_size; + + index_ids = static_cast<index_id_t*>( + ut_malloc_nokey(size * sizeof *index_ids)); + + counts = static_cast<ulint*>(ut_malloc_nokey(sizeof(ulint) * size)); + + mysql_mutex_lock(&mutex); + mysql_mutex_lock(&flush_list_mutex); + + ib::info() + << "[buffer pool: size=" << curr_size + << ", database pages=" << UT_LIST_GET_LEN(LRU) + << ", free pages=" << UT_LIST_GET_LEN(free) + << ", modified database pages=" + << UT_LIST_GET_LEN(flush_list) + << ", n pending decompressions=" << n_pend_unzip + << ", n pending flush LRU=" << n_flush() + << " list=" << os_aio_pending_writes() + << ", pages made young=" << stat.n_pages_made_young + << ", not young=" << stat.n_pages_not_made_young + << ", pages read=" << stat.n_pages_read + << ", created=" << stat.n_pages_created + << ", written=" << stat.n_pages_written << "]"; + + mysql_mutex_unlock(&flush_list_mutex); + + /* Count the number of blocks belonging to each index in the buffer */ + + n_found = 0; + + chunk = chunks; + + for (i = n_chunks; i--; chunk++) { + buf_block_t* block = chunk->blocks; + ulint n_blocks = chunk->size; + + for (; n_blocks--; block++) { + const buf_frame_t* frame = block->page.frame; + + if (fil_page_index_page_check(frame)) { + + id = btr_page_get_index_id(frame); + + /* Look for the id in the index_ids array */ + j = 0; + + while (j < n_found) { + + if (index_ids[j] == id) { + counts[j]++; + + break; + } + j++; + } + + if (j == n_found) { + n_found++; + index_ids[j] = id; + counts[j] = 1; + } + } + } + } + + mysql_mutex_unlock(&mutex); + + for (i = 0; i < n_found; i++) { + index = dict_index_get_if_in_cache(index_ids[i]); + + if (!index) { + ib::info() << "Block count for index " + << index_ids[i] << " in buffer is about " + << counts[i]; + } else { + ib::info() << "Block count for index " << index_ids[i] + << " in buffer is about " << counts[i] + << ", index " << index->name + << " of table " << index->table->name; + } + } + + ut_free(index_ids); + ut_free(counts); + + validate(); +} +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */ + +#ifdef UNIV_DEBUG +/** @return the number of latched pages in the buffer pool */ +ulint buf_get_latched_pages_number() +{ + ulint fixed_pages_number= 0; + + mysql_mutex_lock(&buf_pool.mutex); + + for (buf_page_t *b= UT_LIST_GET_FIRST(buf_pool.LRU); b; + b= UT_LIST_GET_NEXT(LRU, b)) + if (b->state() > buf_page_t::UNFIXED) + fixed_pages_number++; + + mysql_mutex_unlock(&buf_pool.mutex); + + return fixed_pages_number; +} +#endif /* UNIV_DEBUG */ + +/** Collect buffer pool metadata. +@param[out] pool_info buffer pool metadata */ +void buf_stats_get_pool_info(buf_pool_info_t *pool_info) +{ + time_t current_time; + double time_elapsed; + + mysql_mutex_lock(&buf_pool.mutex); + + pool_info->pool_size = buf_pool.curr_size; + + pool_info->lru_len = UT_LIST_GET_LEN(buf_pool.LRU); + + pool_info->old_lru_len = buf_pool.LRU_old_len; + + pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool.free); + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool.flush_list); + + pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool.unzip_LRU); + + pool_info->n_pend_reads = os_aio_pending_reads_approx(); + + pool_info->n_pending_flush_lru = buf_pool.n_flush(); + + pool_info->n_pending_flush_list = os_aio_pending_writes(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + current_time = time(NULL); + time_elapsed = 0.001 + difftime(current_time, + buf_pool.last_printout_time); + + pool_info->n_pages_made_young = buf_pool.stat.n_pages_made_young; + + pool_info->n_pages_not_made_young = + buf_pool.stat.n_pages_not_made_young; + + pool_info->n_pages_read = buf_pool.stat.n_pages_read; + + pool_info->n_pages_created = buf_pool.stat.n_pages_created; + + pool_info->n_pages_written = buf_pool.stat.n_pages_written; + + pool_info->n_page_gets = buf_pool.stat.n_page_gets; + + pool_info->n_ra_pages_read_rnd = buf_pool.stat.n_ra_pages_read_rnd; + pool_info->n_ra_pages_read = buf_pool.stat.n_ra_pages_read; + + pool_info->n_ra_pages_evicted = buf_pool.stat.n_ra_pages_evicted; + + pool_info->page_made_young_rate = + static_cast<double>(buf_pool.stat.n_pages_made_young + - buf_pool.old_stat.n_pages_made_young) + / time_elapsed; + + pool_info->page_not_made_young_rate = + static_cast<double>(buf_pool.stat.n_pages_not_made_young + - buf_pool.old_stat.n_pages_not_made_young) + / time_elapsed; + + pool_info->pages_read_rate = + static_cast<double>(buf_pool.stat.n_pages_read + - buf_pool.old_stat.n_pages_read) + / time_elapsed; + + pool_info->pages_created_rate = + static_cast<double>(buf_pool.stat.n_pages_created + - buf_pool.old_stat.n_pages_created) + / time_elapsed; + + pool_info->pages_written_rate = + static_cast<double>(buf_pool.stat.n_pages_written + - buf_pool.old_stat.n_pages_written) + / time_elapsed; + + pool_info->n_page_get_delta = buf_pool.stat.n_page_gets + - buf_pool.old_stat.n_page_gets; + + if (pool_info->n_page_get_delta) { + pool_info->page_read_delta = buf_pool.stat.n_pages_read + - buf_pool.old_stat.n_pages_read; + + pool_info->young_making_delta = + buf_pool.stat.n_pages_made_young + - buf_pool.old_stat.n_pages_made_young; + + pool_info->not_young_making_delta = + buf_pool.stat.n_pages_not_made_young + - buf_pool.old_stat.n_pages_not_made_young; + } + pool_info->pages_readahead_rnd_rate = + static_cast<double>(buf_pool.stat.n_ra_pages_read_rnd + - buf_pool.old_stat.n_ra_pages_read_rnd) + / time_elapsed; + + + pool_info->pages_readahead_rate = + static_cast<double>(buf_pool.stat.n_ra_pages_read + - buf_pool.old_stat.n_ra_pages_read) + / time_elapsed; + + pool_info->pages_evicted_rate = + static_cast<double>(buf_pool.stat.n_ra_pages_evicted + - buf_pool.old_stat.n_ra_pages_evicted) + / time_elapsed; + + pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool.unzip_LRU); + + pool_info->io_sum = buf_LRU_stat_sum.io; + + pool_info->io_cur = buf_LRU_stat_cur.io; + + pool_info->unzip_sum = buf_LRU_stat_sum.unzip; + + pool_info->unzip_cur = buf_LRU_stat_cur.unzip; + + buf_refresh_io_stats(); + mysql_mutex_unlock(&buf_pool.mutex); +} + +/*********************************************************************//** +Prints info of the buffer i/o. */ +static +void +buf_print_io_instance( +/*==================*/ + buf_pool_info_t*pool_info, /*!< in: buffer pool info */ + FILE* file) /*!< in/out: buffer where to print */ +{ + ut_ad(pool_info); + + fprintf(file, + "Buffer pool size " ULINTPF "\n" + "Free buffers " ULINTPF "\n" + "Database pages " ULINTPF "\n" + "Old database pages " ULINTPF "\n" + "Modified db pages " ULINTPF "\n" + "Percent of dirty pages(LRU & free pages): %.3f\n" + "Max dirty pages percent: %.3f\n" + "Pending reads " ULINTPF "\n" + "Pending writes: LRU " ULINTPF ", flush list " ULINTPF "\n", + pool_info->pool_size, + pool_info->free_list_len, + pool_info->lru_len, + pool_info->old_lru_len, + pool_info->flush_list_len, + static_cast<double>(pool_info->flush_list_len) + / (static_cast<double>(pool_info->lru_len + + pool_info->free_list_len) + 1.0) + * 100.0, + srv_max_buf_pool_modified_pct, + pool_info->n_pend_reads, + pool_info->n_pending_flush_lru, + pool_info->n_pending_flush_list); + + fprintf(file, + "Pages made young " ULINTPF ", not young " ULINTPF "\n" + "%.2f youngs/s, %.2f non-youngs/s\n" + "Pages read " ULINTPF ", created " ULINTPF + ", written " ULINTPF "\n" + "%.2f reads/s, %.2f creates/s, %.2f writes/s\n", + pool_info->n_pages_made_young, + pool_info->n_pages_not_made_young, + pool_info->page_made_young_rate, + pool_info->page_not_made_young_rate, + pool_info->n_pages_read, + pool_info->n_pages_created, + pool_info->n_pages_written, + pool_info->pages_read_rate, + pool_info->pages_created_rate, + pool_info->pages_written_rate); + + if (pool_info->n_page_get_delta) { + double hit_rate = static_cast<double>( + pool_info->page_read_delta) + / static_cast<double>(pool_info->n_page_get_delta); + + if (hit_rate > 1) { + hit_rate = 1; + } + + fprintf(file, + "Buffer pool hit rate " ULINTPF " / 1000," + " young-making rate " ULINTPF " / 1000 not " + ULINTPF " / 1000\n", + ulint(1000 * (1 - hit_rate)), + ulint(1000 + * double(pool_info->young_making_delta) + / double(pool_info->n_page_get_delta)), + ulint(1000 * double(pool_info->not_young_making_delta) + / double(pool_info->n_page_get_delta))); + } else { + fputs("No buffer pool page gets since the last printout\n", + file); + } + + /* Statistics about read ahead algorithm */ + fprintf(file, "Pages read ahead %.2f/s," + " evicted without access %.2f/s," + " Random read ahead %.2f/s\n", + + pool_info->pages_readahead_rate, + pool_info->pages_evicted_rate, + pool_info->pages_readahead_rnd_rate); + + /* Print some values to help us with visualizing what is + happening with LRU eviction. */ + fprintf(file, + "LRU len: " ULINTPF ", unzip_LRU len: " ULINTPF "\n" + "I/O sum[" ULINTPF "]:cur[" ULINTPF "], " + "unzip sum[" ULINTPF "]:cur[" ULINTPF "]\n", + pool_info->lru_len, pool_info->unzip_lru_len, + pool_info->io_sum, pool_info->io_cur, + pool_info->unzip_sum, pool_info->unzip_cur); +} + +/*********************************************************************//** +Prints info of the buffer i/o. */ +void +buf_print_io( +/*=========*/ + FILE* file) /*!< in/out: buffer where to print */ +{ + buf_pool_info_t pool_info; + + buf_stats_get_pool_info(&pool_info); + buf_print_io_instance(&pool_info, file); +} + +/** Verify that post encryption checksum match with the calculated checksum. +This function should be called only if tablespace contains crypt data metadata. +@param page page frame +@param fsp_flags contents of FSP_SPACE_FLAGS +@return whether the page is encrypted and valid */ +bool buf_page_verify_crypt_checksum(const byte *page, uint32_t fsp_flags) +{ + if (!fil_space_t::full_crc32(fsp_flags)) { + return fil_space_verify_crypt_checksum( + page, fil_space_t::zip_size(fsp_flags)); + } + + return !buf_page_is_corrupted(true, page, fsp_flags); +} + +/** Print the given page_id_t object. +@param[in,out] out the output stream +@param[in] page_id the page_id_t object to be printed +@return the output stream */ +std::ostream& operator<<(std::ostream &out, const page_id_t page_id) +{ + out << "[page id: space=" << page_id.space() + << ", page number=" << page_id.page_no() << "]"; + return out; +} +#endif /* !UNIV_INNOCHECKSUM */ |