summaryrefslogtreecommitdiffstats
path: root/storage/innobase/fil
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 18:00:34 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 18:00:34 +0000
commit3f619478f796eddbba6e39502fe941b285dd97b1 (patch)
treee2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/fil
parentInitial commit. (diff)
downloadmariadb-upstream.tar.xz
mariadb-upstream.zip
Adding upstream version 1:10.11.6.upstream/1%10.11.6upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/fil')
-rw-r--r--storage/innobase/fil/fil0crypt.cc2425
-rw-r--r--storage/innobase/fil/fil0fil.cc3282
-rw-r--r--storage/innobase/fil/fil0pagecompress.cc584
3 files changed, 6291 insertions, 0 deletions
diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc
new file mode 100644
index 00000000..97cb3994
--- /dev/null
+++ b/storage/innobase/fil/fil0crypt.cc
@@ -0,0 +1,2425 @@
+/*****************************************************************************
+Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file fil0crypt.cc
+Innodb file space encrypt/decrypt
+
+Created Jonas Oreland Google
+Modified Jan Lindström jan.lindstrom@mariadb.com
+*******************************************************/
+
+#include "fil0crypt.h"
+#include "mach0data.h"
+#include "page0zip.h"
+#include "buf0checksum.h"
+#ifdef UNIV_INNOCHECKSUM
+# include "buf0buf.h"
+#else
+#include "buf0flu.h"
+#include "buf0dblwr.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "ut0ut.h"
+#include "fsp0fsp.h"
+#include "fil0pagecompress.h"
+#include <my_crypt.h>
+
+static bool fil_crypt_threads_inited = false;
+
+/** Is encryption enabled/disabled */
+ulong srv_encrypt_tables;
+
+/** No of key rotation threads requested */
+uint srv_n_fil_crypt_threads;
+
+/** No of key rotation threads started */
+uint srv_n_fil_crypt_threads_started;
+
+/** At this age or older a space/page will be rotated */
+uint srv_fil_crypt_rotate_key_age;
+
+/** Whether the encryption plugin does key rotation */
+Atomic_relaxed<bool> srv_encrypt_rotate;
+
+/** Condition variable for srv_n_fil_crypt_threads_started */
+static pthread_cond_t fil_crypt_cond;
+
+/** Condition variable to to signal the key rotation threads */
+static pthread_cond_t fil_crypt_threads_cond;
+
+/** Condition variable for interrupting sleeptime_ms sleep at the end
+of fil_crypt_rotate_page() */
+static pthread_cond_t fil_crypt_throttle_sleep_cond;
+
+/** Mutex for key rotation threads. Acquired before fil_system.mutex! */
+static mysql_mutex_t fil_crypt_threads_mutex;
+
+/** Variable ensuring only 1 thread at time does initial conversion */
+static bool fil_crypt_start_converting;
+
+/** Variables for throttling */
+uint srv_n_fil_crypt_iops; // 10ms per iop
+static constexpr uint srv_alloc_time = 3; // allocate iops for 3s at a time
+static uint n_fil_crypt_iops_allocated;
+
+#define DEBUG_KEYROTATION_THROTTLING 0
+
+/** Statistics variables */
+static fil_crypt_stat_t crypt_stat;
+static mysql_mutex_t crypt_stat_mutex;
+
+/** Wake up the encryption threads */
+void fil_crypt_threads_signal(bool broadcast)
+{
+ mysql_mutex_lock(&fil_crypt_threads_mutex);
+ if (broadcast)
+ pthread_cond_broadcast(&fil_crypt_threads_cond);
+ else
+ pthread_cond_signal(&fil_crypt_threads_cond);
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+}
+
+/***********************************************************************
+Check if a key needs rotation given a key_state
+@param[in] crypt_data Encryption information
+@param[in] key_version Current key version
+@param[in] latest_key_version Latest key version
+@param[in] rotate_key_age when to rotate
+@return true if key needs rotation, false if not */
+static bool
+fil_crypt_needs_rotation(
+ const fil_space_crypt_t* crypt_data,
+ uint key_version,
+ uint latest_key_version,
+ uint rotate_key_age)
+ MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************
+Init space crypt */
+void fil_space_crypt_init()
+{
+ pthread_cond_init(&fil_crypt_throttle_sleep_cond, nullptr);
+ mysql_mutex_init(0, &crypt_stat_mutex, nullptr);
+}
+
+/*********************************************************************
+Cleanup space crypt */
+void fil_space_crypt_cleanup()
+{
+ pthread_cond_destroy(&fil_crypt_throttle_sleep_cond);
+ mysql_mutex_destroy(&crypt_stat_mutex);
+}
+
+/**
+Get latest key version from encryption plugin.
+@return key version or ENCRYPTION_KEY_VERSION_INVALID */
+uint
+fil_space_crypt_t::key_get_latest_version(void)
+{
+ uint key_version = key_found;
+
+ if (is_key_found()) {
+ key_version = encryption_key_get_latest_version(key_id);
+ /* InnoDB does dirty read of srv_fil_crypt_rotate_key_age.
+ It doesn't matter because srv_encrypt_rotate
+ can be set to true only once */
+ if (!srv_encrypt_rotate
+ && key_version > srv_fil_crypt_rotate_key_age) {
+ srv_encrypt_rotate = true;
+ }
+
+ srv_stats.n_key_requests.inc();
+ key_found = key_version;
+ }
+
+ return key_version;
+}
+
+/******************************************************************
+Get the latest(key-version), waking the encrypt thread, if needed
+@param[in,out] crypt_data Crypt data */
+static inline
+uint
+fil_crypt_get_latest_key_version(
+ fil_space_crypt_t* crypt_data)
+{
+ ut_ad(crypt_data != NULL);
+
+ uint key_version = crypt_data->key_get_latest_version();
+
+ if (crypt_data->is_key_found()) {
+
+ if (fil_crypt_needs_rotation(
+ crypt_data,
+ crypt_data->min_key_version,
+ key_version,
+ srv_fil_crypt_rotate_key_age)) {
+ if (fil_crypt_threads_inited) {
+ fil_crypt_threads_signal();
+ }
+ }
+ }
+
+ return key_version;
+}
+
+/******************************************************************
+Mutex helper for crypt_data->scheme */
+void
+crypt_data_scheme_locker(
+/*=====================*/
+ st_encryption_scheme* scheme,
+ int exit)
+{
+ fil_space_crypt_t* crypt_data =
+ static_cast<fil_space_crypt_t*>(scheme);
+
+ if (exit) {
+ mysql_mutex_unlock(&crypt_data->mutex);
+ } else {
+ mysql_mutex_lock(&crypt_data->mutex);
+ }
+}
+
+/******************************************************************
+Create a fil_space_crypt_t object
+@param[in] type CRYPT_SCHEME_UNENCRYPTE or
+ CRYPT_SCHEME_1
+@param[in] encrypt_mode FIL_ENCRYPTION_DEFAULT or
+ FIL_ENCRYPTION_ON or
+ FIL_ENCRYPTION_OFF
+@param[in] min_key_version key_version or 0
+@param[in] key_id Used key id
+@return crypt object */
+static
+fil_space_crypt_t*
+fil_space_create_crypt_data(
+ uint type,
+ fil_encryption_t encrypt_mode,
+ uint min_key_version,
+ uint key_id)
+{
+ fil_space_crypt_t* crypt_data = NULL;
+ if (void* buf = ut_zalloc_nokey(sizeof(fil_space_crypt_t))) {
+ crypt_data = new(buf)
+ fil_space_crypt_t(
+ type,
+ min_key_version,
+ key_id,
+ encrypt_mode);
+ }
+
+ return crypt_data;
+}
+
+/******************************************************************
+Create a fil_space_crypt_t object
+@param[in] encrypt_mode FIL_ENCRYPTION_DEFAULT or
+ FIL_ENCRYPTION_ON or
+ FIL_ENCRYPTION_OFF
+
+@param[in] key_id Encryption key id
+@return crypt object */
+fil_space_crypt_t*
+fil_space_create_crypt_data(
+ fil_encryption_t encrypt_mode,
+ uint key_id)
+{
+ return (fil_space_create_crypt_data(0, encrypt_mode, 0, key_id));
+}
+
+/******************************************************************
+Merge fil_space_crypt_t object
+@param[in,out] dst Destination cryp data
+@param[in] src Source crypt data */
+static
+void
+fil_space_merge_crypt_data(
+ fil_space_crypt_t* dst,
+ const fil_space_crypt_t* src)
+{
+ mysql_mutex_lock(&dst->mutex);
+
+ /* validate that they are mergeable */
+ ut_a(src->type == CRYPT_SCHEME_UNENCRYPTED ||
+ src->type == CRYPT_SCHEME_1);
+
+ ut_a(dst->type == CRYPT_SCHEME_UNENCRYPTED ||
+ dst->type == CRYPT_SCHEME_1);
+
+ dst->encryption = src->encryption;
+ dst->type = src->type;
+ dst->min_key_version = src->min_key_version;
+ dst->keyserver_requests += src->keyserver_requests;
+
+ mysql_mutex_unlock(&dst->mutex);
+}
+
+/** Initialize encryption parameters from a tablespace header page.
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] page first page of the tablespace
+@return crypt data from page 0
+@retval NULL if not present or not valid */
+fil_space_crypt_t* fil_space_read_crypt_data(ulint zip_size, const byte* page)
+{
+ const ulint offset = FSP_HEADER_OFFSET
+ + fsp_header_get_encryption_offset(zip_size);
+
+ if (memcmp(page + offset, CRYPT_MAGIC, MAGIC_SZ) != 0) {
+ /* Crypt data is not stored. */
+ return NULL;
+ }
+
+ uint8_t type = mach_read_from_1(page + offset + MAGIC_SZ + 0);
+ uint8_t iv_length = mach_read_from_1(page + offset + MAGIC_SZ + 1);
+ fil_space_crypt_t* crypt_data;
+
+ if (!(type == CRYPT_SCHEME_UNENCRYPTED ||
+ type == CRYPT_SCHEME_1)
+ || iv_length != sizeof crypt_data->iv) {
+ ib::error() << "Found non sensible crypt scheme: "
+ << type << "," << iv_length
+ << " for space: "
+ << page_get_space_id(page);
+ return NULL;
+ }
+
+ uint min_key_version = mach_read_from_4
+ (page + offset + MAGIC_SZ + 2 + iv_length);
+
+ uint key_id = mach_read_from_4
+ (page + offset + MAGIC_SZ + 2 + iv_length + 4);
+
+ fil_encryption_t encryption = (fil_encryption_t)mach_read_from_1(
+ page + offset + MAGIC_SZ + 2 + iv_length + 8);
+
+ crypt_data = fil_space_create_crypt_data(encryption, key_id);
+ /* We need to overwrite these as above function will initialize
+ members */
+ crypt_data->type = type;
+ crypt_data->min_key_version = min_key_version;
+ memcpy(crypt_data->iv, page + offset + MAGIC_SZ + 2, iv_length);
+
+ return crypt_data;
+}
+
+/******************************************************************
+Free a crypt data object
+@param[in,out] crypt_data crypt data to be freed */
+void fil_space_destroy_crypt_data(fil_space_crypt_t **crypt_data)
+{
+ if (crypt_data != NULL && (*crypt_data) != NULL) {
+ fil_space_crypt_t* c;
+ if (UNIV_LIKELY(fil_crypt_threads_inited)) {
+ mysql_mutex_lock(&fil_crypt_threads_mutex);
+ c = *crypt_data;
+ *crypt_data = NULL;
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+ } else {
+ ut_ad(srv_read_only_mode || !srv_was_started);
+ c = *crypt_data;
+ *crypt_data = NULL;
+ }
+ if (c) {
+ c->~fil_space_crypt_t();
+ ut_free(c);
+ }
+ }
+}
+
+/** Amend encryption information from redo log.
+@param[in] space tablespace
+@param[in] data encryption metadata */
+void fil_crypt_parse(fil_space_t* space, const byte* data)
+{
+ ut_ad(data[1] == MY_AES_BLOCK_SIZE);
+ if (void* buf = ut_zalloc_nokey(sizeof(fil_space_crypt_t))) {
+ fil_space_crypt_t* crypt_data = new(buf)
+ fil_space_crypt_t(
+ data[0],
+ mach_read_from_4(&data[2 + MY_AES_BLOCK_SIZE]),
+ mach_read_from_4(&data[6 + MY_AES_BLOCK_SIZE]),
+ static_cast<fil_encryption_t>
+ (data[10 + MY_AES_BLOCK_SIZE]));
+ memcpy(crypt_data->iv, data + 2, MY_AES_BLOCK_SIZE);
+ mysql_mutex_lock(&fil_system.mutex);
+ if (space->crypt_data) {
+ fil_space_merge_crypt_data(space->crypt_data,
+ crypt_data);
+ fil_space_destroy_crypt_data(&crypt_data);
+ crypt_data = space->crypt_data;
+ } else {
+ space->crypt_data = crypt_data;
+ }
+ mysql_mutex_unlock(&fil_system.mutex);
+ }
+}
+
+/** Write encryption metadata to the first page.
+@param[in,out] block first page of the tablespace
+@param[in,out] mtr mini-transaction */
+void fil_space_crypt_t::write_page0(buf_block_t* block, mtr_t* mtr)
+{
+ const ulint offset = FSP_HEADER_OFFSET
+ + fsp_header_get_encryption_offset(block->zip_size());
+ byte* b = block->page.frame + offset;
+
+ mtr->memcpy<mtr_t::MAYBE_NOP>(*block, b, CRYPT_MAGIC, MAGIC_SZ);
+
+ b += MAGIC_SZ;
+ byte* const start = b;
+ *b++ = static_cast<byte>(type);
+ compile_time_assert(sizeof iv == MY_AES_BLOCK_SIZE);
+ compile_time_assert(sizeof iv == CRYPT_SCHEME_1_IV_LEN);
+ *b++ = sizeof iv;
+ memcpy(b, iv, sizeof iv);
+ b += sizeof iv;
+ mach_write_to_4(b, min_key_version);
+ b += 4;
+ mach_write_to_4(b, key_id);
+ b += 4;
+ *b++ = byte(encryption);
+ ut_ad(b - start == 11 + MY_AES_BLOCK_SIZE);
+ /* We must log also any unchanged bytes, because recovery will
+ invoke fil_crypt_parse() based on this log record. */
+ mtr->memcpy(*block, offset + MAGIC_SZ, b - start);
+}
+
+/** Encrypt a buffer for non full checksum.
+@param[in,out] crypt_data Crypt data
+@param[in] space space_id
+@param[in] offset Page offset
+@param[in] lsn Log sequence number
+@param[in] src_frame Page to encrypt
+@param[in] zip_size ROW_FORMAT=COMPRESSED
+ page size, or 0
+@param[in,out] dst_frame Output buffer
+@return encrypted buffer or NULL */
+static byte* fil_encrypt_buf_for_non_full_checksum(
+ fil_space_crypt_t* crypt_data,
+ ulint space,
+ ulint offset,
+ lsn_t lsn,
+ const byte* src_frame,
+ ulint zip_size,
+ byte* dst_frame)
+{
+ uint size = uint(zip_size ? zip_size : srv_page_size);
+ uint key_version = fil_crypt_get_latest_key_version(crypt_data);
+ ut_a(key_version != ENCRYPTION_KEY_VERSION_INVALID);
+ ut_ad(!ut_align_offset(src_frame, 8));
+ ut_ad(!ut_align_offset(dst_frame, 8));
+
+ const bool page_compressed = fil_page_get_type(src_frame)
+ == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED;
+ uint header_len = FIL_PAGE_DATA;
+
+ if (page_compressed) {
+ header_len += FIL_PAGE_ENCRYPT_COMP_METADATA_LEN;
+ }
+
+ /* FIL page header is not encrypted */
+ memcpy(dst_frame, src_frame, header_len);
+ mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
+ key_version);
+
+ /* Calculate the start offset in a page */
+ uint unencrypted_bytes = header_len + FIL_PAGE_DATA_END;
+ uint srclen = size - unencrypted_bytes;
+ const byte* src = src_frame + header_len;
+ byte* dst = dst_frame + header_len;
+ uint32 dstlen = 0;
+
+ if (page_compressed) {
+ srclen = mach_read_from_2(src_frame + FIL_PAGE_DATA);
+ }
+
+ int rc = encryption_scheme_encrypt(src, srclen, dst, &dstlen,
+ crypt_data, key_version,
+ (uint32)space, (uint32)offset, lsn);
+ ut_a(rc == MY_AES_OK);
+ ut_a(dstlen == srclen);
+
+ /* For compressed tables we do not store the FIL header because
+ the whole page is not stored to the disk. In compressed tables only
+ the FIL header + compressed (and now encrypted) payload alligned
+ to sector boundary is written. */
+ if (!page_compressed) {
+ /* FIL page trailer is also not encrypted */
+ static_assert(FIL_PAGE_DATA_END == 8, "alignment");
+ memcpy_aligned<8>(dst_frame + size - FIL_PAGE_DATA_END,
+ src_frame + size - FIL_PAGE_DATA_END, 8);
+ } else {
+ /* Clean up rest of buffer */
+ memset(dst_frame+header_len+srclen, 0,
+ size - (header_len + srclen));
+ }
+
+ /* store the post-encryption checksum after the key-version */
+ mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4,
+ zip_size
+ ? page_zip_calc_checksum(dst_frame, zip_size,
+ SRV_CHECKSUM_ALGORITHM_CRC32)
+ : buf_calc_page_crc32(dst_frame));
+
+ ut_ad(fil_space_verify_crypt_checksum(dst_frame, zip_size));
+
+ srv_stats.pages_encrypted.inc();
+
+ return dst_frame;
+}
+
+/** Encrypt a buffer for full checksum format.
+@param[in,out] crypt_data Crypt data
+@param[in] space space_id
+@param[in] offset Page offset
+@param[in] lsn Log sequence number
+@param[in] src_frame Page to encrypt
+@param[in,out] dst_frame Output buffer
+@return encrypted buffer or NULL */
+static byte* fil_encrypt_buf_for_full_crc32(
+ fil_space_crypt_t* crypt_data,
+ ulint space,
+ ulint offset,
+ lsn_t lsn,
+ const byte* src_frame,
+ byte* dst_frame)
+{
+ uint key_version = fil_crypt_get_latest_key_version(crypt_data);
+ ut_d(bool corrupted = false);
+ const uint size = buf_page_full_crc32_size(src_frame, NULL,
+#ifdef UNIV_DEBUG
+ &corrupted
+#else
+ NULL
+#endif
+ );
+ ut_ad(!corrupted);
+ uint srclen = size - (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+ + FIL_PAGE_FCRC32_CHECKSUM);
+ const byte* src = src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+ byte* dst = dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+ uint dstlen = 0;
+
+ ut_a(key_version != ENCRYPTION_KEY_VERSION_INVALID);
+
+ /* Till FIL_PAGE_LSN, page is not encrypted */
+ memcpy(dst_frame, src_frame, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+
+ /* Write key version to the page. */
+ mach_write_to_4(dst_frame + FIL_PAGE_FCRC32_KEY_VERSION, key_version);
+
+ int rc = encryption_scheme_encrypt(src, srclen, dst, &dstlen,
+ crypt_data, key_version,
+ uint(space), uint(offset), lsn);
+ ut_a(rc == MY_AES_OK);
+ ut_a(dstlen == srclen);
+
+ const ulint payload = size - FIL_PAGE_FCRC32_CHECKSUM;
+ mach_write_to_4(dst_frame + payload, my_crc32c(0, dst_frame, payload));
+ /* Clean the rest of the buffer. FIXME: Punch holes when writing! */
+ memset(dst_frame + (payload + 4), 0, srv_page_size - (payload + 4));
+
+ srv_stats.pages_encrypted.inc();
+
+ return dst_frame;
+}
+
+/** Encrypt a buffer.
+@param[in,out] crypt_data Crypt data
+@param[in] space space_id
+@param[in] offset Page offset
+@param[in] src_frame Page to encrypt
+@param[in] zip_size ROW_FORMAT=COMPRESSED
+ page size, or 0
+@param[in,out] dst_frame Output buffer
+@param[in] use_full_checksum full crc32 algo is used
+@return encrypted buffer or NULL */
+byte* fil_encrypt_buf(
+ fil_space_crypt_t* crypt_data,
+ ulint space,
+ ulint offset,
+ const byte* src_frame,
+ ulint zip_size,
+ byte* dst_frame,
+ bool use_full_checksum)
+{
+ const lsn_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN);
+
+ if (use_full_checksum) {
+ ut_ad(!zip_size);
+ return fil_encrypt_buf_for_full_crc32(
+ crypt_data, space, offset,
+ lsn, src_frame, dst_frame);
+ }
+
+ return fil_encrypt_buf_for_non_full_checksum(
+ crypt_data, space, offset, lsn,
+ src_frame, zip_size, dst_frame);
+}
+
+/** Check whether these page types are allowed to encrypt.
+@param[in] space tablespace object
+@param[in] src_frame source page
+@return true if it is valid page type */
+static bool fil_space_encrypt_valid_page_type(
+ const fil_space_t* space,
+ const byte* src_frame)
+{
+ switch (fil_page_get_type(src_frame)) {
+ case FIL_PAGE_RTREE:
+ return space->full_crc32();
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ return false;
+ }
+
+ return true;
+}
+
+/******************************************************************
+Encrypt a page
+
+@param[in] space Tablespace
+@param[in] offset Page offset
+@param[in] src_frame Page to encrypt
+@param[in,out] dst_frame Output buffer
+@return encrypted buffer or NULL */
+byte* fil_space_encrypt(
+ const fil_space_t* space,
+ ulint offset,
+ byte* src_frame,
+ byte* dst_frame)
+{
+ if (!fil_space_encrypt_valid_page_type(space, src_frame)) {
+ return src_frame;
+ }
+
+ if (!space->crypt_data || !space->crypt_data->is_encrypted()) {
+ return (src_frame);
+ }
+
+ ut_ad(space->referenced());
+
+ return fil_encrypt_buf(space->crypt_data, space->id, offset,
+ src_frame, space->zip_size(),
+ dst_frame, space->full_crc32());
+}
+
+/** Decrypt a page for full checksum format.
+@param[in] space space id
+@param[in] crypt_data crypt_data
+@param[in] tmp_frame Temporary buffer
+@param[in,out] src_frame Page to decrypt
+@return DB_SUCCESS or error */
+static dberr_t fil_space_decrypt_full_crc32(
+ ulint space,
+ fil_space_crypt_t* crypt_data,
+ byte* tmp_frame,
+ byte* src_frame)
+{
+ uint key_version = mach_read_from_4(
+ src_frame + FIL_PAGE_FCRC32_KEY_VERSION);
+ lsn_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN);
+ uint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET);
+
+ ut_ad(key_version != ENCRYPTION_KEY_NOT_ENCRYPTED);
+
+ memcpy(tmp_frame, src_frame, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+
+ /* Calculate the offset where decryption starts */
+ const byte* src = src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+ byte* dst = tmp_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+ uint dstlen = 0;
+ bool corrupted = false;
+ uint size = buf_page_full_crc32_size(src_frame, NULL, &corrupted);
+ if (UNIV_UNLIKELY(corrupted)) {
+ return DB_DECRYPTION_FAILED;
+ }
+
+ uint srclen = size - (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+ + FIL_PAGE_FCRC32_CHECKSUM);
+
+ int rc = encryption_scheme_decrypt(src, srclen, dst, &dstlen,
+ crypt_data, key_version,
+ (uint) space, offset, lsn);
+
+ if (rc != MY_AES_OK || dstlen != srclen) {
+ return DB_DECRYPTION_FAILED;
+ }
+
+ /* Copy only checksum part in the trailer */
+ memcpy(tmp_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+ src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+ FIL_PAGE_FCRC32_CHECKSUM);
+
+ srv_stats.pages_decrypted.inc();
+
+ return DB_SUCCESS; /* page was decrypted */
+}
+
+/** Decrypt a page for non full checksum format.
+@param[in] crypt_data crypt_data
+@param[in] tmp_frame Temporary buffer
+@param[in] physical_size page size
+@param[in,out] src_frame Page to decrypt
+@return DB_SUCCESS or error */
+static dberr_t fil_space_decrypt_for_non_full_checksum(
+ fil_space_crypt_t* crypt_data,
+ byte* tmp_frame,
+ ulint physical_size,
+ byte* src_frame)
+{
+ uint key_version = mach_read_from_4(
+ src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+ bool page_compressed = (fil_page_get_type(src_frame)
+ == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED);
+ uint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET);
+ uint space = mach_read_from_4(
+ src_frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ ib_uint64_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN);
+
+ ut_ad(key_version != ENCRYPTION_KEY_NOT_ENCRYPTED);
+
+ /* read space & lsn */
+ uint header_len = FIL_PAGE_DATA;
+
+ if (page_compressed) {
+ header_len += FIL_PAGE_ENCRYPT_COMP_METADATA_LEN;
+ }
+
+ /* Copy FIL page header, it is not encrypted */
+ memcpy(tmp_frame, src_frame, header_len);
+
+ /* Calculate the offset where decryption starts */
+ const byte* src = src_frame + header_len;
+ byte* dst = tmp_frame + header_len;
+ uint32 dstlen = 0;
+ uint srclen = uint(physical_size) - header_len - FIL_PAGE_DATA_END;
+
+ if (page_compressed) {
+ srclen = mach_read_from_2(src_frame + FIL_PAGE_DATA);
+ }
+
+ int rc = encryption_scheme_decrypt(src, srclen, dst, &dstlen,
+ crypt_data, key_version,
+ space, offset, lsn);
+
+ if (! ((rc == MY_AES_OK) && ((ulint) dstlen == srclen))) {
+ return DB_DECRYPTION_FAILED;
+ }
+
+ /* For compressed tables we do not store the FIL header because
+ the whole page is not stored to the disk. In compressed tables only
+ the FIL header + compressed (and now encrypted) payload alligned
+ to sector boundary is written. */
+ if (!page_compressed) {
+ /* Copy FIL trailer */
+ memcpy(tmp_frame + physical_size - FIL_PAGE_DATA_END,
+ src_frame + physical_size - FIL_PAGE_DATA_END,
+ FIL_PAGE_DATA_END);
+ }
+
+ srv_stats.pages_decrypted.inc();
+
+ return DB_SUCCESS; /* page was decrypted */
+}
+
+/** Decrypt a page.
+@param[in] space_id tablespace id
+@param[in] fsp_flags Tablespace flags
+@param[in] crypt_data crypt_data
+@param[in] tmp_frame Temporary buffer
+@param[in] physical_size page size
+@param[in,out] src_frame Page to decrypt
+@retval DB_SUCCESS on success
+@retval DB_DECRYPTION_FAILED on error */
+dberr_t
+fil_space_decrypt(
+ uint32_t space_id,
+ uint32_t fsp_flags,
+ fil_space_crypt_t* crypt_data,
+ byte* tmp_frame,
+ ulint physical_size,
+ byte* src_frame)
+{
+ if (!crypt_data || !crypt_data->is_encrypted()) {
+ return DB_DECRYPTION_FAILED;
+ }
+
+ if (fil_space_t::full_crc32(fsp_flags)) {
+ return fil_space_decrypt_full_crc32(
+ space_id, crypt_data, tmp_frame, src_frame);
+ }
+
+ return fil_space_decrypt_for_non_full_checksum(crypt_data, tmp_frame,
+ physical_size,
+ src_frame);
+}
+
+/**
+Decrypt a page.
+@param[in] space Tablespace
+@param[in] tmp_frame Temporary buffer used for decrypting
+@param[in,out] src_frame Page to decrypt
+@return decrypted page, or original not encrypted page if decryption is
+not needed.
+@retval nullptr on failure */
+byte*
+fil_space_decrypt(
+ const fil_space_t* space,
+ byte* tmp_frame,
+ byte* src_frame)
+{
+ const ulint physical_size = space->physical_size();
+
+ ut_ad(space->referenced());
+
+ if (DB_SUCCESS != fil_space_decrypt(space->id, space->flags,
+ space->crypt_data,
+ tmp_frame, physical_size,
+ src_frame)) {
+ return nullptr;
+ }
+
+ /* Copy the decrypted page back to page buffer, not
+ really any other options. */
+ return static_cast<byte*>(memcpy(src_frame, tmp_frame, physical_size));
+}
+
+/***********************************************************************/
+
+/** A copy of global key state */
+struct key_state_t {
+ key_state_t() : key_id(0), key_version(0),
+ rotate_key_age(srv_fil_crypt_rotate_key_age) {}
+ bool operator==(const key_state_t& other) const {
+ return key_version == other.key_version &&
+ rotate_key_age == other.rotate_key_age;
+ }
+ uint key_id;
+ uint key_version;
+ uint rotate_key_age;
+};
+
+/***********************************************************************
+Copy global key state
+@param[in,out] new_state key state
+@param[in] crypt_data crypt data */
+static void
+fil_crypt_get_key_state(
+ key_state_t* new_state,
+ fil_space_crypt_t* crypt_data)
+{
+ if (srv_encrypt_tables) {
+ new_state->key_version = crypt_data->key_get_latest_version();
+ new_state->rotate_key_age = srv_fil_crypt_rotate_key_age;
+
+ ut_a(new_state->key_version != ENCRYPTION_KEY_NOT_ENCRYPTED);
+ } else {
+ new_state->key_version = 0;
+ new_state->rotate_key_age = 0;
+ }
+}
+
+/***********************************************************************
+Check if a key needs rotation given a key_state
+@param[in] crypt_data Encryption information
+@param[in] key_version Current key version
+@param[in] latest_key_version Latest key version
+@param[in] rotate_key_age when to rotate
+@return true if key needs rotation, false if not */
+static bool
+fil_crypt_needs_rotation(
+ const fil_space_crypt_t* crypt_data,
+ uint key_version,
+ uint latest_key_version,
+ uint rotate_key_age)
+{
+ if (key_version == ENCRYPTION_KEY_VERSION_INVALID) {
+ return false;
+ }
+
+ if (key_version == 0 && latest_key_version != 0) {
+ /* this is rotation unencrypted => encrypted
+ * ignore rotate_key_age */
+ return true;
+ }
+
+ if (latest_key_version == 0 && key_version != 0) {
+ if (crypt_data->encryption == FIL_ENCRYPTION_DEFAULT) {
+ /* this is rotation encrypted => unencrypted */
+ return true;
+ }
+ return false;
+ }
+
+ if (crypt_data->encryption == FIL_ENCRYPTION_DEFAULT
+ && crypt_data->type == CRYPT_SCHEME_1
+ && !srv_encrypt_tables) {
+ /* This is rotation encrypted => unencrypted */
+ return true;
+ }
+
+ if (rotate_key_age == 0) {
+ return false;
+ }
+
+ /* this is rotation encrypted => encrypted,
+ * only reencrypt if key is sufficiently old */
+ if (key_version + rotate_key_age < latest_key_version) {
+ return true;
+ }
+
+ return false;
+}
+
+/** Read page 0 and possible crypt data from there.
+@param[in,out] space Tablespace */
+static inline void fil_crypt_read_crypt_data(fil_space_t *space)
+{
+ if (space->crypt_data || space->size || !space->get_size())
+ /* The encryption metadata has already been read, or the
+ tablespace is not encrypted and the file has been opened already,
+ or the file cannot be accessed, likely due to a concurrent DROP
+ (possibly as part of TRUNCATE or ALTER TABLE).
+
+ FIXME: The file can become unaccessible any time after this check!
+ We should really remove this function and instead make crypt_data
+ an integral part of fil_space_t. */
+ return;
+
+ const ulint zip_size= space->zip_size();
+ mtr_t mtr;
+ mtr.start();
+ if (buf_block_t* b= buf_page_get_gen(page_id_t{space->id, 0}, zip_size,
+ RW_S_LATCH, nullptr,
+ BUF_GET_POSSIBLY_FREED, &mtr))
+ {
+ mysql_mutex_lock(&fil_system.mutex);
+ if (!space->crypt_data && !space->is_stopping())
+ space->crypt_data= fil_space_read_crypt_data(zip_size, b->page.frame);
+ mysql_mutex_unlock(&fil_system.mutex);
+ }
+ mtr.commit();
+}
+
+/** Start encrypting a space
+@param[in,out] space Tablespace
+@return true if a recheck of tablespace is needed by encryption thread. */
+static bool fil_crypt_start_encrypting_space(fil_space_t* space)
+{
+ mysql_mutex_lock(&fil_crypt_threads_mutex);
+
+ fil_space_crypt_t *crypt_data = space->crypt_data;
+
+ /* If space is not encrypted and encryption is not enabled, then
+ do not continue encrypting the space. */
+ if (!crypt_data && !srv_encrypt_tables) {
+func_exit:
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+ return false;
+ }
+
+ const bool recheck = fil_crypt_start_converting;
+
+ if (recheck || crypt_data || space->is_stopping()) {
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+ return recheck;
+ }
+
+ /* NOTE: we need to write and flush page 0 before publishing
+ * the crypt data. This so that after restart there is no
+ * risk of finding encrypted pages without having
+ * crypt data in page 0 */
+
+ /* 1 - create crypt data */
+ crypt_data = fil_space_create_crypt_data(
+ FIL_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY);
+
+ if (!crypt_data) {
+ goto func_exit;
+ }
+
+ fil_crypt_start_converting = true;
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+
+ mtr_t mtr;
+ mtr.start();
+
+ /* 2 - get page 0 */
+ if (buf_block_t* block = buf_page_get_gen(
+ page_id_t(space->id, 0), space->zip_size(),
+ RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED, &mtr)) {
+ crypt_data->type = CRYPT_SCHEME_1;
+ crypt_data->min_key_version = 0; // all pages are unencrypted
+ crypt_data->rotate_state.start_time = time(0);
+ crypt_data->rotate_state.starting = true;
+ crypt_data->rotate_state.active_threads = 1;
+
+ mysql_mutex_lock(&fil_system.mutex);
+ const bool stopping = space->is_stopping();
+ if (!stopping) {
+ space->crypt_data = crypt_data;
+ }
+ mysql_mutex_unlock(&fil_system.mutex);
+
+ if (stopping) {
+ goto abort;
+ }
+
+ /* 3 - write crypt data to page 0 */
+ mtr.set_named_space(space);
+ crypt_data->write_page0(block, &mtr);
+
+ mtr.commit();
+
+ /* 4 - sync tablespace before publishing crypt data */
+ while (buf_flush_list_space(space));
+
+ /* 5 - publish crypt data */
+ mysql_mutex_lock(&fil_crypt_threads_mutex);
+ mysql_mutex_lock(&crypt_data->mutex);
+ crypt_data->type = CRYPT_SCHEME_1;
+ ut_a(crypt_data->rotate_state.active_threads == 1);
+ crypt_data->rotate_state.active_threads = 0;
+ crypt_data->rotate_state.starting = false;
+
+ fil_crypt_start_converting = false;
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+ mysql_mutex_unlock(&crypt_data->mutex);
+
+ return false;
+ }
+
+abort:
+ mtr.commit();
+ mysql_mutex_lock(&fil_crypt_threads_mutex);
+ fil_crypt_start_converting = false;
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+
+ crypt_data->~fil_space_crypt_t();
+ ut_free(crypt_data);
+ return false;
+}
+
+/** State of a rotation thread */
+struct rotate_thread_t {
+ explicit rotate_thread_t(uint no) : thread_no(no) {}
+
+ uint thread_no;
+ bool first = true; /*!< is position before first space */
+ space_list_t::iterator space
+ = fil_system.space_list.end();/*!< current space or .end() */
+ uint32_t offset = 0; /*!< current page number */
+ ulint batch = 0; /*!< #pages to rotate */
+ uint min_key_version_found = 0; /*!< min key version found but not rotated */
+ lsn_t end_lsn = 0; /*!< max lsn when rotating this space */
+
+ uint estimated_max_iops = 20;/*!< estimation of max iops */
+ uint allocated_iops = 0; /*!< allocated iops */
+ ulint cnt_waited = 0; /*!< #times waited during this slot */
+ uintmax_t sum_waited_us = 0; /*!< wait time during this slot */
+
+ fil_crypt_stat_t crypt_stat; // statistics
+
+ /** @return whether this thread should terminate */
+ bool should_shutdown() const {
+ mysql_mutex_assert_owner(&fil_crypt_threads_mutex);
+ switch (srv_shutdown_state) {
+ case SRV_SHUTDOWN_NONE:
+ return thread_no >= srv_n_fil_crypt_threads;
+ case SRV_SHUTDOWN_EXIT_THREADS:
+ /* srv_init_abort() must have been invoked */
+ case SRV_SHUTDOWN_CLEANUP:
+ case SRV_SHUTDOWN_INITIATED:
+ return true;
+ case SRV_SHUTDOWN_LAST_PHASE:
+ break;
+ }
+ ut_ad(0);
+ return true;
+ }
+};
+
+/** Avoid the removal of the tablespace from
+default_encrypt_list only when
+1) Another active encryption thread working on tablespace
+2) Eligible for tablespace key rotation
+3) Tablespace is in flushing phase
+@return true if tablespace should be removed from
+default encrypt */
+static bool fil_crypt_must_remove(const fil_space_t &space)
+{
+ ut_ad(space.purpose == FIL_TYPE_TABLESPACE);
+ fil_space_crypt_t *crypt_data = space.crypt_data;
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ const ulong encrypt_tables= srv_encrypt_tables;
+ if (!crypt_data)
+ return !encrypt_tables;
+ if (!crypt_data->is_key_found())
+ return true;
+
+ mysql_mutex_lock(&crypt_data->mutex);
+ const bool remove= (space.is_stopping() || crypt_data->not_encrypted()) &&
+ (!crypt_data->rotate_state.flushing &&
+ !encrypt_tables == !!crypt_data->min_key_version &&
+ !crypt_data->rotate_state.active_threads);
+ mysql_mutex_unlock(&crypt_data->mutex);
+ return remove;
+}
+
+/***********************************************************************
+Check if space needs rotation given a key_state
+@param[in,out] state Key rotation state
+@param[in,out] key_state Key state
+@param[in,out] recheck needs recheck ?
+@return true if space needs key rotation */
+static
+bool
+fil_crypt_space_needs_rotation(
+ rotate_thread_t* state,
+ key_state_t* key_state,
+ bool* recheck)
+{
+ mysql_mutex_assert_not_owner(&fil_crypt_threads_mutex);
+
+ fil_space_t* space = &*state->space;
+
+ ut_ad(space->referenced());
+ ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
+
+ fil_space_crypt_t *crypt_data = space->crypt_data;
+
+ if (crypt_data == NULL) {
+ /**
+ * space has no crypt data
+ * start encrypting it...
+ */
+ *recheck = fil_crypt_start_encrypting_space(space);
+ crypt_data = space->crypt_data;
+
+ if (crypt_data == NULL) {
+ return false;
+ }
+
+ crypt_data->key_get_latest_version();
+ }
+
+ /* If used key_id is not found from encryption plugin we can't
+ continue to rotate the tablespace */
+ if (!crypt_data->is_key_found()) {
+ return false;
+ }
+
+ bool need_key_rotation = false;
+
+ mysql_mutex_lock(&crypt_data->mutex);
+
+ do {
+ /* prevent threads from starting to rotate space */
+ if (crypt_data->rotate_state.starting) {
+ /* recheck this space later */
+ *recheck = true;
+ break;
+ }
+
+ /* prevent threads from starting to rotate space */
+ if (space->is_stopping()) {
+ break;
+ }
+
+ if (crypt_data->rotate_state.flushing) {
+ break;
+ }
+
+ /* No need to rotate space if encryption is disabled */
+ if (crypt_data->not_encrypted()) {
+ break;
+ }
+
+ if (crypt_data->key_id != key_state->key_id) {
+ key_state->key_id= crypt_data->key_id;
+ fil_crypt_get_key_state(key_state, crypt_data);
+ }
+
+ need_key_rotation = fil_crypt_needs_rotation(
+ crypt_data,
+ crypt_data->min_key_version,
+ key_state->key_version,
+ key_state->rotate_key_age);
+ } while (0);
+
+ mysql_mutex_unlock(&crypt_data->mutex);
+ return need_key_rotation;
+}
+
+/***********************************************************************
+Update global statistics with thread statistics
+@param[in,out] state key rotation statistics */
+static void
+fil_crypt_update_total_stat(
+ rotate_thread_t *state)
+{
+ mysql_mutex_lock(&crypt_stat_mutex);
+ crypt_stat.pages_read_from_cache +=
+ state->crypt_stat.pages_read_from_cache;
+ crypt_stat.pages_read_from_disk +=
+ state->crypt_stat.pages_read_from_disk;
+ crypt_stat.pages_modified += state->crypt_stat.pages_modified;
+ crypt_stat.pages_flushed += state->crypt_stat.pages_flushed;
+ // remote old estimate
+ crypt_stat.estimated_iops -= state->crypt_stat.estimated_iops;
+ // add new estimate
+ crypt_stat.estimated_iops += state->estimated_max_iops;
+ mysql_mutex_unlock(&crypt_stat_mutex);
+
+ // make new estimate "current" estimate
+ state->crypt_stat.pages_read_from_cache = 0;
+ state->crypt_stat.pages_read_from_disk = 0;
+ state->crypt_stat.pages_modified = 0;
+ state->crypt_stat.pages_flushed = 0;
+ // record our old (current) estimate
+ state->crypt_stat.estimated_iops = state->estimated_max_iops;
+}
+
+/***********************************************************************
+Allocate iops to thread from global setting,
+used before starting to rotate a space.
+@param[in,out] state Rotation state
+@return true if allocation succeeded, false if failed */
+static bool fil_crypt_alloc_iops(rotate_thread_t *state)
+{
+ mysql_mutex_assert_owner(&fil_crypt_threads_mutex);
+ ut_ad(state->allocated_iops == 0);
+
+ /* We have not yet selected the space to rotate, thus
+ state might not contain space and we can't check
+ its status yet. */
+
+ uint max_iops = state->estimated_max_iops;
+
+ if (n_fil_crypt_iops_allocated >= srv_n_fil_crypt_iops) {
+wait:
+ my_cond_wait(&fil_crypt_threads_cond,
+ &fil_crypt_threads_mutex.m_mutex);
+ return false;
+ }
+
+ uint alloc = srv_n_fil_crypt_iops - n_fil_crypt_iops_allocated;
+
+ if (alloc > max_iops) {
+ alloc = max_iops;
+ }
+
+ if (!alloc) {
+ goto wait;
+ }
+
+ n_fil_crypt_iops_allocated += alloc;
+
+ state->allocated_iops = alloc;
+ return true;
+}
+
+/**
+Reallocate iops to thread when processing a tablespace
+@param[in,out] state Rotation state
+@return whether the thread should continue running */
+static bool fil_crypt_realloc_iops(rotate_thread_t *state)
+{
+ ut_a(state->allocated_iops > 0);
+
+ if (10 * state->cnt_waited > state->batch) {
+ /* if we waited more than 10% re-estimate max_iops */
+ ulint avg_wait_time_us =
+ ulint(state->sum_waited_us / state->cnt_waited);
+
+ if (avg_wait_time_us == 0) {
+ avg_wait_time_us = 1; // prevent division by zero
+ }
+
+ DBUG_PRINT("ib_crypt",
+ ("thr_no: %u - update estimated_max_iops from %u to "
+ ULINTPF ".",
+ state->thread_no,
+ state->estimated_max_iops,
+ 1000000 / avg_wait_time_us));
+
+ state->estimated_max_iops = std::max(
+ 1U, uint(1000000 / avg_wait_time_us));
+ state->cnt_waited = 0;
+ state->sum_waited_us = 0;
+ } else {
+ DBUG_PRINT("ib_crypt",
+ ("thr_no: %u only waited " ULINTPF
+ "%% skip re-estimate.",
+ state->thread_no,
+ (100 * state->cnt_waited)
+ / (state->batch ? state->batch : 1)));
+ }
+
+ ut_ad(state->estimated_max_iops);
+
+ mysql_mutex_lock(&fil_crypt_threads_mutex);
+
+ if (state->should_shutdown()) {
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+ return false;
+ }
+
+ if (state->allocated_iops > state->estimated_max_iops) {
+ /* release iops */
+ uint extra = state->allocated_iops - state->estimated_max_iops;
+ state->allocated_iops = state->estimated_max_iops;
+ ut_ad(n_fil_crypt_iops_allocated >= extra);
+ n_fil_crypt_iops_allocated -= extra;
+ pthread_cond_broadcast(&fil_crypt_threads_cond);
+ } else if (srv_n_fil_crypt_iops > n_fil_crypt_iops_allocated) {
+ /* there are extra iops free */
+ uint add = srv_n_fil_crypt_iops - n_fil_crypt_iops_allocated;
+ if (state->allocated_iops + add > state->estimated_max_iops) {
+ /* but don't alloc more than our max */
+ add= state->estimated_max_iops - state->allocated_iops;
+ }
+ n_fil_crypt_iops_allocated += add;
+ state->allocated_iops += add;
+
+ DBUG_PRINT("ib_crypt",
+ ("thr_no: %u increased iops from %u to %u.",
+ state->thread_no,
+ state->allocated_iops - add,
+ state->allocated_iops));
+ }
+
+ fil_crypt_update_total_stat(state);
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+ return true;
+}
+
+/** Release excess allocated iops
+@param state rotation state
+@param wake whether to wake up other threads */
+static void fil_crypt_return_iops(rotate_thread_t *state, bool wake= true)
+{
+ mysql_mutex_assert_owner(&fil_crypt_threads_mutex);
+
+ if (uint iops= state->allocated_iops)
+ {
+ ut_ad(n_fil_crypt_iops_allocated >= iops);
+ n_fil_crypt_iops_allocated-= iops;
+ state->allocated_iops= 0;
+ if (wake)
+ pthread_cond_broadcast(&fil_crypt_threads_cond);
+ }
+
+ fil_crypt_update_total_stat(state);
+}
+
+/** Acquire a tablespace reference.
+@return whether a tablespace reference was successfully acquired */
+inline bool fil_space_t::acquire_if_not_stopped()
+{
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ const uint32_t n= acquire_low();
+ if (UNIV_LIKELY(!(n & (STOPPING | CLOSING))))
+ return true;
+ if (UNIV_UNLIKELY(n & STOPPING))
+ return false;
+ return UNIV_LIKELY(!(n & CLOSING)) || prepare_acquired();
+}
+
+bool fil_crypt_must_default_encrypt()
+{
+ return !srv_fil_crypt_rotate_key_age || !srv_encrypt_rotate;
+}
+
+/** Return the next tablespace from default_encrypt_tables list.
+@param space previous tablespace (nullptr to start from the start)
+@param recheck whether the removal condition needs to be rechecked after
+the encryption parameters were changed
+@param encrypt expected state of innodb_encrypt_tables
+@return the next tablespace to process (n_pending_ops incremented)
+@retval fil_system.temp_space if there is no work to do
+@retval nullptr upon reaching the end of the iteration */
+inline fil_space_t *fil_system_t::default_encrypt_next(fil_space_t *space,
+ bool recheck,
+ bool encrypt)
+{
+ mysql_mutex_assert_owner(&mutex);
+
+ auto it= space && space->is_in_default_encrypt
+ ? sized_ilist<fil_space_t, default_encrypt_tag_t>::iterator(space)
+ : default_encrypt_tables.begin();
+ const auto end= default_encrypt_tables.end();
+
+ if (space)
+ {
+ const bool released= !space->release();
+
+ if (space->is_in_default_encrypt)
+ {
+ while (++it != end &&
+ (!UT_LIST_GET_LEN(it->chain) || it->is_stopping()));
+
+ /* If one of the encryption threads already started
+ the encryption of the table then don't remove the
+ unencrypted spaces from default encrypt list.
+
+ If there is a change in innodb_encrypt_tables variables
+ value then don't remove the last processed tablespace
+ from the default encrypt list. */
+ if (released && !recheck && fil_crypt_must_remove(*space))
+ {
+ ut_a(!default_encrypt_tables.empty());
+ default_encrypt_tables.remove(*space);
+ space->is_in_default_encrypt= false;
+ }
+ }
+ }
+ else while (it != end &&
+ (!UT_LIST_GET_LEN(it->chain) || it->is_stopping()))
+ {
+ /* Find the next suitable default encrypt table if
+ beginning of default_encrypt_tables list has been scheduled
+ to be deleted */
+ it++;
+ }
+
+ if (it == end)
+ return temp_space;
+
+ do
+ {
+ space= &*it;
+ if (space->acquire_if_not_stopped())
+ return space;
+ if (++it == end)
+ return nullptr;
+ }
+ while (!UT_LIST_GET_LEN(it->chain) || it->is_stopping());
+
+ return nullptr;
+}
+
+/** Determine the next tablespace for encryption key rotation.
+@param space current tablespace (nullptr to start from the beginning)
+@param recheck whether the removal condition needs to be rechecked after
+encryption parameters were changed
+@param encrypt expected state of innodb_encrypt_tables
+@return the next tablespace
+@retval fil_system.temp_space if there is no work to do
+@retval end() upon reaching the end of the iteration */
+space_list_t::iterator fil_space_t::next(space_list_t::iterator space,
+ bool recheck, bool encrypt)
+{
+ mysql_mutex_lock(&fil_system.mutex);
+
+ if (fil_crypt_must_default_encrypt())
+ {
+ fil_space_t *next_space=
+ fil_system.default_encrypt_next(space == fil_system.space_list.end()
+ ? nullptr : &*space, recheck, encrypt);
+ space= next_space
+ ? space_list_t::iterator(next_space)
+ : fil_system.space_list.end();
+ }
+ else
+ {
+ if (space == fil_system.space_list.end())
+ space= fil_system.space_list.begin();
+ else
+ {
+ /* Move on to the next fil_space_t */
+ space->release();
+ ++space;
+ }
+
+ for (; space != fil_system.space_list.end(); ++space)
+ {
+ if (space->purpose != FIL_TYPE_TABLESPACE)
+ continue;
+ const uint32_t n= space->acquire_low();
+ if (UNIV_LIKELY(!(n & (STOPPING | CLOSING))))
+ break;
+ if (!(n & STOPPING) && space->prepare_acquired())
+ break;
+ }
+ }
+
+ mysql_mutex_unlock(&fil_system.mutex);
+ return space;
+}
+
+/** Search for a space needing rotation
+@param[in,out] key_state Key state
+@param[in,out] state Rotation state
+@param[in,out] recheck recheck of the tablespace is needed or
+ still encryption thread does write page 0
+@return whether the thread should keep running */
+static bool fil_crypt_find_space_to_rotate(
+ key_state_t* key_state,
+ rotate_thread_t* state,
+ bool* recheck)
+{
+ /* we need iops to start rotating */
+ do {
+ if (state->should_shutdown()) {
+ if (state->space != fil_system.space_list.end()) {
+ state->space->release();
+ state->space = fil_system.space_list.end();
+ }
+ return false;
+ }
+ } while (!fil_crypt_alloc_iops(state));
+
+ if (state->first) {
+ state->first = false;
+ if (state->space != fil_system.space_list.end()) {
+ state->space->release();
+ }
+ state->space = fil_system.space_list.end();
+ }
+
+ state->space = fil_space_t::next(state->space, *recheck,
+ key_state->key_version != 0);
+
+ bool wake = true;
+ while (state->space != fil_system.space_list.end()) {
+ if (state->space
+ == space_list_t::iterator(fil_system.temp_space)) {
+ wake = false;
+ goto done;
+ }
+
+ if (state->should_shutdown()) {
+ state->space->release();
+done:
+ state->space = fil_system.space_list.end();
+ break;
+ }
+
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+ /* If there is no crypt data and we have not yet read
+ page 0 for this tablespace, we need to read it before
+ we can continue. */
+ if (!state->space->crypt_data) {
+ fil_crypt_read_crypt_data(&*state->space);
+ }
+
+ if (fil_crypt_space_needs_rotation(state, key_state, recheck)) {
+ ut_ad(key_state->key_id);
+ /* init state->min_key_version_found before
+ * starting on a space */
+ state->min_key_version_found = key_state->key_version;
+ mysql_mutex_lock(&fil_crypt_threads_mutex);
+ return true;
+ }
+
+ state->space = fil_space_t::next(state->space, *recheck,
+ key_state->key_version != 0);
+ mysql_mutex_lock(&fil_crypt_threads_mutex);
+ }
+
+ /* no work to do; release our allocation of I/O capacity */
+ fil_crypt_return_iops(state, wake);
+ return true;
+}
+
+/***********************************************************************
+Start rotating a space
+@param[in] key_state Key state
+@param[in,out] state Rotation state */
+static
+void
+fil_crypt_start_rotate_space(
+ const key_state_t* key_state,
+ rotate_thread_t* state)
+{
+ fil_space_crypt_t *crypt_data = state->space->crypt_data;
+
+ ut_ad(crypt_data);
+ mysql_mutex_lock(&crypt_data->mutex);
+ ut_ad(key_state->key_id == crypt_data->key_id);
+
+ if (crypt_data->rotate_state.active_threads == 0) {
+ /* only first thread needs to init */
+ crypt_data->rotate_state.next_offset = 1; // skip page 0
+ /* no need to rotate beyond current max
+ * if space extends, it will be encrypted with newer version */
+ /* FIXME: max_offset could be removed and instead
+ space->size consulted.*/
+ crypt_data->rotate_state.max_offset = state->space->size;
+ crypt_data->rotate_state.end_lsn = 0;
+ crypt_data->rotate_state.min_key_version_found =
+ key_state->key_version;
+
+ crypt_data->rotate_state.start_time = time(0);
+
+ if (crypt_data->type == CRYPT_SCHEME_UNENCRYPTED &&
+ crypt_data->is_encrypted() &&
+ key_state->key_version != 0) {
+ /* this is rotation unencrypted => encrypted */
+ crypt_data->type = CRYPT_SCHEME_1;
+ }
+ }
+
+ /* count active threads in space */
+ crypt_data->rotate_state.active_threads++;
+
+ /* Initialize thread local state */
+ state->end_lsn = crypt_data->rotate_state.end_lsn;
+ state->min_key_version_found =
+ crypt_data->rotate_state.min_key_version_found;
+
+ mysql_mutex_unlock(&crypt_data->mutex);
+}
+
+/***********************************************************************
+Search for batch of pages needing rotation
+@param[in] key_state Key state
+@param[in,out] state Rotation state
+@return true if page needing key rotation found, false if not found */
+static
+bool
+fil_crypt_find_page_to_rotate(
+ const key_state_t* key_state,
+ rotate_thread_t* state)
+{
+ ulint batch = srv_alloc_time * state->allocated_iops;
+
+ ut_ad(state->space == fil_system.space_list.end()
+ || state->space->referenced());
+
+ /* If space is marked to be dropped stop rotation. */
+ if (state->space == fil_system.space_list.end()
+ || state->space->is_stopping()) {
+ return false;
+ }
+
+ fil_space_crypt_t *crypt_data = state->space->crypt_data;
+
+ mysql_mutex_lock(&crypt_data->mutex);
+ ut_ad(key_state->key_id == crypt_data->key_id);
+
+ bool found = crypt_data->rotate_state.max_offset >=
+ crypt_data->rotate_state.next_offset;
+
+ if (found) {
+ state->offset = crypt_data->rotate_state.next_offset;
+ ulint remaining = crypt_data->rotate_state.max_offset -
+ crypt_data->rotate_state.next_offset;
+
+ if (batch <= remaining) {
+ state->batch = batch;
+ } else {
+ state->batch = remaining;
+ }
+ }
+
+ crypt_data->rotate_state.next_offset += uint32_t(batch);
+ mysql_mutex_unlock(&crypt_data->mutex);
+ return found;
+}
+
+/***********************************************************************
+Get a page and compute sleep time
+@param[in,out] state Rotation state
+@param[in] offset Page offset
+@param[in,out] mtr Minitransaction
+@param[out] sleeptime_ms Sleep time
+@return page or NULL*/
+static
+buf_block_t*
+fil_crypt_get_page_throttle(
+ rotate_thread_t* state,
+ uint32_t offset,
+ mtr_t* mtr,
+ ulint* sleeptime_ms)
+{
+ fil_space_t* space = &*state->space;
+ const ulint zip_size = space->zip_size();
+ const page_id_t page_id(space->id, offset);
+ ut_ad(space->referenced());
+
+ /* Before reading from tablespace we need to make sure that
+ the tablespace is not about to be dropped. */
+ if (space->is_stopping()) {
+ return NULL;
+ }
+
+ buf_block_t* block = buf_page_get_gen(page_id, zip_size, RW_X_LATCH,
+ NULL,
+ BUF_PEEK_IF_IN_POOL, mtr);
+ if (block != NULL) {
+ /* page was in buffer pool */
+ state->crypt_stat.pages_read_from_cache++;
+ return block;
+ }
+
+ if (space->is_stopping()) {
+ return NULL;
+ }
+
+ if (offset % (zip_size ? zip_size : srv_page_size)
+ && DB_SUCCESS_LOCKED_REC
+ != fseg_page_is_allocated(space, offset)) {
+ /* page is already freed */
+ return NULL;
+ }
+
+ state->crypt_stat.pages_read_from_disk++;
+
+ const ulonglong start = my_interval_timer();
+ block = buf_page_get_gen(page_id, zip_size,
+ RW_X_LATCH,
+ NULL, BUF_GET_POSSIBLY_FREED, mtr);
+ const ulonglong end = my_interval_timer();
+
+ state->cnt_waited++;
+
+ if (end > start) {
+ state->sum_waited_us += (end - start) / 1000;
+ }
+
+ /* average page load */
+ ulint add_sleeptime_ms = 0;
+ ulint avg_wait_time_us =ulint(state->sum_waited_us / state->cnt_waited);
+ ulint alloc_wait_us = 1000000 / state->allocated_iops;
+
+ if (avg_wait_time_us < alloc_wait_us) {
+ /* we reading faster than we allocated */
+ add_sleeptime_ms = (alloc_wait_us - avg_wait_time_us) / 1000;
+ } else {
+ /* if page load time is longer than we want, skip sleeping */
+ }
+
+ *sleeptime_ms += add_sleeptime_ms;
+
+ return block;
+}
+
+/***********************************************************************
+Rotate one page
+@param[in,out] key_state Key state
+@param[in,out] state Rotation state */
+static
+void
+fil_crypt_rotate_page(
+ const key_state_t* key_state,
+ rotate_thread_t* state)
+{
+ fil_space_t *space = &*state->space;
+ ulint space_id = space->id;
+ uint32_t offset = state->offset;
+ ulint sleeptime_ms = 0;
+ fil_space_crypt_t *crypt_data = space->crypt_data;
+
+ ut_ad(space->referenced());
+ ut_ad(offset > 0);
+
+ /* In fil_crypt_thread where key rotation is done we have
+ acquired space and checked that this space is not yet
+ marked to be dropped. Similarly, in fil_crypt_find_page_to_rotate().
+ Check here also to give DROP TABLE or similar a change. */
+ if (space->is_stopping()) {
+ return;
+ }
+
+ if (space_id == TRX_SYS_SPACE && offset == TRX_SYS_PAGE_NO) {
+ /* don't encrypt this as it contains address to dblwr buffer */
+ return;
+ }
+
+ mtr_t mtr;
+ mtr.start();
+ if (buf_block_t* block = fil_crypt_get_page_throttle(state,
+ offset, &mtr,
+ &sleeptime_ms)) {
+ bool modified = false;
+ byte* frame = buf_block_get_frame(block);
+ const lsn_t block_lsn = mach_read_from_8(FIL_PAGE_LSN + frame);
+ uint kv = buf_page_get_key_version(frame, space->flags);
+
+ if (block->page.oldest_modification() > 1) {
+ /* Do not unnecessarily touch pages that are
+ already dirty. */
+ } else if (space->is_stopping()) {
+ /* The tablespace is closing (in DROP TABLE or
+ TRUNCATE TABLE or similar): avoid further access */
+ } else if (!kv && !*reinterpret_cast<uint16_t*>
+ (&frame[FIL_PAGE_TYPE])) {
+ /* It looks like this page is not
+ allocated. Because key rotation is accessing
+ pages in a pattern that is unlike the normal
+ B-tree and undo log access pattern, we cannot
+ invoke fseg_page_is_allocated() here, because that
+ could result in a deadlock. If we invoked
+ fseg_page_is_allocated() and released the
+ tablespace latch before acquiring block->lock,
+ then the fseg_page_is_allocated() information
+ could be stale already. */
+
+ /* If the data file was originally created
+ before MariaDB 10.0 or MySQL 5.6, some
+ allocated data pages could carry 0 in
+ FIL_PAGE_TYPE. The FIL_PAGE_TYPE on those
+ pages will be updated in
+ buf_flush_init_for_writing() when the page
+ is modified the next time.
+
+ Also, when the doublewrite buffer pages are
+ allocated on bootstrap in a non-debug build,
+ some dummy pages will be allocated, with 0 in
+ the FIL_PAGE_TYPE. Those pages should be
+ skipped from key rotation forever. */
+ } else if (fil_crypt_needs_rotation(
+ crypt_data,
+ kv,
+ key_state->key_version,
+ key_state->rotate_key_age)) {
+
+ mtr.set_named_space(space);
+ modified = true;
+
+ /* force rotation by dummy updating page */
+ mtr.write<1,mtr_t::FORCED>(*block,
+ &frame[FIL_PAGE_SPACE_ID],
+ frame[FIL_PAGE_SPACE_ID]);
+
+ /* statistics */
+ state->crypt_stat.pages_modified++;
+ } else {
+ if (crypt_data->is_encrypted()) {
+ if (kv < state->min_key_version_found) {
+ state->min_key_version_found = kv;
+ }
+ }
+ }
+
+ mtr.commit();
+ lsn_t end_lsn = mtr.commit_lsn();
+
+
+ if (modified) {
+ /* if we modified page, we take lsn from mtr */
+ ut_a(end_lsn > state->end_lsn);
+ ut_a(end_lsn > block_lsn);
+ state->end_lsn = end_lsn;
+ } else {
+ /* if we did not modify page, check for max lsn */
+ if (block_lsn > state->end_lsn) {
+ state->end_lsn = block_lsn;
+ }
+ }
+ } else {
+ /* If block read failed mtr memo and log should be empty. */
+ ut_ad(!mtr.has_modifications());
+ ut_ad(mtr.is_empty());
+ mtr.commit();
+ }
+
+ if (sleeptime_ms) {
+ mysql_mutex_lock(&fil_crypt_threads_mutex);
+ timespec abstime;
+ set_timespec_nsec(abstime, 1000000ULL * sleeptime_ms);
+ my_cond_timedwait(&fil_crypt_throttle_sleep_cond,
+ &fil_crypt_threads_mutex.m_mutex, &abstime);
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+ }
+}
+
+/***********************************************************************
+Rotate a batch of pages
+@param[in,out] key_state Key state
+@param[in,out] state Rotation state */
+static
+void
+fil_crypt_rotate_pages(
+ const key_state_t* key_state,
+ rotate_thread_t* state)
+{
+ const uint32_t space_id = state->space->id;
+ uint32_t end = std::min(state->offset + uint32_t(state->batch),
+ state->space->free_limit);
+
+ ut_ad(state->space->referenced());
+
+ for (; state->offset < end; state->offset++) {
+
+ /* we can't rotate pages in dblwr buffer as
+ * it's not possible to read those due to lots of asserts
+ * in buffer pool.
+ *
+ * However since these are only (short-lived) copies of
+ * real pages, they will be updated anyway when the
+ * real page is updated
+ */
+ if (buf_dblwr.is_inside(page_id_t(space_id, state->offset))) {
+ continue;
+ }
+
+ /* If space is marked as stopping, stop rotating
+ pages. */
+ if (state->space->is_stopping()) {
+ break;
+ }
+
+ fil_crypt_rotate_page(key_state, state);
+ }
+}
+
+/***********************************************************************
+Flush rotated pages and then update page 0
+
+@param[in,out] state rotation state */
+static
+void
+fil_crypt_flush_space(
+ rotate_thread_t* state)
+{
+ fil_space_t* space = &*state->space;
+ fil_space_crypt_t *crypt_data = space->crypt_data;
+
+ ut_ad(space->referenced());
+
+ /* flush tablespace pages so that there are no pages left with old key */
+ lsn_t end_lsn = crypt_data->rotate_state.end_lsn;
+
+ if (end_lsn > 0 && !space->is_stopping()) {
+ ulint sum_pages = 0;
+ const ulonglong start = my_interval_timer();
+ while (buf_flush_list_space(space, &sum_pages));
+ if (sum_pages) {
+ const ulonglong end = my_interval_timer();
+
+ state->cnt_waited += sum_pages;
+ state->sum_waited_us += (end - start) / 1000;
+
+ /* statistics */
+ state->crypt_stat.pages_flushed += sum_pages;
+ }
+ }
+
+ if (crypt_data->min_key_version == 0) {
+ crypt_data->type = CRYPT_SCHEME_UNENCRYPTED;
+ }
+
+ if (space->is_stopping()) {
+ return;
+ }
+
+ /* update page 0 */
+ mtr_t mtr;
+ mtr.start();
+
+ if (buf_block_t* block = buf_page_get_gen(
+ page_id_t(space->id, 0), space->zip_size(),
+ RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED, &mtr)) {
+ mtr.set_named_space(space);
+ crypt_data->write_page0(block, &mtr);
+ }
+
+ mtr.commit();
+}
+
+/***********************************************************************
+Complete rotating a space
+@param[in,out] state Rotation state */
+static void fil_crypt_complete_rotate_space(rotate_thread_t* state)
+{
+ fil_space_crypt_t *crypt_data = state->space->crypt_data;
+
+ ut_ad(crypt_data);
+ ut_ad(state->space->referenced());
+
+ mysql_mutex_lock(&crypt_data->mutex);
+
+ /* Space might already be dropped */
+ if (!state->space->is_stopping()) {
+ /**
+ * Update crypt data state with state from thread
+ */
+ if (state->min_key_version_found <
+ crypt_data->rotate_state.min_key_version_found) {
+ crypt_data->rotate_state.min_key_version_found =
+ state->min_key_version_found;
+ }
+
+ if (state->end_lsn > crypt_data->rotate_state.end_lsn) {
+ crypt_data->rotate_state.end_lsn = state->end_lsn;
+ }
+
+ ut_a(crypt_data->rotate_state.active_threads > 0);
+ crypt_data->rotate_state.active_threads--;
+ bool last = crypt_data->rotate_state.active_threads == 0;
+
+ /**
+ * check if space is fully done
+ * this as when threads shutdown, it could be that we "complete"
+ * iterating before we have scanned the full space.
+ */
+ bool done = crypt_data->rotate_state.next_offset >=
+ crypt_data->rotate_state.max_offset;
+
+ /**
+ * we should flush space if we're last thread AND
+ * the iteration is done
+ */
+ bool should_flush = last && done;
+
+ if (should_flush) {
+ /* we're the last active thread */
+ crypt_data->rotate_state.flushing = true;
+ crypt_data->min_key_version =
+ crypt_data->rotate_state.min_key_version_found;
+ mysql_mutex_unlock(&crypt_data->mutex);
+ fil_crypt_flush_space(state);
+
+ mysql_mutex_lock(&crypt_data->mutex);
+ crypt_data->rotate_state.flushing = false;
+ }
+ } else {
+ ut_a(crypt_data->rotate_state.active_threads > 0);
+ crypt_data->rotate_state.active_threads--;
+ }
+
+ mysql_mutex_unlock(&crypt_data->mutex);
+}
+
+/** A thread which monitors global key state and rotates tablespaces
+accordingly */
+static void fil_crypt_thread()
+{
+ mysql_mutex_lock(&fil_crypt_threads_mutex);
+ rotate_thread_t thr(srv_n_fil_crypt_threads_started++);
+ pthread_cond_signal(&fil_crypt_cond); /* signal that we started */
+
+ if (!thr.should_shutdown()) {
+ /* if we find a tablespace that is starting, skip over it
+ and recheck it later */
+ bool recheck = false;
+
+wait_for_work:
+ if (!recheck && !thr.should_shutdown()) {
+ /* wait for key state changes
+ * i.e either new key version of change or
+ * new rotate_key_age */
+ my_cond_wait(&fil_crypt_threads_cond,
+ &fil_crypt_threads_mutex.m_mutex);
+ }
+
+ recheck = false;
+ thr.first = true; // restart from first tablespace
+
+ key_state_t new_state;
+
+ /* iterate all spaces searching for those needing rotation */
+ while (fil_crypt_find_space_to_rotate(&new_state, &thr,
+ &recheck)) {
+ if (thr.space == fil_system.space_list.end()) {
+ goto wait_for_work;
+ }
+
+ /* we found a space to rotate */
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+ fil_crypt_start_rotate_space(&new_state, &thr);
+
+ /* iterate all pages (cooperativly with other threads) */
+ while (fil_crypt_find_page_to_rotate(&new_state, &thr)) {
+
+ /* If space is marked as stopping, release
+ space and stop rotation. */
+ if (thr.space->is_stopping()) {
+ fil_crypt_complete_rotate_space(&thr);
+ thr.space->release();
+ thr.space = fil_system.space_list.end();
+ break;
+ }
+
+ fil_crypt_rotate_pages(&new_state, &thr);
+ /* realloc iops */
+ if (!fil_crypt_realloc_iops(&thr)) {
+ break;
+ }
+ }
+
+ /* complete rotation */
+ if (thr.space != fil_system.space_list.end()) {
+ fil_crypt_complete_rotate_space(&thr);
+ }
+
+ /* force key state refresh */
+ new_state.key_id = 0;
+
+ mysql_mutex_lock(&fil_crypt_threads_mutex);
+ /* release iops */
+ fil_crypt_return_iops(&thr);
+ }
+
+ if (thr.space != fil_system.space_list.end()) {
+ thr.space->release();
+ thr.space = fil_system.space_list.end();
+ }
+ }
+
+ fil_crypt_return_iops(&thr);
+ srv_n_fil_crypt_threads_started--;
+ pthread_cond_signal(&fil_crypt_cond); /* signal that we stopped */
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+
+#ifdef UNIV_PFS_THREAD
+ pfs_delete_thread();
+#endif
+}
+
+/*********************************************************************
+Adjust thread count for key rotation
+@param[in] enw_cnt Number of threads to be used */
+void fil_crypt_set_thread_cnt(const uint new_cnt)
+{
+ if (!fil_crypt_threads_inited) {
+ if (srv_shutdown_state != SRV_SHUTDOWN_NONE)
+ return;
+ fil_crypt_threads_init();
+ }
+
+ mysql_mutex_lock(&fil_crypt_threads_mutex);
+
+ if (new_cnt > srv_n_fil_crypt_threads) {
+ uint add = new_cnt - srv_n_fil_crypt_threads;
+ srv_n_fil_crypt_threads = new_cnt;
+ for (uint i = 0; i < add; i++) {
+ std::thread thd(fil_crypt_thread);
+ ib::info() << "Creating #"
+ << i+1 << " encryption thread id "
+ << thd.get_id()
+ << " total threads " << new_cnt << ".";
+ thd.detach();
+ }
+ } else if (new_cnt < srv_n_fil_crypt_threads) {
+ srv_n_fil_crypt_threads = new_cnt;
+ }
+
+ pthread_cond_broadcast(&fil_crypt_threads_cond);
+
+ while (srv_n_fil_crypt_threads_started != srv_n_fil_crypt_threads) {
+ my_cond_wait(&fil_crypt_cond,
+ &fil_crypt_threads_mutex.m_mutex);
+ }
+
+ pthread_cond_broadcast(&fil_crypt_threads_cond);
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+}
+
+/** Initialize the tablespace default_encrypt_tables
+if innodb_encryption_rotate_key_age=0. */
+static void fil_crypt_default_encrypt_tables_fill()
+{
+ mysql_mutex_assert_owner(&fil_system.mutex);
+
+ for (fil_space_t& space : fil_system.space_list) {
+ if (space.purpose != FIL_TYPE_TABLESPACE
+ || space.is_in_default_encrypt
+ || UT_LIST_GET_LEN(space.chain) == 0
+ || !space.acquire_if_not_stopped()) {
+ continue;
+ }
+
+ /* Ensure that crypt_data has been initialized. */
+ ut_ad(space.size);
+
+ /* Skip ENCRYPTION!=DEFAULT tablespaces. */
+ if (space.crypt_data
+ && !space.crypt_data->is_default_encryption()) {
+ goto next;
+ }
+
+ if (srv_encrypt_tables) {
+ /* Skip encrypted tablespaces if
+ innodb_encrypt_tables!=OFF */
+ if (space.crypt_data
+ && space.crypt_data->min_key_version) {
+ goto next;
+ }
+ } else {
+ /* Skip unencrypted tablespaces if
+ innodb_encrypt_tables=OFF */
+ if (!space.crypt_data
+ || !space.crypt_data->min_key_version) {
+ goto next;
+ }
+ }
+
+ fil_system.default_encrypt_tables.push_back(space);
+ space.is_in_default_encrypt = true;
+next:
+ space.release();
+ }
+}
+
+/*********************************************************************
+Adjust max key age
+@param[in] val New max key age */
+void fil_crypt_set_rotate_key_age(uint val)
+{
+ mysql_mutex_lock(&fil_crypt_threads_mutex);
+ mysql_mutex_lock(&fil_system.mutex);
+ srv_fil_crypt_rotate_key_age= val;
+ if (val == 0)
+ fil_crypt_default_encrypt_tables_fill();
+ mysql_mutex_unlock(&fil_system.mutex);
+ pthread_cond_broadcast(&fil_crypt_threads_cond);
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+}
+
+/*********************************************************************
+Adjust rotation iops
+@param[in] val New max roation iops */
+void fil_crypt_set_rotation_iops(uint val)
+{
+ mysql_mutex_lock(&fil_crypt_threads_mutex);
+ srv_n_fil_crypt_iops= val;
+ pthread_cond_broadcast(&fil_crypt_threads_cond);
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+}
+
+/*********************************************************************
+Adjust encrypt tables
+@param[in] val New setting for innodb-encrypt-tables */
+void fil_crypt_set_encrypt_tables(ulong val)
+{
+ if (!fil_crypt_threads_inited)
+ return;
+
+ mysql_mutex_lock(&fil_crypt_threads_mutex);
+
+ mysql_mutex_lock(&fil_system.mutex);
+ srv_encrypt_tables= val;
+
+ if (fil_crypt_must_default_encrypt())
+ fil_crypt_default_encrypt_tables_fill();
+
+ mysql_mutex_unlock(&fil_system.mutex);
+
+ pthread_cond_broadcast(&fil_crypt_threads_cond);
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+}
+
+/*********************************************************************
+Init threads for key rotation */
+void fil_crypt_threads_init()
+{
+ if (!fil_crypt_threads_inited) {
+ pthread_cond_init(&fil_crypt_cond, nullptr);
+ pthread_cond_init(&fil_crypt_threads_cond, nullptr);
+ mysql_mutex_init(0, &fil_crypt_threads_mutex, nullptr);
+ uint cnt = srv_n_fil_crypt_threads;
+ srv_n_fil_crypt_threads = 0;
+ fil_crypt_threads_inited = true;
+ fil_crypt_set_thread_cnt(cnt);
+ }
+}
+
+/*********************************************************************
+Clean up key rotation threads resources */
+void fil_crypt_threads_cleanup()
+{
+ if (!fil_crypt_threads_inited) {
+ return;
+ }
+ ut_a(!srv_n_fil_crypt_threads_started);
+ pthread_cond_destroy(&fil_crypt_cond);
+ pthread_cond_destroy(&fil_crypt_threads_cond);
+ mysql_mutex_destroy(&fil_crypt_threads_mutex);
+ fil_crypt_threads_inited = false;
+}
+
+/*********************************************************************
+Wait for crypt threads to stop accessing space
+@param[in] space Tablespace */
+void fil_space_crypt_close_tablespace(const fil_space_t *space)
+{
+ fil_space_crypt_t* crypt_data = space->crypt_data;
+
+ if (!crypt_data || srv_n_fil_crypt_threads == 0
+ || !fil_crypt_threads_inited) {
+ return;
+ }
+
+ time_t start = time(0);
+ time_t last = start;
+
+ mysql_mutex_lock(&crypt_data->mutex);
+
+ while (crypt_data->rotate_state.active_threads
+ || crypt_data->rotate_state.flushing) {
+ mysql_mutex_unlock(&crypt_data->mutex);
+
+ /* wakeup throttle (all) sleepers */
+ mysql_mutex_lock(&fil_crypt_threads_mutex);
+ pthread_cond_broadcast(&fil_crypt_throttle_sleep_cond);
+ pthread_cond_broadcast(&fil_crypt_threads_cond);
+ mysql_mutex_unlock(&fil_crypt_threads_mutex);
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(20));
+
+ time_t now = time(0);
+
+ if (UNIV_UNLIKELY(now >= last + 30)) {
+ ib::warn() << "Waited "
+ << now - start
+ << " seconds to drop space: "
+ << space->chain.start->name << " ("
+ << space->id << ") active threads "
+ << crypt_data->rotate_state.active_threads
+ << "flushing="
+ << crypt_data->rotate_state.flushing << ".";
+ last = now;
+ }
+
+ mysql_mutex_lock(&crypt_data->mutex);
+ }
+
+ mysql_mutex_unlock(&crypt_data->mutex);
+}
+
+/*********************************************************************
+Get crypt status for a space (used by information_schema)
+@param[in] space Tablespace
+@param[out] status Crypt status */
+void
+fil_space_crypt_get_status(
+ const fil_space_t* space,
+ struct fil_space_crypt_status_t* status)
+{
+ memset(status, 0, sizeof(*status));
+
+ ut_ad(space->referenced());
+
+ /* If there is no crypt data and we have not yet read
+ page 0 for this tablespace, we need to read it before
+ we can continue. */
+ if (!space->crypt_data) {
+ fil_crypt_read_crypt_data(const_cast<fil_space_t*>(space));
+ }
+
+ status->space = ULINT_UNDEFINED;
+
+ if (fil_space_crypt_t* crypt_data = space->crypt_data) {
+ status->space = space->id;
+ mysql_mutex_lock(&crypt_data->mutex);
+ status->scheme = crypt_data->type;
+ status->keyserver_requests = crypt_data->keyserver_requests;
+ status->min_key_version = crypt_data->min_key_version;
+ status->key_id = crypt_data->key_id;
+
+ if (crypt_data->rotate_state.active_threads > 0 ||
+ crypt_data->rotate_state.flushing) {
+ status->rotating = true;
+ status->flushing =
+ crypt_data->rotate_state.flushing;
+ status->rotate_next_page_number =
+ crypt_data->rotate_state.next_offset;
+ status->rotate_max_page_number =
+ crypt_data->rotate_state.max_offset;
+ }
+
+ mysql_mutex_unlock(&crypt_data->mutex);
+
+ if (srv_encrypt_tables || crypt_data->min_key_version) {
+ status->current_key_version =
+ fil_crypt_get_latest_key_version(crypt_data);
+ }
+ }
+}
+
+/*********************************************************************
+Return crypt statistics
+@param[out] stat Crypt statistics */
+void fil_crypt_total_stat(fil_crypt_stat_t *stat)
+{
+ mysql_mutex_lock(&crypt_stat_mutex);
+ *stat = crypt_stat;
+ mysql_mutex_unlock(&crypt_stat_mutex);
+}
+
+#endif /* UNIV_INNOCHECKSUM */
+
+/**
+Verify that post encryption checksum match calculated checksum.
+This function should be called only if tablespace contains crypt_data
+metadata (this is strong indication that tablespace is encrypted).
+Function also verifies that traditional checksum does not match
+calculated checksum as if it does page could be valid unencrypted,
+encrypted, or corrupted.
+
+@param[in,out] page page frame (checksum is temporarily modified)
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return true if page is encrypted AND OK, false otherwise */
+bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size)
+{
+ if (ENCRYPTION_KEY_NOT_ENCRYPTED == mach_read_from_4(
+ page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)) {
+ return false;
+ }
+
+ /* Compressed and encrypted pages do not have checksum. Assume not
+ corrupted. Page verification happens after decompression in
+ buf_page_t::read_complete() using buf_page_is_corrupted(). */
+ if (fil_page_get_type(page) == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
+ return true;
+ }
+
+ /* Read stored post encryption checksum. */
+ const ib_uint32_t checksum = mach_read_from_4(
+ page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4);
+
+ /* If stored checksum matches one of the calculated checksums
+ page is not corrupted. */
+
+#ifndef UNIV_INNOCHECKSUM
+ switch (srv_checksum_algorithm) {
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+#endif /* !UNIV_INNOCHECKSUM */
+ if (zip_size) {
+ return checksum == page_zip_calc_checksum(
+ page, zip_size, false);
+ }
+
+ return checksum == buf_calc_page_crc32(page);
+#ifndef UNIV_INNOCHECKSUM
+ default:
+ if (checksum == BUF_NO_CHECKSUM_MAGIC) {
+ return true;
+ }
+ if (zip_size) {
+ return checksum == page_zip_calc_checksum(
+ page, zip_size, false)
+ || checksum == page_zip_calc_checksum(
+ page, zip_size, true);
+ }
+
+ return checksum == buf_calc_page_crc32(page)
+ || checksum == buf_calc_page_new_checksum(page);
+ }
+#endif /* !UNIV_INNOCHECKSUM */
+}
diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
new file mode 100644
index 00000000..8a88f4e2
--- /dev/null
+++ b/storage/innobase/fil/fil0fil.cc
@@ -0,0 +1,3282 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2021, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fil/fil0fil.cc
+The tablespace memory cache
+
+Created 10/25/1995 Heikki Tuuri
+*******************************************************/
+
+#include "fil0fil.h"
+#include "fil0crypt.h"
+
+#include "btr0btr.h"
+#include "buf0buf.h"
+#include "dict0boot.h"
+#include "dict0dict.h"
+#include "dict0load.h"
+#include "fsp0file.h"
+#include "fsp0fsp.h"
+#include "hash0hash.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "os0file.h"
+#include "page0zip.h"
+#include "row0mysql.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "buf0lru.h"
+#include "buf0flu.h"
+#include "log.h"
+#ifdef __linux__
+# include <sys/types.h>
+# include <sys/sysmacros.h>
+# include <dirent.h>
+#endif
+
+#include "lz4.h"
+#include "lzo/lzo1x.h"
+#include "lzma.h"
+#include "bzlib.h"
+#include "snappy-c.h"
+
+ATTRIBUTE_COLD void fil_space_t::set_corrupted() const
+{
+ if (!is_stopping() && !is_corrupted.test_and_set())
+ sql_print_error("InnoDB: File '%s' is corrupted", chain.start->name);
+}
+
+/** Try to close a file to adhere to the innodb_open_files limit.
+@param print_info whether to diagnose why a file cannot be closed
+@return whether a file was closed */
+bool fil_space_t::try_to_close(bool print_info)
+{
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ for (fil_space_t &space : fil_system.space_list)
+ {
+ switch (space.purpose) {
+ case FIL_TYPE_TEMPORARY:
+ continue;
+ case FIL_TYPE_IMPORT:
+ break;
+ case FIL_TYPE_TABLESPACE:
+ if (is_predefined_tablespace(space.id))
+ continue;
+ }
+
+ /* We are using an approximation of LRU replacement policy. In
+ fil_node_open_file_low(), newly opened files are moved to the end
+ of fil_system.space_list, so that they would be less likely to be
+ closed here. */
+ fil_node_t *node= UT_LIST_GET_FIRST(space.chain);
+ if (!node)
+ /* fil_ibd_create() did not invoke fil_space_t::add() yet */
+ continue;
+ ut_ad(!UT_LIST_GET_NEXT(chain, node));
+
+ if (!node->is_open())
+ continue;
+
+ const auto n= space.set_closing();
+ if (n & STOPPING)
+ /* Let fil_space_t::drop() in another thread handle this. */
+ continue;
+ if (n & (PENDING | NEEDS_FSYNC))
+ {
+ if (!print_info)
+ continue;
+ print_info= false;
+ const time_t now= time(nullptr);
+ if (now - fil_system.n_open_exceeded_time < 5)
+ continue; /* We display messages at most once in 5 seconds. */
+ fil_system.n_open_exceeded_time= now;
+
+ if (n & PENDING)
+ sql_print_information("InnoDB: Cannot close file %s because of "
+ UINT32PF " pending operations%s", node->name,
+ n & PENDING,
+ (n & NEEDS_FSYNC) ? " and pending fsync" : "");
+ else if (n & NEEDS_FSYNC)
+ sql_print_information("InnoDB: Cannot close file %s because of "
+ "pending fsync", node->name);
+ continue;
+ }
+
+ node->close();
+
+ fil_system.move_closed_last_to_space_list(node->space);
+
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE
+ =============================================
+
+The tablespace cache is responsible for providing fast read/write access to
+tablespaces and logs of the database. File creation and deletion is done
+in other modules which know more of the logic of the operation, however.
+
+A tablespace consists of a chain of files. The size of the files does not
+have to be divisible by the database block size, because we may just leave
+the last incomplete block unused. When a new file is appended to the
+tablespace, the maximum size of the file is also specified. At the moment,
+we think that it is best to extend the file to its maximum size already at
+the creation of the file, because then we can avoid dynamically extending
+the file when more space is needed for the tablespace.
+
+A block's position in the tablespace is specified with a 32-bit unsigned
+integer. The files in the chain are thought to be catenated, and the block
+corresponding to an address n is the nth block in the catenated file (where
+the first block is named the 0th block, and the incomplete block fragments
+at the end of files are not taken into account). A tablespace can be extended
+by appending a new file at the end of the chain.
+
+Our tablespace concept is similar to the one of Oracle.
+
+To acquire more speed in disk transfers, a technique called disk striping is
+sometimes used. This means that logical block addresses are divided in a
+round-robin fashion across several disks. Windows NT supports disk striping,
+so there we do not need to support it in the database. Disk striping is
+implemented in hardware in RAID disks. We conclude that it is not necessary
+to implement it in the database. Oracle 7 does not support disk striping,
+either.
+
+Another trick used at some database sites is replacing tablespace files by
+raw disks, that is, the whole physical disk drive, or a partition of it, is
+opened as a single file, and it is accessed through byte offsets calculated
+from the start of the disk or the partition. This is recommended in some
+books on database tuning to achieve more speed in i/o. Using raw disk
+certainly prevents the OS from fragmenting disk space, but it is not clear
+if it really adds speed. We measured on the Pentium 100 MHz + NT + NTFS file
+system + EIDE Conner disk only a negligible difference in speed when reading
+from a file, versus reading from a raw disk.
+
+To have fast access to a tablespace or a log file, we put the data structures
+to a hash table. Each tablespace and log file is given an unique 32-bit
+identifier. */
+
+/** Reference to the server data directory. Usually it is the
+current working directory ".", but in the MariaDB Embedded Server Library
+it is an absolute path. */
+const char* fil_path_to_mysql_datadir;
+
+/** Common InnoDB file extensions */
+const char* dot_ext[] = { "", ".ibd", ".isl", ".cfg" };
+
+/** Number of pending tablespace flushes */
+Atomic_counter<ulint> fil_n_pending_tablespace_flushes;
+
+/** The tablespace memory cache. This variable is NULL before the module is
+initialized. */
+fil_system_t fil_system;
+
+/** At this age or older a space/page will be rotated */
+extern uint srv_fil_crypt_rotate_key_age;
+
+#ifdef UNIV_DEBUG
+/** Try fil_validate() every this many times */
+# define FIL_VALIDATE_SKIP 17
+
+/******************************************************************//**
+Checks the consistency of the tablespace cache some of the time.
+@return true if ok or the check was skipped */
+static
+bool
+fil_validate_skip(void)
+/*===================*/
+{
+ /** The fil_validate() call skip counter. */
+ static Atomic_counter<uint32_t> fil_validate_count;
+
+ /* We want to reduce the call frequency of the costly fil_validate()
+ check in debug builds. */
+ return (fil_validate_count++ % FIL_VALIDATE_SKIP) || fil_validate();
+}
+#endif /* UNIV_DEBUG */
+
+/** Look up a tablespace.
+@param tablespace identifier
+@return tablespace
+@retval nullptr if not found */
+fil_space_t *fil_space_get_by_id(uint32_t id)
+{
+ fil_space_t* space;
+
+ ut_ad(fil_system.is_initialised());
+ mysql_mutex_assert_owner(&fil_system.mutex);
+
+ HASH_SEARCH(hash, &fil_system.spaces, id,
+ fil_space_t*, space,, space->id == id);
+
+ return(space);
+}
+
+/** Look up a tablespace.
+The caller should hold an InnoDB table lock or a MDL that prevents
+the tablespace from being dropped during the operation,
+or the caller should be in single-threaded crash recovery mode
+(no user connections that could drop tablespaces).
+Normally, fil_space_t::get() should be used instead.
+@param[in] id tablespace ID
+@return tablespace, or NULL if not found */
+fil_space_t *fil_space_get(uint32_t id)
+{
+ mysql_mutex_lock(&fil_system.mutex);
+ fil_space_t *space= fil_space_get_by_id(id);
+ mysql_mutex_unlock(&fil_system.mutex);
+ return space;
+}
+
+/** Check if the compression algorithm is loaded
+@param[in] comp_algo ulint compression algorithm
+@return whether the compression algorithm is loaded */
+bool fil_comp_algo_loaded(ulint comp_algo)
+{
+ switch (comp_algo) {
+ case PAGE_UNCOMPRESSED:
+ case PAGE_ZLIB_ALGORITHM:
+ return true;
+
+ case PAGE_LZ4_ALGORITHM:
+ return provider_service_lz4->is_loaded;
+
+ case PAGE_LZO_ALGORITHM:
+ return provider_service_lzo->is_loaded;
+
+ case PAGE_LZMA_ALGORITHM:
+ return provider_service_lzma->is_loaded;
+
+ case PAGE_BZIP2_ALGORITHM:
+ return provider_service_bzip2->is_loaded;
+
+ case PAGE_SNAPPY_ALGORITHM:
+ return provider_service_snappy->is_loaded;
+ }
+
+ return false;
+}
+
+/** Append a file to the chain of files of a space.
+@param[in] name file name of a file that is not open
+@param[in] handle file handle, or OS_FILE_CLOSED
+@param[in] size file size in entire database pages
+@param[in] is_raw whether this is a raw device
+@param[in] atomic_write true if atomic write could be enabled
+@param[in] max_pages maximum number of pages in file,
+or UINT32_MAX for unlimited
+@return file object */
+fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle,
+ uint32_t size, bool is_raw, bool atomic_write,
+ uint32_t max_pages)
+{
+ mysql_mutex_assert_owner(&fil_system.mutex);
+
+ fil_node_t* node;
+
+ ut_ad(name != NULL);
+ ut_ad(fil_system.is_initialised());
+
+ node = reinterpret_cast<fil_node_t*>(ut_zalloc_nokey(sizeof(*node)));
+
+ node->handle = handle;
+
+ node->name = mem_strdup(name);
+
+ ut_a(!is_raw || srv_start_raw_disk_in_use);
+
+ node->is_raw_disk = is_raw;
+
+ node->size = size;
+
+ node->init_size = size;
+ node->max_size = max_pages;
+
+ node->space = this;
+
+ node->atomic_write = atomic_write;
+
+ this->size += size;
+ UT_LIST_ADD_LAST(chain, node);
+ if (node->is_open()) {
+ clear_closing();
+ if (++fil_system.n_open >= srv_max_n_open_files) {
+ reacquire();
+ try_to_close(true);
+ release();
+ }
+ }
+
+ return node;
+}
+
+__attribute__((warn_unused_result, nonnull))
+/** Open a tablespace file.
+@param node data file
+@return whether the file was successfully opened */
+static bool fil_node_open_file_low(fil_node_t *node)
+{
+ ut_ad(!node->is_open());
+ ut_ad(node->space->is_closing());
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ ulint type;
+ static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096, "compatibility");
+ switch (FSP_FLAGS_GET_ZIP_SSIZE(node->space->flags)) {
+ case 1:
+ case 2:
+ type= OS_DATA_FILE_NO_O_DIRECT;
+ break;
+ default:
+ type= OS_DATA_FILE;
+ }
+
+ for (;;)
+ {
+ bool success;
+ node->handle= os_file_create(innodb_data_file_key, node->name,
+ node->is_raw_disk
+ ? OS_FILE_OPEN_RAW | OS_FILE_ON_ERROR_NO_EXIT
+ : OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
+ OS_FILE_AIO, type,
+ srv_read_only_mode, &success);
+
+ if (success && node->is_open())
+ {
+#ifndef _WIN32
+ if (!node->space->id && !srv_read_only_mode && my_disable_locking &&
+ os_file_lock(node->handle, node->name))
+ {
+ os_file_close(node->handle);
+ node->handle= OS_FILE_CLOSED;
+ return false;
+ }
+#endif
+ break;
+ }
+
+ /* The following call prints an error message */
+ if (os_file_get_last_error(true) == EMFILE + 100 &&
+ fil_space_t::try_to_close(true))
+ continue;
+
+ ib::warn() << "Cannot open '" << node->name << "'.";
+ return false;
+ }
+
+ ulint comp_algo = node->space->get_compression_algo();
+ bool comp_algo_invalid = false;
+
+ if (node->size);
+ else if (!node->read_page0() ||
+ // validate compression algorithm for full crc32 format
+ (node->space->full_crc32() &&
+ (comp_algo_invalid = !fil_comp_algo_loaded(comp_algo))))
+ {
+ if (comp_algo_invalid)
+ {
+ if (comp_algo <= PAGE_ALGORITHM_LAST)
+ ib::warn() << "'" << node->name << "' is compressed with "
+ << page_compression_algorithms[comp_algo]
+ << ", which is not currently loaded";
+ else
+ ib::warn() << "'" << node->name << "' is compressed with "
+ << "invalid algorithm: " << comp_algo;
+ }
+
+ os_file_close(node->handle);
+ node->handle= OS_FILE_CLOSED;
+ return false;
+ }
+
+ ut_ad(node->is_open());
+
+ fil_system.move_opened_last_to_space_list(node->space);
+
+ fil_system.n_open++;
+ return true;
+}
+
+/** Open a tablespace file.
+@param node data file
+@return whether the file was successfully opened */
+static bool fil_node_open_file(fil_node_t *node)
+{
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ ut_ad(!node->is_open());
+ ut_ad(!is_predefined_tablespace(node->space->id) ||
+ srv_operation == SRV_OPERATION_BACKUP ||
+ srv_operation == SRV_OPERATION_RESTORE ||
+ srv_operation == SRV_OPERATION_RESTORE_DELTA);
+ ut_ad(node->space->purpose != FIL_TYPE_TEMPORARY);
+ ut_ad(node->space->referenced());
+
+ const auto old_time= fil_system.n_open_exceeded_time;
+
+ for (ulint count= 0; fil_system.n_open >= srv_max_n_open_files; count++)
+ {
+ if (fil_space_t::try_to_close(count > 1))
+ count= 0;
+ else if (count >= 2)
+ {
+ if (old_time != fil_system.n_open_exceeded_time)
+ sql_print_warning("InnoDB: innodb_open_files=" ULINTPF
+ " is exceeded (" ULINTPF " files stay open)",
+ srv_max_n_open_files, fil_system.n_open);
+ break;
+ }
+ else
+ {
+ mysql_mutex_unlock(&fil_system.mutex);
+ std::this_thread::sleep_for(std::chrono::milliseconds(20));
+ /* Flush tablespaces so that we can close modified files. */
+ fil_flush_file_spaces();
+ mysql_mutex_lock(&fil_system.mutex);
+ if (node->is_open())
+ return true;
+ }
+ }
+
+ /* The node can be opened beween releasing and acquiring fil_system.mutex
+ in the above code */
+ return node->is_open() || fil_node_open_file_low(node);
+}
+
+/** Close the file handle. */
+void fil_node_t::close()
+{
+ prepare_to_close_or_detach();
+
+ /* printf("Closing file %s\n", name); */
+ int ret= os_file_close(handle);
+ ut_a(ret);
+ handle= OS_FILE_CLOSED;
+}
+
+pfs_os_file_t fil_node_t::detach()
+{
+ prepare_to_close_or_detach();
+
+ pfs_os_file_t result= handle;
+ handle= OS_FILE_CLOSED;
+ return result;
+}
+
+void fil_node_t::prepare_to_close_or_detach()
+{
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ ut_ad(space->is_ready_to_close() || srv_operation == SRV_OPERATION_BACKUP ||
+ srv_operation == SRV_OPERATION_RESTORE_DELTA);
+ ut_a(is_open());
+ ut_a(!being_extended);
+ ut_a(space->is_ready_to_close() || space->purpose == FIL_TYPE_TEMPORARY ||
+ srv_fast_shutdown == 2 || !srv_was_started);
+
+ ut_a(fil_system.n_open > 0);
+ fil_system.n_open--;
+}
+
+/** Flush any writes cached by the file system. */
+void fil_space_t::flush_low()
+{
+ mysql_mutex_assert_not_owner(&fil_system.mutex);
+
+ uint32_t n= 1;
+ while (!n_pending.compare_exchange_strong(n, n | NEEDS_FSYNC,
+ std::memory_order_acquire,
+ std::memory_order_relaxed))
+ {
+ ut_ad(n & PENDING);
+ if (n & STOPPING_WRITES)
+ return;
+ if (n & NEEDS_FSYNC)
+ break;
+ }
+
+ fil_n_pending_tablespace_flushes++;
+ for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
+ node= UT_LIST_GET_NEXT(chain, node))
+ {
+ if (!node->is_open())
+ {
+ ut_ad(!is_in_unflushed_spaces);
+ continue;
+ }
+ IF_WIN(if (node->is_raw_disk) continue,);
+ os_file_flush(node->handle);
+ }
+
+ if (is_in_unflushed_spaces)
+ {
+ mysql_mutex_lock(&fil_system.mutex);
+ if (is_in_unflushed_spaces)
+ {
+ is_in_unflushed_spaces= false;
+ fil_system.unflushed_spaces.remove(*this);
+ }
+ mysql_mutex_unlock(&fil_system.mutex);
+ }
+
+ clear_flush();
+ fil_n_pending_tablespace_flushes--;
+}
+
+/** Try to extend a tablespace.
+@param[in,out] space tablespace to be extended
+@param[in,out] node last file of the tablespace
+@param[in] size desired size in number of pages
+@param[out] success whether the operation succeeded
+@return whether the operation should be retried */
+static ATTRIBUTE_COLD __attribute__((warn_unused_result, nonnull))
+bool
+fil_space_extend_must_retry(
+ fil_space_t* space,
+ fil_node_t* node,
+ uint32_t size,
+ bool* success)
+{
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ ut_ad(UT_LIST_GET_LAST(space->chain) == node);
+ ut_ad(size >= FIL_IBD_FILE_INITIAL_SIZE);
+ ut_ad(node->space == space);
+ ut_ad(space->referenced() || space->is_being_truncated);
+
+ *success = space->size >= size;
+
+ if (*success) {
+ /* Space already big enough */
+ return(false);
+ }
+
+ if (node->being_extended) {
+ /* Another thread is currently extending the file. Wait
+ for it to finish.
+ It'd have been better to use event driven mechanism but
+ the entire module is peppered with polling stuff. */
+ mysql_mutex_unlock(&fil_system.mutex);
+ std::this_thread::sleep_for(std::chrono::milliseconds(100));
+ return(true);
+ }
+
+ node->being_extended = true;
+
+ /* At this point it is safe to release fil_system.mutex. No
+ other thread can rename, delete, close or extend the file because
+ we have set the node->being_extended flag. */
+ mysql_mutex_unlock(&fil_system.mutex);
+
+ ut_ad(size >= space->size);
+
+ uint32_t last_page_no = space->size;
+ const uint32_t file_start_page_no = last_page_no - node->size;
+
+ const unsigned page_size = space->physical_size();
+
+ /* Datafile::read_first_page() expects innodb_page_size bytes.
+ fil_node_t::read_page0() expects at least 4 * innodb_page_size bytes.
+ os_file_set_size() expects multiples of 4096 bytes.
+ For ROW_FORMAT=COMPRESSED tables using 1024-byte or 2048-byte
+ pages, we will preallocate up to an integer multiple of 4096 bytes,
+ and let normal writes append 1024, 2048, or 3072 bytes to the file. */
+ os_offset_t new_size = std::max(
+ (os_offset_t(size - file_start_page_no) * page_size)
+ & ~os_offset_t(4095),
+ os_offset_t(FIL_IBD_FILE_INITIAL_SIZE << srv_page_size_shift));
+
+ *success = os_file_set_size(node->name, node->handle, new_size,
+ node->punch_hole == 1);
+
+ os_has_said_disk_full = *success;
+ if (*success) {
+ os_file_flush(node->handle);
+ last_page_no = size;
+ } else {
+ /* Let us measure the size of the file
+ to determine how much we were able to
+ extend it */
+ os_offset_t fsize = os_file_get_size(node->handle);
+ ut_a(fsize != os_offset_t(-1));
+
+ last_page_no = uint32_t(fsize / page_size)
+ + file_start_page_no;
+ }
+ mysql_mutex_lock(&fil_system.mutex);
+
+ ut_a(node->being_extended);
+ node->being_extended = false;
+ ut_a(last_page_no - file_start_page_no >= node->size);
+
+ uint32_t file_size = last_page_no - file_start_page_no;
+ space->size += file_size - node->size;
+ node->size = file_size;
+ const uint32_t pages_in_MiB = node->size
+ & ~uint32_t((1U << (20U - srv_page_size_shift)) - 1);
+
+ /* Keep the last data file size info up to date, rounded to
+ full megabytes */
+
+ switch (space->id) {
+ case TRX_SYS_SPACE:
+ srv_sys_space.set_last_file_size(pages_in_MiB);
+ do_flush:
+ space->reacquire();
+ mysql_mutex_unlock(&fil_system.mutex);
+ space->flush_low();
+ space->release();
+ mysql_mutex_lock(&fil_system.mutex);
+ break;
+ default:
+ ut_ad(space->purpose == FIL_TYPE_TABLESPACE
+ || space->purpose == FIL_TYPE_IMPORT);
+ if (space->purpose == FIL_TYPE_TABLESPACE
+ && !space->is_being_truncated) {
+ goto do_flush;
+ }
+ break;
+ case SRV_TMP_SPACE_ID:
+ ut_ad(space->purpose == FIL_TYPE_TEMPORARY);
+ srv_tmp_space.set_last_file_size(pages_in_MiB);
+ break;
+ }
+
+ return false;
+}
+
+/** @return whether the file is usable for io() */
+ATTRIBUTE_COLD bool fil_space_t::prepare_acquired()
+{
+ ut_ad(referenced());
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ fil_node_t *node= UT_LIST_GET_LAST(chain);
+ ut_ad(!id || purpose == FIL_TYPE_TEMPORARY ||
+ node == UT_LIST_GET_FIRST(chain));
+
+ const bool is_open= node && (node->is_open() || fil_node_open_file(node));
+
+ if (!is_open)
+ release();
+ else if (node->deferred);
+ else if (auto desired_size= recv_size)
+ {
+ bool success;
+ while (fil_space_extend_must_retry(this, node, desired_size, &success))
+ mysql_mutex_lock(&fil_system.mutex);
+
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ /* Crash recovery requires the file extension to succeed. */
+ ut_a(success);
+ /* InnoDB data files cannot shrink. */
+ ut_a(size >= desired_size);
+ if (desired_size > committed_size)
+ committed_size= desired_size;
+
+ /* There could be multiple concurrent I/O requests for this
+ tablespace (multiple threads trying to extend this tablespace).
+
+ Also, fil_space_set_recv_size_and_flags() may have been invoked
+ again during the file extension while fil_system.mutex was not
+ being held by us.
+
+ Only if recv_size matches what we read originally, reset the
+ field. In this way, a subsequent I/O request will handle any
+ pending fil_space_set_recv_size_and_flags(). */
+
+ if (desired_size == recv_size)
+ {
+ recv_size= 0;
+ goto clear;
+ }
+ }
+ else
+clear:
+ clear_closing();
+
+ return is_open;
+}
+
+/** @return whether the file is usable for io() */
+ATTRIBUTE_COLD bool fil_space_t::acquire_and_prepare()
+{
+ mysql_mutex_lock(&fil_system.mutex);
+ const auto flags= acquire_low() & (STOPPING | CLOSING);
+ const bool is_open= !flags || (flags == CLOSING && prepare_acquired());
+ mysql_mutex_unlock(&fil_system.mutex);
+ return is_open;
+}
+
+/** Try to extend a tablespace if it is smaller than the specified size.
+@param[in,out] space tablespace
+@param[in] size desired size in pages
+@return whether the tablespace is at least as big as requested */
+bool fil_space_extend(fil_space_t *space, uint32_t size)
+{
+ ut_ad(!srv_read_only_mode || space->purpose == FIL_TYPE_TEMPORARY);
+ bool success= false;
+ const bool acquired= space->acquire();
+ mysql_mutex_lock(&fil_system.mutex);
+ if (acquired || space->is_being_truncated)
+ {
+ while (fil_space_extend_must_retry(space, UT_LIST_GET_LAST(space->chain),
+ size, &success))
+ mysql_mutex_lock(&fil_system.mutex);
+ }
+ mysql_mutex_unlock(&fil_system.mutex);
+ if (acquired)
+ space->release();
+ return success;
+}
+
+/** Prepare to free a file from fil_system. */
+inline pfs_os_file_t fil_node_t::close_to_free(bool detach_handle)
+{
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ ut_a(!being_extended);
+
+ if (is_open() &&
+ (space->n_pending.fetch_or(fil_space_t::CLOSING,
+ std::memory_order_acquire) &
+ fil_space_t::PENDING))
+ {
+ mysql_mutex_unlock(&fil_system.mutex);
+ while (space->referenced())
+ std::this_thread::sleep_for(std::chrono::microseconds(100));
+ mysql_mutex_lock(&fil_system.mutex);
+ }
+
+ while (is_open())
+ {
+ if (space->is_in_unflushed_spaces)
+ {
+ ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC);
+ space->is_in_unflushed_spaces= false;
+ fil_system.unflushed_spaces.remove(*space);
+ }
+
+ ut_a(!being_extended);
+ if (detach_handle)
+ {
+ auto result= handle;
+ handle= OS_FILE_CLOSED;
+ return result;
+ }
+ bool ret= os_file_close(handle);
+ ut_a(ret);
+ handle= OS_FILE_CLOSED;
+ break;
+ }
+
+ return OS_FILE_CLOSED;
+}
+
+/** Detach a tablespace from the cache and close the files.
+@param space tablespace
+@param detach_handle whether to detach the handle, instead of closing
+@return detached handle
+@retval OS_FILE_CLOSED if no handle was detached */
+pfs_os_file_t fil_system_t::detach(fil_space_t *space, bool detach_handle)
+{
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ HASH_DELETE(fil_space_t, hash, &spaces, space->id, space);
+
+ if (space->is_in_unflushed_spaces)
+ {
+ ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC);
+ space->is_in_unflushed_spaces= false;
+ unflushed_spaces.remove(*space);
+ }
+
+ if (space->is_in_default_encrypt)
+ {
+ space->is_in_default_encrypt= false;
+ default_encrypt_tables.remove(*space);
+ }
+
+ {
+ space_list_t::iterator s= space_list_t::iterator(space);
+ if (space_list_last_opened == space)
+ {
+ if (s == space_list.begin())
+ {
+ ut_ad(srv_operation > SRV_OPERATION_EXPORT_RESTORED ||
+ srv_shutdown_state > SRV_SHUTDOWN_NONE);
+ space_list_last_opened= nullptr;
+ }
+ else
+ {
+ space_list_t::iterator prev= s;
+ space_list_last_opened= &*--prev;
+ }
+ }
+ space_list.erase(s);
+ }
+
+ if (space == sys_space)
+ sys_space= nullptr;
+ else if (space == temp_space)
+ temp_space= nullptr;
+
+ for (fil_node_t* node= UT_LIST_GET_FIRST(space->chain); node;
+ node= UT_LIST_GET_NEXT(chain, node))
+ if (node->is_open())
+ {
+ ut_ad(n_open > 0);
+ n_open--;
+ }
+
+ ut_ad(!detach_handle || space->id);
+ ut_ad(!detach_handle || UT_LIST_GET_LEN(space->chain) <= 1);
+
+ pfs_os_file_t handle= OS_FILE_CLOSED;
+
+ for (fil_node_t* node= UT_LIST_GET_FIRST(space->chain); node;
+ node= UT_LIST_GET_NEXT(chain, node))
+ handle= node->close_to_free(detach_handle);
+
+ ut_ad(!space->referenced());
+ return handle;
+}
+
+/** Free a tablespace object on which fil_system_t::detach() was invoked.
+There must not be any pending i/o's or flushes on the files.
+@param[in,out] space tablespace */
+static
+void
+fil_space_free_low(
+ fil_space_t* space)
+{
+ /* The tablespace must not be in fil_system.named_spaces. */
+ ut_ad(srv_fast_shutdown == 2 || !srv_was_started
+ || space->max_lsn == 0);
+
+ /* Wait for fil_space_t::release() after
+ fil_system_t::detach(), the tablespace cannot be found, so
+ fil_space_t::get() would return NULL */
+ while (space->referenced()) {
+ std::this_thread::sleep_for(std::chrono::microseconds(100));
+ }
+
+ for (fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
+ node != NULL; ) {
+ ut_d(space->size -= node->size);
+ ut_free(node->name);
+ fil_node_t* old_node = node;
+ node = UT_LIST_GET_NEXT(chain, node);
+ ut_free(old_node);
+ }
+
+ ut_ad(space->size == 0);
+
+ fil_space_destroy_crypt_data(&space->crypt_data);
+
+ space->~fil_space_t();
+ ut_free(space);
+}
+
+/** Frees a space object from the tablespace memory cache.
+Closes the files in the chain but does not delete them.
+There must not be any pending i/o's or flushes on the files.
+@param id tablespace identifier
+@param x_latched whether the caller holds exclusive fil_space_t::latch
+@return true if success */
+bool fil_space_free(uint32_t id, bool x_latched)
+{
+ ut_ad(id != TRX_SYS_SPACE);
+
+ mysql_mutex_lock(&fil_system.mutex);
+ fil_space_t* space = fil_space_get_by_id(id);
+
+ if (space != NULL) {
+ fil_system.detach(space);
+ }
+
+ mysql_mutex_unlock(&fil_system.mutex);
+
+ if (space != NULL) {
+ if (x_latched) {
+ space->x_unlock();
+ }
+
+ if (!recv_recovery_is_on()) {
+ log_sys.latch.wr_lock(SRW_LOCK_CALL);
+
+ if (space->max_lsn) {
+ ut_d(space->max_lsn = 0);
+ fil_system.named_spaces.remove(*space);
+ }
+
+ log_sys.latch.wr_unlock();
+ } else {
+#ifndef SUX_LOCK_GENERIC
+ ut_ad(log_sys.latch.is_write_locked());
+#endif
+ if (space->max_lsn) {
+ ut_d(space->max_lsn = 0);
+ fil_system.named_spaces.remove(*space);
+ }
+ }
+
+ fil_space_free_low(space);
+ }
+
+ return(space != NULL);
+}
+
+/** Create a tablespace in fil_system.
+@param name tablespace name
+@param id tablespace identifier
+@param flags tablespace flags
+@param purpose tablespace purpose
+@param crypt_data encryption information
+@param mode encryption mode
+@param opened true if space files are opened
+@return pointer to created tablespace, to be filled in with add()
+@retval nullptr on failure (such as when the same tablespace exists) */
+fil_space_t *fil_space_t::create(uint32_t id, uint32_t flags,
+ fil_type_t purpose,
+ fil_space_crypt_t *crypt_data,
+ fil_encryption_t mode,
+ bool opened)
+{
+ fil_space_t* space;
+
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ ut_ad(fil_system.is_initialised());
+ ut_ad(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, id));
+ ut_ad(srv_page_size == UNIV_PAGE_SIZE_ORIG || flags != 0);
+
+ DBUG_EXECUTE_IF("fil_space_create_failure", return(NULL););
+
+ /* FIXME: if calloc() is defined as an inline function that calls
+ memset() or bzero(), then GCC 6 -flifetime-dse can optimize it away */
+ space= new (ut_zalloc_nokey(sizeof(*space))) fil_space_t;
+
+ space->id = id;
+
+ UT_LIST_INIT(space->chain, &fil_node_t::chain);
+
+ space->purpose = purpose;
+ space->flags = flags;
+
+ space->crypt_data = crypt_data;
+ space->n_pending.store(CLOSING, std::memory_order_relaxed);
+
+ DBUG_LOG("tablespace", "Created metadata for " << id);
+ if (crypt_data) {
+ DBUG_LOG("crypt",
+ "Tablespace " << id
+ << " encryption " << crypt_data->encryption
+ << " key id " << crypt_data->key_id
+ << ":" << fil_crypt_get_mode(crypt_data)
+ << " " << fil_crypt_get_type(crypt_data));
+ }
+
+ space->latch.SRW_LOCK_INIT(fil_space_latch_key);
+
+ if (const fil_space_t *old_space = fil_space_get_by_id(id)) {
+ ib::error() << "Trying to add tablespace with id " << id
+ << " to the cache, but tablespace '"
+ << (old_space->chain.start
+ ? old_space->chain.start->name
+ : "")
+ << "' already exists in the cache!";
+ space->~fil_space_t();
+ ut_free(space);
+ return(NULL);
+ }
+
+ HASH_INSERT(fil_space_t, hash, &fil_system.spaces, id, space);
+
+ if (opened)
+ fil_system.add_opened_last_to_space_list(space);
+ else
+ fil_system.space_list.push_back(*space);
+
+ switch (id) {
+ case 0:
+ ut_ad(!fil_system.sys_space);
+ fil_system.sys_space = space;
+ break;
+ case SRV_TMP_SPACE_ID:
+ ut_ad(!fil_system.temp_space);
+ fil_system.temp_space = space;
+ break;
+ default:
+ ut_ad(purpose != FIL_TYPE_TEMPORARY);
+ if (UNIV_LIKELY(id <= fil_system.max_assigned_id)) {
+ break;
+ }
+ if (UNIV_UNLIKELY(srv_operation == SRV_OPERATION_BACKUP)) {
+ break;
+ }
+ if (!fil_system.space_id_reuse_warned) {
+ ib::warn() << "Allocated tablespace ID " << id
+ << ", old maximum was "
+ << fil_system.max_assigned_id;
+ }
+
+ fil_system.max_assigned_id = id;
+ }
+
+ const bool rotate = purpose == FIL_TYPE_TABLESPACE
+ && (mode == FIL_ENCRYPTION_ON || mode == FIL_ENCRYPTION_OFF
+ || srv_encrypt_tables)
+ && fil_crypt_must_default_encrypt();
+
+ if (rotate) {
+ fil_system.default_encrypt_tables.push_back(*space);
+ space->is_in_default_encrypt = true;
+
+ if (srv_n_fil_crypt_threads_started) {
+ mysql_mutex_unlock(&fil_system.mutex);
+ fil_crypt_threads_signal();
+ mysql_mutex_lock(&fil_system.mutex);
+ }
+ }
+
+ return(space);
+}
+
+/*******************************************************************//**
+Assigns a new space id for a new single-table tablespace. This works simply by
+incrementing the global counter. If 4 billion id's is not enough, we may need
+to recycle id's.
+@return true if assigned, false if not */
+bool fil_assign_new_space_id(uint32_t *space_id)
+{
+ uint32_t id = *space_id;
+ bool success;
+
+ mysql_mutex_lock(&fil_system.mutex);
+
+ if (id < fil_system.max_assigned_id) {
+ id = fil_system.max_assigned_id;
+ }
+
+ id++;
+
+ if (id > (SRV_SPACE_ID_UPPER_BOUND / 2) && (id % 1000000UL == 0)) {
+ ib::warn() << "You are running out of new single-table"
+ " tablespace id's. Current counter is " << id
+ << " and it must not exceed" <<SRV_SPACE_ID_UPPER_BOUND
+ << "! To reset the counter to zero you have to dump"
+ " all your tables and recreate the whole InnoDB"
+ " installation.";
+ }
+
+ success = (id < SRV_SPACE_ID_UPPER_BOUND);
+
+ if (success) {
+ *space_id = fil_system.max_assigned_id = id;
+ } else {
+ ib::warn() << "You have run out of single-table tablespace"
+ " id's! Current counter is " << id
+ << ". To reset the counter to zero"
+ " you have to dump all your tables and"
+ " recreate the whole InnoDB installation.";
+ *space_id = UINT32_MAX;
+ }
+
+ mysql_mutex_unlock(&fil_system.mutex);
+
+ return(success);
+}
+
+/** Read the first page of a data file.
+@return whether the page was found valid */
+bool fil_space_t::read_page0()
+{
+ ut_ad(fil_system.is_initialised());
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ if (size)
+ return true;
+
+ fil_node_t *node= UT_LIST_GET_FIRST(chain);
+ if (!node)
+ return false;
+ ut_ad(!UT_LIST_GET_NEXT(chain, node));
+
+ if (UNIV_UNLIKELY(acquire_low() & STOPPING))
+ {
+ ut_ad("this should not happen" == 0);
+ return false;
+ }
+ const bool ok= node->is_open() || fil_node_open_file(node);
+ release();
+ return ok;
+}
+
+/** Look up a tablespace and ensure that its first page has been validated. */
+static fil_space_t *fil_space_get_space(uint32_t id)
+{
+ if (fil_space_t *space= fil_space_get_by_id(id))
+ if (space->read_page0())
+ return space;
+ return nullptr;
+}
+
+void fil_space_set_recv_size_and_flags(uint32_t id, uint32_t size,
+ uint32_t flags)
+{
+ ut_ad(id < SRV_SPACE_ID_UPPER_BOUND);
+ mysql_mutex_lock(&fil_system.mutex);
+ if (fil_space_t *space= fil_space_get_space(id))
+ {
+ if (size)
+ space->recv_size= size;
+ if (flags != FSP_FLAGS_FCRC32_MASK_MARKER)
+ space->flags= flags;
+ }
+ mysql_mutex_unlock(&fil_system.mutex);
+}
+
+/** Open each file. Never invoked on .ibd files.
+@param create_new_db whether to skip the call to fil_node_t::read_page0()
+@return whether all files were opened */
+bool fil_space_t::open(bool create_new_db)
+{
+ ut_ad(fil_system.is_initialised());
+ ut_ad(!id || create_new_db);
+
+ bool success= true;
+ bool skip_read= create_new_db;
+
+ mysql_mutex_lock(&fil_system.mutex);
+
+ for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
+ node= UT_LIST_GET_NEXT(chain, node))
+ {
+ if (!node->is_open() && !fil_node_open_file_low(node))
+ {
+err_exit:
+ success= false;
+ break;
+ }
+
+ if (create_new_db)
+ {
+ node->find_metadata(node->handle);
+ continue;
+ }
+ if (skip_read)
+ {
+ size+= node->size;
+ continue;
+ }
+
+ if (!node->read_page0())
+ {
+ fil_system.n_open--;
+ os_file_close(node->handle);
+ node->handle= OS_FILE_CLOSED;
+ goto err_exit;
+ }
+
+ skip_read= true;
+ }
+
+ if (!create_new_db)
+ committed_size= size;
+ mysql_mutex_unlock(&fil_system.mutex);
+ return success;
+}
+
+/** Close each file. Only invoked on fil_system.temp_space. */
+void fil_space_t::close()
+{
+ if (!fil_system.is_initialised()) {
+ return;
+ }
+
+ mysql_mutex_lock(&fil_system.mutex);
+ ut_ad(this == fil_system.temp_space
+ || srv_operation == SRV_OPERATION_BACKUP
+ || srv_operation == SRV_OPERATION_RESTORE
+ || srv_operation == SRV_OPERATION_RESTORE_DELTA);
+
+ for (fil_node_t* node = UT_LIST_GET_FIRST(chain);
+ node != NULL;
+ node = UT_LIST_GET_NEXT(chain, node)) {
+ if (node->is_open()) {
+ node->close();
+ }
+ }
+
+ mysql_mutex_unlock(&fil_system.mutex);
+}
+
+void fil_system_t::create(ulint hash_size)
+{
+ ut_ad(this == &fil_system);
+ ut_ad(!is_initialised());
+ ut_ad(!(srv_page_size % FSP_EXTENT_SIZE));
+ ut_ad(srv_page_size);
+ ut_ad(!spaces.array);
+
+ m_initialised = true;
+
+ compile_time_assert(!(UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX));
+ compile_time_assert(!(UNIV_PAGE_SIZE_MIN % FSP_EXTENT_SIZE_MIN));
+
+ ut_ad(hash_size > 0);
+
+ mysql_mutex_init(fil_system_mutex_key, &mutex, nullptr);
+
+ spaces.create(hash_size);
+
+ fil_space_crypt_init();
+#ifdef __linux__
+ ssd.clear();
+ char fn[sizeof(dirent::d_name)
+ + sizeof "/sys/block/" "/queue/rotational"];
+ const size_t sizeof_fnp = (sizeof fn) - sizeof "/sys/block";
+ memcpy(fn, "/sys/block/", sizeof "/sys/block");
+ char* fnp = &fn[sizeof "/sys/block"];
+
+ std::set<std::string> ssd_devices;
+ if (DIR* d = opendir("/sys/block")) {
+ while (struct dirent* e = readdir(d)) {
+ if (e->d_name[0] == '.') {
+ continue;
+ }
+ snprintf(fnp, sizeof_fnp, "%s/queue/rotational",
+ e->d_name);
+ int f = open(fn, O_RDONLY);
+ if (f == -1) {
+ continue;
+ }
+ char b[sizeof "4294967295:4294967295\n"];
+ ssize_t l = read(f, b, sizeof b);
+ ::close(f);
+ if (l != 2 || memcmp("0\n", b, 2)) {
+ continue;
+ }
+ snprintf(fnp, sizeof_fnp, "%s/dev", e->d_name);
+ f = open(fn, O_RDONLY);
+ if (f == -1) {
+ continue;
+ }
+ l = read(f, b, sizeof b);
+ ::close(f);
+ if (l <= 0 || b[l - 1] != '\n') {
+ continue;
+ }
+ b[l - 1] = '\0';
+ char* end = b;
+ unsigned long dev_major = strtoul(b, &end, 10);
+ if (b == end || *end != ':'
+ || dev_major != unsigned(dev_major)) {
+ continue;
+ }
+ char* c = end + 1;
+ unsigned long dev_minor = strtoul(c, &end, 10);
+ if (c == end || *end
+ || dev_minor != unsigned(dev_minor)) {
+ continue;
+ }
+ ssd.push_back(makedev(unsigned(dev_major),
+ unsigned(dev_minor)));
+ }
+ closedir(d);
+ }
+ /* fil_system_t::is_ssd() assumes the following */
+ ut_ad(makedev(0, 8) == 8);
+ ut_ad(makedev(0, 4) == 4);
+ ut_ad(makedev(0, 2) == 2);
+ ut_ad(makedev(0, 1) == 1);
+#endif
+}
+
+void fil_system_t::close()
+{
+ ut_ad(this == &fil_system);
+ ut_a(unflushed_spaces.empty());
+ ut_a(space_list.empty());
+ ut_ad(!sys_space);
+ ut_ad(!temp_space);
+
+ if (is_initialised())
+ {
+ m_initialised= false;
+ spaces.free();
+ mysql_mutex_destroy(&mutex);
+ fil_space_crypt_cleanup();
+ }
+
+ ut_ad(!spaces.array);
+
+#ifdef __linux__
+ ssd.clear();
+ ssd.shrink_to_fit();
+#endif /* __linux__ */
+}
+
+void fil_system_t::add_opened_last_to_space_list(fil_space_t *space)
+{
+ if (UNIV_LIKELY(space_list_last_opened != nullptr))
+ space_list.insert(++space_list_t::iterator(space_list_last_opened), *space);
+ else
+ space_list.push_front(*space);
+ space_list_last_opened= space;
+}
+
+/** Extend all open data files to the recovered size */
+ATTRIBUTE_COLD void fil_system_t::extend_to_recv_size()
+{
+ ut_ad(is_initialised());
+ mysql_mutex_lock(&mutex);
+ for (fil_space_t &space : fil_system.space_list)
+ {
+ const uint32_t size= space.recv_size;
+
+ if (size > space.size)
+ {
+ if (space.is_closing())
+ continue;
+ space.reacquire();
+ bool success;
+ while (fil_space_extend_must_retry(&space, UT_LIST_GET_LAST(space.chain),
+ size, &success))
+ mysql_mutex_lock(&mutex);
+ /* Crash recovery requires the file extension to succeed. */
+ ut_a(success);
+ space.release();
+ }
+ }
+ mysql_mutex_unlock(&mutex);
+}
+
+/** Close all tablespace files at shutdown */
+void fil_space_t::close_all()
+{
+ if (!fil_system.is_initialised())
+ return;
+
+ /* At shutdown, we should not have any files in this list. */
+ ut_ad(srv_fast_shutdown == 2 || !srv_was_started ||
+ fil_system.named_spaces.empty());
+ fil_flush_file_spaces();
+
+ mysql_mutex_lock(&fil_system.mutex);
+
+ while (!fil_system.space_list.empty())
+ {
+ fil_space_t &space= fil_system.space_list.front();
+
+ for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node != NULL;
+ node= UT_LIST_GET_NEXT(chain, node))
+ {
+
+ if (!node->is_open())
+ {
+ next:
+ continue;
+ }
+
+ for (ulint count= 10000; count--;)
+ {
+ const auto n= space.set_closing();
+ if (n & STOPPING)
+ goto next;
+ if (!(n & (PENDING | NEEDS_FSYNC)))
+ {
+ node->close();
+ goto next;
+ }
+ mysql_mutex_unlock(&fil_system.mutex);
+ std::this_thread::sleep_for(std::chrono::microseconds(100));
+ mysql_mutex_lock(&fil_system.mutex);
+ if (!node->is_open())
+ goto next;
+ }
+
+ ib::error() << "File '" << node->name << "' has " << space.referenced()
+ << " operations";
+ }
+
+ fil_system.detach(&space);
+ mysql_mutex_unlock(&fil_system.mutex);
+ fil_space_free_low(&space);
+ mysql_mutex_lock(&fil_system.mutex);
+ }
+
+ mysql_mutex_unlock(&fil_system.mutex);
+
+ ut_ad(srv_fast_shutdown == 2 || !srv_was_started ||
+ fil_system.named_spaces.empty());
+}
+
+/*******************************************************************//**
+Sets the max tablespace id counter if the given number is bigger than the
+previous value. */
+void fil_set_max_space_id_if_bigger(uint32_t max_id)
+{
+ ut_a(max_id < SRV_SPACE_ID_UPPER_BOUND);
+
+ mysql_mutex_lock(&fil_system.mutex);
+
+ if (fil_system.max_assigned_id < max_id) {
+
+ fil_system.max_assigned_id = max_id;
+ }
+
+ mysql_mutex_unlock(&fil_system.mutex);
+}
+
+/** Acquire a tablespace reference.
+@param id tablespace identifier
+@return tablespace
+@retval nullptr if the tablespace is missing or inaccessible */
+fil_space_t *fil_space_t::get(uint32_t id)
+{
+ mysql_mutex_lock(&fil_system.mutex);
+ fil_space_t *space= fil_space_get_by_id(id);
+ const uint32_t n= space ? space->acquire_low() : 0;
+
+ if (n & STOPPING)
+ space= nullptr;
+ else if ((n & CLOSING) && !space->prepare_acquired())
+ space= nullptr;
+
+ mysql_mutex_unlock(&fil_system.mutex);
+ return space;
+}
+
+/** Write a log record about a file operation.
+@param type file operation
+@param first_page_no first page number in the file
+@param path file path
+@param new_path new file path for type=FILE_RENAME */
+inline void mtr_t::log_file_op(mfile_type_t type, uint32_t space_id,
+ const char *path, const char *new_path)
+{
+ ut_ad((new_path != nullptr) == (type == FILE_RENAME));
+ ut_ad(!(byte(type) & 15));
+
+ /* fil_name_parse() requires that there be at least one path
+ separator and that the file path end with ".ibd". */
+ ut_ad(strchr(path, '/'));
+ ut_ad(!strcmp(&path[strlen(path) - strlen(DOT_IBD)], DOT_IBD));
+
+ m_modifications= true;
+ if (!is_logged())
+ return;
+ m_last= nullptr;
+
+ const size_t len= strlen(path);
+ const size_t new_len= type == FILE_RENAME ? 1 + strlen(new_path) : 0;
+ ut_ad(len > 0);
+ byte *const log_ptr= m_log.open(1 + 3/*length*/ + 5/*space_id*/ +
+ 1/*page_no=0*/);
+ byte *end= log_ptr + 1;
+ end= mlog_encode_varint(end, space_id);
+ *end++= 0;
+ if (UNIV_LIKELY(end + len + new_len >= &log_ptr[16]))
+ {
+ *log_ptr= type;
+ size_t total_len= len + new_len + end - log_ptr - 15;
+ if (total_len >= MIN_3BYTE)
+ total_len+= 2;
+ else if (total_len >= MIN_2BYTE)
+ total_len++;
+ end= mlog_encode_varint(log_ptr + 1, total_len);
+ end= mlog_encode_varint(end, space_id);
+ *end++= 0;
+ }
+ else
+ {
+ *log_ptr= static_cast<byte>(type | (end + len + new_len - &log_ptr[1]));
+ ut_ad(*log_ptr & 15);
+ }
+
+ m_log.close(end);
+
+ if (type == FILE_RENAME)
+ {
+ ut_ad(strchr(new_path, '/'));
+ m_log.push(reinterpret_cast<const byte*>(path), uint32_t(len + 1));
+ m_log.push(reinterpret_cast<const byte*>(new_path), uint32_t(new_len - 1));
+ }
+ else
+ m_log.push(reinterpret_cast<const byte*>(path), uint32_t(len));
+}
+
+/** Write FILE_MODIFY for a file.
+@param[in] space_id tablespace id
+@param[in] name tablespace file name
+@param[in,out] mtr mini-transaction */
+static void fil_name_write(uint32_t space_id, const char *name,
+ mtr_t *mtr)
+{
+ ut_ad(!is_predefined_tablespace(space_id));
+ mtr->log_file_op(FILE_MODIFY, space_id, name);
+}
+
+fil_space_t *fil_space_t::drop(uint32_t id, pfs_os_file_t *detached_handle)
+{
+ ut_a(!is_system_tablespace(id));
+ mysql_mutex_lock(&fil_system.mutex);
+ fil_space_t *space= fil_space_get_by_id(id);
+
+ if (!space)
+ {
+ mysql_mutex_unlock(&fil_system.mutex);
+ return nullptr;
+ }
+
+ if (space->pending() & STOPPING)
+ {
+ /* A thread executing DDL and another thread executing purge may
+ be executing fil_delete_tablespace() concurrently for the same
+ tablespace. Wait for the other thread to complete the operation. */
+ for (ulint count= 0;; count++)
+ {
+ space= fil_space_get_by_id(id);
+ ut_ad(!space || space->is_stopping());
+ mysql_mutex_unlock(&fil_system.mutex);
+ if (!space)
+ return nullptr;
+ /* Issue a warning every 10.24 seconds, starting after 2.56 seconds */
+ if ((count & 511) == 128)
+ sql_print_warning("InnoDB: Waiting for tablespace " UINT32PF
+ " to be deleted", id);
+ std::this_thread::sleep_for(std::chrono::milliseconds(20));
+ mysql_mutex_lock(&fil_system.mutex);
+ }
+ }
+
+ /* We must be the first one to set either STOPPING flag on the .ibd file,
+ because the flags are only being set here, within a critical section of
+ fil_system.mutex. */
+ unsigned pending;
+ ut_d(pending=)
+ space->n_pending.fetch_add(STOPPING_READS + 1, std::memory_order_relaxed);
+ ut_ad(!(pending & STOPPING));
+ mysql_mutex_unlock(&fil_system.mutex);
+
+ if (space->crypt_data)
+ fil_space_crypt_close_tablespace(space);
+
+ if (space->purpose == FIL_TYPE_TABLESPACE)
+ {
+ if (id >= srv_undo_space_id_start &&
+ id < srv_undo_space_id_start + srv_undo_tablespaces_open)
+ {
+ os_file_delete(innodb_data_file_key, space->chain.start->name);
+ goto deleted;
+ }
+
+ /* Before deleting the file, persistently write a log record. */
+ mtr_t mtr;
+ mtr.start();
+ mtr.log_file_op(FILE_DELETE, id, space->chain.start->name);
+ mtr.commit_file(*space, nullptr);
+
+ if (FSP_FLAGS_HAS_DATA_DIR(space->flags))
+ RemoteDatafile::delete_link_file(space->name());
+
+ os_file_delete(innodb_data_file_key, space->chain.start->name);
+ }
+ else
+ ut_ad(space->purpose == FIL_TYPE_IMPORT);
+
+ if (char *cfg_name= fil_make_filepath(space->chain.start->name,
+ fil_space_t::name_type{}, CFG, false))
+ {
+ os_file_delete_if_exists(innodb_data_file_key, cfg_name, nullptr);
+ ut_free(cfg_name);
+ }
+
+ deleted:
+ mysql_mutex_lock(&fil_system.mutex);
+ ut_ad(space == fil_space_get_by_id(id));
+ pending=
+ space->n_pending.fetch_add(STOPPING_WRITES - 1, std::memory_order_relaxed);
+ ut_ad((pending & STOPPING) == STOPPING_READS);
+ ut_ad(pending & PENDING);
+ pending&= PENDING;
+ if (--pending)
+ {
+ for (ulint count= 0;; count++)
+ {
+ ut_ad(space == fil_space_get_by_id(id));
+ pending= space->n_pending.load(std::memory_order_relaxed) & PENDING;
+ if (!pending)
+ break;
+ mysql_mutex_unlock(&fil_system.mutex);
+ /* Issue a warning every 10.24 seconds, starting after 2.56 seconds */
+ if ((count & 511) == 128)
+ sql_print_warning("InnoDB: Trying to delete tablespace '%s' "
+ "but there are %u pending operations",
+ space->chain.start->name, pending);
+ std::this_thread::sleep_for(std::chrono::milliseconds(20));
+ mysql_mutex_lock(&fil_system.mutex);
+ }
+ }
+
+ pfs_os_file_t handle= fil_system.detach(space, true);
+ mysql_mutex_unlock(&fil_system.mutex);
+ if (detached_handle)
+ *detached_handle = handle;
+ else
+ os_file_close(handle);
+ return space;
+}
+
+/** Close a single-table tablespace on failed IMPORT TABLESPACE.
+The tablespace must be cached in the memory cache.
+Free all pages used by the tablespace. */
+void fil_close_tablespace(uint32_t id)
+{
+ ut_ad(!is_system_tablespace(id));
+ fil_space_t* space = fil_space_t::drop(id, nullptr);
+ if (!space) {
+ return;
+ }
+
+ space->x_lock();
+ ut_ad(space->is_stopping());
+
+ /* Invalidate in the buffer pool all pages belonging to the
+ tablespace. Since space->is_stopping() holds, readahead
+ can no longer read more pages of this tablespace to buf_pool.
+ Thus we can clean the tablespace out of buf_pool
+ completely and permanently. */
+ while (buf_flush_list_space(space));
+
+ space->x_unlock();
+ log_sys.latch.wr_lock(SRW_LOCK_CALL);
+ if (space->max_lsn != 0) {
+ ut_d(space->max_lsn = 0);
+ fil_system.named_spaces.remove(*space);
+ }
+ log_sys.latch.wr_unlock();
+ fil_space_free_low(space);
+}
+
+/** Delete a tablespace and associated .ibd file.
+@param id tablespace identifier
+@return detached file handle (to be closed by the caller)
+@return OS_FILE_CLOSED if no file existed */
+pfs_os_file_t fil_delete_tablespace(uint32_t id)
+{
+ ut_ad(!is_system_tablespace(id));
+ pfs_os_file_t handle= OS_FILE_CLOSED;
+ if (fil_space_t *space= fil_space_t::drop(id, &handle))
+ fil_space_free_low(space);
+ return handle;
+}
+
+/*******************************************************************//**
+Allocates and builds a file name from a path, a table or tablespace name
+and a suffix. The string must be freed by caller with ut_free().
+@param[in] path NULL or the directory path or the full path and filename.
+@param[in] name {} if path is full, or Table/Tablespace name
+@param[in] ext the file extension to use
+@param[in] trim_name true if the last name on the path should be trimmed.
+@return own: file name */
+char* fil_make_filepath(const char *path, const fil_space_t::name_type &name,
+ ib_extention ext, bool trim_name)
+{
+ /* The path may contain the basename of the file, if so we do not
+ need the name. If the path is NULL, we can use the default path,
+ but there needs to be a name. */
+ ut_ad(path || name.data());
+
+ /* If we are going to strip a name off the path, there better be a
+ path and a new name to put back on. */
+ ut_ad(!trim_name || (path && name.data()));
+
+ if (path == NULL) {
+ path = fil_path_to_mysql_datadir;
+ }
+
+ ulint len = 0; /* current length */
+ ulint path_len = strlen(path);
+ const char* suffix = dot_ext[ext];
+ ulint suffix_len = strlen(suffix);
+ ulint full_len = path_len + 1 + name.size() + suffix_len + 1;
+
+ char* full_name = static_cast<char*>(ut_malloc_nokey(full_len));
+ if (full_name == NULL) {
+ return NULL;
+ }
+
+ /* If the name is a relative or absolute path, do not prepend "./". */
+ if (path[0] == '.'
+ && (path[1] == '\0' || path[1] == '/' IF_WIN(|| path[1] == '\\',))
+ && name.size() && (name.data()[0] == '.'
+ || is_absolute_path(name.data()))) {
+ path = NULL;
+ path_len = 0;
+ }
+
+ if (path != NULL) {
+ memcpy(full_name, path, path_len);
+ len = path_len;
+ }
+
+ full_name[len] = '\0';
+
+ if (trim_name) {
+ /* Find the offset of the last DIR separator and set it to
+ null in order to strip off the old basename from this path. */
+ char* last_dir_sep = strrchr(full_name, '/');
+#ifdef _WIN32
+ if (char *last = strrchr(full_name, '\\')) {
+ if (last > last_dir_sep) {
+ last_dir_sep = last;
+ }
+ }
+#endif
+ if (last_dir_sep) {
+ last_dir_sep[0] = '\0';
+ len = strlen(full_name);
+ }
+ }
+
+ if (name.size()) {
+ if (len && full_name[len - 1] != '/') {
+ /* Add a DIR separator */
+ full_name[len] = '/';
+ full_name[++len] = '\0';
+ }
+
+ char* ptr = &full_name[len];
+ memcpy(ptr, name.data(), name.size());
+ len += name.size();
+ full_name[len] = '\0';
+ }
+
+ /* Make sure that the specified suffix is at the end of the filepath
+ string provided. This assumes that the suffix starts with '.'.
+ If the first char of the suffix is found in the filepath at the same
+ length as the suffix from the end, then we will assume that there is
+ a previous suffix that needs to be replaced. */
+ if (suffix != NULL) {
+ /* Need room for the trailing null byte. */
+ ut_ad(len < full_len);
+
+ if ((len > suffix_len)
+ && (full_name[len - suffix_len] == suffix[0])) {
+ /* Another suffix exists, make it the one requested. */
+ memcpy(&full_name[len - suffix_len], suffix, suffix_len);
+
+ } else {
+ /* No previous suffix, add it. */
+ ut_ad(len + suffix_len < full_len);
+ memcpy(&full_name[len], suffix, suffix_len);
+ full_name[len + suffix_len] = '\0';
+ }
+ }
+
+ return(full_name);
+}
+
+char *fil_make_filepath(const char* path, const table_name_t name,
+ ib_extention suffix, bool strip_name)
+{
+ return fil_make_filepath(path, {name.m_name, strlen(name.m_name)},
+ suffix, strip_name);
+}
+
+dberr_t fil_space_t::rename(const char *path, bool log, bool replace)
+{
+ ut_ad(UT_LIST_GET_LEN(chain) == 1);
+ ut_ad(!is_predefined_tablespace(id));
+
+ const char *old_path= chain.start->name;
+
+ ut_ad(strchr(old_path, '/'));
+ ut_ad(strchr(path, '/'));
+
+ if (!strcmp(path, old_path))
+ return DB_SUCCESS;
+
+ if (!log)
+ {
+ if (!os_file_rename(innodb_data_file_key, old_path, path))
+ return DB_ERROR;
+ mysql_mutex_lock(&fil_system.mutex);
+ ut_free(chain.start->name);
+ chain.start->name= mem_strdup(path);
+ mysql_mutex_unlock(&fil_system.mutex);
+ return DB_SUCCESS;
+ }
+
+ bool exists= false;
+ os_file_type_t ftype;
+
+ /* Check upfront if the rename operation might succeed, because we
+ must durably write redo log before actually attempting to execute
+ the rename in the file system. */
+ if (os_file_status(old_path, &exists, &ftype) && !exists)
+ {
+ sql_print_error("InnoDB: Cannot rename '%s' to '%s'"
+ " because the source file does not exist.",
+ old_path, path);
+ return DB_TABLESPACE_NOT_FOUND;
+ }
+
+ exists= false;
+ if (replace);
+ else if (!os_file_status(path, &exists, &ftype) || exists)
+ {
+ sql_print_error("InnoDB: Cannot rename '%s' to '%s'"
+ " because the target file exists.",
+ old_path, path);
+ return DB_TABLESPACE_EXISTS;
+ }
+
+ mtr_t mtr;
+ mtr.start();
+ mtr.log_file_op(FILE_RENAME, id, old_path, path);
+ return mtr.commit_file(*this, path) ? DB_SUCCESS : DB_ERROR;
+}
+
+/** Create a tablespace file.
+@param[in] space_id Tablespace ID
+@param[in] name Tablespace name in dbname/tablename format.
+@param[in] path Path and filename of the datafile to create.
+@param[in] flags Tablespace flags
+@param[in] size Initial size of the tablespace file in pages,
+must be >= FIL_IBD_FILE_INITIAL_SIZE
+@param[in] mode MariaDB encryption mode
+@param[in] key_id MariaDB encryption key_id
+@param[out] err DB_SUCCESS or error code
+@return the created tablespace
+@retval NULL on error */
+fil_space_t*
+fil_ibd_create(
+ uint32_t space_id,
+ const table_name_t name,
+ const char* path,
+ uint32_t flags,
+ uint32_t size,
+ fil_encryption_t mode,
+ uint32_t key_id,
+ dberr_t* err)
+{
+ pfs_os_file_t file;
+ bool success;
+ mtr_t mtr;
+ bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags) != 0;
+
+ ut_ad(!is_system_tablespace(space_id));
+ ut_ad(!srv_read_only_mode);
+ ut_a(space_id < SRV_SPACE_ID_UPPER_BOUND);
+ ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE);
+ ut_a(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, space_id));
+
+ /* Create the subdirectories in the path, if they are
+ not there already. */
+ *err = os_file_create_subdirs_if_needed(path);
+ if (*err != DB_SUCCESS) {
+ return NULL;
+ }
+
+ mtr.start();
+ mtr.log_file_op(FILE_CREATE, space_id, path);
+ log_sys.latch.wr_lock(SRW_LOCK_CALL);
+ auto lsn= mtr.commit_files();
+ log_sys.latch.wr_unlock();
+ mtr.flag_wr_unlock();
+ log_write_up_to(lsn, true);
+
+ ulint type;
+ static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096,
+ "compatibility");
+ switch (FSP_FLAGS_GET_ZIP_SSIZE(flags)) {
+ case 1:
+ case 2:
+ type = OS_DATA_FILE_NO_O_DIRECT;
+ break;
+ default:
+ type = OS_DATA_FILE;
+ }
+
+ file = os_file_create(
+ innodb_data_file_key, path,
+ OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
+ OS_FILE_AIO, type, srv_read_only_mode, &success);
+
+ if (!success) {
+ /* The following call will print an error message */
+ switch (os_file_get_last_error(true)) {
+ case OS_FILE_ALREADY_EXISTS:
+ ib::info() << "The file '" << path << "'"
+ " already exists though the"
+ " corresponding table did not exist"
+ " in the InnoDB data dictionary."
+ " You can resolve the problem by removing"
+ " the file.";
+ *err = DB_TABLESPACE_EXISTS;
+ break;
+ case OS_FILE_DISK_FULL:
+ *err = DB_OUT_OF_FILE_SPACE;
+ break;
+ default:
+ *err = DB_ERROR;
+ }
+ ib::error() << "Cannot create file '" << path << "'";
+ return NULL;
+ }
+
+ const bool is_compressed = fil_space_t::is_compressed(flags);
+#ifdef _WIN32
+ const bool is_sparse = is_compressed;
+ if (is_compressed) {
+ os_file_set_sparse_win32(file);
+ }
+#else
+ const bool is_sparse = is_compressed
+ && DB_SUCCESS == os_file_punch_hole(file, 0, 4096)
+ && !my_test_if_thinly_provisioned(file);
+#endif
+
+ if (fil_space_t::full_crc32(flags)) {
+ flags |= FSP_FLAGS_FCRC32_PAGE_SSIZE();
+ } else {
+ flags |= FSP_FLAGS_PAGE_SSIZE();
+ }
+
+ /* Create crypt data if the tablespace is either encrypted or user has
+ requested it to remain unencrypted. */
+ fil_space_crypt_t* crypt_data = (mode != FIL_ENCRYPTION_DEFAULT
+ || srv_encrypt_tables)
+ ? fil_space_create_crypt_data(mode, key_id)
+ : nullptr;
+
+ if (!os_file_set_size(path, file,
+ os_offset_t(size) << srv_page_size_shift,
+ is_sparse)) {
+ *err = DB_OUT_OF_FILE_SPACE;
+err_exit:
+ os_file_close(file);
+ os_file_delete(innodb_data_file_key, path);
+ free(crypt_data);
+ return nullptr;
+ }
+
+ fil_space_t::name_type space_name;
+
+ if (has_data_dir) {
+ /* Make the ISL file if the IBD file is not
+ in the default location. */
+ space_name = {name.m_name, strlen(name.m_name)};
+ *err = RemoteDatafile::create_link_file(space_name, path);
+ if (*err != DB_SUCCESS) {
+ goto err_exit;
+ }
+ }
+
+ DBUG_EXECUTE_IF("checkpoint_after_file_create",
+ log_make_checkpoint(););
+
+ mysql_mutex_lock(&fil_system.mutex);
+ if (fil_space_t* space = fil_space_t::create(space_id, flags,
+ FIL_TYPE_TABLESPACE,
+ crypt_data, mode, true)) {
+ fil_node_t* node = space->add(path, file, size, false, true);
+ IF_WIN(node->find_metadata(), node->find_metadata(file, true));
+ mysql_mutex_unlock(&fil_system.mutex);
+ mtr.start();
+ mtr.set_named_space(space);
+ ut_a(fsp_header_init(space, size, &mtr) == DB_SUCCESS);
+ mtr.commit();
+ return space;
+ } else {
+ mysql_mutex_unlock(&fil_system.mutex);
+ }
+
+ if (space_name.data()) {
+ RemoteDatafile::delete_link_file(space_name);
+ }
+
+ *err = DB_ERROR;
+ goto err_exit;
+}
+
+/** Try to open a single-table tablespace and optionally check that the
+space id in it is correct. If this does not succeed, print an error message
+to the .err log. This function is used to open a tablespace when we start
+mysqld after the dictionary has been booted, and also in IMPORT TABLESPACE.
+
+NOTE that we assume this operation is used either at the database startup
+or under the protection of dict_sys.latch, so that two users cannot
+race here. This operation does not leave the file associated with the
+tablespace open, but closes it after we have looked at the space id in it.
+
+If the validate boolean is set, we read the first page of the file and
+check that the space id in the file is what we expect. We assume that
+this function runs much faster if no check is made, since accessing the
+file inode probably is much faster (the OS caches them) than accessing
+the first page of the file. This boolean may be initially false, but if
+a remote tablespace is found it will be changed to true.
+
+If the fix_dict boolean is set, then it is safe to use an internal SQL
+statement to update the dictionary tables if they are incorrect.
+
+@param[in] validate 0=maybe missing, 1=do not validate, 2=validate
+@param[in] purpose FIL_TYPE_TABLESPACE or FIL_TYPE_TEMPORARY
+@param[in] id tablespace ID
+@param[in] flags expected FSP_SPACE_FLAGS
+@param[in] name table name
+If file-per-table, it is the table name in the databasename/tablename format
+@param[in] path_in expected filepath, usually read from dictionary
+@param[out] err DB_SUCCESS or error code
+@return tablespace
+@retval NULL if the tablespace could not be opened */
+fil_space_t*
+fil_ibd_open(
+ unsigned validate,
+ fil_type_t purpose,
+ uint32_t id,
+ uint32_t flags,
+ fil_space_t::name_type name,
+ const char* path_in,
+ dberr_t* err)
+{
+ mysql_mutex_lock(&fil_system.mutex);
+ fil_space_t* space = fil_space_get_by_id(id);
+ mysql_mutex_unlock(&fil_system.mutex);
+ if (space) {
+ if (validate > 1 && !srv_read_only_mode) {
+ fsp_flags_try_adjust(space,
+ flags & ~FSP_FLAGS_MEM_MASK);
+ }
+ return space;
+ }
+
+ dberr_t local_err = DB_SUCCESS;
+
+ /* Table flags can be ULINT_UNDEFINED if
+ dict_tf_to_fsp_flags_failure is set. */
+ if (flags == UINT32_MAX) {
+corrupted:
+ local_err = DB_CORRUPTION;
+func_exit:
+ if (err) *err = local_err;
+ return space;
+ }
+
+ ut_ad(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, id));
+
+ Datafile df_default; /* default location */
+ RemoteDatafile df_remote; /* remote location */
+ ulint tablespaces_found = 0;
+ ulint valid_tablespaces_found = 0;
+
+ df_default.init(flags);
+ df_remote.init(flags);
+
+ /* Discover the correct file by looking in three possible locations
+ while avoiding unecessary effort. */
+
+ /* We will always look for an ibd in the default location. */
+ df_default.make_filepath(nullptr, name, IBD);
+
+ /* Look for a filepath embedded in an ISL where the default file
+ would be. */
+ bool must_validate = df_remote.open_link_file(name);
+
+ if (must_validate) {
+ if (df_remote.open_read_only(true) == DB_SUCCESS) {
+ ut_ad(df_remote.is_open());
+ ++tablespaces_found;
+ } else {
+ /* The following call prints an error message */
+ os_file_get_last_error(true);
+ ib::error() << "A link file was found named '"
+ << df_remote.link_filepath()
+ << "' but the linked tablespace '"
+ << df_remote.filepath()
+ << "' could not be opened read-only.";
+ }
+ } else if (path_in && !df_default.same_filepath_as(path_in)) {
+ /* Dict path is not the default path. Always validate
+ remote files. If default is opened, it was moved. */
+ must_validate = true;
+ } else if (validate > 1) {
+ must_validate = true;
+ }
+
+ const bool operation_not_for_export =
+ srv_operation != SRV_OPERATION_RESTORE_EXPORT
+ && srv_operation != SRV_OPERATION_EXPORT_RESTORED;
+
+ /* Always look for a file at the default location. But don't log
+ an error if the tablespace is already open in remote or dict. */
+ ut_a(df_default.filepath());
+
+ /* Mariabackup will not copy files whose names start with
+ #sql-. We will suppress messages about such files missing on
+ the first server startup. The tables ought to be dropped by
+ drop_garbage_tables_after_restore() a little later. */
+
+ const bool strict = validate && !tablespaces_found
+ && operation_not_for_export
+ && !(srv_operation == SRV_OPERATION_NORMAL
+ && srv_start_after_restore
+ && srv_force_recovery < SRV_FORCE_NO_BACKGROUND
+ && dict_table_t::is_temporary_name(
+ df_default.filepath()));
+
+ if (df_default.open_read_only(strict) == DB_SUCCESS) {
+ ut_ad(df_default.is_open());
+ ++tablespaces_found;
+ }
+
+ /* Check if multiple locations point to the same file. */
+ if (tablespaces_found > 1 && df_default.same_as(df_remote)) {
+ /* A link file was found with the default path in it.
+ Use the default path and delete the link file. */
+ --tablespaces_found;
+ df_remote.delete_link_file();
+ df_remote.close();
+ }
+
+ /* We have now checked all possible tablespace locations and
+ have a count of how many unique files we found. If things are
+ normal, we only found 1. */
+ /* For encrypted tablespace, we need to check the
+ encryption in header of first page. */
+ if (!must_validate && tablespaces_found == 1) {
+ goto skip_validate;
+ }
+
+ /* Read and validate the first page of these three tablespace
+ locations, if found. */
+ valid_tablespaces_found +=
+ (df_remote.validate_to_dd(id, flags) == DB_SUCCESS);
+
+ valid_tablespaces_found +=
+ (df_default.validate_to_dd(id, flags) == DB_SUCCESS);
+
+ /* Make sense of these three possible locations.
+ First, bail out if no tablespace files were found. */
+ if (valid_tablespaces_found == 0) {
+ if (!strict
+ && IF_WIN(GetLastError() == ERROR_FILE_NOT_FOUND
+ || GetLastError() == ERROR_PATH_NOT_FOUND,
+ errno == ENOENT)) {
+ /* Suppress a message about a missing file. */
+ goto corrupted;
+ }
+
+ os_file_get_last_error(operation_not_for_export,
+ !operation_not_for_export);
+ if (!operation_not_for_export) {
+ goto corrupted;
+ }
+ sql_print_error("InnoDB: Could not find a valid tablespace"
+ " file for %.*s. %s",
+ static_cast<int>(name.size()), name.data(),
+ TROUBLESHOOT_DATADICT_MSG);
+ goto corrupted;
+ }
+ if (!must_validate) {
+ goto skip_validate;
+ }
+
+ /* Do not open any tablespaces if more than one tablespace with
+ the correct space ID and flags were found. */
+ if (df_default.is_open() && df_remote.is_open()) {
+ ib::error()
+ << "A tablespace has been found in multiple places: "
+ << df_default.filepath()
+ << "(Space ID=" << df_default.space_id()
+ << ", Flags=" << df_default.flags()
+ << ") and "
+ << df_remote.filepath()
+ << "(Space ID=" << df_remote.space_id()
+ << ", Flags=" << df_remote.flags()
+ << (valid_tablespaces_found > 1 || srv_force_recovery
+ ? "); will not open"
+ : ")");
+
+ /* Force-recovery will allow some tablespaces to be
+ skipped by REDO if there was more than one file found.
+ Unlike during the REDO phase of recovery, we now know
+ if the tablespace is valid according to the dictionary,
+ which was not available then. So if we did not force
+ recovery and there is only one good tablespace, ignore
+ any bad tablespaces. */
+ if (valid_tablespaces_found > 1 || srv_force_recovery > 0) {
+ /* If the file is not open it cannot be valid. */
+ ut_ad(df_default.is_open() || !df_default.is_valid());
+ ut_ad(df_remote.is_open() || !df_remote.is_valid());
+
+ /* Having established that, this is an easy way to
+ look for corrupted data files. */
+ if (df_default.is_open() != df_default.is_valid()
+ || df_remote.is_open() != df_remote.is_valid()) {
+ goto corrupted;
+ }
+error:
+ local_err = DB_ERROR;
+ goto func_exit;
+ }
+
+ /* There is only one valid tablespace found and we did
+ not use srv_force_recovery during REDO. Use this one
+ tablespace and clean up invalid tablespace pointers */
+ if (df_default.is_open() && !df_default.is_valid()) {
+ df_default.close();
+ tablespaces_found--;
+ }
+
+ if (df_remote.is_open() && !df_remote.is_valid()) {
+ df_remote.close();
+ tablespaces_found--;
+ }
+ }
+
+ /* At this point, there should be only one filepath. */
+ ut_a(tablespaces_found == 1);
+ ut_a(valid_tablespaces_found == 1);
+
+skip_validate:
+ const byte* first_page =
+ df_default.is_open() ? df_default.get_first_page() :
+ df_remote.get_first_page();
+
+ fil_space_crypt_t* crypt_data = first_page
+ ? fil_space_read_crypt_data(fil_space_t::zip_size(flags),
+ first_page)
+ : NULL;
+
+ mysql_mutex_lock(&fil_system.mutex);
+ space = fil_space_t::create(id, flags, purpose, crypt_data);
+ if (!space) {
+ mysql_mutex_unlock(&fil_system.mutex);
+ goto error;
+ }
+
+ /* We do not measure the size of the file, that is why
+ we pass the 0 below */
+
+ space->add(
+ df_remote.is_open() ? df_remote.filepath() :
+ df_default.filepath(), OS_FILE_CLOSED, 0, false, true);
+ mysql_mutex_unlock(&fil_system.mutex);
+
+ if (must_validate && !srv_read_only_mode) {
+ df_remote.close();
+ df_default.close();
+ if (space->acquire()) {
+ if (purpose != FIL_TYPE_IMPORT) {
+ fsp_flags_try_adjust(space, flags
+ & ~FSP_FLAGS_MEM_MASK);
+ }
+ space->release();
+ }
+ }
+
+ goto func_exit;
+}
+
+/** Discover the correct IBD file to open given a remote or missing
+filepath from the REDO log. Administrators can move a crashed
+database to another location on the same machine and try to recover it.
+Remote IBD files might be moved as well to the new location.
+ The problem with this is that the REDO log contains the old location
+which may be still accessible. During recovery, if files are found in
+both locations, we can chose on based on these priorities;
+1. Default location
+2. ISL location
+3. REDO location
+@param[in] space_id tablespace ID
+@param[in] df Datafile object with path from redo
+@return true if a valid datafile was found, false if not */
+static
+bool
+fil_ibd_discover(
+ ulint space_id,
+ Datafile& df)
+{
+ Datafile df_def_per; /* default file-per-table datafile */
+ RemoteDatafile df_rem_per; /* remote file-per-table datafile */
+
+ /* Look for the datafile in the default location. */
+ const char* filename = df.filepath();
+ const char* basename = base_name(filename);
+
+ /* If this datafile is file-per-table it will have a schema dir. */
+ ulint sep_found = 0;
+ const char* db = basename;
+ for (; db > filename && sep_found < 2; db--) {
+ switch (db[0]) {
+#ifdef _WIN32
+ case '\\':
+#endif
+ case '/':
+ sep_found++;
+ }
+ }
+ if (sep_found == 2) {
+ db += 2;
+ df_def_per.init(0);
+ df_def_per.set_filepath(db);
+ if (df_def_per.open_read_only(false) == DB_SUCCESS
+ && df_def_per.validate_for_recovery() == DB_SUCCESS
+ && df_def_per.space_id() == space_id) {
+ df.set_filepath(df_def_per.filepath());
+ df.open_read_only(false);
+ return(true);
+ }
+
+ /* Look for a remote file-per-table tablespace. */
+
+ switch (srv_operation) {
+ case SRV_OPERATION_BACKUP:
+ case SRV_OPERATION_RESTORE_DELTA:
+ case SRV_OPERATION_BACKUP_NO_DEFER:
+ ut_ad(0);
+ break;
+ case SRV_OPERATION_RESTORE_EXPORT:
+ case SRV_OPERATION_RESTORE:
+ break;
+ case SRV_OPERATION_NORMAL:
+ case SRV_OPERATION_EXPORT_RESTORED:
+ size_t len= strlen(db);
+ if (len <= 4 || strcmp(db + len - 4, dot_ext[IBD])) {
+ break;
+ }
+ df_rem_per.open_link_file({db, len - 4});
+
+ if (!df_rem_per.filepath()) {
+ break;
+ }
+
+ /* An ISL file was found with contents. */
+ if (df_rem_per.open_read_only(false) != DB_SUCCESS
+ || df_rem_per.validate_for_recovery()
+ != DB_SUCCESS) {
+
+ /* Assume that this ISL file is intended to
+ be used. Do not continue looking for another
+ if this file cannot be opened or is not
+ a valid IBD file. */
+ ib::error() << "ISL file '"
+ << df_rem_per.link_filepath()
+ << "' was found but the linked file '"
+ << df_rem_per.filepath()
+ << "' could not be opened or is"
+ " not correct.";
+ return(false);
+ }
+
+ /* Use this file if it has the space_id from the
+ FILE_ record. */
+ if (df_rem_per.space_id() == space_id) {
+ df.set_filepath(df_rem_per.filepath());
+ df.open_read_only(false);
+ return(true);
+ }
+
+ /* Since old MLOG records can use the same basename
+ in multiple CREATE/DROP TABLE sequences, this ISL
+ file could be pointing to a later version of this
+ basename.ibd file which has a different space_id.
+ Keep looking. */
+ }
+ }
+
+ /* No ISL files were found in the default location. Use the location
+ given in the redo log. */
+ if (df.open_read_only(false) == DB_SUCCESS
+ && df.validate_for_recovery() == DB_SUCCESS
+ && df.space_id() == space_id) {
+ return(true);
+ }
+
+ /* A datafile was not discovered for the filename given. */
+ return(false);
+}
+
+bool fil_crypt_check(fil_space_crypt_t *crypt_data, const char *f_name)
+{
+ if (crypt_data->is_key_found())
+ return true;
+ sql_print_error("InnoDB: Encryption key is not found for %s", f_name);
+ crypt_data->~fil_space_crypt_t();
+ ut_free(crypt_data);
+ return false;
+}
+
+/** Open an ibd tablespace and add it to the InnoDB data structures.
+This is similar to fil_ibd_open() except that it is used while processing
+the REDO log, so the data dictionary is not available and very little
+validation is done. The tablespace name is extracred from the
+dbname/tablename.ibd portion of the filename, which assumes that the file
+is a file-per-table tablespace. Any name will do for now. General
+tablespace names will be read from the dictionary after it has been
+recovered. The tablespace flags are read at this time from the first page
+of the file in validate_for_recovery().
+@param[in] space_id tablespace ID
+@param[in] filename path/to/databasename/tablename.ibd
+@param[out] space the tablespace, or NULL on error
+@return status of the operation */
+enum fil_load_status
+fil_ibd_load(uint32_t space_id, const char *filename, fil_space_t *&space)
+{
+ /* If the a space is already in the file system cache with this
+ space ID, then there is nothing to do. */
+ mysql_mutex_lock(&fil_system.mutex);
+ space = fil_space_get_by_id(space_id);
+ mysql_mutex_unlock(&fil_system.mutex);
+
+ if (space) {
+ /* Compare the filename we are trying to open with the
+ filename from the first node of the tablespace we opened
+ previously. Fail if it is different. */
+ fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
+ if (0 != strcmp(innobase_basename(filename),
+ innobase_basename(node->name))) {
+ ib::info()
+ << "Ignoring data file '" << filename
+ << "' with space ID " << space->id
+ << ". Another data file called " << node->name
+ << " exists with the same space ID.";
+ space = NULL;
+ return(FIL_LOAD_ID_CHANGED);
+ }
+ return(FIL_LOAD_OK);
+ }
+
+ if (srv_operation == SRV_OPERATION_RESTORE) {
+ /* Replace absolute DATA DIRECTORY file paths with
+ short names relative to the backup directory. */
+ const char* name = strrchr(filename, '/');
+#ifdef _WIN32
+ if (const char *last = strrchr(filename, '\\')) {
+ if (last > name) {
+ name = last;
+ }
+ }
+#endif
+ if (name) {
+ while (--name > filename
+#ifdef _WIN32
+ && *name != '\\'
+#endif
+ && *name != '/');
+ if (name > filename) {
+ filename = name + 1;
+ }
+ }
+ }
+
+ Datafile file;
+ file.set_filepath(filename);
+ file.open_read_only(false);
+
+ if (!file.is_open()) {
+ /* The file has been moved or it is a remote datafile. */
+ if (!fil_ibd_discover(space_id, file)
+ || !file.is_open()) {
+ return(FIL_LOAD_NOT_FOUND);
+ }
+ }
+
+ os_offset_t size;
+ bool deferred_space = false;
+
+ /* Read and validate the first page of the tablespace.
+ Assign a tablespace name based on the tablespace type. */
+ switch (file.validate_for_recovery()) {
+ os_offset_t minimum_size;
+ case DB_SUCCESS:
+ deferred_space = file.m_defer;
+
+ if (deferred_space) {
+ goto tablespace_check;
+ }
+
+ if (file.space_id() != space_id) {
+ return(FIL_LOAD_ID_CHANGED);
+ }
+tablespace_check:
+ /* Get and test the file size. */
+ size = os_file_get_size(file.handle());
+
+ /* Every .ibd file is created >= 4 pages in size.
+ Smaller files cannot be OK. */
+ minimum_size = os_offset_t(FIL_IBD_FILE_INITIAL_SIZE)
+ << srv_page_size_shift;
+
+ if (size == static_cast<os_offset_t>(-1)) {
+ /* The following call prints an error message */
+ os_file_get_last_error(true);
+
+ ib::error() << "Could not measure the size of"
+ " single-table tablespace file '"
+ << file.filepath() << "'";
+ } else if (deferred_space) {
+ return FIL_LOAD_DEFER;
+ } else if (size < minimum_size) {
+ ib::error() << "The size of tablespace file '"
+ << file.filepath() << "' is only " << size
+ << ", should be at least " << minimum_size
+ << "!";
+ } else {
+ /* Everything is fine so far. */
+ break;
+ }
+
+ /* fall through */
+
+ case DB_TABLESPACE_EXISTS:
+ return(FIL_LOAD_INVALID);
+
+ default:
+ return(FIL_LOAD_NOT_FOUND);
+ }
+
+ ut_ad(space == NULL);
+
+ /* Adjust the memory-based flags that would normally be set by
+ dict_tf_to_fsp_flags(). In recovery, we have no data dictionary. */
+ uint32_t flags = file.flags();
+ if (fil_space_t::is_compressed(flags)) {
+ flags |= page_zip_level
+ << FSP_FLAGS_MEM_COMPRESSION_LEVEL;
+ }
+
+ const byte* first_page = file.get_first_page();
+ fil_space_crypt_t* crypt_data = first_page
+ ? fil_space_read_crypt_data(fil_space_t::zip_size(flags),
+ first_page)
+ : NULL;
+
+ if (crypt_data && !fil_crypt_check(crypt_data, filename)) {
+ return FIL_LOAD_INVALID;
+ }
+
+ mysql_mutex_lock(&fil_system.mutex);
+
+ space = fil_space_t::create(
+ space_id, flags, FIL_TYPE_TABLESPACE, crypt_data);
+
+ if (space == NULL) {
+ mysql_mutex_unlock(&fil_system.mutex);
+ return(FIL_LOAD_INVALID);
+ }
+
+ ut_ad(space->id == file.space_id());
+ ut_ad(space->id == space_id);
+
+ /* We do not use the size information we have about the file, because
+ the rounding formula for extents and pages is somewhat complex; we
+ let fil_node_open() do that task. */
+
+ space->add(file.filepath(), OS_FILE_CLOSED, 0, false, false);
+ mysql_mutex_unlock(&fil_system.mutex);
+
+ return(FIL_LOAD_OK);
+}
+
+/** Try to adjust FSP_SPACE_FLAGS if they differ from the expectations.
+(Typically when upgrading from MariaDB 10.1.0..10.1.20.)
+@param[in,out] space tablespace
+@param[in] flags desired tablespace flags */
+void fsp_flags_try_adjust(fil_space_t *space, uint32_t flags)
+{
+ ut_ad(!srv_read_only_mode);
+ ut_ad(fil_space_t::is_valid_flags(flags, space->id));
+ if (space->full_crc32() || fil_space_t::full_crc32(flags)) {
+ return;
+ }
+ if (!space->size && (space->purpose != FIL_TYPE_TABLESPACE
+ || !space->get_size())) {
+ return;
+ }
+ /* This code is executed during server startup while no
+ connections are allowed. We do not need to protect against
+ DROP TABLE by fil_space_acquire(). */
+ mtr_t mtr;
+ mtr.start();
+ if (buf_block_t* b = buf_page_get(
+ page_id_t(space->id, 0), space->zip_size(),
+ RW_X_LATCH, &mtr)) {
+ uint32_t f = fsp_header_get_flags(b->page.frame);
+ if (fil_space_t::full_crc32(f)) {
+ goto func_exit;
+ }
+ if (fil_space_t::is_flags_equal(f, flags)) {
+ goto func_exit;
+ }
+ /* Suppress the message if only the DATA_DIR flag to differs. */
+ if ((f ^ flags) & ~(1U << FSP_FLAGS_POS_RESERVED)) {
+ ib::warn()
+ << "adjusting FSP_SPACE_FLAGS of file '"
+ << UT_LIST_GET_FIRST(space->chain)->name
+ << "' from " << ib::hex(f)
+ << " to " << ib::hex(flags);
+ }
+ mtr.set_named_space(space);
+ mtr.write<4,mtr_t::FORCED>(*b,
+ FSP_HEADER_OFFSET + FSP_SPACE_FLAGS
+ + b->page.frame, flags);
+ }
+func_exit:
+ mtr.commit();
+}
+
+/** Determine if a matching tablespace exists in the InnoDB tablespace
+memory cache. Note that if we have not done a crash recovery at the database
+startup, there may be many tablespaces which are not yet in the memory cache.
+@param[in] id Tablespace ID
+@param[in] table_flags table flags
+@return the tablespace
+@retval NULL if no matching tablespace exists in the memory cache */
+fil_space_t *fil_space_for_table_exists_in_mem(uint32_t id,
+ uint32_t table_flags)
+{
+ const uint32_t expected_flags = dict_tf_to_fsp_flags(table_flags);
+
+ mysql_mutex_lock(&fil_system.mutex);
+ if (fil_space_t* space = fil_space_get_by_id(id)) {
+ uint32_t tf = expected_flags & ~FSP_FLAGS_MEM_MASK;
+ uint32_t sf = space->flags & ~FSP_FLAGS_MEM_MASK;
+
+ if (!fil_space_t::is_flags_equal(tf, sf)
+ && !fil_space_t::is_flags_equal(sf, tf)) {
+ goto func_exit;
+ }
+
+ /* Adjust the flags that are in FSP_FLAGS_MEM_MASK.
+ FSP_SPACE_FLAGS will not be written back here. */
+ space->flags = (space->flags & ~FSP_FLAGS_MEM_MASK)
+ | (expected_flags & FSP_FLAGS_MEM_MASK);
+ mysql_mutex_unlock(&fil_system.mutex);
+ if (!srv_read_only_mode) {
+ fsp_flags_try_adjust(space, expected_flags
+ & ~FSP_FLAGS_MEM_MASK);
+ }
+ return space;
+ }
+
+func_exit:
+ mysql_mutex_unlock(&fil_system.mutex);
+ return NULL;
+}
+
+/*============================ FILE I/O ================================*/
+
+/** Report information about an invalid page access. */
+ATTRIBUTE_COLD
+static void fil_invalid_page_access_msg(const char *name,
+ os_offset_t offset, ulint len,
+ bool is_read)
+{
+ sql_print_error("%s %zu bytes at " UINT64PF
+ " outside the bounds of the file: %s",
+ is_read
+ ? "InnoDB: Trying to read"
+ : "[FATAL] InnoDB: Trying to write", len, offset, name);
+ if (!is_read)
+ abort();
+}
+
+/** Update the data structures on write completion */
+inline void fil_node_t::complete_write()
+{
+ mysql_mutex_assert_not_owner(&fil_system.mutex);
+
+ if (space->purpose != FIL_TYPE_TEMPORARY &&
+ srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC &&
+ space->set_needs_flush())
+ {
+ mysql_mutex_lock(&fil_system.mutex);
+ if (!space->is_in_unflushed_spaces)
+ {
+ space->is_in_unflushed_spaces= true;
+ fil_system.unflushed_spaces.push_front(*space);
+ }
+ mysql_mutex_unlock(&fil_system.mutex);
+ }
+}
+
+/** Read or write data.
+@param type I/O context
+@param offset offset in bytes
+@param len number of bytes
+@param buf the data to be read or written
+@param bpage buffer block (for type.is_async() completion callback)
+@return status and file descriptor */
+fil_io_t fil_space_t::io(const IORequest &type, os_offset_t offset, size_t len,
+ void *buf, buf_page_t *bpage)
+{
+ ut_ad(referenced());
+ ut_ad(offset % UNIV_ZIP_SIZE_MIN == 0);
+ ut_ad(len % 512 == 0); /* page_compressed */
+ ut_ad(fil_validate_skip());
+ ut_ad(type.is_read() || type.is_write());
+ ut_ad(type.type != IORequest::DBLWR_BATCH);
+
+ if (type.is_read()) {
+ srv_stats.data_read.add(len);
+ } else {
+ ut_ad(!srv_read_only_mode || this == fil_system.temp_space);
+ srv_stats.data_written.add(len);
+ }
+
+ fil_node_t* node= UT_LIST_GET_FIRST(chain);
+ ut_ad(node);
+ ulint p = static_cast<ulint>(offset >> srv_page_size_shift);
+ dberr_t err;
+
+ if (type.type == IORequest::READ_ASYNC && is_stopping()) {
+ err = DB_TABLESPACE_DELETED;
+ node = nullptr;
+ goto release;
+ }
+
+ DBUG_EXECUTE_IF("intermittent_recovery_failure",
+ if (type.is_read() && !(~get_rnd_value() & 0x3ff0))
+ goto io_error;);
+
+ DBUG_EXECUTE_IF("intermittent_read_failure",
+ if (srv_was_started && type.is_read() &&
+ !(~get_rnd_value() & 0x3ff0)) goto io_error;);
+
+ if (UNIV_LIKELY_NULL(UT_LIST_GET_NEXT(chain, node))) {
+ ut_ad(this == fil_system.sys_space
+ || this == fil_system.temp_space);
+ ut_ad(!(offset & ((1 << srv_page_size_shift) - 1)));
+
+ while (node->size <= p) {
+ p -= node->size;
+ node = UT_LIST_GET_NEXT(chain, node);
+ if (!node) {
+fail:
+ if (type.type != IORequest::READ_ASYNC) {
+ fil_invalid_page_access_msg(
+ node->name,
+ offset, len,
+ type.is_read());
+ }
+#ifndef DBUG_OFF
+io_error:
+#endif
+ set_corrupted();
+ err = DB_CORRUPTION;
+ node = nullptr;
+ goto release;
+ }
+ }
+
+ offset = os_offset_t{p} << srv_page_size_shift;
+ }
+
+ if (UNIV_UNLIKELY(node->size <= p)) {
+ goto fail;
+ }
+
+ if (type.type == IORequest::PUNCH_RANGE) {
+ err = os_file_punch_hole(node->handle, offset, len);
+ /* Punch hole is not supported, make space not to
+ support punch hole */
+ if (UNIV_UNLIKELY(err == DB_IO_NO_PUNCH_HOLE)) {
+ node->punch_hole = false;
+ err = DB_SUCCESS;
+ }
+ goto release_sync_write;
+ } else {
+ /* Queue the aio request */
+ err = os_aio(IORequest{bpage, type.slot, node, type.type},
+ buf, offset, len);
+ }
+
+ if (!type.is_async()) {
+ if (type.is_write()) {
+release_sync_write:
+ node->complete_write();
+release:
+ release();
+ goto func_exit;
+ }
+ ut_ad(fil_validate_skip());
+ }
+ if (err != DB_SUCCESS) {
+ goto release;
+ }
+func_exit:
+ return {err, node};
+}
+
+#include <tpool.h>
+
+void IORequest::write_complete(int io_error) const
+{
+ ut_ad(fil_validate_skip());
+ ut_ad(node);
+ ut_ad(is_write());
+ node->complete_write();
+
+ if (!bpage)
+ {
+ ut_ad(!srv_read_only_mode);
+ if (type == IORequest::DBLWR_BATCH)
+ buf_dblwr.flush_buffered_writes_completed(*this);
+ else
+ ut_ad(type == IORequest::WRITE_ASYNC);
+ }
+ else
+ buf_page_write_complete(*this, io_error);
+
+ node->space->release();
+}
+
+void IORequest::read_complete(int io_error) const
+{
+ ut_ad(fil_validate_skip());
+ ut_ad(node);
+ ut_ad(is_read());
+ ut_ad(bpage);
+
+ /* IMPORTANT: since i/o handling for reads will read also the insert
+ buffer in fil_system.sys_space, we have to be very careful not to
+ introduce deadlocks. We never close fil_system.sys_space data files
+ and never issue asynchronous reads of change buffer pages. */
+ const page_id_t id(bpage->id());
+
+ if (UNIV_UNLIKELY(io_error != 0))
+ {
+ sql_print_error("InnoDB: Read error %d of page " UINT32PF " in file %s",
+ io_error, id.page_no(), node->name);
+ buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX);
+ corrupted:
+ if (recv_recovery_is_on() && !srv_force_recovery)
+ {
+ mysql_mutex_lock(&recv_sys.mutex);
+ recv_sys.set_corrupt_fs();
+ mysql_mutex_unlock(&recv_sys.mutex);
+ }
+ }
+ else if (dberr_t err= bpage->read_complete(*node))
+ {
+ if (err != DB_FAIL)
+ ib::error() << "Failed to read page " << id.page_no()
+ << " from file '" << node->name << "': " << err;
+ goto corrupted;
+ }
+
+ node->space->release();
+}
+
+/** Flush to disk the writes in file spaces of the given type
+possibly cached by the OS. */
+void fil_flush_file_spaces()
+{
+ if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
+ {
+ ut_d(mysql_mutex_lock(&fil_system.mutex));
+ ut_ad(fil_system.unflushed_spaces.empty());
+ ut_d(mysql_mutex_unlock(&fil_system.mutex));
+ return;
+ }
+
+rescan:
+ mysql_mutex_lock(&fil_system.mutex);
+
+ for (fil_space_t &space : fil_system.unflushed_spaces)
+ {
+ if (space.needs_flush_not_stopping())
+ {
+ space.reacquire();
+ mysql_mutex_unlock(&fil_system.mutex);
+ space.flush_low();
+ space.release();
+ goto rescan;
+ }
+ }
+
+ mysql_mutex_unlock(&fil_system.mutex);
+}
+
+/** Functor to validate the file node list of a tablespace. */
+struct Check {
+ /** Total size of file nodes visited so far */
+ ulint size;
+ /** Total number of open files visited so far */
+ ulint n_open;
+
+ /** Constructor */
+ Check() : size(0), n_open(0) {}
+
+ /** Visit a file node
+ @param[in] elem file node to visit */
+ void operator()(const fil_node_t* elem)
+ {
+ n_open += elem->is_open();
+ size += elem->size;
+ }
+
+ /** Validate a tablespace.
+ @param[in] space tablespace to validate
+ @return number of open file nodes */
+ static ulint validate(const fil_space_t* space)
+ {
+ mysql_mutex_assert_owner(&fil_system.mutex);
+ Check check;
+ ut_list_validate(space->chain, check);
+ ut_a(space->size == check.size);
+
+ switch (space->id) {
+ case TRX_SYS_SPACE:
+ ut_ad(fil_system.sys_space == NULL
+ || fil_system.sys_space == space);
+ break;
+ case SRV_TMP_SPACE_ID:
+ ut_ad(fil_system.temp_space == NULL
+ || fil_system.temp_space == space);
+ break;
+ default:
+ break;
+ }
+
+ return(check.n_open);
+ }
+};
+
+/******************************************************************//**
+Checks the consistency of the tablespace cache.
+@return true if ok */
+bool fil_validate()
+{
+ ulint n_open = 0;
+
+ mysql_mutex_lock(&fil_system.mutex);
+
+ for (fil_space_t &space : fil_system.space_list) {
+ n_open += Check::validate(&space);
+ }
+
+ ut_a(fil_system.n_open == n_open);
+
+ mysql_mutex_unlock(&fil_system.mutex);
+
+ return(true);
+}
+
+/*********************************************************************//**
+Sets the file page type. */
+void
+fil_page_set_type(
+/*==============*/
+ byte* page, /*!< in/out: file page */
+ ulint type) /*!< in: type */
+{
+ ut_ad(page);
+
+ mach_write_to_2(page + FIL_PAGE_TYPE, type);
+}
+
+/********************************************************************//**
+Delete the tablespace file and any related files like .cfg.
+This should not be called for temporary tables.
+@param[in] ibd_filepath File path of the IBD tablespace */
+void fil_delete_file(const char *ibd_filepath)
+{
+ ib::info() << "Deleting " << ibd_filepath;
+ os_file_delete_if_exists(innodb_data_file_key, ibd_filepath, nullptr);
+
+ if (char *cfg_filepath= fil_make_filepath(ibd_filepath,
+ fil_space_t::name_type{}, CFG,
+ false))
+ {
+ os_file_delete_if_exists(innodb_data_file_key, cfg_filepath, nullptr);
+ ut_free(cfg_filepath);
+ }
+}
+
+#ifdef UNIV_DEBUG
+/** Check that a tablespace is valid for mtr_commit().
+@param[in] space persistent tablespace that has been changed */
+static
+void
+fil_space_validate_for_mtr_commit(
+ const fil_space_t* space)
+{
+ mysql_mutex_assert_not_owner(&fil_system.mutex);
+ ut_ad(space != NULL);
+ ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
+ ut_ad(!is_predefined_tablespace(space->id));
+
+ /* We are serving mtr_commit(). While there is an active
+ mini-transaction, we should have !space->stop_new_ops. This is
+ guaranteed by meta-data locks or transactional locks. */
+ ut_ad(!space->is_stopping()
+ || space->is_being_truncated /* fil_truncate_prepare() */
+ || space->referenced());
+}
+#endif /* UNIV_DEBUG */
+
+/** Note that a non-predefined persistent tablespace has been modified
+by redo log.
+@param[in,out] space tablespace */
+void
+fil_names_dirty(
+ fil_space_t* space)
+{
+#ifndef SUX_LOCK_GENERIC
+ ut_ad(log_sys.latch.is_write_locked());
+#endif
+ ut_ad(recv_recovery_is_on());
+ ut_ad(log_sys.get_lsn() != 0);
+ ut_ad(space->max_lsn == 0);
+ ut_d(fil_space_validate_for_mtr_commit(space));
+
+ fil_system.named_spaces.push_back(*space);
+ space->max_lsn = log_sys.get_lsn();
+}
+
+/** Write a FILE_MODIFY record when a non-predefined persistent
+tablespace was modified for the first time since fil_names_clear(). */
+ATTRIBUTE_NOINLINE ATTRIBUTE_COLD void mtr_t::name_write()
+{
+#ifndef SUX_LOCK_GENERIC
+ ut_ad(log_sys.latch.is_write_locked());
+#endif
+ ut_d(fil_space_validate_for_mtr_commit(m_user_space));
+ ut_ad(!m_user_space->max_lsn);
+ m_user_space->max_lsn= log_sys.get_lsn();
+
+ fil_system.named_spaces.push_back(*m_user_space);
+ ut_ad(UT_LIST_GET_LEN(m_user_space->chain) == 1);
+
+ mtr_t mtr;
+ mtr.start();
+ fil_name_write(m_user_space->id,
+ UT_LIST_GET_FIRST(m_user_space->chain)->name,
+ &mtr);
+ mtr.commit_files();
+}
+
+/** On a log checkpoint, reset fil_names_dirty_and_write() flags
+and write out FILE_MODIFY if needed, and write FILE_CHECKPOINT.
+@param lsn checkpoint LSN
+@return current LSN */
+lsn_t fil_names_clear(lsn_t lsn)
+{
+ mtr_t mtr;
+
+#ifndef SUX_LOCK_GENERIC
+ ut_ad(log_sys.latch.is_write_locked());
+#endif
+ ut_ad(lsn);
+ ut_ad(log_sys.is_latest());
+
+ mtr.start();
+
+ for (auto it = fil_system.named_spaces.begin();
+ it != fil_system.named_spaces.end(); ) {
+ if (mtr.get_log_size() + strlen(it->chain.start->name)
+ >= recv_sys.MTR_SIZE_MAX - (3 + 5)) {
+ /* Prevent log parse buffer overflow */
+ mtr.commit_files();
+ mtr.start();
+ }
+
+ auto next = std::next(it);
+
+ ut_ad(it->max_lsn > 0);
+ if (it->max_lsn < lsn) {
+ /* The tablespace was last dirtied before the
+ checkpoint LSN. Remove it from the list, so
+ that if the tablespace is not going to be
+ modified any more, subsequent checkpoints will
+ avoid calling fil_names_write() on it. */
+ it->max_lsn = 0;
+ fil_system.named_spaces.erase(it);
+ }
+
+ /* max_lsn is the last LSN where fil_names_dirty_and_write()
+ was called. If we kept track of "min_lsn" (the first LSN
+ where max_lsn turned nonzero), we could avoid the
+ fil_names_write() call if min_lsn > lsn. */
+ ut_ad(UT_LIST_GET_LEN((*it).chain) == 1);
+ fil_name_write((*it).id, UT_LIST_GET_FIRST((*it).chain)->name,
+ &mtr);
+ it = next;
+ }
+
+ return mtr.commit_files(lsn);
+}
+
+/* Unit Tests */
+#ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH
+#define MF fil_make_filepath
+#define DISPLAY ib::info() << path
+void
+test_make_filepath()
+{
+ char* path;
+ const char* long_path =
+ "this/is/a/very/long/path/including/a/very/"
+ "looooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooooo"
+ "oooooooooooooooooooooooooooooooooooooooooooooooong"
+ "/folder/name";
+ path = MF("/this/is/a/path/with/a/filename", NULL, IBD, false); DISPLAY;
+ path = MF("/this/is/a/path/with/a/filename", NULL, ISL, false); DISPLAY;
+ path = MF("/this/is/a/path/with/a/filename", NULL, CFG, false); DISPLAY;
+ path = MF("/this/is/a/path/with/a/filename.ibd", NULL, IBD, false); DISPLAY;
+ path = MF("/this/is/a/path/with/a/filename.ibd", NULL, IBD, false); DISPLAY;
+ path = MF("/this/is/a/path/with/a/filename.dat", NULL, IBD, false); DISPLAY;
+ path = MF(NULL, "tablespacename", NO_EXT, false); DISPLAY;
+ path = MF(NULL, "tablespacename", IBD, false); DISPLAY;
+ path = MF(NULL, "dbname/tablespacename", NO_EXT, false); DISPLAY;
+ path = MF(NULL, "dbname/tablespacename", IBD, false); DISPLAY;
+ path = MF(NULL, "dbname/tablespacename", ISL, false); DISPLAY;
+ path = MF(NULL, "dbname/tablespacename", CFG, false); DISPLAY;
+ path = MF(NULL, "dbname\\tablespacename", NO_EXT, false); DISPLAY;
+ path = MF(NULL, "dbname\\tablespacename", IBD, false); DISPLAY;
+ path = MF("/this/is/a/path", "dbname/tablespacename", IBD, false); DISPLAY;
+ path = MF("/this/is/a/path", "dbname/tablespacename", IBD, true); DISPLAY;
+ path = MF("./this/is/a/path", "dbname/tablespacename.ibd", IBD, true); DISPLAY;
+ path = MF("this\\is\\a\\path", "dbname/tablespacename", IBD, true); DISPLAY;
+ path = MF("/this/is/a/path", "dbname\\tablespacename", IBD, true); DISPLAY;
+ path = MF(long_path, NULL, IBD, false); DISPLAY;
+ path = MF(long_path, "tablespacename", IBD, false); DISPLAY;
+ path = MF(long_path, "tablespacename", IBD, true); DISPLAY;
+}
+#endif /* UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH */
+/* @} */
+
+/** Determine the block size of the data file.
+@param[in] space tablespace
+@param[in] offset page number
+@return block size */
+ulint fil_space_get_block_size(const fil_space_t *space, unsigned offset)
+{
+ ulint block_size = 512;
+
+ for (fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
+ node != NULL;
+ node = UT_LIST_GET_NEXT(chain, node)) {
+ block_size = node->block_size;
+ if (node->size > offset) {
+ ut_ad(node->size <= 0xFFFFFFFFU);
+ break;
+ }
+ offset -= static_cast<unsigned>(node->size);
+ }
+
+ /* Currently supporting block size up to 4K,
+ fall back to default if bigger requested. */
+ if (block_size > 4096) {
+ block_size = 512;
+ }
+
+ return block_size;
+}
+
+/** @return the tablespace name (databasename/tablename) */
+fil_space_t::name_type fil_space_t::name() const
+{
+ switch (id) {
+ case 0:
+ return name_type{"innodb_system", 13};
+ case SRV_TMP_SPACE_ID:
+ return name_type{"innodb_temporary", 16};
+ }
+
+ if (!UT_LIST_GET_FIRST(chain) || srv_is_undo_tablespace(id))
+ return name_type{};
+
+ ut_ad(purpose != FIL_TYPE_TEMPORARY);
+ ut_ad(UT_LIST_GET_LEN(chain) == 1);
+
+ const char *path= UT_LIST_GET_FIRST(chain)->name;
+ const char *sep= strchr(path, '/');
+ ut_ad(sep);
+
+ while (const char *next_sep= strchr(sep + 1, '/'))
+ path= sep + 1, sep= next_sep;
+
+#ifdef _WIN32
+ if (const char *last_sep= strchr(path, '\\'))
+ if (last_sep < sep)
+ path= last_sep;
+#endif
+
+ size_t len= strlen(path);
+ ut_ad(len > 4);
+ len-= 4;
+ ut_ad(!strcmp(&path[len], DOT_IBD));
+
+ return name_type{path, len};
+}
+
+#ifdef UNIV_DEBUG
+
+fil_space_t *fil_space_t::next_in_space_list()
+{
+ space_list_t::iterator it(this);
+ auto end= fil_system.space_list.end();
+ if (it == end)
+ return nullptr;
+ ++it;
+ return it == end ? nullptr : &*it;
+}
+
+fil_space_t *fil_space_t::prev_in_space_list()
+{
+ space_list_t::iterator it(this);
+ if (it == fil_system.space_list.begin())
+ return nullptr;
+ --it;
+ return &*it;
+}
+
+fil_space_t *fil_space_t::next_in_unflushed_spaces()
+{
+ sized_ilist<fil_space_t, unflushed_spaces_tag_t>::iterator it(this);
+ auto end= fil_system.unflushed_spaces.end();
+ if (it == end)
+ return nullptr;
+ ++it;
+ return it == end ? nullptr : &*it;
+}
+
+fil_space_t *fil_space_t::prev_in_unflushed_spaces()
+{
+ sized_ilist<fil_space_t, unflushed_spaces_tag_t>::iterator it(this);
+ if (it == fil_system.unflushed_spaces.begin())
+ return nullptr;
+ --it;
+ return &*it;
+}
+
+#endif
diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc
new file mode 100644
index 00000000..16aea2a7
--- /dev/null
+++ b/storage/innobase/fil/fil0pagecompress.cc
@@ -0,0 +1,584 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fil/fil0pagecompress.cc
+Implementation for page compressed file spaces.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@mariadb.com
+Updated 14/02/2015
+***********************************************************************/
+
+#include "fil0fil.h"
+#include "fil0pagecompress.h"
+
+#include <my_dbug.h>
+
+#include "mem0mem.h"
+#include "hash0hash.h"
+#include "os0file.h"
+#include "mach0data.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "log0recv.h"
+#include "fsp0fsp.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "trx0sys.h"
+#include "row0mysql.h"
+#include "buf0lru.h"
+#include "ibuf0ibuf.h"
+#include "zlib.h"
+#ifdef __linux__
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#endif
+#include "row0mysql.h"
+#include "lz4.h"
+#include "lzo/lzo1x.h"
+#include "lzma.h"
+#include "bzlib.h"
+#include "snappy-c.h"
+
+/** Compress a page for the given compression algorithm.
+@param[in] buf page to be compressed
+@param[out] out_buf compressed page
+@param[in] header_len header length of the page
+@param[in] comp_algo compression algorithm
+@param[in] comp_level compression level
+@return actual length of compressed page data
+@retval 0 if the page was not compressed */
+static ulint fil_page_compress_low(
+ const byte* buf,
+ byte* out_buf,
+ ulint header_len,
+ ulint comp_algo,
+ unsigned comp_level)
+{
+ ulint write_size = srv_page_size - header_len;
+
+ switch (comp_algo) {
+ default:
+ ut_ad("unknown compression method" == 0);
+ /* fall through */
+ case PAGE_UNCOMPRESSED:
+ return 0;
+
+ case PAGE_ZLIB_ALGORITHM:
+ {
+ ulong len = uLong(write_size);
+ if (Z_OK == compress2(
+ out_buf + header_len, &len, buf,
+ uLong(srv_page_size), int(comp_level))) {
+ return len;
+ }
+ }
+ break;
+
+ case PAGE_LZ4_ALGORITHM:
+ write_size = LZ4_compress_default(
+ reinterpret_cast<const char*>(buf),
+ reinterpret_cast<char*>(out_buf) + header_len,
+ int(srv_page_size), int(write_size));
+
+ return write_size;
+
+ case PAGE_LZO_ALGORITHM: {
+ lzo_uint len = write_size;
+
+ if (LZO_E_OK == lzo1x_1_15_compress(
+ buf, srv_page_size,
+ out_buf + header_len, &len,
+ out_buf + srv_page_size)
+ && len <= write_size) {
+ return len;
+ }
+ break;
+ }
+
+ case PAGE_LZMA_ALGORITHM: {
+ size_t out_pos = 0;
+
+ if (LZMA_OK == lzma_easy_buffer_encode(
+ comp_level, LZMA_CHECK_NONE, NULL,
+ buf, srv_page_size, out_buf + header_len,
+ &out_pos, write_size)
+ && out_pos <= write_size) {
+ return out_pos;
+ }
+ break;
+ }
+
+ case PAGE_BZIP2_ALGORITHM: {
+ unsigned len = unsigned(write_size);
+ if (BZ_OK == BZ2_bzBuffToBuffCompress(
+ reinterpret_cast<char*>(out_buf + header_len),
+ &len,
+ const_cast<char*>(
+ reinterpret_cast<const char*>(buf)),
+ unsigned(srv_page_size), 1, 0, 0)
+ && len <= write_size) {
+ return len;
+ }
+ break;
+ }
+
+ case PAGE_SNAPPY_ALGORITHM: {
+ size_t len = snappy_max_compressed_length(srv_page_size);
+
+ if (SNAPPY_OK == snappy_compress(
+ reinterpret_cast<const char*>(buf),
+ srv_page_size,
+ reinterpret_cast<char*>(out_buf) + header_len,
+ &len)
+ && len <= write_size) {
+ return len;
+ }
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/** Compress a page_compressed page for full crc32 format.
+@param[in] buf page to be compressed
+@param[out] out_buf compressed page
+@param[in] flags tablespace flags
+@param[in] block_size file system block size
+@return actual length of compressed page
+@retval 0 if the page was not compressed */
+static ulint fil_page_compress_for_full_crc32(
+ const byte* buf,
+ byte* out_buf,
+ uint32_t flags,
+ ulint block_size,
+ bool encrypted)
+{
+ ulint comp_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags);
+ ulint comp_algo = fil_space_t::get_compression_algo(flags);
+
+ if (comp_level == 0) {
+ comp_level = page_zip_level;
+ }
+
+ const ulint header_len = FIL_PAGE_COMP_ALGO;
+
+ ulint write_size = fil_page_compress_low(
+ buf, out_buf, header_len,
+ comp_algo,
+ static_cast<unsigned>(comp_level));
+
+ if (write_size == 0) {
+fail:
+ if (comp_algo != PAGE_UNCOMPRESSED)
+ srv_stats.pages_page_compression_error.inc();
+ return 0;
+ }
+
+ write_size += header_len;
+ const ulint actual_size = write_size;
+ /* Write the actual length of the data & page type
+ for full crc32 format. */
+ const bool lsb = fil_space_t::full_crc32_page_compressed_len(flags);
+ /* In the MSB, store the rounded-up page size. */
+ write_size = (write_size + lsb + (4 + 255)) & ~255;
+ if (write_size >= srv_page_size) {
+ goto fail;
+ }
+
+ /* Set up the page header */
+ memcpy(out_buf, buf, header_len);
+ out_buf[FIL_PAGE_TYPE] = 1U << (FIL_PAGE_COMPRESS_FCRC32_MARKER - 8);
+ out_buf[FIL_PAGE_TYPE + 1] = byte(write_size >> 8);
+ /* Clean up the buffer for the remaining write_size (except checksum) */
+ memset(out_buf + actual_size, 0, write_size - actual_size - 4);
+ if (lsb) {
+ /* Store the LSB */
+ out_buf[write_size - 5] = byte(actual_size + (1 + 4));
+ }
+
+ if (!block_size) {
+ block_size = 512;
+ }
+
+ ut_ad(write_size);
+ if (write_size & (block_size - 1)) {
+ size_t tmp = write_size;
+ write_size = (write_size + (block_size - 1))
+ & ~(block_size - 1);
+ memset(out_buf + tmp, 0, write_size - tmp);
+ }
+
+ srv_stats.page_compression_saved.add(srv_page_size - write_size);
+ srv_stats.pages_page_compressed.inc();
+
+ return write_size;
+}
+
+/** Compress a page_compressed page for non full crc32 format.
+@param[in] buf page to be compressed
+@param[out] out_buf compressed page
+@param[in] flags tablespace flags
+@param[in] block_size file system block size
+@param[in] encrypted whether the page will be subsequently encrypted
+@return actual length of compressed page
+@retval 0 if the page was not compressed */
+static ulint fil_page_compress_for_non_full_crc32(
+ const byte* buf,
+ byte* out_buf,
+ ulint flags,
+ ulint block_size,
+ bool encrypted)
+{
+ uint comp_level = static_cast<uint>(
+ FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags));
+ ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMP_METADATA_LEN;
+ /* Cache to avoid change during function execution */
+ ulint comp_algo = innodb_compression_algorithm;
+
+ if (encrypted) {
+ header_len += FIL_PAGE_ENCRYPT_COMP_ALGO;
+ }
+
+ /* If no compression level was provided to this table, use system
+ default level */
+ if (comp_level == 0) {
+ comp_level = page_zip_level;
+ }
+
+ ulint write_size = fil_page_compress_low(
+ buf, out_buf,
+ header_len, comp_algo, comp_level);
+
+ if (write_size == 0) {
+ if (comp_algo != PAGE_UNCOMPRESSED)
+ srv_stats.pages_page_compression_error.inc();
+ return 0;
+ }
+
+ /* Set up the page header */
+ memcpy(out_buf, buf, FIL_PAGE_DATA);
+ /* Set up the checksum */
+ mach_write_to_4(out_buf + FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC);
+
+ /* Set up the compression algorithm */
+ mach_write_to_8(out_buf + FIL_PAGE_COMP_ALGO, comp_algo);
+
+ if (encrypted) {
+ /* Set up the correct page type */
+ mach_write_to_2(out_buf + FIL_PAGE_TYPE,
+ FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED);
+
+ mach_write_to_2(out_buf + FIL_PAGE_DATA
+ + FIL_PAGE_ENCRYPT_COMP_ALGO, comp_algo);
+ } else {
+ /* Set up the correct page type */
+ mach_write_to_2(out_buf + FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED);
+ }
+
+ /* Set up the actual payload lenght */
+ mach_write_to_2(out_buf + FIL_PAGE_DATA + FIL_PAGE_COMP_SIZE,
+ write_size);
+
+ ut_ad(mach_read_from_4(out_buf + FIL_PAGE_SPACE_OR_CHKSUM)
+ == BUF_NO_CHECKSUM_MAGIC);
+
+ ut_ad(mach_read_from_2(out_buf + FIL_PAGE_DATA + FIL_PAGE_COMP_SIZE)
+ == write_size);
+
+#ifdef UNIV_DEBUG
+ bool is_compressed = (mach_read_from_8(out_buf + FIL_PAGE_COMP_ALGO)
+ == (ulint) comp_algo);
+
+ bool is_encrypted_compressed =
+ (mach_read_from_2(out_buf + FIL_PAGE_DATA
+ + FIL_PAGE_ENCRYPT_COMP_ALGO)
+ == (ulint) comp_algo);
+#endif /* UNIV_DEBUG */
+
+ ut_ad(is_compressed || is_encrypted_compressed);
+
+ write_size+=header_len;
+
+ if (block_size <= 0) {
+ block_size = 512;
+ }
+
+ ut_ad(write_size > 0 && block_size > 0);
+
+ /* Actual write needs to be alligned on block size */
+ if (write_size % block_size) {
+ size_t tmp = write_size;
+ write_size = (size_t)ut_uint64_align_up(
+ (ib_uint64_t)write_size, block_size);
+ /* Clean up the end of buffer */
+ memset(out_buf+tmp, 0, write_size - tmp);
+#ifdef UNIV_DEBUG
+ ut_a(write_size > 0 && ((write_size % block_size) == 0));
+ ut_a(write_size >= tmp);
+#endif
+ }
+
+ srv_stats.page_compression_saved.add(srv_page_size - write_size);
+ srv_stats.pages_page_compressed.inc();
+
+ return write_size;
+}
+
+/** Compress a page_compressed page before writing to a data file.
+@param[in] buf page to be compressed
+@param[out] out_buf compressed page
+@param[in] flags tablespace flags
+@param[in] block_size file system block size
+@param[in] encrypted whether the page will be subsequently encrypted
+@return actual length of compressed page
+@retval 0 if the page was not compressed */
+ulint fil_page_compress(
+ const byte* buf,
+ byte* out_buf,
+ uint32_t flags,
+ ulint block_size,
+ bool encrypted)
+{
+ /* The full_crc32 page_compressed format assumes this. */
+ ut_ad(!(block_size & 255));
+ ut_ad(ut_is_2pow(block_size));
+
+ /* Let's not compress file space header or
+ extent descriptor */
+ switch (fil_page_get_type(buf)) {
+ case 0:
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ case FIL_PAGE_PAGE_COMPRESSED:
+ return 0;
+ }
+
+ if (fil_space_t::full_crc32(flags)) {
+ return fil_page_compress_for_full_crc32(
+ buf, out_buf, flags, block_size, encrypted);
+ }
+
+ return fil_page_compress_for_non_full_crc32(
+ buf, out_buf, flags, block_size, encrypted);
+}
+
+/** Decompress a page that may be subject to page_compressed compression.
+@param[in,out] tmp_buf temporary buffer (of innodb_page_size)
+@param[in,out] buf possibly compressed page buffer
+@param[in] comp_algo compression algorithm
+@param[in] header_len header length of the page
+@param[in] actual size actual size of the page
+@retval true if the page is decompressed or false */
+static bool fil_page_decompress_low(
+ byte* tmp_buf,
+ byte* buf,
+ ulint comp_algo,
+ ulint header_len,
+ ulint actual_size)
+{
+ switch (comp_algo) {
+ default:
+ ib::error() << "Unknown compression algorithm "
+ << comp_algo;
+ return false;
+ case PAGE_ZLIB_ALGORITHM:
+ {
+ uLong len = srv_page_size;
+ return (Z_OK == uncompress(tmp_buf, &len,
+ buf + header_len,
+ uLong(actual_size))
+ && len == srv_page_size);
+ }
+
+ case PAGE_LZ4_ALGORITHM:
+ return LZ4_decompress_safe(
+ reinterpret_cast<const char*>(buf) + header_len,
+ reinterpret_cast<char*>(tmp_buf),
+ static_cast<int>(actual_size),
+ static_cast<int>(srv_page_size)) ==
+ static_cast<int>(srv_page_size);
+
+ case PAGE_LZO_ALGORITHM:
+ {
+ lzo_uint len_lzo = srv_page_size;
+ return (LZO_E_OK == lzo1x_decompress_safe(
+ buf + header_len,
+ actual_size, tmp_buf, &len_lzo, NULL)
+ && len_lzo == srv_page_size);
+ }
+
+ case PAGE_LZMA_ALGORITHM:
+ {
+ size_t src_pos = 0;
+ size_t dst_pos = 0;
+ uint64_t memlimit = UINT64_MAX;
+
+ return LZMA_OK == lzma_stream_buffer_decode(
+ &memlimit, 0, NULL, buf + header_len,
+ &src_pos, actual_size, tmp_buf, &dst_pos,
+ srv_page_size)
+ && dst_pos == srv_page_size;
+ }
+
+ case PAGE_BZIP2_ALGORITHM:
+ {
+ uint dst_pos = static_cast<uint>(srv_page_size);
+ return BZ_OK == BZ2_bzBuffToBuffDecompress(
+ reinterpret_cast<char*>(tmp_buf),
+ &dst_pos,
+ reinterpret_cast<char*>(buf) + header_len,
+ static_cast<uint>(actual_size), 1, 0)
+ && dst_pos == srv_page_size;
+ }
+
+ case PAGE_SNAPPY_ALGORITHM:
+ {
+ size_t olen = srv_page_size;
+
+ return SNAPPY_OK == snappy_uncompress(
+ reinterpret_cast<const char*>(buf)
+ + header_len,
+ actual_size,
+ reinterpret_cast<char*>(tmp_buf), &olen)
+ && olen == srv_page_size;
+ }
+ }
+
+ return false;
+}
+
+/** Decompress a page for full crc32 format.
+@param[in,out] tmp_buf temporary buffer (of innodb_page_size)
+@param[in,out] buf possibly compressed page buffer
+@param[in] flags tablespace flags
+@return size of the compressed data
+@retval 0 if decompression failed
+@retval srv_page_size if the page was not compressed */
+static size_t fil_page_decompress_for_full_crc32(byte *tmp_buf, byte *buf,
+ uint32_t flags)
+{
+ ut_ad(fil_space_t::full_crc32(flags));
+ bool compressed = false;
+ size_t size = buf_page_full_crc32_size(buf, &compressed, NULL);
+ if (!compressed) {
+ ut_ad(size == srv_page_size);
+ return size;
+ }
+
+ if (!fil_space_t::is_compressed(flags)) {
+ return 0;
+ }
+
+ if (size >= srv_page_size) {
+ return 0;
+ }
+
+ if (fil_space_t::full_crc32_page_compressed_len(flags)) {
+ compile_time_assert(FIL_PAGE_FCRC32_CHECKSUM == 4);
+ if (size_t lsb = buf[size - 5]) {
+ size += lsb - 0x100;
+ }
+ size -= 5;
+ }
+
+ const size_t header_len = FIL_PAGE_COMP_ALGO;
+
+ if (!fil_page_decompress_low(tmp_buf, buf,
+ fil_space_t::get_compression_algo(flags),
+ header_len, size - header_len)) {
+ return 0;
+ }
+
+ srv_stats.pages_page_decompressed.inc();
+ memcpy(buf, tmp_buf, srv_page_size);
+ return size;
+}
+
+/** Decompress a page for non full crc32 format.
+@param[in,out] tmp_buf temporary buffer (of innodb_page_size)
+@param[in,out] buf possibly compressed page buffer
+@return size of the compressed data
+@retval 0 if decompression failed
+@retval srv_page_size if the page was not compressed */
+static size_t fil_page_decompress_for_non_full_crc32(byte *tmp_buf, byte *buf)
+{
+ ulint header_len;
+ uint comp_algo;
+ switch (fil_page_get_type(buf)) {
+ case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED:
+ header_len= FIL_PAGE_DATA + FIL_PAGE_ENCRYPT_COMP_METADATA_LEN;
+ comp_algo = mach_read_from_2(
+ FIL_PAGE_DATA + FIL_PAGE_ENCRYPT_COMP_ALGO + buf);
+ break;
+ case FIL_PAGE_PAGE_COMPRESSED:
+ header_len = FIL_PAGE_DATA + FIL_PAGE_COMP_METADATA_LEN;
+ if (mach_read_from_6(FIL_PAGE_COMP_ALGO + buf)) {
+ return 0;
+ }
+ comp_algo = mach_read_from_2(FIL_PAGE_COMP_ALGO + 6 + buf);
+ break;
+ default:
+ return srv_page_size;
+ }
+
+ if (mach_read_from_4(buf + FIL_PAGE_SPACE_OR_CHKSUM)
+ != BUF_NO_CHECKSUM_MAGIC) {
+ return 0;
+ }
+
+ ulint actual_size = mach_read_from_2(buf + FIL_PAGE_DATA
+ + FIL_PAGE_COMP_SIZE);
+
+ /* Check if payload size is corrupted */
+ if (actual_size == 0 || actual_size > srv_page_size - header_len) {
+ return 0;
+ }
+
+ if (!fil_page_decompress_low(tmp_buf, buf, comp_algo, header_len,
+ actual_size)) {
+ return 0;
+ }
+
+ srv_stats.pages_page_decompressed.inc();
+ memcpy(buf, tmp_buf, srv_page_size);
+ return actual_size;
+}
+
+/** Decompress a page that may be subject to page_compressed compression.
+@param[in,out] tmp_buf temporary buffer (of innodb_page_size)
+@param[in,out] buf possibly compressed page buffer
+@param[in] flags tablespace flags
+@return size of the compressed data
+@retval 0 if decompression failed
+@retval srv_page_size if the page was not compressed */
+ulint fil_page_decompress(byte *tmp_buf, byte *buf, uint32_t flags)
+{
+ if (fil_space_t::full_crc32(flags)) {
+ return fil_page_decompress_for_full_crc32(tmp_buf, buf, flags);
+ }
+
+ return fil_page_decompress_for_non_full_crc32(tmp_buf, buf);
+}