/***************************************************************************** Copyright (c) 1995, 2021, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2014, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA *****************************************************************************/ /**************************************************//** @file fil/fil0fil.cc The tablespace memory cache Created 10/25/1995 Heikki Tuuri *******************************************************/ #include "fil0fil.h" #include "fil0crypt.h" #include "btr0btr.h" #include "buf0buf.h" #include "dict0boot.h" #include "dict0dict.h" #include "dict0load.h" #include "fsp0file.h" #include "fsp0fsp.h" #include "hash0hash.h" #include "log0log.h" #include "log0recv.h" #include "mach0data.h" #include "mtr0log.h" #include "os0file.h" #include "page0zip.h" #include "row0mysql.h" #include "srv0start.h" #include "trx0purge.h" #include "buf0lru.h" #include "buf0flu.h" #include "log.h" #ifdef __linux__ # include # include # include #endif #include "lz4.h" #include "lzo/lzo1x.h" #include "lzma.h" #include "bzlib.h" #include "snappy-c.h" ATTRIBUTE_COLD void fil_space_t::set_corrupted() const { if (!is_stopping() && !is_corrupted.test_and_set()) sql_print_error("InnoDB: File '%s' is corrupted", chain.start->name); } /** Try to close a file to adhere to the innodb_open_files limit. @param print_info whether to diagnose why a file cannot be closed @return whether a file was closed */ bool fil_space_t::try_to_close(bool print_info) { mysql_mutex_assert_owner(&fil_system.mutex); for (fil_space_t &space : fil_system.space_list) { switch (space.purpose) { case FIL_TYPE_TEMPORARY: continue; case FIL_TYPE_IMPORT: break; case FIL_TYPE_TABLESPACE: if (is_predefined_tablespace(space.id)) continue; } /* We are using an approximation of LRU replacement policy. In fil_node_open_file_low(), newly opened files are moved to the end of fil_system.space_list, so that they would be less likely to be closed here. */ fil_node_t *node= UT_LIST_GET_FIRST(space.chain); if (!node) /* fil_ibd_create() did not invoke fil_space_t::add() yet */ continue; ut_ad(!UT_LIST_GET_NEXT(chain, node)); if (!node->is_open()) continue; const auto n= space.set_closing(); if (n & STOPPING) /* Let fil_space_t::drop() in another thread handle this. */ continue; if (n & (PENDING | NEEDS_FSYNC)) { if (!print_info) continue; print_info= false; const time_t now= time(nullptr); if (now - fil_system.n_open_exceeded_time < 5) continue; /* We display messages at most once in 5 seconds. */ fil_system.n_open_exceeded_time= now; if (n & PENDING) sql_print_information("InnoDB: Cannot close file %s because of " UINT32PF " pending operations%s", node->name, n & PENDING, (n & NEEDS_FSYNC) ? " and pending fsync" : ""); else if (n & NEEDS_FSYNC) sql_print_information("InnoDB: Cannot close file %s because of " "pending fsync", node->name); continue; } node->close(); fil_system.move_closed_last_to_space_list(node->space); return true; } return false; } /* IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE ============================================= The tablespace cache is responsible for providing fast read/write access to tablespaces and logs of the database. File creation and deletion is done in other modules which know more of the logic of the operation, however. A tablespace consists of a chain of files. The size of the files does not have to be divisible by the database block size, because we may just leave the last incomplete block unused. When a new file is appended to the tablespace, the maximum size of the file is also specified. At the moment, we think that it is best to extend the file to its maximum size already at the creation of the file, because then we can avoid dynamically extending the file when more space is needed for the tablespace. A block's position in the tablespace is specified with a 32-bit unsigned integer. The files in the chain are thought to be catenated, and the block corresponding to an address n is the nth block in the catenated file (where the first block is named the 0th block, and the incomplete block fragments at the end of files are not taken into account). A tablespace can be extended by appending a new file at the end of the chain. Our tablespace concept is similar to the one of Oracle. To acquire more speed in disk transfers, a technique called disk striping is sometimes used. This means that logical block addresses are divided in a round-robin fashion across several disks. Windows NT supports disk striping, so there we do not need to support it in the database. Disk striping is implemented in hardware in RAID disks. We conclude that it is not necessary to implement it in the database. Oracle 7 does not support disk striping, either. Another trick used at some database sites is replacing tablespace files by raw disks, that is, the whole physical disk drive, or a partition of it, is opened as a single file, and it is accessed through byte offsets calculated from the start of the disk or the partition. This is recommended in some books on database tuning to achieve more speed in i/o. Using raw disk certainly prevents the OS from fragmenting disk space, but it is not clear if it really adds speed. We measured on the Pentium 100 MHz + NT + NTFS file system + EIDE Conner disk only a negligible difference in speed when reading from a file, versus reading from a raw disk. To have fast access to a tablespace or a log file, we put the data structures to a hash table. Each tablespace and log file is given an unique 32-bit identifier. */ /** Reference to the server data directory. Usually it is the current working directory ".", but in the MariaDB Embedded Server Library it is an absolute path. */ const char* fil_path_to_mysql_datadir; /** Common InnoDB file extensions */ const char* dot_ext[] = { "", ".ibd", ".isl", ".cfg" }; /** Number of pending tablespace flushes */ Atomic_counter fil_n_pending_tablespace_flushes; /** The tablespace memory cache. This variable is NULL before the module is initialized. */ fil_system_t fil_system; /** At this age or older a space/page will be rotated */ extern uint srv_fil_crypt_rotate_key_age; #ifdef UNIV_DEBUG /** Try fil_validate() every this many times */ # define FIL_VALIDATE_SKIP 17 /******************************************************************//** Checks the consistency of the tablespace cache some of the time. @return true if ok or the check was skipped */ static bool fil_validate_skip(void) /*===================*/ { /** The fil_validate() call skip counter. */ static Atomic_counter fil_validate_count; /* We want to reduce the call frequency of the costly fil_validate() check in debug builds. */ return (fil_validate_count++ % FIL_VALIDATE_SKIP) || fil_validate(); } #endif /* UNIV_DEBUG */ /** Look up a tablespace. @param tablespace identifier @return tablespace @retval nullptr if not found */ fil_space_t *fil_space_get_by_id(uint32_t id) { fil_space_t* space; ut_ad(fil_system.is_initialised()); mysql_mutex_assert_owner(&fil_system.mutex); HASH_SEARCH(hash, &fil_system.spaces, id, fil_space_t*, space,, space->id == id); return(space); } /** Look up a tablespace. The caller should hold an InnoDB table lock or a MDL that prevents the tablespace from being dropped during the operation, or the caller should be in single-threaded crash recovery mode (no user connections that could drop tablespaces). Normally, fil_space_t::get() should be used instead. @param[in] id tablespace ID @return tablespace, or NULL if not found */ fil_space_t *fil_space_get(uint32_t id) { mysql_mutex_lock(&fil_system.mutex); fil_space_t *space= fil_space_get_by_id(id); mysql_mutex_unlock(&fil_system.mutex); return space; } /** Check if the compression algorithm is loaded @param[in] comp_algo ulint compression algorithm @return whether the compression algorithm is loaded */ bool fil_comp_algo_loaded(ulint comp_algo) { switch (comp_algo) { case PAGE_UNCOMPRESSED: case PAGE_ZLIB_ALGORITHM: return true; case PAGE_LZ4_ALGORITHM: return provider_service_lz4->is_loaded; case PAGE_LZO_ALGORITHM: return provider_service_lzo->is_loaded; case PAGE_LZMA_ALGORITHM: return provider_service_lzma->is_loaded; case PAGE_BZIP2_ALGORITHM: return provider_service_bzip2->is_loaded; case PAGE_SNAPPY_ALGORITHM: return provider_service_snappy->is_loaded; } return false; } /** Append a file to the chain of files of a space. @param[in] name file name of a file that is not open @param[in] handle file handle, or OS_FILE_CLOSED @param[in] size file size in entire database pages @param[in] is_raw whether this is a raw device @param[in] atomic_write true if atomic write could be enabled @param[in] max_pages maximum number of pages in file, or UINT32_MAX for unlimited @return file object */ fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle, uint32_t size, bool is_raw, bool atomic_write, uint32_t max_pages) { mysql_mutex_assert_owner(&fil_system.mutex); fil_node_t* node; ut_ad(name != NULL); ut_ad(fil_system.is_initialised()); node = reinterpret_cast(ut_zalloc_nokey(sizeof(*node))); node->handle = handle; node->name = mem_strdup(name); ut_a(!is_raw || srv_start_raw_disk_in_use); node->is_raw_disk = is_raw; node->size = size; node->init_size = size; node->max_size = max_pages; node->space = this; node->atomic_write = atomic_write; this->size += size; UT_LIST_ADD_LAST(chain, node); if (node->is_open()) { clear_closing(); if (++fil_system.n_open >= srv_max_n_open_files) { reacquire(); try_to_close(true); release(); } } return node; } __attribute__((warn_unused_result, nonnull)) /** Open a tablespace file. @param node data file @return whether the file was successfully opened */ static bool fil_node_open_file_low(fil_node_t *node) { ut_ad(!node->is_open()); ut_ad(node->space->is_closing()); mysql_mutex_assert_owner(&fil_system.mutex); ulint type; static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096, "compatibility"); switch (FSP_FLAGS_GET_ZIP_SSIZE(node->space->flags)) { case 1: case 2: type= OS_DATA_FILE_NO_O_DIRECT; break; default: type= OS_DATA_FILE; } for (;;) { bool success; node->handle= os_file_create(innodb_data_file_key, node->name, node->is_raw_disk ? OS_FILE_OPEN_RAW | OS_FILE_ON_ERROR_NO_EXIT : OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_AIO, type, srv_read_only_mode, &success); if (success && node->is_open()) { #ifndef _WIN32 if (!node->space->id && !srv_read_only_mode && my_disable_locking && os_file_lock(node->handle, node->name)) { os_file_close(node->handle); node->handle= OS_FILE_CLOSED; return false; } #endif break; } /* The following call prints an error message */ if (os_file_get_last_error(true) == EMFILE + 100 && fil_space_t::try_to_close(true)) continue; ib::warn() << "Cannot open '" << node->name << "'."; return false; } ulint comp_algo = node->space->get_compression_algo(); bool comp_algo_invalid = false; if (node->size); else if (!node->read_page0() || // validate compression algorithm for full crc32 format (node->space->full_crc32() && (comp_algo_invalid = !fil_comp_algo_loaded(comp_algo)))) { if (comp_algo_invalid) { if (comp_algo <= PAGE_ALGORITHM_LAST) ib::warn() << "'" << node->name << "' is compressed with " << page_compression_algorithms[comp_algo] << ", which is not currently loaded"; else ib::warn() << "'" << node->name << "' is compressed with " << "invalid algorithm: " << comp_algo; } os_file_close(node->handle); node->handle= OS_FILE_CLOSED; return false; } ut_ad(node->is_open()); fil_system.move_opened_last_to_space_list(node->space); fil_system.n_open++; return true; } /** Open a tablespace file. @param node data file @return whether the file was successfully opened */ static bool fil_node_open_file(fil_node_t *node) { mysql_mutex_assert_owner(&fil_system.mutex); ut_ad(!node->is_open()); ut_ad(!is_predefined_tablespace(node->space->id) || srv_operation == SRV_OPERATION_BACKUP || srv_operation == SRV_OPERATION_RESTORE || srv_operation == SRV_OPERATION_RESTORE_DELTA); ut_ad(node->space->purpose != FIL_TYPE_TEMPORARY); ut_ad(node->space->referenced()); const auto old_time= fil_system.n_open_exceeded_time; for (ulint count= 0; fil_system.n_open >= srv_max_n_open_files; count++) { if (fil_space_t::try_to_close(count > 1)) count= 0; else if (count >= 2) { if (old_time != fil_system.n_open_exceeded_time) sql_print_warning("InnoDB: innodb_open_files=" ULINTPF " is exceeded (" ULINTPF " files stay open)", srv_max_n_open_files, fil_system.n_open); break; } else { mysql_mutex_unlock(&fil_system.mutex); std::this_thread::sleep_for(std::chrono::milliseconds(20)); /* Flush tablespaces so that we can close modified files. */ fil_flush_file_spaces(); mysql_mutex_lock(&fil_system.mutex); if (node->is_open()) return true; } } /* The node can be opened beween releasing and acquiring fil_system.mutex in the above code */ return node->is_open() || fil_node_open_file_low(node); } /** Close the file handle. */ void fil_node_t::close() { prepare_to_close_or_detach(); /* printf("Closing file %s\n", name); */ int ret= os_file_close(handle); ut_a(ret); handle= OS_FILE_CLOSED; } pfs_os_file_t fil_node_t::detach() { prepare_to_close_or_detach(); pfs_os_file_t result= handle; handle= OS_FILE_CLOSED; return result; } void fil_node_t::prepare_to_close_or_detach() { mysql_mutex_assert_owner(&fil_system.mutex); ut_ad(space->is_ready_to_close() || srv_operation == SRV_OPERATION_BACKUP || srv_operation == SRV_OPERATION_RESTORE_DELTA); ut_a(is_open()); ut_a(!being_extended); ut_a(space->is_ready_to_close() || space->purpose == FIL_TYPE_TEMPORARY || srv_fast_shutdown == 2 || !srv_was_started); ut_a(fil_system.n_open > 0); fil_system.n_open--; } /** Flush any writes cached by the file system. */ void fil_space_t::flush_low() { mysql_mutex_assert_not_owner(&fil_system.mutex); uint32_t n= 1; while (!n_pending.compare_exchange_strong(n, n | NEEDS_FSYNC, std::memory_order_acquire, std::memory_order_relaxed)) { ut_ad(n & PENDING); if (n & STOPPING_WRITES) return; if (n & NEEDS_FSYNC) break; } fil_n_pending_tablespace_flushes++; for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node; node= UT_LIST_GET_NEXT(chain, node)) { if (!node->is_open()) { ut_ad(!is_in_unflushed_spaces); continue; } IF_WIN(if (node->is_raw_disk) continue,); os_file_flush(node->handle); } if (is_in_unflushed_spaces) { mysql_mutex_lock(&fil_system.mutex); if (is_in_unflushed_spaces) { is_in_unflushed_spaces= false; fil_system.unflushed_spaces.remove(*this); } mysql_mutex_unlock(&fil_system.mutex); } clear_flush(); fil_n_pending_tablespace_flushes--; } /** Try to extend a tablespace. @param[in,out] space tablespace to be extended @param[in,out] node last file of the tablespace @param[in] size desired size in number of pages @param[out] success whether the operation succeeded @return whether the operation should be retried */ static ATTRIBUTE_COLD __attribute__((warn_unused_result, nonnull)) bool fil_space_extend_must_retry( fil_space_t* space, fil_node_t* node, uint32_t size, bool* success) { mysql_mutex_assert_owner(&fil_system.mutex); ut_ad(UT_LIST_GET_LAST(space->chain) == node); ut_ad(size >= FIL_IBD_FILE_INITIAL_SIZE); ut_ad(node->space == space); ut_ad(space->referenced() || space->is_being_truncated); *success = space->size >= size; if (*success) { /* Space already big enough */ return(false); } if (node->being_extended) { /* Another thread is currently extending the file. Wait for it to finish. It'd have been better to use event driven mechanism but the entire module is peppered with polling stuff. */ mysql_mutex_unlock(&fil_system.mutex); std::this_thread::sleep_for(std::chrono::milliseconds(100)); return(true); } node->being_extended = true; /* At this point it is safe to release fil_system.mutex. No other thread can rename, delete, close or extend the file because we have set the node->being_extended flag. */ mysql_mutex_unlock(&fil_system.mutex); ut_ad(size >= space->size); uint32_t last_page_no = space->size; const uint32_t file_start_page_no = last_page_no - node->size; const unsigned page_size = space->physical_size(); /* Datafile::read_first_page() expects innodb_page_size bytes. fil_node_t::read_page0() expects at least 4 * innodb_page_size bytes. os_file_set_size() expects multiples of 4096 bytes. For ROW_FORMAT=COMPRESSED tables using 1024-byte or 2048-byte pages, we will preallocate up to an integer multiple of 4096 bytes, and let normal writes append 1024, 2048, or 3072 bytes to the file. */ os_offset_t new_size = std::max( (os_offset_t(size - file_start_page_no) * page_size) & ~os_offset_t(4095), os_offset_t(FIL_IBD_FILE_INITIAL_SIZE << srv_page_size_shift)); *success = os_file_set_size(node->name, node->handle, new_size, node->punch_hole == 1); os_has_said_disk_full = *success; if (*success) { os_file_flush(node->handle); last_page_no = size; } else { /* Let us measure the size of the file to determine how much we were able to extend it */ os_offset_t fsize = os_file_get_size(node->handle); ut_a(fsize != os_offset_t(-1)); last_page_no = uint32_t(fsize / page_size) + file_start_page_no; } mysql_mutex_lock(&fil_system.mutex); ut_a(node->being_extended); node->being_extended = false; ut_a(last_page_no - file_start_page_no >= node->size); uint32_t file_size = last_page_no - file_start_page_no; space->size += file_size - node->size; node->size = file_size; const uint32_t pages_in_MiB = node->size & ~uint32_t((1U << (20U - srv_page_size_shift)) - 1); /* Keep the last data file size info up to date, rounded to full megabytes */ switch (space->id) { case TRX_SYS_SPACE: srv_sys_space.set_last_file_size(pages_in_MiB); do_flush: space->reacquire(); mysql_mutex_unlock(&fil_system.mutex); space->flush_low(); space->release(); mysql_mutex_lock(&fil_system.mutex); break; default: ut_ad(space->purpose == FIL_TYPE_TABLESPACE || space->purpose == FIL_TYPE_IMPORT); if (space->purpose == FIL_TYPE_TABLESPACE && !space->is_being_truncated) { goto do_flush; } break; case SRV_TMP_SPACE_ID: ut_ad(space->purpose == FIL_TYPE_TEMPORARY); srv_tmp_space.set_last_file_size(pages_in_MiB); break; } return false; } /** @return whether the file is usable for io() */ ATTRIBUTE_COLD bool fil_space_t::prepare_acquired() { ut_ad(referenced()); mysql_mutex_assert_owner(&fil_system.mutex); fil_node_t *node= UT_LIST_GET_LAST(chain); ut_ad(!id || purpose == FIL_TYPE_TEMPORARY || node == UT_LIST_GET_FIRST(chain)); const bool is_open= node && (node->is_open() || fil_node_open_file(node)); if (!is_open) release(); else if (node->deferred); else if (auto desired_size= recv_size) { bool success; while (fil_space_extend_must_retry(this, node, desired_size, &success)) mysql_mutex_lock(&fil_system.mutex); mysql_mutex_assert_owner(&fil_system.mutex); /* Crash recovery requires the file extension to succeed. */ ut_a(success); /* InnoDB data files cannot shrink. */ ut_a(size >= desired_size); if (desired_size > committed_size) committed_size= desired_size; /* There could be multiple concurrent I/O requests for this tablespace (multiple threads trying to extend this tablespace). Also, fil_space_set_recv_size_and_flags() may have been invoked again during the file extension while fil_system.mutex was not being held by us. Only if recv_size matches what we read originally, reset the field. In this way, a subsequent I/O request will handle any pending fil_space_set_recv_size_and_flags(). */ if (desired_size == recv_size) { recv_size= 0; goto clear; } } else clear: clear_closing(); return is_open; } /** @return whether the file is usable for io() */ ATTRIBUTE_COLD bool fil_space_t::acquire_and_prepare() { mysql_mutex_lock(&fil_system.mutex); const auto flags= acquire_low() & (STOPPING | CLOSING); const bool is_open= !flags || (flags == CLOSING && prepare_acquired()); mysql_mutex_unlock(&fil_system.mutex); return is_open; } /** Try to extend a tablespace if it is smaller than the specified size. @param[in,out] space tablespace @param[in] size desired size in pages @return whether the tablespace is at least as big as requested */ bool fil_space_extend(fil_space_t *space, uint32_t size) { ut_ad(!srv_read_only_mode || space->purpose == FIL_TYPE_TEMPORARY); bool success= false; const bool acquired= space->acquire(); mysql_mutex_lock(&fil_system.mutex); if (acquired || space->is_being_truncated) { while (fil_space_extend_must_retry(space, UT_LIST_GET_LAST(space->chain), size, &success)) mysql_mutex_lock(&fil_system.mutex); } mysql_mutex_unlock(&fil_system.mutex); if (acquired) space->release(); return success; } /** Prepare to free a file from fil_system. */ inline pfs_os_file_t fil_node_t::close_to_free(bool detach_handle) { mysql_mutex_assert_owner(&fil_system.mutex); ut_a(!being_extended); if (is_open() && (space->n_pending.fetch_or(fil_space_t::CLOSING, std::memory_order_acquire) & fil_space_t::PENDING)) { mysql_mutex_unlock(&fil_system.mutex); while (space->referenced()) std::this_thread::sleep_for(std::chrono::microseconds(100)); mysql_mutex_lock(&fil_system.mutex); } while (is_open()) { if (space->is_in_unflushed_spaces) { ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC); space->is_in_unflushed_spaces= false; fil_system.unflushed_spaces.remove(*space); } ut_a(!being_extended); if (detach_handle) { auto result= handle; handle= OS_FILE_CLOSED; return result; } bool ret= os_file_close(handle); ut_a(ret); handle= OS_FILE_CLOSED; break; } return OS_FILE_CLOSED; } /** Detach a tablespace from the cache and close the files. @param space tablespace @param detach_handle whether to detach the handle, instead of closing @return detached handle @retval OS_FILE_CLOSED if no handle was detached */ pfs_os_file_t fil_system_t::detach(fil_space_t *space, bool detach_handle) { mysql_mutex_assert_owner(&fil_system.mutex); HASH_DELETE(fil_space_t, hash, &spaces, space->id, space); if (space->is_in_unflushed_spaces) { ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC); space->is_in_unflushed_spaces= false; unflushed_spaces.remove(*space); } if (space->is_in_default_encrypt) { space->is_in_default_encrypt= false; default_encrypt_tables.remove(*space); } { space_list_t::iterator s= space_list_t::iterator(space); if (space_list_last_opened == space) { if (s == space_list.begin()) { ut_ad(srv_operation > SRV_OPERATION_EXPORT_RESTORED || srv_shutdown_state > SRV_SHUTDOWN_NONE); space_list_last_opened= nullptr; } else { space_list_t::iterator prev= s; space_list_last_opened= &*--prev; } } space_list.erase(s); } if (space == sys_space) sys_space= nullptr; else if (space == temp_space) temp_space= nullptr; for (fil_node_t* node= UT_LIST_GET_FIRST(space->chain); node; node= UT_LIST_GET_NEXT(chain, node)) if (node->is_open()) { ut_ad(n_open > 0); n_open--; } ut_ad(!detach_handle || space->id); ut_ad(!detach_handle || UT_LIST_GET_LEN(space->chain) <= 1); pfs_os_file_t handle= OS_FILE_CLOSED; for (fil_node_t* node= UT_LIST_GET_FIRST(space->chain); node; node= UT_LIST_GET_NEXT(chain, node)) handle= node->close_to_free(detach_handle); ut_ad(!space->referenced()); return handle; } /** Free a tablespace object on which fil_system_t::detach() was invoked. There must not be any pending i/o's or flushes on the files. @param[in,out] space tablespace */ static void fil_space_free_low( fil_space_t* space) { /* The tablespace must not be in fil_system.named_spaces. */ ut_ad(srv_fast_shutdown == 2 || !srv_was_started || space->max_lsn == 0); /* Wait for fil_space_t::release() after fil_system_t::detach(), the tablespace cannot be found, so fil_space_t::get() would return NULL */ while (space->referenced()) { std::this_thread::sleep_for(std::chrono::microseconds(100)); } for (fil_node_t* node = UT_LIST_GET_FIRST(space->chain); node != NULL; ) { ut_d(space->size -= node->size); ut_free(node->name); fil_node_t* old_node = node; node = UT_LIST_GET_NEXT(chain, node); ut_free(old_node); } ut_ad(space->size == 0); fil_space_destroy_crypt_data(&space->crypt_data); space->~fil_space_t(); ut_free(space); } /** Frees a space object from the tablespace memory cache. Closes the files in the chain but does not delete them. There must not be any pending i/o's or flushes on the files. @param id tablespace identifier @param x_latched whether the caller holds exclusive fil_space_t::latch @return true if success */ bool fil_space_free(uint32_t id, bool x_latched) { ut_ad(id != TRX_SYS_SPACE); mysql_mutex_lock(&fil_system.mutex); fil_space_t* space = fil_space_get_by_id(id); if (space != NULL) { fil_system.detach(space); } mysql_mutex_unlock(&fil_system.mutex); if (space != NULL) { if (x_latched) { space->x_unlock(); } if (!recv_recovery_is_on()) { log_sys.latch.wr_lock(SRW_LOCK_CALL); if (space->max_lsn) { ut_d(space->max_lsn = 0); fil_system.named_spaces.remove(*space); } log_sys.latch.wr_unlock(); } else { #ifndef SUX_LOCK_GENERIC ut_ad(log_sys.latch.is_write_locked()); #endif if (space->max_lsn) { ut_d(space->max_lsn = 0); fil_system.named_spaces.remove(*space); } } fil_space_free_low(space); } return(space != NULL); } /** Create a tablespace in fil_system. @param name tablespace name @param id tablespace identifier @param flags tablespace flags @param purpose tablespace purpose @param crypt_data encryption information @param mode encryption mode @param opened true if space files are opened @return pointer to created tablespace, to be filled in with add() @retval nullptr on failure (such as when the same tablespace exists) */ fil_space_t *fil_space_t::create(uint32_t id, uint32_t flags, fil_type_t purpose, fil_space_crypt_t *crypt_data, fil_encryption_t mode, bool opened) { fil_space_t* space; mysql_mutex_assert_owner(&fil_system.mutex); ut_ad(fil_system.is_initialised()); ut_ad(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, id)); ut_ad(srv_page_size == UNIV_PAGE_SIZE_ORIG || flags != 0); DBUG_EXECUTE_IF("fil_space_create_failure", return(NULL);); /* FIXME: if calloc() is defined as an inline function that calls memset() or bzero(), then GCC 6 -flifetime-dse can optimize it away */ space= new (ut_zalloc_nokey(sizeof(*space))) fil_space_t; space->id = id; UT_LIST_INIT(space->chain, &fil_node_t::chain); space->purpose = purpose; space->flags = flags; space->crypt_data = crypt_data; space->n_pending.store(CLOSING, std::memory_order_relaxed); DBUG_LOG("tablespace", "Created metadata for " << id); if (crypt_data) { DBUG_LOG("crypt", "Tablespace " << id << " encryption " << crypt_data->encryption << " key id " << crypt_data->key_id << ":" << fil_crypt_get_mode(crypt_data) << " " << fil_crypt_get_type(crypt_data)); } space->latch.SRW_LOCK_INIT(fil_space_latch_key); if (const fil_space_t *old_space = fil_space_get_by_id(id)) { ib::error() << "Trying to add tablespace with id " << id << " to the cache, but tablespace '" << (old_space->chain.start ? old_space->chain.start->name : "") << "' already exists in the cache!"; space->~fil_space_t(); ut_free(space); return(NULL); } HASH_INSERT(fil_space_t, hash, &fil_system.spaces, id, space); if (opened) fil_system.add_opened_last_to_space_list(space); else fil_system.space_list.push_back(*space); switch (id) { case 0: ut_ad(!fil_system.sys_space); fil_system.sys_space = space; break; case SRV_TMP_SPACE_ID: ut_ad(!fil_system.temp_space); fil_system.temp_space = space; break; default: ut_ad(purpose != FIL_TYPE_TEMPORARY); if (UNIV_LIKELY(id <= fil_system.max_assigned_id)) { break; } if (UNIV_UNLIKELY(srv_operation == SRV_OPERATION_BACKUP)) { break; } if (!fil_system.space_id_reuse_warned) { ib::warn() << "Allocated tablespace ID " << id << ", old maximum was " << fil_system.max_assigned_id; } fil_system.max_assigned_id = id; } const bool rotate = purpose == FIL_TYPE_TABLESPACE && (mode == FIL_ENCRYPTION_ON || mode == FIL_ENCRYPTION_OFF || srv_encrypt_tables) && fil_crypt_must_default_encrypt(); if (rotate) { fil_system.default_encrypt_tables.push_back(*space); space->is_in_default_encrypt = true; if (srv_n_fil_crypt_threads_started) { mysql_mutex_unlock(&fil_system.mutex); fil_crypt_threads_signal(); mysql_mutex_lock(&fil_system.mutex); } } return(space); } /*******************************************************************//** Assigns a new space id for a new single-table tablespace. This works simply by incrementing the global counter. If 4 billion id's is not enough, we may need to recycle id's. @return true if assigned, false if not */ bool fil_assign_new_space_id(uint32_t *space_id) { uint32_t id = *space_id; bool success; mysql_mutex_lock(&fil_system.mutex); if (id < fil_system.max_assigned_id) { id = fil_system.max_assigned_id; } id++; if (id > (SRV_SPACE_ID_UPPER_BOUND / 2) && (id % 1000000UL == 0)) { ib::warn() << "You are running out of new single-table" " tablespace id's. Current counter is " << id << " and it must not exceed" <is_open() || fil_node_open_file(node); release(); return ok; } /** Look up a tablespace and ensure that its first page has been validated. */ static fil_space_t *fil_space_get_space(uint32_t id) { if (fil_space_t *space= fil_space_get_by_id(id)) if (space->read_page0()) return space; return nullptr; } void fil_space_set_recv_size_and_flags(uint32_t id, uint32_t size, uint32_t flags) { ut_ad(id < SRV_SPACE_ID_UPPER_BOUND); mysql_mutex_lock(&fil_system.mutex); if (fil_space_t *space= fil_space_get_space(id)) { if (size) space->recv_size= size; if (flags != FSP_FLAGS_FCRC32_MASK_MARKER) space->flags= flags; } mysql_mutex_unlock(&fil_system.mutex); } /** Open each file. Never invoked on .ibd files. @param create_new_db whether to skip the call to fil_node_t::read_page0() @return whether all files were opened */ bool fil_space_t::open(bool create_new_db) { ut_ad(fil_system.is_initialised()); ut_ad(!id || create_new_db); bool success= true; bool skip_read= create_new_db; mysql_mutex_lock(&fil_system.mutex); for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node; node= UT_LIST_GET_NEXT(chain, node)) { if (!node->is_open() && !fil_node_open_file_low(node)) { err_exit: success= false; break; } if (create_new_db) { node->find_metadata(node->handle); continue; } if (skip_read) { size+= node->size; continue; } if (!node->read_page0()) { fil_system.n_open--; os_file_close(node->handle); node->handle= OS_FILE_CLOSED; goto err_exit; } skip_read= true; } if (!create_new_db) committed_size= size; mysql_mutex_unlock(&fil_system.mutex); return success; } /** Close each file. Only invoked on fil_system.temp_space. */ void fil_space_t::close() { if (!fil_system.is_initialised()) { return; } mysql_mutex_lock(&fil_system.mutex); ut_ad(this == fil_system.temp_space || srv_operation == SRV_OPERATION_BACKUP || srv_operation == SRV_OPERATION_RESTORE || srv_operation == SRV_OPERATION_RESTORE_DELTA); for (fil_node_t* node = UT_LIST_GET_FIRST(chain); node != NULL; node = UT_LIST_GET_NEXT(chain, node)) { if (node->is_open()) { node->close(); } } mysql_mutex_unlock(&fil_system.mutex); } void fil_system_t::create(ulint hash_size) { ut_ad(this == &fil_system); ut_ad(!is_initialised()); ut_ad(!(srv_page_size % FSP_EXTENT_SIZE)); ut_ad(srv_page_size); ut_ad(!spaces.array); m_initialised = true; compile_time_assert(!(UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX)); compile_time_assert(!(UNIV_PAGE_SIZE_MIN % FSP_EXTENT_SIZE_MIN)); ut_ad(hash_size > 0); mysql_mutex_init(fil_system_mutex_key, &mutex, nullptr); spaces.create(hash_size); fil_space_crypt_init(); #ifdef __linux__ ssd.clear(); char fn[sizeof(dirent::d_name) + sizeof "/sys/block/" "/queue/rotational"]; const size_t sizeof_fnp = (sizeof fn) - sizeof "/sys/block"; memcpy(fn, "/sys/block/", sizeof "/sys/block"); char* fnp = &fn[sizeof "/sys/block"]; std::set ssd_devices; if (DIR* d = opendir("/sys/block")) { while (struct dirent* e = readdir(d)) { if (e->d_name[0] == '.') { continue; } snprintf(fnp, sizeof_fnp, "%s/queue/rotational", e->d_name); int f = open(fn, O_RDONLY); if (f == -1) { continue; } char b[sizeof "4294967295:4294967295\n"]; ssize_t l = read(f, b, sizeof b); ::close(f); if (l != 2 || memcmp("0\n", b, 2)) { continue; } snprintf(fnp, sizeof_fnp, "%s/dev", e->d_name); f = open(fn, O_RDONLY); if (f == -1) { continue; } l = read(f, b, sizeof b); ::close(f); if (l <= 0 || b[l - 1] != '\n') { continue; } b[l - 1] = '\0'; char* end = b; unsigned long dev_major = strtoul(b, &end, 10); if (b == end || *end != ':' || dev_major != unsigned(dev_major)) { continue; } char* c = end + 1; unsigned long dev_minor = strtoul(c, &end, 10); if (c == end || *end || dev_minor != unsigned(dev_minor)) { continue; } ssd.push_back(makedev(unsigned(dev_major), unsigned(dev_minor))); } closedir(d); } /* fil_system_t::is_ssd() assumes the following */ ut_ad(makedev(0, 8) == 8); ut_ad(makedev(0, 4) == 4); ut_ad(makedev(0, 2) == 2); ut_ad(makedev(0, 1) == 1); #endif } void fil_system_t::close() { ut_ad(this == &fil_system); ut_a(unflushed_spaces.empty()); ut_a(space_list.empty()); ut_ad(!sys_space); ut_ad(!temp_space); if (is_initialised()) { m_initialised= false; spaces.free(); mysql_mutex_destroy(&mutex); fil_space_crypt_cleanup(); } ut_ad(!spaces.array); #ifdef __linux__ ssd.clear(); ssd.shrink_to_fit(); #endif /* __linux__ */ } void fil_system_t::add_opened_last_to_space_list(fil_space_t *space) { if (UNIV_LIKELY(space_list_last_opened != nullptr)) space_list.insert(++space_list_t::iterator(space_list_last_opened), *space); else space_list.push_front(*space); space_list_last_opened= space; } /** Extend all open data files to the recovered size */ ATTRIBUTE_COLD void fil_system_t::extend_to_recv_size() { ut_ad(is_initialised()); mysql_mutex_lock(&mutex); for (fil_space_t &space : fil_system.space_list) { const uint32_t size= space.recv_size; if (size > space.size) { if (space.is_closing()) continue; space.reacquire(); bool success; while (fil_space_extend_must_retry(&space, UT_LIST_GET_LAST(space.chain), size, &success)) mysql_mutex_lock(&mutex); /* Crash recovery requires the file extension to succeed. */ ut_a(success); space.release(); } } mysql_mutex_unlock(&mutex); } /** Close all tablespace files at shutdown */ void fil_space_t::close_all() { if (!fil_system.is_initialised()) return; /* At shutdown, we should not have any files in this list. */ ut_ad(srv_fast_shutdown == 2 || !srv_was_started || fil_system.named_spaces.empty()); fil_flush_file_spaces(); mysql_mutex_lock(&fil_system.mutex); while (!fil_system.space_list.empty()) { fil_space_t &space= fil_system.space_list.front(); for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node != NULL; node= UT_LIST_GET_NEXT(chain, node)) { if (!node->is_open()) { next: continue; } for (ulint count= 10000; count--;) { const auto n= space.set_closing(); if (n & STOPPING) goto next; if (!(n & (PENDING | NEEDS_FSYNC))) { node->close(); goto next; } mysql_mutex_unlock(&fil_system.mutex); std::this_thread::sleep_for(std::chrono::microseconds(100)); mysql_mutex_lock(&fil_system.mutex); if (!node->is_open()) goto next; } ib::error() << "File '" << node->name << "' has " << space.referenced() << " operations"; } fil_system.detach(&space); mysql_mutex_unlock(&fil_system.mutex); fil_space_free_low(&space); mysql_mutex_lock(&fil_system.mutex); } mysql_mutex_unlock(&fil_system.mutex); ut_ad(srv_fast_shutdown == 2 || !srv_was_started || fil_system.named_spaces.empty()); } /*******************************************************************//** Sets the max tablespace id counter if the given number is bigger than the previous value. */ void fil_set_max_space_id_if_bigger(uint32_t max_id) { ut_a(max_id < SRV_SPACE_ID_UPPER_BOUND); mysql_mutex_lock(&fil_system.mutex); if (fil_system.max_assigned_id < max_id) { fil_system.max_assigned_id = max_id; } mysql_mutex_unlock(&fil_system.mutex); } /** Acquire a tablespace reference. @param id tablespace identifier @return tablespace @retval nullptr if the tablespace is missing or inaccessible */ fil_space_t *fil_space_t::get(uint32_t id) { mysql_mutex_lock(&fil_system.mutex); fil_space_t *space= fil_space_get_by_id(id); const uint32_t n= space ? space->acquire_low() : 0; if (n & STOPPING) space= nullptr; else if ((n & CLOSING) && !space->prepare_acquired()) space= nullptr; mysql_mutex_unlock(&fil_system.mutex); return space; } /** Write a log record about a file operation. @param type file operation @param first_page_no first page number in the file @param path file path @param new_path new file path for type=FILE_RENAME */ inline void mtr_t::log_file_op(mfile_type_t type, uint32_t space_id, const char *path, const char *new_path) { ut_ad((new_path != nullptr) == (type == FILE_RENAME)); ut_ad(!(byte(type) & 15)); /* fil_name_parse() requires that there be at least one path separator and that the file path end with ".ibd". */ ut_ad(strchr(path, '/')); ut_ad(!strcmp(&path[strlen(path) - strlen(DOT_IBD)], DOT_IBD)); m_modifications= true; if (!is_logged()) return; m_last= nullptr; const size_t len= strlen(path); const size_t new_len= type == FILE_RENAME ? 1 + strlen(new_path) : 0; ut_ad(len > 0); byte *const log_ptr= m_log.open(1 + 3/*length*/ + 5/*space_id*/ + 1/*page_no=0*/); byte *end= log_ptr + 1; end= mlog_encode_varint(end, space_id); *end++= 0; if (UNIV_LIKELY(end + len + new_len >= &log_ptr[16])) { *log_ptr= type; size_t total_len= len + new_len + end - log_ptr - 15; if (total_len >= MIN_3BYTE) total_len+= 2; else if (total_len >= MIN_2BYTE) total_len++; end= mlog_encode_varint(log_ptr + 1, total_len); end= mlog_encode_varint(end, space_id); *end++= 0; } else { *log_ptr= static_cast(type | (end + len + new_len - &log_ptr[1])); ut_ad(*log_ptr & 15); } m_log.close(end); if (type == FILE_RENAME) { ut_ad(strchr(new_path, '/')); m_log.push(reinterpret_cast(path), uint32_t(len + 1)); m_log.push(reinterpret_cast(new_path), uint32_t(new_len - 1)); } else m_log.push(reinterpret_cast(path), uint32_t(len)); } /** Write FILE_MODIFY for a file. @param[in] space_id tablespace id @param[in] name tablespace file name @param[in,out] mtr mini-transaction */ static void fil_name_write(uint32_t space_id, const char *name, mtr_t *mtr) { ut_ad(!is_predefined_tablespace(space_id)); mtr->log_file_op(FILE_MODIFY, space_id, name); } fil_space_t *fil_space_t::drop(uint32_t id, pfs_os_file_t *detached_handle) { ut_a(!is_system_tablespace(id)); mysql_mutex_lock(&fil_system.mutex); fil_space_t *space= fil_space_get_by_id(id); if (!space) { mysql_mutex_unlock(&fil_system.mutex); return nullptr; } if (space->pending() & STOPPING) { /* A thread executing DDL and another thread executing purge may be executing fil_delete_tablespace() concurrently for the same tablespace. Wait for the other thread to complete the operation. */ for (ulint count= 0;; count++) { space= fil_space_get_by_id(id); ut_ad(!space || space->is_stopping()); mysql_mutex_unlock(&fil_system.mutex); if (!space) return nullptr; /* Issue a warning every 10.24 seconds, starting after 2.56 seconds */ if ((count & 511) == 128) sql_print_warning("InnoDB: Waiting for tablespace " UINT32PF " to be deleted", id); std::this_thread::sleep_for(std::chrono::milliseconds(20)); mysql_mutex_lock(&fil_system.mutex); } } /* We must be the first one to set either STOPPING flag on the .ibd file, because the flags are only being set here, within a critical section of fil_system.mutex. */ unsigned pending; ut_d(pending=) space->n_pending.fetch_add(STOPPING_READS + 1, std::memory_order_relaxed); ut_ad(!(pending & STOPPING)); mysql_mutex_unlock(&fil_system.mutex); if (space->crypt_data) fil_space_crypt_close_tablespace(space); if (space->purpose == FIL_TYPE_TABLESPACE) { if (id >= srv_undo_space_id_start && id < srv_undo_space_id_start + srv_undo_tablespaces_open) { os_file_delete(innodb_data_file_key, space->chain.start->name); goto deleted; } /* Before deleting the file, persistently write a log record. */ mtr_t mtr; mtr.start(); mtr.log_file_op(FILE_DELETE, id, space->chain.start->name); mtr.commit_file(*space, nullptr); if (FSP_FLAGS_HAS_DATA_DIR(space->flags)) RemoteDatafile::delete_link_file(space->name()); os_file_delete(innodb_data_file_key, space->chain.start->name); } else ut_ad(space->purpose == FIL_TYPE_IMPORT); if (char *cfg_name= fil_make_filepath(space->chain.start->name, fil_space_t::name_type{}, CFG, false)) { os_file_delete_if_exists(innodb_data_file_key, cfg_name, nullptr); ut_free(cfg_name); } deleted: mysql_mutex_lock(&fil_system.mutex); ut_ad(space == fil_space_get_by_id(id)); pending= space->n_pending.fetch_add(STOPPING_WRITES - 1, std::memory_order_relaxed); ut_ad((pending & STOPPING) == STOPPING_READS); ut_ad(pending & PENDING); pending&= PENDING; if (--pending) { for (ulint count= 0;; count++) { ut_ad(space == fil_space_get_by_id(id)); pending= space->n_pending.load(std::memory_order_relaxed) & PENDING; if (!pending) break; mysql_mutex_unlock(&fil_system.mutex); /* Issue a warning every 10.24 seconds, starting after 2.56 seconds */ if ((count & 511) == 128) sql_print_warning("InnoDB: Trying to delete tablespace '%s' " "but there are %u pending operations", space->chain.start->name, pending); std::this_thread::sleep_for(std::chrono::milliseconds(20)); mysql_mutex_lock(&fil_system.mutex); } } pfs_os_file_t handle= fil_system.detach(space, true); mysql_mutex_unlock(&fil_system.mutex); if (detached_handle) *detached_handle = handle; else os_file_close(handle); return space; } /** Close a single-table tablespace on failed IMPORT TABLESPACE. The tablespace must be cached in the memory cache. Free all pages used by the tablespace. */ void fil_close_tablespace(uint32_t id) { ut_ad(!is_system_tablespace(id)); fil_space_t* space = fil_space_t::drop(id, nullptr); if (!space) { return; } space->x_lock(); ut_ad(space->is_stopping()); /* Invalidate in the buffer pool all pages belonging to the tablespace. Since space->is_stopping() holds, readahead can no longer read more pages of this tablespace to buf_pool. Thus we can clean the tablespace out of buf_pool completely and permanently. */ while (buf_flush_list_space(space)); space->x_unlock(); log_sys.latch.wr_lock(SRW_LOCK_CALL); if (space->max_lsn != 0) { ut_d(space->max_lsn = 0); fil_system.named_spaces.remove(*space); } log_sys.latch.wr_unlock(); fil_space_free_low(space); } /** Delete a tablespace and associated .ibd file. @param id tablespace identifier @return detached file handle (to be closed by the caller) @return OS_FILE_CLOSED if no file existed */ pfs_os_file_t fil_delete_tablespace(uint32_t id) { ut_ad(!is_system_tablespace(id)); pfs_os_file_t handle= OS_FILE_CLOSED; if (fil_space_t *space= fil_space_t::drop(id, &handle)) fil_space_free_low(space); return handle; } /*******************************************************************//** Allocates and builds a file name from a path, a table or tablespace name and a suffix. The string must be freed by caller with ut_free(). @param[in] path NULL or the directory path or the full path and filename. @param[in] name {} if path is full, or Table/Tablespace name @param[in] ext the file extension to use @param[in] trim_name true if the last name on the path should be trimmed. @return own: file name */ char* fil_make_filepath(const char *path, const fil_space_t::name_type &name, ib_extention ext, bool trim_name) { /* The path may contain the basename of the file, if so we do not need the name. If the path is NULL, we can use the default path, but there needs to be a name. */ ut_ad(path || name.data()); /* If we are going to strip a name off the path, there better be a path and a new name to put back on. */ ut_ad(!trim_name || (path && name.data())); if (path == NULL) { path = fil_path_to_mysql_datadir; } ulint len = 0; /* current length */ ulint path_len = strlen(path); const char* suffix = dot_ext[ext]; ulint suffix_len = strlen(suffix); ulint full_len = path_len + 1 + name.size() + suffix_len + 1; char* full_name = static_cast(ut_malloc_nokey(full_len)); if (full_name == NULL) { return NULL; } /* If the name is a relative or absolute path, do not prepend "./". */ if (path[0] == '.' && (path[1] == '\0' || path[1] == '/' IF_WIN(|| path[1] == '\\',)) && name.size() && (name.data()[0] == '.' || is_absolute_path(name.data()))) { path = NULL; path_len = 0; } if (path != NULL) { memcpy(full_name, path, path_len); len = path_len; } full_name[len] = '\0'; if (trim_name) { /* Find the offset of the last DIR separator and set it to null in order to strip off the old basename from this path. */ char* last_dir_sep = strrchr(full_name, '/'); #ifdef _WIN32 if (char *last = strrchr(full_name, '\\')) { if (last > last_dir_sep) { last_dir_sep = last; } } #endif if (last_dir_sep) { last_dir_sep[0] = '\0'; len = strlen(full_name); } } if (name.size()) { if (len && full_name[len - 1] != '/') { /* Add a DIR separator */ full_name[len] = '/'; full_name[++len] = '\0'; } char* ptr = &full_name[len]; memcpy(ptr, name.data(), name.size()); len += name.size(); full_name[len] = '\0'; } /* Make sure that the specified suffix is at the end of the filepath string provided. This assumes that the suffix starts with '.'. If the first char of the suffix is found in the filepath at the same length as the suffix from the end, then we will assume that there is a previous suffix that needs to be replaced. */ if (suffix != NULL) { /* Need room for the trailing null byte. */ ut_ad(len < full_len); if ((len > suffix_len) && (full_name[len - suffix_len] == suffix[0])) { /* Another suffix exists, make it the one requested. */ memcpy(&full_name[len - suffix_len], suffix, suffix_len); } else { /* No previous suffix, add it. */ ut_ad(len + suffix_len < full_len); memcpy(&full_name[len], suffix, suffix_len); full_name[len + suffix_len] = '\0'; } } return(full_name); } char *fil_make_filepath(const char* path, const table_name_t name, ib_extention suffix, bool strip_name) { return fil_make_filepath(path, {name.m_name, strlen(name.m_name)}, suffix, strip_name); } dberr_t fil_space_t::rename(const char *path, bool log, bool replace) { ut_ad(UT_LIST_GET_LEN(chain) == 1); ut_ad(!is_predefined_tablespace(id)); const char *old_path= chain.start->name; ut_ad(strchr(old_path, '/')); ut_ad(strchr(path, '/')); if (!strcmp(path, old_path)) return DB_SUCCESS; if (!log) { if (!os_file_rename(innodb_data_file_key, old_path, path)) return DB_ERROR; mysql_mutex_lock(&fil_system.mutex); ut_free(chain.start->name); chain.start->name= mem_strdup(path); mysql_mutex_unlock(&fil_system.mutex); return DB_SUCCESS; } bool exists= false; os_file_type_t ftype; /* Check upfront if the rename operation might succeed, because we must durably write redo log before actually attempting to execute the rename in the file system. */ if (os_file_status(old_path, &exists, &ftype) && !exists) { sql_print_error("InnoDB: Cannot rename '%s' to '%s'" " because the source file does not exist.", old_path, path); return DB_TABLESPACE_NOT_FOUND; } exists= false; if (replace); else if (!os_file_status(path, &exists, &ftype) || exists) { sql_print_error("InnoDB: Cannot rename '%s' to '%s'" " because the target file exists.", old_path, path); return DB_TABLESPACE_EXISTS; } mtr_t mtr; mtr.start(); mtr.log_file_op(FILE_RENAME, id, old_path, path); return mtr.commit_file(*this, path) ? DB_SUCCESS : DB_ERROR; } /** Create a tablespace file. @param[in] space_id Tablespace ID @param[in] name Tablespace name in dbname/tablename format. @param[in] path Path and filename of the datafile to create. @param[in] flags Tablespace flags @param[in] size Initial size of the tablespace file in pages, must be >= FIL_IBD_FILE_INITIAL_SIZE @param[in] mode MariaDB encryption mode @param[in] key_id MariaDB encryption key_id @param[out] err DB_SUCCESS or error code @return the created tablespace @retval NULL on error */ fil_space_t* fil_ibd_create( uint32_t space_id, const table_name_t name, const char* path, uint32_t flags, uint32_t size, fil_encryption_t mode, uint32_t key_id, dberr_t* err) { pfs_os_file_t file; bool success; mtr_t mtr; bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags) != 0; ut_ad(!is_system_tablespace(space_id)); ut_ad(!srv_read_only_mode); ut_a(space_id < SRV_SPACE_ID_UPPER_BOUND); ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE); ut_a(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, space_id)); /* Create the subdirectories in the path, if they are not there already. */ *err = os_file_create_subdirs_if_needed(path); if (*err != DB_SUCCESS) { return NULL; } mtr.start(); mtr.log_file_op(FILE_CREATE, space_id, path); log_sys.latch.wr_lock(SRW_LOCK_CALL); auto lsn= mtr.commit_files(); log_sys.latch.wr_unlock(); mtr.flag_wr_unlock(); log_write_up_to(lsn, true); ulint type; static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096, "compatibility"); switch (FSP_FLAGS_GET_ZIP_SSIZE(flags)) { case 1: case 2: type = OS_DATA_FILE_NO_O_DIRECT; break; default: type = OS_DATA_FILE; } file = os_file_create( innodb_data_file_key, path, OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_AIO, type, srv_read_only_mode, &success); if (!success) { /* The following call will print an error message */ switch (os_file_get_last_error(true)) { case OS_FILE_ALREADY_EXISTS: ib::info() << "The file '" << path << "'" " already exists though the" " corresponding table did not exist" " in the InnoDB data dictionary." " You can resolve the problem by removing" " the file."; *err = DB_TABLESPACE_EXISTS; break; case OS_FILE_DISK_FULL: *err = DB_OUT_OF_FILE_SPACE; break; default: *err = DB_ERROR; } ib::error() << "Cannot create file '" << path << "'"; return NULL; } const bool is_compressed = fil_space_t::is_compressed(flags); #ifdef _WIN32 const bool is_sparse = is_compressed; if (is_compressed) { os_file_set_sparse_win32(file); } #else const bool is_sparse = is_compressed && DB_SUCCESS == os_file_punch_hole(file, 0, 4096) && !my_test_if_thinly_provisioned(file); #endif if (fil_space_t::full_crc32(flags)) { flags |= FSP_FLAGS_FCRC32_PAGE_SSIZE(); } else { flags |= FSP_FLAGS_PAGE_SSIZE(); } /* Create crypt data if the tablespace is either encrypted or user has requested it to remain unencrypted. */ fil_space_crypt_t* crypt_data = (mode != FIL_ENCRYPTION_DEFAULT || srv_encrypt_tables) ? fil_space_create_crypt_data(mode, key_id) : nullptr; if (!os_file_set_size(path, file, os_offset_t(size) << srv_page_size_shift, is_sparse)) { *err = DB_OUT_OF_FILE_SPACE; err_exit: os_file_close(file); os_file_delete(innodb_data_file_key, path); free(crypt_data); return nullptr; } fil_space_t::name_type space_name; if (has_data_dir) { /* Make the ISL file if the IBD file is not in the default location. */ space_name = {name.m_name, strlen(name.m_name)}; *err = RemoteDatafile::create_link_file(space_name, path); if (*err != DB_SUCCESS) { goto err_exit; } } DBUG_EXECUTE_IF("checkpoint_after_file_create", log_make_checkpoint();); mysql_mutex_lock(&fil_system.mutex); if (fil_space_t* space = fil_space_t::create(space_id, flags, FIL_TYPE_TABLESPACE, crypt_data, mode, true)) { fil_node_t* node = space->add(path, file, size, false, true); IF_WIN(node->find_metadata(), node->find_metadata(file, true)); mysql_mutex_unlock(&fil_system.mutex); mtr.start(); mtr.set_named_space(space); ut_a(fsp_header_init(space, size, &mtr) == DB_SUCCESS); mtr.commit(); return space; } else { mysql_mutex_unlock(&fil_system.mutex); } if (space_name.data()) { RemoteDatafile::delete_link_file(space_name); } *err = DB_ERROR; goto err_exit; } /** Try to open a single-table tablespace and optionally check that the space id in it is correct. If this does not succeed, print an error message to the .err log. This function is used to open a tablespace when we start mysqld after the dictionary has been booted, and also in IMPORT TABLESPACE. NOTE that we assume this operation is used either at the database startup or under the protection of dict_sys.latch, so that two users cannot race here. This operation does not leave the file associated with the tablespace open, but closes it after we have looked at the space id in it. If the validate boolean is set, we read the first page of the file and check that the space id in the file is what we expect. We assume that this function runs much faster if no check is made, since accessing the file inode probably is much faster (the OS caches them) than accessing the first page of the file. This boolean may be initially false, but if a remote tablespace is found it will be changed to true. If the fix_dict boolean is set, then it is safe to use an internal SQL statement to update the dictionary tables if they are incorrect. @param[in] validate 0=maybe missing, 1=do not validate, 2=validate @param[in] purpose FIL_TYPE_TABLESPACE or FIL_TYPE_TEMPORARY @param[in] id tablespace ID @param[in] flags expected FSP_SPACE_FLAGS @param[in] name table name If file-per-table, it is the table name in the databasename/tablename format @param[in] path_in expected filepath, usually read from dictionary @param[out] err DB_SUCCESS or error code @return tablespace @retval NULL if the tablespace could not be opened */ fil_space_t* fil_ibd_open( unsigned validate, fil_type_t purpose, uint32_t id, uint32_t flags, fil_space_t::name_type name, const char* path_in, dberr_t* err) { mysql_mutex_lock(&fil_system.mutex); fil_space_t* space = fil_space_get_by_id(id); mysql_mutex_unlock(&fil_system.mutex); if (space) { if (validate > 1 && !srv_read_only_mode) { fsp_flags_try_adjust(space, flags & ~FSP_FLAGS_MEM_MASK); } return space; } dberr_t local_err = DB_SUCCESS; /* Table flags can be ULINT_UNDEFINED if dict_tf_to_fsp_flags_failure is set. */ if (flags == UINT32_MAX) { corrupted: local_err = DB_CORRUPTION; func_exit: if (err) *err = local_err; return space; } ut_ad(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, id)); Datafile df_default; /* default location */ RemoteDatafile df_remote; /* remote location */ ulint tablespaces_found = 0; ulint valid_tablespaces_found = 0; df_default.init(flags); df_remote.init(flags); /* Discover the correct file by looking in three possible locations while avoiding unecessary effort. */ /* We will always look for an ibd in the default location. */ df_default.make_filepath(nullptr, name, IBD); /* Look for a filepath embedded in an ISL where the default file would be. */ bool must_validate = df_remote.open_link_file(name); if (must_validate) { if (df_remote.open_read_only(true) == DB_SUCCESS) { ut_ad(df_remote.is_open()); ++tablespaces_found; } else { /* The following call prints an error message */ os_file_get_last_error(true); ib::error() << "A link file was found named '" << df_remote.link_filepath() << "' but the linked tablespace '" << df_remote.filepath() << "' could not be opened read-only."; } } else if (path_in && !df_default.same_filepath_as(path_in)) { /* Dict path is not the default path. Always validate remote files. If default is opened, it was moved. */ must_validate = true; } else if (validate > 1) { must_validate = true; } const bool operation_not_for_export = srv_operation != SRV_OPERATION_RESTORE_EXPORT && srv_operation != SRV_OPERATION_EXPORT_RESTORED; /* Always look for a file at the default location. But don't log an error if the tablespace is already open in remote or dict. */ ut_a(df_default.filepath()); /* Mariabackup will not copy files whose names start with #sql-. We will suppress messages about such files missing on the first server startup. The tables ought to be dropped by drop_garbage_tables_after_restore() a little later. */ const bool strict = validate && !tablespaces_found && operation_not_for_export && !(srv_operation == SRV_OPERATION_NORMAL && srv_start_after_restore && srv_force_recovery < SRV_FORCE_NO_BACKGROUND && dict_table_t::is_temporary_name( df_default.filepath())); if (df_default.open_read_only(strict) == DB_SUCCESS) { ut_ad(df_default.is_open()); ++tablespaces_found; } /* Check if multiple locations point to the same file. */ if (tablespaces_found > 1 && df_default.same_as(df_remote)) { /* A link file was found with the default path in it. Use the default path and delete the link file. */ --tablespaces_found; df_remote.delete_link_file(); df_remote.close(); } /* We have now checked all possible tablespace locations and have a count of how many unique files we found. If things are normal, we only found 1. */ /* For encrypted tablespace, we need to check the encryption in header of first page. */ if (!must_validate && tablespaces_found == 1) { goto skip_validate; } /* Read and validate the first page of these three tablespace locations, if found. */ valid_tablespaces_found += (df_remote.validate_to_dd(id, flags) == DB_SUCCESS); valid_tablespaces_found += (df_default.validate_to_dd(id, flags) == DB_SUCCESS); /* Make sense of these three possible locations. First, bail out if no tablespace files were found. */ if (valid_tablespaces_found == 0) { if (!strict && IF_WIN(GetLastError() == ERROR_FILE_NOT_FOUND || GetLastError() == ERROR_PATH_NOT_FOUND, errno == ENOENT)) { /* Suppress a message about a missing file. */ goto corrupted; } os_file_get_last_error(operation_not_for_export, !operation_not_for_export); if (!operation_not_for_export) { goto corrupted; } sql_print_error("InnoDB: Could not find a valid tablespace" " file for %.*s. %s", static_cast(name.size()), name.data(), TROUBLESHOOT_DATADICT_MSG); goto corrupted; } if (!must_validate) { goto skip_validate; } /* Do not open any tablespaces if more than one tablespace with the correct space ID and flags were found. */ if (df_default.is_open() && df_remote.is_open()) { ib::error() << "A tablespace has been found in multiple places: " << df_default.filepath() << "(Space ID=" << df_default.space_id() << ", Flags=" << df_default.flags() << ") and " << df_remote.filepath() << "(Space ID=" << df_remote.space_id() << ", Flags=" << df_remote.flags() << (valid_tablespaces_found > 1 || srv_force_recovery ? "); will not open" : ")"); /* Force-recovery will allow some tablespaces to be skipped by REDO if there was more than one file found. Unlike during the REDO phase of recovery, we now know if the tablespace is valid according to the dictionary, which was not available then. So if we did not force recovery and there is only one good tablespace, ignore any bad tablespaces. */ if (valid_tablespaces_found > 1 || srv_force_recovery > 0) { /* If the file is not open it cannot be valid. */ ut_ad(df_default.is_open() || !df_default.is_valid()); ut_ad(df_remote.is_open() || !df_remote.is_valid()); /* Having established that, this is an easy way to look for corrupted data files. */ if (df_default.is_open() != df_default.is_valid() || df_remote.is_open() != df_remote.is_valid()) { goto corrupted; } error: local_err = DB_ERROR; goto func_exit; } /* There is only one valid tablespace found and we did not use srv_force_recovery during REDO. Use this one tablespace and clean up invalid tablespace pointers */ if (df_default.is_open() && !df_default.is_valid()) { df_default.close(); tablespaces_found--; } if (df_remote.is_open() && !df_remote.is_valid()) { df_remote.close(); tablespaces_found--; } } /* At this point, there should be only one filepath. */ ut_a(tablespaces_found == 1); ut_a(valid_tablespaces_found == 1); skip_validate: const byte* first_page = df_default.is_open() ? df_default.get_first_page() : df_remote.get_first_page(); fil_space_crypt_t* crypt_data = first_page ? fil_space_read_crypt_data(fil_space_t::zip_size(flags), first_page) : NULL; mysql_mutex_lock(&fil_system.mutex); space = fil_space_t::create(id, flags, purpose, crypt_data); if (!space) { mysql_mutex_unlock(&fil_system.mutex); goto error; } /* We do not measure the size of the file, that is why we pass the 0 below */ space->add( df_remote.is_open() ? df_remote.filepath() : df_default.filepath(), OS_FILE_CLOSED, 0, false, true); mysql_mutex_unlock(&fil_system.mutex); if (must_validate && !srv_read_only_mode) { df_remote.close(); df_default.close(); if (space->acquire()) { if (purpose != FIL_TYPE_IMPORT) { fsp_flags_try_adjust(space, flags & ~FSP_FLAGS_MEM_MASK); } space->release(); } } goto func_exit; } /** Discover the correct IBD file to open given a remote or missing filepath from the REDO log. Administrators can move a crashed database to another location on the same machine and try to recover it. Remote IBD files might be moved as well to the new location. The problem with this is that the REDO log contains the old location which may be still accessible. During recovery, if files are found in both locations, we can chose on based on these priorities; 1. Default location 2. ISL location 3. REDO location @param[in] space_id tablespace ID @param[in] df Datafile object with path from redo @return true if a valid datafile was found, false if not */ static bool fil_ibd_discover( ulint space_id, Datafile& df) { Datafile df_def_per; /* default file-per-table datafile */ RemoteDatafile df_rem_per; /* remote file-per-table datafile */ /* Look for the datafile in the default location. */ const char* filename = df.filepath(); const char* basename = base_name(filename); /* If this datafile is file-per-table it will have a schema dir. */ ulint sep_found = 0; const char* db = basename; for (; db > filename && sep_found < 2; db--) { switch (db[0]) { #ifdef _WIN32 case '\\': #endif case '/': sep_found++; } } if (sep_found == 2) { db += 2; df_def_per.init(0); df_def_per.set_filepath(db); if (df_def_per.open_read_only(false) == DB_SUCCESS && df_def_per.validate_for_recovery() == DB_SUCCESS && df_def_per.space_id() == space_id) { df.set_filepath(df_def_per.filepath()); df.open_read_only(false); return(true); } /* Look for a remote file-per-table tablespace. */ switch (srv_operation) { case SRV_OPERATION_BACKUP: case SRV_OPERATION_RESTORE_DELTA: case SRV_OPERATION_BACKUP_NO_DEFER: ut_ad(0); break; case SRV_OPERATION_RESTORE_EXPORT: case SRV_OPERATION_RESTORE: break; case SRV_OPERATION_NORMAL: case SRV_OPERATION_EXPORT_RESTORED: size_t len= strlen(db); if (len <= 4 || strcmp(db + len - 4, dot_ext[IBD])) { break; } df_rem_per.open_link_file({db, len - 4}); if (!df_rem_per.filepath()) { break; } /* An ISL file was found with contents. */ if (df_rem_per.open_read_only(false) != DB_SUCCESS || df_rem_per.validate_for_recovery() != DB_SUCCESS) { /* Assume that this ISL file is intended to be used. Do not continue looking for another if this file cannot be opened or is not a valid IBD file. */ ib::error() << "ISL file '" << df_rem_per.link_filepath() << "' was found but the linked file '" << df_rem_per.filepath() << "' could not be opened or is" " not correct."; return(false); } /* Use this file if it has the space_id from the FILE_ record. */ if (df_rem_per.space_id() == space_id) { df.set_filepath(df_rem_per.filepath()); df.open_read_only(false); return(true); } /* Since old MLOG records can use the same basename in multiple CREATE/DROP TABLE sequences, this ISL file could be pointing to a later version of this basename.ibd file which has a different space_id. Keep looking. */ } } /* No ISL files were found in the default location. Use the location given in the redo log. */ if (df.open_read_only(false) == DB_SUCCESS && df.validate_for_recovery() == DB_SUCCESS && df.space_id() == space_id) { return(true); } /* A datafile was not discovered for the filename given. */ return(false); } bool fil_crypt_check(fil_space_crypt_t *crypt_data, const char *f_name) { if (crypt_data->is_key_found()) return true; sql_print_error("InnoDB: Encryption key is not found for %s", f_name); crypt_data->~fil_space_crypt_t(); ut_free(crypt_data); return false; } /** Open an ibd tablespace and add it to the InnoDB data structures. This is similar to fil_ibd_open() except that it is used while processing the REDO log, so the data dictionary is not available and very little validation is done. The tablespace name is extracred from the dbname/tablename.ibd portion of the filename, which assumes that the file is a file-per-table tablespace. Any name will do for now. General tablespace names will be read from the dictionary after it has been recovered. The tablespace flags are read at this time from the first page of the file in validate_for_recovery(). @param[in] space_id tablespace ID @param[in] filename path/to/databasename/tablename.ibd @param[out] space the tablespace, or NULL on error @return status of the operation */ enum fil_load_status fil_ibd_load(uint32_t space_id, const char *filename, fil_space_t *&space) { /* If the a space is already in the file system cache with this space ID, then there is nothing to do. */ mysql_mutex_lock(&fil_system.mutex); space = fil_space_get_by_id(space_id); mysql_mutex_unlock(&fil_system.mutex); if (space) { /* Compare the filename we are trying to open with the filename from the first node of the tablespace we opened previously. Fail if it is different. */ fil_node_t* node = UT_LIST_GET_FIRST(space->chain); if (0 != strcmp(innobase_basename(filename), innobase_basename(node->name))) { ib::info() << "Ignoring data file '" << filename << "' with space ID " << space->id << ". Another data file called " << node->name << " exists with the same space ID."; space = NULL; return(FIL_LOAD_ID_CHANGED); } return(FIL_LOAD_OK); } if (srv_operation == SRV_OPERATION_RESTORE) { /* Replace absolute DATA DIRECTORY file paths with short names relative to the backup directory. */ const char* name = strrchr(filename, '/'); #ifdef _WIN32 if (const char *last = strrchr(filename, '\\')) { if (last > name) { name = last; } } #endif if (name) { while (--name > filename #ifdef _WIN32 && *name != '\\' #endif && *name != '/'); if (name > filename) { filename = name + 1; } } } Datafile file; file.set_filepath(filename); file.open_read_only(false); if (!file.is_open()) { /* The file has been moved or it is a remote datafile. */ if (!fil_ibd_discover(space_id, file) || !file.is_open()) { return(FIL_LOAD_NOT_FOUND); } } os_offset_t size; bool deferred_space = false; /* Read and validate the first page of the tablespace. Assign a tablespace name based on the tablespace type. */ switch (file.validate_for_recovery()) { os_offset_t minimum_size; case DB_SUCCESS: deferred_space = file.m_defer; if (deferred_space) { goto tablespace_check; } if (file.space_id() != space_id) { return(FIL_LOAD_ID_CHANGED); } tablespace_check: /* Get and test the file size. */ size = os_file_get_size(file.handle()); /* Every .ibd file is created >= 4 pages in size. Smaller files cannot be OK. */ minimum_size = os_offset_t(FIL_IBD_FILE_INITIAL_SIZE) << srv_page_size_shift; if (size == static_cast(-1)) { /* The following call prints an error message */ os_file_get_last_error(true); ib::error() << "Could not measure the size of" " single-table tablespace file '" << file.filepath() << "'"; } else if (deferred_space) { return FIL_LOAD_DEFER; } else if (size < minimum_size) { ib::error() << "The size of tablespace file '" << file.filepath() << "' is only " << size << ", should be at least " << minimum_size << "!"; } else { /* Everything is fine so far. */ break; } /* fall through */ case DB_TABLESPACE_EXISTS: return(FIL_LOAD_INVALID); default: return(FIL_LOAD_NOT_FOUND); } ut_ad(space == NULL); /* Adjust the memory-based flags that would normally be set by dict_tf_to_fsp_flags(). In recovery, we have no data dictionary. */ uint32_t flags = file.flags(); if (fil_space_t::is_compressed(flags)) { flags |= page_zip_level << FSP_FLAGS_MEM_COMPRESSION_LEVEL; } const byte* first_page = file.get_first_page(); fil_space_crypt_t* crypt_data = first_page ? fil_space_read_crypt_data(fil_space_t::zip_size(flags), first_page) : NULL; if (crypt_data && !fil_crypt_check(crypt_data, filename)) { return FIL_LOAD_INVALID; } mysql_mutex_lock(&fil_system.mutex); space = fil_space_t::create( space_id, flags, FIL_TYPE_TABLESPACE, crypt_data); if (space == NULL) { mysql_mutex_unlock(&fil_system.mutex); return(FIL_LOAD_INVALID); } ut_ad(space->id == file.space_id()); ut_ad(space->id == space_id); /* We do not use the size information we have about the file, because the rounding formula for extents and pages is somewhat complex; we let fil_node_open() do that task. */ space->add(file.filepath(), OS_FILE_CLOSED, 0, false, false); mysql_mutex_unlock(&fil_system.mutex); return(FIL_LOAD_OK); } /** Try to adjust FSP_SPACE_FLAGS if they differ from the expectations. (Typically when upgrading from MariaDB 10.1.0..10.1.20.) @param[in,out] space tablespace @param[in] flags desired tablespace flags */ void fsp_flags_try_adjust(fil_space_t *space, uint32_t flags) { ut_ad(!srv_read_only_mode); ut_ad(fil_space_t::is_valid_flags(flags, space->id)); if (space->full_crc32() || fil_space_t::full_crc32(flags)) { return; } if (!space->size && (space->purpose != FIL_TYPE_TABLESPACE || !space->get_size())) { return; } /* This code is executed during server startup while no connections are allowed. We do not need to protect against DROP TABLE by fil_space_acquire(). */ mtr_t mtr; mtr.start(); if (buf_block_t* b = buf_page_get( page_id_t(space->id, 0), space->zip_size(), RW_X_LATCH, &mtr)) { uint32_t f = fsp_header_get_flags(b->page.frame); if (fil_space_t::full_crc32(f)) { goto func_exit; } if (fil_space_t::is_flags_equal(f, flags)) { goto func_exit; } /* Suppress the message if only the DATA_DIR flag to differs. */ if ((f ^ flags) & ~(1U << FSP_FLAGS_POS_RESERVED)) { ib::warn() << "adjusting FSP_SPACE_FLAGS of file '" << UT_LIST_GET_FIRST(space->chain)->name << "' from " << ib::hex(f) << " to " << ib::hex(flags); } mtr.set_named_space(space); mtr.write<4,mtr_t::FORCED>(*b, FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + b->page.frame, flags); } func_exit: mtr.commit(); } /** Determine if a matching tablespace exists in the InnoDB tablespace memory cache. Note that if we have not done a crash recovery at the database startup, there may be many tablespaces which are not yet in the memory cache. @param[in] id Tablespace ID @param[in] table_flags table flags @return the tablespace @retval NULL if no matching tablespace exists in the memory cache */ fil_space_t *fil_space_for_table_exists_in_mem(uint32_t id, uint32_t table_flags) { const uint32_t expected_flags = dict_tf_to_fsp_flags(table_flags); mysql_mutex_lock(&fil_system.mutex); if (fil_space_t* space = fil_space_get_by_id(id)) { uint32_t tf = expected_flags & ~FSP_FLAGS_MEM_MASK; uint32_t sf = space->flags & ~FSP_FLAGS_MEM_MASK; if (!fil_space_t::is_flags_equal(tf, sf) && !fil_space_t::is_flags_equal(sf, tf)) { goto func_exit; } /* Adjust the flags that are in FSP_FLAGS_MEM_MASK. FSP_SPACE_FLAGS will not be written back here. */ space->flags = (space->flags & ~FSP_FLAGS_MEM_MASK) | (expected_flags & FSP_FLAGS_MEM_MASK); mysql_mutex_unlock(&fil_system.mutex); if (!srv_read_only_mode) { fsp_flags_try_adjust(space, expected_flags & ~FSP_FLAGS_MEM_MASK); } return space; } func_exit: mysql_mutex_unlock(&fil_system.mutex); return NULL; } /*============================ FILE I/O ================================*/ /** Report information about an invalid page access. */ ATTRIBUTE_COLD static void fil_invalid_page_access_msg(const char *name, os_offset_t offset, ulint len, bool is_read) { sql_print_error("%s %zu bytes at " UINT64PF " outside the bounds of the file: %s", is_read ? "InnoDB: Trying to read" : "[FATAL] InnoDB: Trying to write", len, offset, name); if (!is_read) abort(); } /** Update the data structures on write completion */ inline void fil_node_t::complete_write() { mysql_mutex_assert_not_owner(&fil_system.mutex); if (space->purpose != FIL_TYPE_TEMPORARY && srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC && space->set_needs_flush()) { mysql_mutex_lock(&fil_system.mutex); if (!space->is_in_unflushed_spaces) { space->is_in_unflushed_spaces= true; fil_system.unflushed_spaces.push_front(*space); } mysql_mutex_unlock(&fil_system.mutex); } } /** Read or write data. @param type I/O context @param offset offset in bytes @param len number of bytes @param buf the data to be read or written @param bpage buffer block (for type.is_async() completion callback) @return status and file descriptor */ fil_io_t fil_space_t::io(const IORequest &type, os_offset_t offset, size_t len, void *buf, buf_page_t *bpage) { ut_ad(referenced()); ut_ad(offset % UNIV_ZIP_SIZE_MIN == 0); ut_ad(len % 512 == 0); /* page_compressed */ ut_ad(fil_validate_skip()); ut_ad(type.is_read() || type.is_write()); ut_ad(type.type != IORequest::DBLWR_BATCH); if (type.is_read()) { srv_stats.data_read.add(len); } else { ut_ad(!srv_read_only_mode || this == fil_system.temp_space); srv_stats.data_written.add(len); } fil_node_t* node= UT_LIST_GET_FIRST(chain); ut_ad(node); ulint p = static_cast(offset >> srv_page_size_shift); dberr_t err; if (type.type == IORequest::READ_ASYNC && is_stopping()) { err = DB_TABLESPACE_DELETED; node = nullptr; goto release; } DBUG_EXECUTE_IF("intermittent_recovery_failure", if (type.is_read() && !(~get_rnd_value() & 0x3ff0)) goto io_error;); DBUG_EXECUTE_IF("intermittent_read_failure", if (srv_was_started && type.is_read() && !(~get_rnd_value() & 0x3ff0)) goto io_error;); if (UNIV_LIKELY_NULL(UT_LIST_GET_NEXT(chain, node))) { ut_ad(this == fil_system.sys_space || this == fil_system.temp_space); ut_ad(!(offset & ((1 << srv_page_size_shift) - 1))); while (node->size <= p) { p -= node->size; node = UT_LIST_GET_NEXT(chain, node); if (!node) { fail: if (type.type != IORequest::READ_ASYNC) { fil_invalid_page_access_msg( node->name, offset, len, type.is_read()); } #ifndef DBUG_OFF io_error: #endif set_corrupted(); err = DB_CORRUPTION; node = nullptr; goto release; } } offset = os_offset_t{p} << srv_page_size_shift; } if (UNIV_UNLIKELY(node->size <= p)) { goto fail; } if (type.type == IORequest::PUNCH_RANGE) { err = os_file_punch_hole(node->handle, offset, len); /* Punch hole is not supported, make space not to support punch hole */ if (UNIV_UNLIKELY(err == DB_IO_NO_PUNCH_HOLE)) { node->punch_hole = false; err = DB_SUCCESS; } goto release_sync_write; } else { /* Queue the aio request */ err = os_aio(IORequest{bpage, type.slot, node, type.type}, buf, offset, len); } if (!type.is_async()) { if (type.is_write()) { release_sync_write: node->complete_write(); release: release(); goto func_exit; } ut_ad(fil_validate_skip()); } if (err != DB_SUCCESS) { goto release; } func_exit: return {err, node}; } #include void IORequest::write_complete(int io_error) const { ut_ad(fil_validate_skip()); ut_ad(node); ut_ad(is_write()); node->complete_write(); if (!bpage) { ut_ad(!srv_read_only_mode); if (type == IORequest::DBLWR_BATCH) buf_dblwr.flush_buffered_writes_completed(*this); else ut_ad(type == IORequest::WRITE_ASYNC); } else buf_page_write_complete(*this, io_error); node->space->release(); } void IORequest::read_complete(int io_error) const { ut_ad(fil_validate_skip()); ut_ad(node); ut_ad(is_read()); ut_ad(bpage); /* IMPORTANT: since i/o handling for reads will read also the insert buffer in fil_system.sys_space, we have to be very careful not to introduce deadlocks. We never close fil_system.sys_space data files and never issue asynchronous reads of change buffer pages. */ const page_id_t id(bpage->id()); if (UNIV_UNLIKELY(io_error != 0)) { sql_print_error("InnoDB: Read error %d of page " UINT32PF " in file %s", io_error, id.page_no(), node->name); buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX); corrupted: if (recv_recovery_is_on() && !srv_force_recovery) { mysql_mutex_lock(&recv_sys.mutex); recv_sys.set_corrupt_fs(); mysql_mutex_unlock(&recv_sys.mutex); } } else if (dberr_t err= bpage->read_complete(*node)) { if (err != DB_FAIL) ib::error() << "Failed to read page " << id.page_no() << " from file '" << node->name << "': " << err; goto corrupted; } node->space->release(); } /** Flush to disk the writes in file spaces of the given type possibly cached by the OS. */ void fil_flush_file_spaces() { if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC) { ut_d(mysql_mutex_lock(&fil_system.mutex)); ut_ad(fil_system.unflushed_spaces.empty()); ut_d(mysql_mutex_unlock(&fil_system.mutex)); return; } rescan: mysql_mutex_lock(&fil_system.mutex); for (fil_space_t &space : fil_system.unflushed_spaces) { if (space.needs_flush_not_stopping()) { space.reacquire(); mysql_mutex_unlock(&fil_system.mutex); space.flush_low(); space.release(); goto rescan; } } mysql_mutex_unlock(&fil_system.mutex); } /** Functor to validate the file node list of a tablespace. */ struct Check { /** Total size of file nodes visited so far */ ulint size; /** Total number of open files visited so far */ ulint n_open; /** Constructor */ Check() : size(0), n_open(0) {} /** Visit a file node @param[in] elem file node to visit */ void operator()(const fil_node_t* elem) { n_open += elem->is_open(); size += elem->size; } /** Validate a tablespace. @param[in] space tablespace to validate @return number of open file nodes */ static ulint validate(const fil_space_t* space) { mysql_mutex_assert_owner(&fil_system.mutex); Check check; ut_list_validate(space->chain, check); ut_a(space->size == check.size); switch (space->id) { case TRX_SYS_SPACE: ut_ad(fil_system.sys_space == NULL || fil_system.sys_space == space); break; case SRV_TMP_SPACE_ID: ut_ad(fil_system.temp_space == NULL || fil_system.temp_space == space); break; default: break; } return(check.n_open); } }; /******************************************************************//** Checks the consistency of the tablespace cache. @return true if ok */ bool fil_validate() { ulint n_open = 0; mysql_mutex_lock(&fil_system.mutex); for (fil_space_t &space : fil_system.space_list) { n_open += Check::validate(&space); } ut_a(fil_system.n_open == n_open); mysql_mutex_unlock(&fil_system.mutex); return(true); } /*********************************************************************//** Sets the file page type. */ void fil_page_set_type( /*==============*/ byte* page, /*!< in/out: file page */ ulint type) /*!< in: type */ { ut_ad(page); mach_write_to_2(page + FIL_PAGE_TYPE, type); } /********************************************************************//** Delete the tablespace file and any related files like .cfg. This should not be called for temporary tables. @param[in] ibd_filepath File path of the IBD tablespace */ void fil_delete_file(const char *ibd_filepath) { ib::info() << "Deleting " << ibd_filepath; os_file_delete_if_exists(innodb_data_file_key, ibd_filepath, nullptr); if (char *cfg_filepath= fil_make_filepath(ibd_filepath, fil_space_t::name_type{}, CFG, false)) { os_file_delete_if_exists(innodb_data_file_key, cfg_filepath, nullptr); ut_free(cfg_filepath); } } #ifdef UNIV_DEBUG /** Check that a tablespace is valid for mtr_commit(). @param[in] space persistent tablespace that has been changed */ static void fil_space_validate_for_mtr_commit( const fil_space_t* space) { mysql_mutex_assert_not_owner(&fil_system.mutex); ut_ad(space != NULL); ut_ad(space->purpose == FIL_TYPE_TABLESPACE); ut_ad(!is_predefined_tablespace(space->id)); /* We are serving mtr_commit(). While there is an active mini-transaction, we should have !space->stop_new_ops. This is guaranteed by meta-data locks or transactional locks. */ ut_ad(!space->is_stopping() || space->is_being_truncated /* fil_truncate_prepare() */ || space->referenced()); } #endif /* UNIV_DEBUG */ /** Note that a non-predefined persistent tablespace has been modified by redo log. @param[in,out] space tablespace */ void fil_names_dirty( fil_space_t* space) { #ifndef SUX_LOCK_GENERIC ut_ad(log_sys.latch.is_write_locked()); #endif ut_ad(recv_recovery_is_on()); ut_ad(log_sys.get_lsn() != 0); ut_ad(space->max_lsn == 0); ut_d(fil_space_validate_for_mtr_commit(space)); fil_system.named_spaces.push_back(*space); space->max_lsn = log_sys.get_lsn(); } /** Write a FILE_MODIFY record when a non-predefined persistent tablespace was modified for the first time since fil_names_clear(). */ ATTRIBUTE_NOINLINE ATTRIBUTE_COLD void mtr_t::name_write() { #ifndef SUX_LOCK_GENERIC ut_ad(log_sys.latch.is_write_locked()); #endif ut_d(fil_space_validate_for_mtr_commit(m_user_space)); ut_ad(!m_user_space->max_lsn); m_user_space->max_lsn= log_sys.get_lsn(); fil_system.named_spaces.push_back(*m_user_space); ut_ad(UT_LIST_GET_LEN(m_user_space->chain) == 1); mtr_t mtr; mtr.start(); fil_name_write(m_user_space->id, UT_LIST_GET_FIRST(m_user_space->chain)->name, &mtr); mtr.commit_files(); } /** On a log checkpoint, reset fil_names_dirty_and_write() flags and write out FILE_MODIFY if needed, and write FILE_CHECKPOINT. @param lsn checkpoint LSN @return current LSN */ lsn_t fil_names_clear(lsn_t lsn) { mtr_t mtr; #ifndef SUX_LOCK_GENERIC ut_ad(log_sys.latch.is_write_locked()); #endif ut_ad(lsn); ut_ad(log_sys.is_latest()); mtr.start(); for (auto it = fil_system.named_spaces.begin(); it != fil_system.named_spaces.end(); ) { if (mtr.get_log_size() + strlen(it->chain.start->name) >= recv_sys.MTR_SIZE_MAX - (3 + 5)) { /* Prevent log parse buffer overflow */ mtr.commit_files(); mtr.start(); } auto next = std::next(it); ut_ad(it->max_lsn > 0); if (it->max_lsn < lsn) { /* The tablespace was last dirtied before the checkpoint LSN. Remove it from the list, so that if the tablespace is not going to be modified any more, subsequent checkpoints will avoid calling fil_names_write() on it. */ it->max_lsn = 0; fil_system.named_spaces.erase(it); } /* max_lsn is the last LSN where fil_names_dirty_and_write() was called. If we kept track of "min_lsn" (the first LSN where max_lsn turned nonzero), we could avoid the fil_names_write() call if min_lsn > lsn. */ ut_ad(UT_LIST_GET_LEN((*it).chain) == 1); fil_name_write((*it).id, UT_LIST_GET_FIRST((*it).chain)->name, &mtr); it = next; } return mtr.commit_files(lsn); } /* Unit Tests */ #ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH #define MF fil_make_filepath #define DISPLAY ib::info() << path void test_make_filepath() { char* path; const char* long_path = "this/is/a/very/long/path/including/a/very/" "looooooooooooooooooooooooooooooooooooooooooooooooo" "oooooooooooooooooooooooooooooooooooooooooooooooooo" "oooooooooooooooooooooooooooooooooooooooooooooooooo" "oooooooooooooooooooooooooooooooooooooooooooooooooo" "oooooooooooooooooooooooooooooooooooooooooooooooooo" "oooooooooooooooooooooooooooooooooooooooooooooooooo" "oooooooooooooooooooooooooooooooooooooooooooooooooo" "oooooooooooooooooooooooooooooooooooooooooooooooooo" "oooooooooooooooooooooooooooooooooooooooooooooooooo" "oooooooooooooooooooooooooooooooooooooooooooooooong" "/folder/name"; path = MF("/this/is/a/path/with/a/filename", NULL, IBD, false); DISPLAY; path = MF("/this/is/a/path/with/a/filename", NULL, ISL, false); DISPLAY; path = MF("/this/is/a/path/with/a/filename", NULL, CFG, false); DISPLAY; path = MF("/this/is/a/path/with/a/filename.ibd", NULL, IBD, false); DISPLAY; path = MF("/this/is/a/path/with/a/filename.ibd", NULL, IBD, false); DISPLAY; path = MF("/this/is/a/path/with/a/filename.dat", NULL, IBD, false); DISPLAY; path = MF(NULL, "tablespacename", NO_EXT, false); DISPLAY; path = MF(NULL, "tablespacename", IBD, false); DISPLAY; path = MF(NULL, "dbname/tablespacename", NO_EXT, false); DISPLAY; path = MF(NULL, "dbname/tablespacename", IBD, false); DISPLAY; path = MF(NULL, "dbname/tablespacename", ISL, false); DISPLAY; path = MF(NULL, "dbname/tablespacename", CFG, false); DISPLAY; path = MF(NULL, "dbname\\tablespacename", NO_EXT, false); DISPLAY; path = MF(NULL, "dbname\\tablespacename", IBD, false); DISPLAY; path = MF("/this/is/a/path", "dbname/tablespacename", IBD, false); DISPLAY; path = MF("/this/is/a/path", "dbname/tablespacename", IBD, true); DISPLAY; path = MF("./this/is/a/path", "dbname/tablespacename.ibd", IBD, true); DISPLAY; path = MF("this\\is\\a\\path", "dbname/tablespacename", IBD, true); DISPLAY; path = MF("/this/is/a/path", "dbname\\tablespacename", IBD, true); DISPLAY; path = MF(long_path, NULL, IBD, false); DISPLAY; path = MF(long_path, "tablespacename", IBD, false); DISPLAY; path = MF(long_path, "tablespacename", IBD, true); DISPLAY; } #endif /* UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH */ /* @} */ /** Determine the block size of the data file. @param[in] space tablespace @param[in] offset page number @return block size */ ulint fil_space_get_block_size(const fil_space_t *space, unsigned offset) { ulint block_size = 512; for (fil_node_t* node = UT_LIST_GET_FIRST(space->chain); node != NULL; node = UT_LIST_GET_NEXT(chain, node)) { block_size = node->block_size; if (node->size > offset) { ut_ad(node->size <= 0xFFFFFFFFU); break; } offset -= static_cast(node->size); } /* Currently supporting block size up to 4K, fall back to default if bigger requested. */ if (block_size > 4096) { block_size = 512; } return block_size; } /** @return the tablespace name (databasename/tablename) */ fil_space_t::name_type fil_space_t::name() const { switch (id) { case 0: return name_type{"innodb_system", 13}; case SRV_TMP_SPACE_ID: return name_type{"innodb_temporary", 16}; } if (!UT_LIST_GET_FIRST(chain) || srv_is_undo_tablespace(id)) return name_type{}; ut_ad(purpose != FIL_TYPE_TEMPORARY); ut_ad(UT_LIST_GET_LEN(chain) == 1); const char *path= UT_LIST_GET_FIRST(chain)->name; const char *sep= strchr(path, '/'); ut_ad(sep); while (const char *next_sep= strchr(sep + 1, '/')) path= sep + 1, sep= next_sep; #ifdef _WIN32 if (const char *last_sep= strchr(path, '\\')) if (last_sep < sep) path= last_sep; #endif size_t len= strlen(path); ut_ad(len > 4); len-= 4; ut_ad(!strcmp(&path[len], DOT_IBD)); return name_type{path, len}; } #ifdef UNIV_DEBUG fil_space_t *fil_space_t::next_in_space_list() { space_list_t::iterator it(this); auto end= fil_system.space_list.end(); if (it == end) return nullptr; ++it; return it == end ? nullptr : &*it; } fil_space_t *fil_space_t::prev_in_space_list() { space_list_t::iterator it(this); if (it == fil_system.space_list.begin()) return nullptr; --it; return &*it; } fil_space_t *fil_space_t::next_in_unflushed_spaces() { sized_ilist::iterator it(this); auto end= fil_system.unflushed_spaces.end(); if (it == end) return nullptr; ++it; return it == end ? nullptr : &*it; } fil_space_t *fil_space_t::prev_in_unflushed_spaces() { sized_ilist::iterator it(this); if (it == fil_system.unflushed_spaces.begin()) return nullptr; --it; return &*it; } #endif