diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
commit | 3f619478f796eddbba6e39502fe941b285dd97b1 (patch) | |
tree | e2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/fsp | |
parent | Initial commit. (diff) | |
download | mariadb-upstream.tar.xz mariadb-upstream.zip |
Adding upstream version 1:10.11.6.upstream/1%10.11.6upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/fsp')
-rw-r--r-- | storage/innobase/fsp/fsp0file.cc | 936 | ||||
-rw-r--r-- | storage/innobase/fsp/fsp0fsp.cc | 3070 | ||||
-rw-r--r-- | storage/innobase/fsp/fsp0space.cc | 224 | ||||
-rw-r--r-- | storage/innobase/fsp/fsp0sysspace.cc | 1019 |
4 files changed, 5249 insertions, 0 deletions
diff --git a/storage/innobase/fsp/fsp0file.cc b/storage/innobase/fsp/fsp0file.cc new file mode 100644 index 00000000..cafff419 --- /dev/null +++ b/storage/innobase/fsp/fsp0file.cc @@ -0,0 +1,936 @@ +/***************************************************************************** + +Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fsp/fsp0file.cc +Tablespace data file implementation + +Created 2013-7-26 by Kevin Lewis +*******************************************************/ + +#include "fil0fil.h" +#include "fsp0types.h" +#include "os0file.h" +#include "page0page.h" +#include "srv0start.h" +#include "log.h" + +/** Release the resources. */ +void +Datafile::shutdown() +{ + close(); + + free_filepath(); + free_first_page(); +} + +/** Create/open a data file. +@param[in] read_only_mode if true, then readonly mode checks are enforced. +@return DB_SUCCESS or error code */ +dberr_t +Datafile::open_or_create(bool read_only_mode) +{ + bool success; + ut_a(m_filepath != NULL); + ut_ad(m_handle == OS_FILE_CLOSED); + + m_handle = os_file_create( + innodb_data_file_key, m_filepath, m_open_flags, + OS_FILE_NORMAL, OS_DATA_FILE, read_only_mode, &success); + + if (!success) { + m_last_os_error = os_file_get_last_error(true); + ib::error() << "Cannot open datafile '" << m_filepath << "'"; + return(DB_CANNOT_OPEN_FILE); + } + + return(DB_SUCCESS); +} + +/** Open a data file in read-only mode to check if it exists so that it +can be validated. +@param[in] strict whether to issue error messages +@return DB_SUCCESS or error code */ +dberr_t +Datafile::open_read_only(bool strict) +{ + bool success = false; + ut_ad(m_handle == OS_FILE_CLOSED); + + /* This function can be called for file objects that do not need + to be opened, which is the case when the m_filepath is NULL */ + if (m_filepath == NULL) { + return(DB_ERROR); + } + + set_open_flags(OS_FILE_OPEN); + m_handle = os_file_create_simple_no_error_handling( + innodb_data_file_key, m_filepath, m_open_flags, + OS_FILE_READ_ONLY, true, &success); + + if (success) { + m_exists = true; + init_file_info(); + + return(DB_SUCCESS); + } + + if (strict) { + m_last_os_error = os_file_get_last_error(true); + ib::error() << "Cannot open datafile for read-only: '" + << m_filepath << "' OS error: " << m_last_os_error; + } + + return(DB_CANNOT_OPEN_FILE); +} + +/** Open a data file in read-write mode during start-up so that +doublewrite pages can be restored and then it can be validated.* +@return DB_SUCCESS or error code */ +inline dberr_t Datafile::open_read_write() +{ + bool success = false; + ut_ad(m_handle == OS_FILE_CLOSED); + ut_ad(!srv_read_only_mode); + + /* This function can be called for file objects that do not need + to be opened, which is the case when the m_filepath is NULL */ + if (m_filepath == NULL) { + return(DB_ERROR); + } + + set_open_flags(OS_FILE_OPEN); + m_handle = os_file_create_simple_no_error_handling( + innodb_data_file_key, m_filepath, m_open_flags, + OS_FILE_READ_WRITE, false, &success); + + if (!success) { + m_last_os_error = os_file_get_last_error(true); + ib::error() << "Cannot open datafile for read-write: '" + << m_filepath << "'"; + return(DB_CANNOT_OPEN_FILE); + } + + m_exists = true; + + init_file_info(); + + return(DB_SUCCESS); +} + +/** Initialize OS specific file info. */ +void +Datafile::init_file_info() +{ +#ifdef _WIN32 + GetFileInformationByHandle((os_file_t)m_handle, &m_file_info); +#else + fstat(m_handle, &m_file_info); +#endif /* WIN32 */ +} + +/** Close a data file. +@return DB_SUCCESS or error code */ +dberr_t +Datafile::close() +{ + if (m_handle != OS_FILE_CLOSED) { + ibool success = os_file_close(m_handle); + ut_a(success); + + m_handle = OS_FILE_CLOSED; + } + + return(DB_SUCCESS); +} + +/** Make a full filepath from a directory path and a filename. +Prepend the dirpath to filename using the extension given. +If dirpath is NULL, prepend the default datadir to filepath. +Store the result in m_filepath. +@param dirpath directory path +@param name tablespace (table) name +@param ext filename extension */ +void Datafile::make_filepath(const char *dirpath, fil_space_t::name_type name, + ib_extention ext) +{ + ut_ad(dirpath || name.size()); + free_filepath(); + m_filepath= fil_make_filepath(dirpath, name, ext, false); + ut_ad(m_filepath); + set_filename(); +} + +/** Set the filepath by duplicating the filepath sent in. This is the +name of the file with its extension and absolute or relative path. +@param[in] filepath filepath to set */ +void +Datafile::set_filepath(const char* filepath) +{ + free_filepath(); + m_filepath = static_cast<char*>(ut_malloc_nokey(strlen(filepath) + 1)); + ::strcpy(m_filepath, filepath); + set_filename(); +} + +/** Free the filepath buffer. */ +void +Datafile::free_filepath() +{ + if (m_filepath != NULL) { + ut_free(m_filepath); + m_filepath = NULL; + m_filename = NULL; + } +} + +/** Do a quick test if the filepath provided looks the same as this filepath +byte by byte. If they are two different looking paths to the same file, +same_as() will be used to show that after the files are opened. +@param[in] other filepath to compare with +@retval true if it is the same filename by byte comparison +@retval false if it looks different */ +bool +Datafile::same_filepath_as( + const char* other) const +{ + return(0 == strcmp(m_filepath, other)); +} + +/** Test if another opened datafile is the same file as this object. +@param[in] other Datafile to compare with +@return true if it is the same file, else false */ +bool +Datafile::same_as( + const Datafile& other) const +{ +#ifdef _WIN32 + return(m_file_info.dwVolumeSerialNumber + == other.m_file_info.dwVolumeSerialNumber + && m_file_info.nFileIndexHigh + == other.m_file_info.nFileIndexHigh + && m_file_info.nFileIndexLow + == other.m_file_info.nFileIndexLow); +#else + return(m_file_info.st_ino == other.m_file_info.st_ino + && m_file_info.st_dev == other.m_file_info.st_dev); +#endif /* WIN32 */ +} + +/** Reads a few significant fields from the first page of the first +datafile. The Datafile must already be open. +@param[in] read_only_mode If true, then readonly mode checks are enforced. +@return DB_SUCCESS or DB_IO_ERROR if page cannot be read */ +dberr_t +Datafile::read_first_page(bool read_only_mode) +{ + if (m_handle == OS_FILE_CLOSED) { + + dberr_t err = open_or_create(read_only_mode); + + if (err != DB_SUCCESS) { + return(err); + } + } + + /* Align the memory for a possible read from a raw device */ + + m_first_page = static_cast<byte*>( + aligned_malloc(UNIV_PAGE_SIZE_MAX, srv_page_size)); + + dberr_t err = DB_ERROR; + size_t page_size = UNIV_PAGE_SIZE_MAX; + + /* Don't want unnecessary complaints about partial reads. */ + + while (page_size >= UNIV_PAGE_SIZE_MIN) { + + ulint n_read = 0; + + err = os_file_read( + IORequestReadPartial, m_handle, m_first_page, 0, + page_size, &n_read); + + if (err == DB_SUCCESS) { + break; + } + + if (err == DB_IO_ERROR && n_read == 0) { + break; + } + if (err == DB_IO_ERROR && n_read >= UNIV_PAGE_SIZE_MIN) { + page_size >>= 1; + } else if (srv_operation == SRV_OPERATION_BACKUP) { + break; + } else { + ib::info() << "Cannot read first page of '" + << m_filepath << "': " << err; + break; + } + } + + if (err != DB_SUCCESS) { + return(err); + } + + if (m_order == 0) { + if (memcmp_aligned<2>(FIL_PAGE_SPACE_ID + m_first_page, + FSP_HEADER_OFFSET + FSP_SPACE_ID + + m_first_page, 4)) { + ib::error() + << "Inconsistent tablespace ID in " + << m_filepath; + return DB_CORRUPTION; + } + + m_space_id = mach_read_from_4(FIL_PAGE_SPACE_ID + + m_first_page); + m_flags = fsp_header_get_flags(m_first_page); + if (!fil_space_t::is_valid_flags(m_flags, m_space_id)) { + uint32_t cflags = fsp_flags_convert_from_101(m_flags); + if (cflags == UINT32_MAX) { + switch (fsp_flags_is_incompatible_mysql(m_flags)) { + case 0: + sql_print_error("InnoDB: Invalid flags 0x%" PRIx32 " in %s", + m_flags, m_filepath); + return DB_CORRUPTION; + case 3: + case 2: + sql_print_error("InnoDB: MySQL-8.0 tablespace in %s", + m_filepath); + break; + case 1: + sql_print_error("InnoDB: MySQL Encrypted tablespace in %s", + m_filepath); + break; + } + sql_print_error("InnoDB: Restart in MySQL for migration/recovery."); + return DB_UNSUPPORTED; + } else { + m_flags = cflags; + } + } + } + + const size_t physical_size = fil_space_t::physical_size(m_flags); + + if (physical_size > page_size) { + ib::error() << "File " << m_filepath + << " should be longer than " + << page_size << " bytes"; + return(DB_CORRUPTION); + } + + return(err); +} + +/** Free the first page from memory when it is no longer needed. */ +void Datafile::free_first_page() +{ + aligned_free(m_first_page); + m_first_page= nullptr; +} + +/** Validates the datafile and checks that it conforms with the expected +space ID and flags. The file should exist and be successfully opened +in order for this function to validate it. +@param[in] space_id The expected tablespace ID. +@param[in] flags The expected tablespace flags. +@retval DB_SUCCESS if tablespace is valid, DB_ERROR if not. +m_is_valid is also set true on success, else false. */ +dberr_t Datafile::validate_to_dd(uint32_t space_id, uint32_t flags) +{ + dberr_t err; + + if (!is_open()) { + return DB_ERROR; + } + + /* Validate this single-table-tablespace with the data dictionary, + but do not compare the DATA_DIR flag, in case the tablespace was + remotely located. */ + err = validate_first_page(); + if (err != DB_SUCCESS) { + return(err); + } + + flags &= ~FSP_FLAGS_MEM_MASK; + + /* Make sure the datafile we found matched the space ID. + If the datafile is a file-per-table tablespace then also match + the row format and zip page size. */ + if (m_space_id == space_id + && (fil_space_t::is_flags_equal(flags, m_flags) + || fil_space_t::is_flags_equal(m_flags, flags))) { + /* Datafile matches the tablespace expected. */ + return(DB_SUCCESS); + } + + /* else do not use this tablespace. */ + m_is_valid = false; + + ib::error() << "Refusing to load '" << m_filepath << "' (id=" + << m_space_id << ", flags=" << ib::hex(m_flags) + << "); dictionary contains id=" + << space_id << ", flags=" << ib::hex(flags); + + return(DB_ERROR); +} + +/** Validates this datafile for the purpose of recovery. The file should +exist and be successfully opened. We initially open it in read-only mode +because we just want to read the SpaceID. However, if the first page is +corrupt and needs to be restored from the doublewrite buffer, we will +reopen it in write mode and ry to restore that page. +@retval DB_SUCCESS if tablespace is valid, DB_ERROR if not. +m_is_valid is also set true on success, else false. */ +dberr_t +Datafile::validate_for_recovery() +{ + dberr_t err; + + ut_ad(is_open()); + ut_ad(!srv_read_only_mode); + + err = validate_first_page(); + + switch (err) { + case DB_TABLESPACE_EXISTS: + break; + case DB_SUCCESS: + if (!m_defer || !m_space_id) { + break; + } + /* InnoDB should check whether the deferred + tablespace page0 can be recovered from + double write buffer. InnoDB should try + to recover only if m_space_id exists because + dblwr pages can be searched via {space_id, 0}. + m_space_id is set in read_first_page(). */ + /* fall through */ + default: + /* Re-open the file in read-write mode Attempt to restore + page 0 from doublewrite and read the space ID from a survey + of the first few pages. */ + close(); + err = open_read_write(); + if (err != DB_SUCCESS) { + return(err); + } + + if (!m_defer) { + err = find_space_id(); + if (err != DB_SUCCESS || m_space_id == 0) { + ib::error() << "Datafile '" << m_filepath + << "' is corrupted. Cannot determine " + "the space ID from the first 64 pages."; + return(err); + } + } + + if (m_space_id == UINT32_MAX) { + return DB_SUCCESS; /* empty file */ + } + + if (recv_sys.dblwr.restore_first_page( + m_space_id, m_filepath, m_handle)) { + return m_defer ? err : DB_CORRUPTION; + } + + /* Free the previously read first page and then re-validate. */ + free_first_page(); + m_defer = false; + err = validate_first_page(); + } + + return(err); +} + +/** Check the consistency of the first page of a datafile when the +tablespace is opened. This occurs before the fil_space_t is created +so the Space ID found here must not already be open. +m_is_valid is set true on success, else false. +@retval DB_SUCCESS on if the datafile is valid +@retval DB_CORRUPTION if the datafile is not readable +@retval DB_TABLESPACE_EXISTS if there is a duplicate space_id */ +dberr_t Datafile::validate_first_page() +{ + const char* error_txt = NULL; + + m_is_valid = true; + + if (m_first_page == NULL + && read_first_page(srv_read_only_mode) != DB_SUCCESS) { + + error_txt = "Cannot read first page"; + } + + if (error_txt != NULL) { +err_exit: + free_first_page(); + + if (recv_recovery_is_on() + || srv_operation == SRV_OPERATION_BACKUP) { + m_defer= true; + return DB_SUCCESS; + } + + ib::info() << error_txt << " in datafile: " << m_filepath + << ", Space ID:" << m_space_id << ", Flags: " + << m_flags; + m_is_valid = false; + return(DB_CORRUPTION); + } + + /* Check if the whole page is blank. */ + if (!m_space_id && !m_flags) { + const byte* b = m_first_page; + ulint nonzero_bytes = srv_page_size; + + while (*b == '\0' && --nonzero_bytes != 0) { + + b++; + } + + if (nonzero_bytes == 0) { + error_txt = "Header page consists of zero bytes"; + goto err_exit; + } + } + + if (!fil_space_t::is_valid_flags(m_flags, m_space_id)) { + /* Tablespace flags must be valid. */ + error_txt = "Tablespace flags are invalid"; + goto err_exit; + } + + ulint logical_size = fil_space_t::logical_size(m_flags); + + if (srv_page_size != logical_size) { + free_first_page(); + if (recv_recovery_is_on() + || srv_operation == SRV_OPERATION_BACKUP) { + m_defer= true; + return DB_SUCCESS; + } + /* Logical size must be innodb_page_size. */ + ib::error() + << "Data file '" << m_filepath << "' uses page size " + << logical_size << ", but the innodb_page_size" + " start-up parameter is " + << srv_page_size; + return(DB_ERROR); + } + + if (page_get_page_no(m_first_page) != 0) { + /* First page must be number 0 */ + error_txt = "Header page contains inconsistent data"; + goto err_exit; + } + + if (m_space_id >= SRV_SPACE_ID_UPPER_BOUND) { + error_txt = "A bad Space ID was found"; + goto err_exit; + } + + if (buf_page_is_corrupted(false, m_first_page, m_flags)) { + /* Look for checksum and other corruptions. */ + error_txt = "Checksum mismatch"; + goto err_exit; + } + + mysql_mutex_lock(&fil_system.mutex); + + fil_space_t* space = fil_space_get_by_id(m_space_id); + + if (space) { + fil_node_t* node = UT_LIST_GET_FIRST(space->chain); + + if (node && !strcmp(m_filepath, node->name)) { +ok_exit: + mysql_mutex_unlock(&fil_system.mutex); + return DB_SUCCESS; + } + + if (!m_space_id + && (recv_recovery_is_on() + || srv_operation == SRV_OPERATION_BACKUP)) { + m_defer= true; + goto ok_exit; + } + + /* Make sure the space_id has not already been opened. */ + ib::error() << "Attempted to open a previously opened" + " tablespace. Previous tablespace: " + << (node ? node->name : "(unknown)") + << " uses space ID: " << m_space_id + << ". Cannot open filepath: " << m_filepath + << " which uses the same space ID."; + } + + mysql_mutex_unlock(&fil_system.mutex); + + if (space) { + m_is_valid = false; + + free_first_page(); + + return(is_predefined_tablespace(m_space_id) + ? DB_CORRUPTION + : DB_TABLESPACE_EXISTS); + } + + return(DB_SUCCESS); +} + +/** Determine the space id of the given file descriptor by reading a few +pages from the beginning of the .ibd file. +@return DB_SUCCESS if space id was successfully identified, else DB_ERROR. */ +dberr_t +Datafile::find_space_id() +{ + os_offset_t file_size; + + ut_ad(m_handle != OS_FILE_CLOSED); + + file_size = os_file_get_size(m_handle); + + if (!file_size) { + return DB_SUCCESS; + } + + if (file_size == (os_offset_t) -1) { + ib::error() << "Could not get file size of datafile '" + << m_filepath << "'"; + return(DB_CORRUPTION); + } + + /* Assuming a page size, read the space_id from each page and store it + in a map. Find out which space_id is agreed on by majority of the + pages. Choose that space_id. */ + for (ulint page_size = UNIV_ZIP_SIZE_MIN; + page_size <= UNIV_PAGE_SIZE_MAX; + page_size <<= 1) { + /* map[space_id] = count of pages */ + typedef std::map< + uint32_t, + uint32_t, + std::less<uint32_t>, + ut_allocator<std::pair<const uint32_t, uint32_t> > > + Pages; + + Pages verify; + uint32_t page_count = 64; + uint32_t valid_pages = 0; + + /* Adjust the number of pages to analyze based on file size */ + while ((page_count * page_size) > file_size) { + --page_count; + } + + ib::info() + << "Page size:" << page_size + << ". Pages to analyze:" << page_count; + + byte* page = static_cast<byte*>( + aligned_malloc(page_size, page_size)); + + uint32_t fsp_flags; + /* provide dummy value if the first os_file_read() fails */ + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + fsp_flags = 1U << FSP_FLAGS_FCRC32_POS_MARKER + | FSP_FLAGS_FCRC32_PAGE_SSIZE() + | uint(innodb_compression_algorithm) + << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO; + break; + default: + fsp_flags = 0; + } + + for (ulint j = 0; j < page_count; ++j) { + if (os_file_read(IORequestRead, m_handle, page, + j * page_size, page_size, nullptr)) { + ib::info() + << "READ FAIL: page_no:" << j; + continue; + } + + if (j == 0) { + fsp_flags = mach_read_from_4( + page + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS); + } + + bool noncompressed_ok = false; + + /* For noncompressed pages, the page size must be + equal to srv_page_size. */ + if (page_size == srv_page_size + && !fil_space_t::zip_size(fsp_flags)) { + noncompressed_ok = !buf_page_is_corrupted( + false, page, fsp_flags); + } + + bool compressed_ok = false; + + if (srv_page_size <= UNIV_PAGE_SIZE_DEF + && page_size == fil_space_t::zip_size(fsp_flags)) { + compressed_ok = !buf_page_is_corrupted( + false, page, fsp_flags); + } + + if (noncompressed_ok || compressed_ok) { + + uint32_t space_id = mach_read_from_4(page + + FIL_PAGE_SPACE_ID); + + if (space_id > 0) { + + ib::info() + << "VALID: space:" + << space_id << " page_no:" << j + << " page_size:" << page_size; + + ++valid_pages; + + ++verify[space_id]; + } + } + } + + aligned_free(page); + + ib::info() + << "Page size: " << page_size + << ". Possible space_id count:" << verify.size(); + + const ulint pages_corrupted = 3; + + for (ulint missed = 0; missed <= pages_corrupted; ++missed) { + + for (Pages::const_iterator it = verify.begin(); + it != verify.end(); + ++it) { + + ib::info() << "space_id:" << it->first + << ", Number of pages matched: " + << it->second << "/" << valid_pages + << " (" << page_size << ")"; + + if (it->second == (valid_pages - missed)) { + ib::info() << "Chosen space:" + << it->first; + + m_space_id = it->first; + return(DB_SUCCESS); + } + } + + } + } + + return(DB_CORRUPTION); +} + +/** Read an InnoDB Symbolic Link (ISL) file by name. +@param link_filepath filepath of the ISL file +@return data file name (must be freed by the caller) +@retval nullptr on error */ +static char *read_link_file(const char *link_filepath) +{ + if (FILE* file= fopen(link_filepath, "r+b" STR_O_CLOEXEC)) + { + char *filepath= static_cast<char*>(ut_malloc_nokey(OS_FILE_MAX_PATH)); + + os_file_read_string(file, filepath, OS_FILE_MAX_PATH); + fclose(file); + + if (size_t len= strlen(filepath)) + { + /* Trim whitespace from end of filepath */ + len--; + while (static_cast<byte>(filepath[len]) <= 0x20) + { + if (!len) + return nullptr; + filepath[len--]= 0; + } + /* Ensure that the last 2 path separators are forward slashes, + because elsewhere we are assuming that tablespace file names end + in "/databasename/tablename.ibd". */ + unsigned trailing_slashes= 0; + for (; len; len--) + { + switch (filepath[len]) { +#ifdef _WIN32 + case '\\': + filepath[len]= '/'; + /* fall through */ +#endif + case '/': + if (++trailing_slashes >= 2) + return filepath; + } + } + } + } + + return nullptr; +} + +/** Create a link filename, +open that file, and read the contents into m_filepath. +@param name table name +@return filepath() +@retval nullptr if the .isl file does not exist or cannot be read */ +const char *RemoteDatafile::open_link_file(const fil_space_t::name_type name) +{ + if (!m_link_filepath) + m_link_filepath= fil_make_filepath(nullptr, name, ISL, false); + m_filepath= read_link_file(m_link_filepath); + return m_filepath; +} + +/** Release the resources. */ +void +RemoteDatafile::shutdown() +{ + Datafile::shutdown(); + + if (m_link_filepath != 0) { + ut_free(m_link_filepath); + m_link_filepath = 0; + } +} + +/** Create InnoDB Symbolic Link (ISL) file. +@param name tablespace name +@param filepath full file name +@return DB_SUCCESS or error code */ +dberr_t RemoteDatafile::create_link_file(fil_space_t::name_type name, + const char *filepath) +{ + bool success; + dberr_t err = DB_SUCCESS; + char* link_filepath = NULL; + char* prev_filepath = NULL; + + ut_ad(!srv_read_only_mode); + + link_filepath = fil_make_filepath(NULL, name, ISL, false); + + if (link_filepath == NULL) { + return(DB_ERROR); + } + + prev_filepath = read_link_file(link_filepath); + if (prev_filepath) { + /* Truncate (starting with MySQL 5.6, probably no + longer since MariaDB Server 10.2.19) used to call this + with an existing link file which contains the same filepath. */ + bool same = !strncmp(prev_filepath, name.data(), name.size()) + && !strcmp(prev_filepath + name.size(), DOT_IBD); + ut_free(prev_filepath); + if (same) { + ut_free(link_filepath); + return(DB_SUCCESS); + } + } + + /** Check if the file already exists. */ + FILE* file = NULL; + bool exists; + os_file_type_t ftype; + + success = os_file_status(link_filepath, &exists, &ftype); + ulint error = 0; + + if (success && !exists) { + + file = fopen(link_filepath, "w"); + if (file == NULL) { + /* This call will print its own error message */ + error = os_file_get_last_error(true); + } + } else { + error = OS_FILE_ALREADY_EXISTS; + } + + if (error != 0) { + + ib::error() << "Cannot create file " << link_filepath << "."; + + if (error == OS_FILE_ALREADY_EXISTS) { + ib::error() << "The link file: " << link_filepath + << " already exists."; + err = DB_TABLESPACE_EXISTS; + + } else if (error == OS_FILE_DISK_FULL) { + err = DB_OUT_OF_FILE_SPACE; + + } else { + err = DB_ERROR; + } + + /* file is not open, no need to close it. */ + ut_free(link_filepath); + return(err); + } + + const size_t len = strlen(filepath); + if (fwrite(filepath, 1, len, file) != len) { + error = os_file_get_last_error(true); + ib::error() << + "Cannot write link file: " + << link_filepath << " filepath: " << filepath; + err = DB_ERROR; + } + + /* Close the file, we only need it at startup */ + fclose(file); + + ut_free(link_filepath); + + return(err); +} + +/** Delete an InnoDB Symbolic Link (ISL) file. */ +void +RemoteDatafile::delete_link_file(void) +{ + ut_ad(m_link_filepath != NULL); + + if (m_link_filepath != NULL) { + os_file_delete_if_exists(innodb_data_file_key, + m_link_filepath, NULL); + } +} + +/** Delete an InnoDB Symbolic Link (ISL) file by name. +@param name tablespace name */ +void RemoteDatafile::delete_link_file(fil_space_t::name_type name) +{ + if (char *link_filepath= fil_make_filepath(NULL, name, ISL, false)) + { + os_file_delete_if_exists(innodb_data_file_key, link_filepath, nullptr); + ut_free(link_filepath); + } +} diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc new file mode 100644 index 00000000..6c5c354e --- /dev/null +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -0,0 +1,3070 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fsp/fsp0fsp.cc +File space management + +Created 11/29/1995 Heikki Tuuri +***********************************************************************/ + +#include "fsp0fsp.h" +#include "buf0buf.h" +#include "fil0fil.h" +#include "fil0crypt.h" +#include "mtr0log.h" +#include "ut0byte.h" +#include "page0page.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "ibuf0ibuf.h" +#include "btr0btr.h" +#include "btr0sea.h" +#include "dict0boot.h" +#include "log0log.h" +#include "dict0mem.h" +#include "fsp0types.h" +#include "log.h" + +typedef uint32_t page_no_t; + +/** Returns the first extent descriptor for a segment. +We think of the extent lists of the segment catenated in the order +FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE. +@param[in] inode segment inode +@param[in] space tablespace +@param[in,out] mtr mini-transaction +@param[out] err error code +@return the first extent descriptor, or NULL if none */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +static +xdes_t* +fseg_get_first_extent( + fseg_inode_t* inode, + const fil_space_t* space, + mtr_t* mtr, + dberr_t* err); + +ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Put new extents to the free list if there are free extents above the free +limit. If an extent happens to contain an extent descriptor page, the extent +is put to the FSP_FREE_FRAG list with the page marked as used. +@param[in] init_space true if this is a single-table tablespace +and we are only initializing the first extent and the first bitmap pages; +then we will not allocate more extents +@param[in,out] space tablespace +@param[in,out] header tablespace header +@param[in,out] mtr mini-transaction */ +static +dberr_t +fsp_fill_free_list( + bool init_space, + fil_space_t* space, + buf_block_t* header, + mtr_t* mtr); + +/** Allocates a single free page from a segment. +This function implements the intelligent allocation strategy which tries to +minimize file space fragmentation. +@param[in,out] space tablespace +@param[in,out] seg_inode segment inode +@param[in,out] iblock segment inode page +@param[in] hint hint of which page would be desirable +@param[in] direction if the new page is needed because of +an index page split, and records are inserted there in order, into which +direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR +@param[in,out] mtr mini-transaction +@param[in,out] init_mtr mtr or another mini-transaction in +which the page should be initialized. +@param[out] err error code +@return the allocated page +@retval nullptr if no page could be allocated */ +static +buf_block_t* +fseg_alloc_free_page_low( + fil_space_t* space, + fseg_inode_t* seg_inode, + buf_block_t* iblock, + uint32_t hint, + byte direction, +#ifdef UNIV_DEBUG + bool has_done_reservation, + /*!< whether the space has already been reserved */ +#endif /* UNIV_DEBUG */ + mtr_t* mtr, + mtr_t* init_mtr, + dberr_t* err) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Get the tablespace header block, SX-latched +@param[in] space tablespace +@param[in,out] mtr mini-transaction +@param[out] err error code +@return pointer to the space header, page x-locked +@retval nullptr if the page cannot be retrieved or is corrupted */ +static buf_block_t *fsp_get_header(const fil_space_t *space, mtr_t *mtr, + dberr_t *err) +{ + const page_id_t id{space->id, 0}; + buf_block_t *block= mtr->get_already_latched(id, MTR_MEMO_PAGE_SX_FIX); + if (block) + *err= DB_SUCCESS; + else + { + block= buf_page_get_gen(id, space->zip_size(), RW_SX_LATCH, + nullptr, BUF_GET_POSSIBLY_FREED, + mtr, err); + if (block && + space->id != mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + + block->page.frame)) + { + *err= DB_CORRUPTION; + block= nullptr; + } + } + return block; +} + +/** Set the XDES_FREE_BIT of a page. +@tparam free desired value of XDES_FREE_BIT +@param[in] block extent descriptor block +@param[in,out] descr extent descriptor +@param[in] offset page offset within the extent +@param[in,out] mtr mini-transaction */ +template<bool free> +inline void xdes_set_free(const buf_block_t &block, xdes_t *descr, + ulint offset, mtr_t *mtr) +{ + ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX | + MTR_MEMO_PAGE_X_FIX)); + ut_ad(offset < FSP_EXTENT_SIZE); + ut_ad(page_align(descr) == block.page.frame); + compile_time_assert(XDES_BITS_PER_PAGE == 2); + compile_time_assert(XDES_FREE_BIT == 0); + compile_time_assert(XDES_CLEAN_BIT == 1); + + ulint index= XDES_BITS_PER_PAGE * offset; + byte *b= &descr[XDES_BITMAP + (index >> 3)]; + /* xdes_init() should have set all XDES_CLEAN_BIT. */ + ut_ad(!(~*b & 0xaa)); + /* Clear or set XDES_FREE_BIT. */ + byte val= free + ? static_cast<byte>(*b | 1 << (index & 7)) + : static_cast<byte>(*b & ~(1 << (index & 7))); + mtr->write<1>(block, b, val); +} + +/** +Find a free page. +@param descr extent descriptor +@param hint page offset to start searching from (towards larger pages) +@return free page offset +@retval FIL_NULL if no page is free */ +inline uint32_t xdes_find_free(const xdes_t *descr, uint32_t hint= 0) +{ + const uint32_t extent_size= FSP_EXTENT_SIZE; + ut_ad(hint < extent_size); + for (uint32_t i= hint; i < extent_size; i++) + if (xdes_is_free(descr, i)) + return i; + for (uint32_t i= 0; i < hint; i++) + if (xdes_is_free(descr, i)) + return i; + return FIL_NULL; +} + +/** +Determine the number of used pages in a descriptor. +@param descr file descriptor +@return number of pages used */ +inline uint32_t xdes_get_n_used(const xdes_t *descr) +{ + uint32_t count= 0; + + for (uint32_t i= FSP_EXTENT_SIZE; i--; ) + if (!xdes_is_free(descr, i)) + count++; + + return count; +} + +/** +Determine whether a file extent is full. +@param descr file descriptor +@return whether all pages have been allocated */ +inline bool xdes_is_full(const xdes_t *descr) +{ + return FSP_EXTENT_SIZE == xdes_get_n_used(descr); +} + +/** Set the state of an extent descriptor. +@param[in] block extent descriptor block +@param[in,out] descr extent descriptor +@param[in] state the state +@param[in,out] mtr mini-transaction */ +inline void xdes_set_state(const buf_block_t &block, xdes_t *descr, + byte state, mtr_t *mtr) +{ + ut_ad(descr && mtr); + ut_ad(state >= XDES_FREE); + ut_ad(state <= XDES_FSEG); + ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX | + MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_align(descr) == block.page.frame); + ut_ad(mach_read_from_4(descr + XDES_STATE) <= XDES_FSEG); + mtr->write<1>(block, XDES_STATE + 3 + descr, state); +} + +/**********************************************************************//** +Gets the state of an xdes. +@return state */ +UNIV_INLINE +ulint +xdes_get_state( +/*===========*/ + const xdes_t* descr) /*!< in: descriptor */ +{ + ulint state; + + ut_ad(descr); + state = mach_read_from_4(descr + XDES_STATE); + ut_ad(state - 1 < XDES_FSEG); + return(state); +} + +/**********************************************************************//** +Inits an extent descriptor to the free and clean state. */ +inline void xdes_init(const buf_block_t &block, xdes_t *descr, mtr_t *mtr) +{ + ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX | + MTR_MEMO_PAGE_X_FIX)); + mtr->memset(&block, uint16_t(descr - block.page.frame) + XDES_BITMAP, + XDES_SIZE - XDES_BITMAP, 0xff); + xdes_set_state(block, descr, XDES_FREE, mtr); +} + +/** Mark a page used in an extent descriptor. +@param[in,out] seg_inode segment inode +@param[in,out] iblock segment inode page +@param[in] page page number +@param[in,out] descr extent descriptor +@param[in,out] xdes extent descriptor page +@param[in,out] mtr mini-transaction +@return error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fseg_mark_page_used(fseg_inode_t *seg_inode, buf_block_t *iblock, + ulint page, xdes_t *descr, buf_block_t *xdes, mtr_t *mtr) +{ + ut_ad(fil_page_get_type(iblock->page.frame) == FIL_PAGE_INODE); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4)); + ut_ad(!memcmp(seg_inode + FSEG_ID, descr + XDES_ID, 4)); + + const uint16_t xoffset= uint16_t(descr - xdes->page.frame + XDES_FLST_NODE); + const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame); + + if (!xdes_get_n_used(descr)) + { + /* We move the extent from the free list to the NOT_FULL list */ + if (dberr_t err= flst_remove(iblock, uint16_t(FSEG_FREE + ioffset), + xdes, xoffset, mtr)) + return err; + if (dberr_t err= flst_add_last(iblock, uint16_t(FSEG_NOT_FULL + ioffset), + xdes, xoffset, mtr)) + return err; + } + + if (UNIV_UNLIKELY(!xdes_is_free(descr, page % FSP_EXTENT_SIZE))) + return DB_CORRUPTION; + + /* We mark the page as used */ + xdes_set_free<false>(*xdes, descr, page % FSP_EXTENT_SIZE, mtr); + + byte* p_not_full= seg_inode + FSEG_NOT_FULL_N_USED; + const uint32_t not_full_n_used= mach_read_from_4(p_not_full) + 1; + mtr->write<4>(*iblock, p_not_full, not_full_n_used); + if (xdes_is_full(descr)) + { + /* We move the extent from the NOT_FULL list to the FULL list */ + if (dberr_t err= flst_remove(iblock, uint16_t(FSEG_NOT_FULL + ioffset), + xdes, xoffset, mtr)) + return err; + if (dberr_t err= flst_add_last(iblock, uint16_t(FSEG_FULL + ioffset), + xdes, xoffset, mtr)) + return err; + mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED, + not_full_n_used - FSP_EXTENT_SIZE); + } + + return DB_SUCCESS; +} + +/** Get pointer to a the extent descriptor of a page. +@param[in,out] sp_header tablespace header page, x-latched +@param[in] space tablespace +@param[in] offset page offset +@param[in,out] mtr mini-transaction +@param[out] err error code +@param[out] desc_block descriptor block +@param[in] init_space whether the tablespace is being initialized +@return pointer to the extent descriptor, NULL if the page does not +exist in the space or if the offset exceeds free limit */ +UNIV_INLINE MY_ATTRIBUTE((warn_unused_result)) +xdes_t* +xdes_get_descriptor_with_space_hdr( + buf_block_t* header, + const fil_space_t* space, + page_no_t offset, + mtr_t* mtr, + dberr_t* err = nullptr, + buf_block_t** desc_block = nullptr, + bool init_space = false) +{ + ut_ad(space->is_owner()); + ut_ad(mtr->memo_contains_flagged(header, MTR_MEMO_PAGE_SX_FIX + | MTR_MEMO_PAGE_X_FIX)); + /* Read free limit and space size */ + uint32_t limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + + header->page.frame); + uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + + header->page.frame); + ut_ad(limit == space->free_limit + || (space->free_limit == 0 + && (init_space + || space->purpose == FIL_TYPE_TEMPORARY + || (srv_startup_is_before_trx_rollback_phase + && (space->id == TRX_SYS_SPACE + || srv_is_undo_tablespace(space->id)))))); + ut_ad(size == space->size_in_header); + + if (offset >= size || offset >= limit) { + return nullptr; + } + + const unsigned zip_size = space->zip_size(); + + uint32_t descr_page_no = xdes_calc_descriptor_page(zip_size, offset); + + buf_block_t* block = header; + + if (descr_page_no) { + block = buf_page_get_gen(page_id_t(space->id, descr_page_no), + zip_size, RW_SX_LATCH, nullptr, + BUF_GET_POSSIBLY_FREED, mtr, err); + } + + if (desc_block) { + *desc_block = block; + } + + return block + ? XDES_ARR_OFFSET + XDES_SIZE + * xdes_calc_descriptor_index(zip_size, offset) + + block->page.frame + : nullptr; +} + +MY_ATTRIBUTE((nonnull(1,3), warn_unused_result)) +/** Get the extent descriptor of a page. +The page where the extent descriptor resides is x-locked. If the page +offset is equal to the free limit of the space, we will add new +extents from above the free limit to the space free list, if not free +limit == space size. This adding is necessary to make the descriptor +defined, as they are uninitialized above the free limit. +@param[in] space tablespace +@param[in] offset page offset; if equal to the free limit, we +try to add new extents to the space free list +@param[in,out] mtr mini-transaction +@param[out] err error code +@param[out] xdes extent descriptor page +@return the extent descriptor */ +static xdes_t *xdes_get_descriptor(const fil_space_t *space, page_no_t offset, + mtr_t *mtr, dberr_t *err= nullptr, + buf_block_t **xdes= nullptr) +{ + if (buf_block_t *block= + buf_page_get_gen(page_id_t(space->id, 0), space->zip_size(), RW_SX_LATCH, + nullptr, BUF_GET_POSSIBLY_FREED, mtr, err)) + return xdes_get_descriptor_with_space_hdr(block, space, offset, mtr, + err, xdes); + return nullptr; +} + +MY_ATTRIBUTE((nonnull(3), warn_unused_result)) +/** Get a pointer to the extent descriptor. The page where the +extent descriptor resides is x-locked. +@param space tablespace +@param lst_node file address of the list node contained in the descriptor +@param mtr mini-transaction +@param err error code +@param block extent descriptor block +@return pointer to the extent descriptor */ +static inline +xdes_t *xdes_lst_get_descriptor(const fil_space_t &space, fil_addr_t lst_node, + mtr_t *mtr, buf_block_t **block= nullptr, + dberr_t *err= nullptr) +{ + ut_ad(mtr->memo_contains(space)); + ut_ad(lst_node.boffset < space.physical_size()); + buf_block_t *b; + if (!block) + block= &b; + *block= buf_page_get_gen(page_id_t{space.id, lst_node.page}, + space.zip_size(), RW_SX_LATCH, + nullptr, BUF_GET_POSSIBLY_FREED, mtr, err); + if (*block) + return (*block)->page.frame + lst_node.boffset - XDES_FLST_NODE; + + space.set_corrupted(); + return nullptr; +} + +/********************************************************************//** +Returns page offset of the first page in extent described by a descriptor. +@return offset of the first page in extent */ +static uint32_t xdes_get_offset(const xdes_t *descr) +{ + ut_ad(descr); + return page_get_page_no(page_align(descr)) + + uint32_t(((page_offset(descr) - XDES_ARR_OFFSET) / XDES_SIZE) * + FSP_EXTENT_SIZE); +} + +/** Initialize a file page whose prior contents should be ignored. +@param[in,out] block buffer pool block */ +void fsp_apply_init_file_page(buf_block_t *block) +{ + memset_aligned<UNIV_PAGE_SIZE_MIN>(block->page.frame, 0, srv_page_size); + const page_id_t id(block->page.id()); + + mach_write_to_4(block->page.frame + FIL_PAGE_OFFSET, id.page_no()); + memset_aligned<8>(block->page.frame + FIL_PAGE_PREV, 0xff, 8); + mach_write_to_4(block->page.frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + id.space()); + if (page_zip_des_t* page_zip= buf_block_get_page_zip(block)) + { + memset_aligned<UNIV_ZIP_SIZE_MIN>(page_zip->data, 0, + page_zip_get_size(page_zip)); + static_assert(FIL_PAGE_OFFSET == 4, "compatibility"); + memcpy_aligned<4>(page_zip->data + FIL_PAGE_OFFSET, + block->page.frame + FIL_PAGE_OFFSET, 4); + memset_aligned<8>(page_zip->data + FIL_PAGE_PREV, 0xff, 8); + static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2, + "not perfect alignment"); + memcpy_aligned<2>(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + block->page.frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4); + } +} + +#ifdef UNIV_DEBUG +/** Assert that the mini-transaction is compatible with +updating an allocation bitmap page. +@param[in] mtr mini-transaction */ +void fil_space_t::modify_check(const mtr_t& mtr) const +{ + switch (mtr.get_log_mode()) { + case MTR_LOG_NONE: + /* These modes are only allowed within a non-bitmap page + when there is a higher-level redo log record written. */ + ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_TEMPORARY); + break; + case MTR_LOG_NO_REDO: + ut_ad(purpose == FIL_TYPE_TEMPORARY || purpose == FIL_TYPE_IMPORT); + break; + default: + /* We may only write redo log for a persistent tablespace. */ + ut_ad(purpose == FIL_TYPE_TABLESPACE); + ut_ad(mtr.is_named_space(id)); + } +} +#endif + +/** Initialize a tablespace header. +@param[in,out] space tablespace +@param[in] size current size in blocks +@param[in,out] mtr mini-transaction +@return error code */ +dberr_t fsp_header_init(fil_space_t *space, uint32_t size, mtr_t *mtr) +{ + const page_id_t page_id(space->id, 0); + const ulint zip_size = space->zip_size(); + + buf_block_t *free_block = buf_LRU_get_free_block(false); + + mtr->x_lock_space(space); + + buf_block_t* block = buf_page_create(space, 0, zip_size, mtr, + free_block); + if (UNIV_UNLIKELY(block != free_block)) { + buf_pool.free_block(free_block); + } + + space->size_in_header = size; + space->free_len = 0; + space->free_limit = 0; + + /* The prior contents of the file page should be ignored */ + + fsp_init_file_page(space, block, mtr); + + mtr->write<2>(*block, block->page.frame + FIL_PAGE_TYPE, + FIL_PAGE_TYPE_FSP_HDR); + + mtr->write<4,mtr_t::MAYBE_NOP>(*block, FSP_HEADER_OFFSET + FSP_SPACE_ID + + block->page.frame, space->id); + ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_NOT_USED + + block->page.frame)); + /* recv_sys_t::parse() expects to find a WRITE record that + covers all 4 bytes. Therefore, we must specify mtr_t::FORCED + in order to avoid optimizing away any unchanged most + significant bytes of FSP_SIZE. */ + mtr->write<4,mtr_t::FORCED>(*block, FSP_HEADER_OFFSET + FSP_SIZE + + block->page.frame, size); + ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + + block->page.frame)); + if (auto f = space->flags & ~FSP_FLAGS_MEM_MASK) { + mtr->write<4,mtr_t::FORCED>(*block, + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + + block->page.frame, f); + } + ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED + + block->page.frame)); + + flst_init(block, FSP_HEADER_OFFSET + FSP_FREE, mtr); + flst_init(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG, mtr); + flst_init(block, FSP_HEADER_OFFSET + FSP_FULL_FRAG, mtr); + flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, mtr); + flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, mtr); + + mtr->write<8>(*block, FSP_HEADER_OFFSET + FSP_SEG_ID + + block->page.frame, + 1U); + + if (dberr_t err = fsp_fill_free_list(!is_system_tablespace(space->id), + space, block, mtr)) { + return err; + } + + /* Write encryption metadata to page 0 if tablespace is + encrypted or encryption is disabled by table option. */ + if (space->crypt_data && + (space->crypt_data->should_encrypt() || + space->crypt_data->not_encrypted())) { + space->crypt_data->write_page0(block, mtr); + } + + return DB_SUCCESS; +} + +/** Try to extend a single-table tablespace so that a page would fit in the +data file. +@param[in,out] space tablespace +@param[in] page_no page number +@param[in,out] header tablespace header +@param[in,out] mtr mini-transaction +@return true if success */ +static ATTRIBUTE_COLD __attribute__((warn_unused_result)) +bool +fsp_try_extend_data_file_with_pages( + fil_space_t* space, + uint32_t page_no, + buf_block_t* header, + mtr_t* mtr) +{ + bool success; + ulint size; + + ut_ad(!is_system_tablespace(space->id)); + ut_d(space->modify_check(*mtr)); + + size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + + header->page.frame); + ut_ad(size == space->size_in_header); + + ut_a(page_no >= size); + + success = fil_space_extend(space, page_no + 1); + /* The size may be less than we wanted if we ran out of disk space. */ + /* recv_sys_t::parse() expects to find a WRITE record that + covers all 4 bytes. Therefore, we must specify mtr_t::FORCED + in order to avoid optimizing away any unchanged most + significant bytes of FSP_SIZE. */ + mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE + + header->page.frame, space->size); + space->size_in_header = space->size; + + return(success); +} + +/** Calculate the number of physical pages in an extent for this file. +@param[in] physical_size page_size of the datafile +@return number of pages in an extent for this file */ +inline uint32_t fsp_get_extent_size_in_pages(ulint physical_size) +{ + return uint32_t((FSP_EXTENT_SIZE << srv_page_size_shift) / physical_size); +} + + +/** Calculate the number of pages to extend a datafile. +We extend single-table tablespaces first one extent at a time, +but 4 at a time for bigger tablespaces. It is not enough to extend always +by one extent, because we need to add at least one extent to FSP_FREE. +A single extent descriptor page will track many extents. And the extent +that uses its extent descriptor page is put onto the FSP_FREE_FRAG list. +Extents that do not use their extent descriptor page are added to FSP_FREE. +The physical page size is used to determine how many extents are tracked +on one extent descriptor page. See xdes_calc_descriptor_page(). +@param[in] physical_size page size in data file +@param[in] size current number of pages in the datafile +@return number of pages to extend the file. */ +static uint32_t fsp_get_pages_to_extend_ibd(unsigned physical_size, + uint32_t size) +{ + uint32_t extent_size = fsp_get_extent_size_in_pages(physical_size); + /* The threshold is set at 32MiB except when the physical page + size is small enough that it must be done sooner. */ + uint32_t threshold = std::min(32 * extent_size, physical_size); + + if (size >= threshold) { + /* Below in fsp_fill_free_list() we assume + that we add at most FSP_FREE_ADD extents at + a time */ + extent_size *= FSP_FREE_ADD; + } + + return extent_size; +} + +/** Try to extend the last data file of a tablespace if it is auto-extending. +@param[in,out] space tablespace +@param[in,out] header tablespace header +@param[in,out] mtr mini-transaction +@return number of pages added +@retval 0 if the tablespace was not extended */ +ATTRIBUTE_COLD __attribute__((nonnull)) +static +ulint +fsp_try_extend_data_file(fil_space_t *space, buf_block_t *header, mtr_t *mtr) +{ + const char* OUT_OF_SPACE_MSG = + "ran out of space. Please add another file or use" + " 'autoextend' for the last file in setting"; + + ut_d(space->modify_check(*mtr)); + + if (space->id == TRX_SYS_SPACE + && !srv_sys_space.can_auto_extend_last_file()) { + + /* We print the error message only once to avoid + spamming the error log. Note that we don't need + to reset the flag to false as dealing with this + error requires server restart. */ + if (!srv_sys_space.get_tablespace_full_status()) { + sql_print_error("InnoDB: The InnoDB system tablespace " + "%s" " innodb_data_file_path.", + OUT_OF_SPACE_MSG); + srv_sys_space.set_tablespace_full_status(true); + } + return(0); + } else if (space->id == SRV_TMP_SPACE_ID + && !srv_tmp_space.can_auto_extend_last_file()) { + + /* We print the error message only once to avoid + spamming the error log. Note that we don't need + to reset the flag to false as dealing with this + error requires server restart. */ + if (!srv_tmp_space.get_tablespace_full_status()) { + sql_print_error("InnoDB: The InnoDB temporary" + " tablespace %s" + " innodb_temp_data_file_path.", + OUT_OF_SPACE_MSG); + srv_tmp_space.set_tablespace_full_status(true); + } + return(0); + } + + uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + + header->page.frame); + ut_ad(size == space->size_in_header); + uint32_t size_increase; + + const unsigned ps = space->physical_size(); + + switch (space->id) { + case TRX_SYS_SPACE: + size_increase = srv_sys_space.get_increment(); + break; + case SRV_TMP_SPACE_ID: + size_increase = srv_tmp_space.get_increment(); + break; + default: + uint32_t extent_pages = fsp_get_extent_size_in_pages(ps); + if (size < extent_pages) { + /* Let us first extend the file to extent_size */ + if (!fsp_try_extend_data_file_with_pages( + space, extent_pages - 1, header, mtr)) { + return(0); + } + + size = extent_pages; + } + + size_increase = fsp_get_pages_to_extend_ibd(ps, size); + } + + if (size_increase == 0) { + return(0); + } + + if (!fil_space_extend(space, size + size_increase)) { + return(0); + } + + /* For the system tablespace, we ignore any fragments of a + full megabyte when storing the size to the space header */ + + space->size_in_header = space->id + ? space->size + : ut_2pow_round(space->size, (1024 * 1024) / ps); + + /* recv_sys_t::parse() expects to find a WRITE record that + covers all 4 bytes. Therefore, we must specify mtr_t::FORCED + in order to avoid optimizing away any unchanged most + significant bytes of FSP_SIZE. */ + mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE + + header->page.frame, + space->size_in_header); + + return(size_increase); +} + +/** Reset the page type. +Data files created before MySQL 5.1.48 may contain garbage in FIL_PAGE_TYPE. +In MySQL 3.23.53, only undo log pages and index pages were tagged. +Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE. +@param[in] block block with invalid FIL_PAGE_TYPE +@param[in] type expected page type +@param[in,out] mtr mini-transaction */ +ATTRIBUTE_COLD +void fil_block_reset_type(const buf_block_t& block, ulint type, mtr_t* mtr) +{ + ib::info() << "Resetting invalid page " << block.page.id() << " type " + << fil_page_get_type(block.page.frame) << " to " << type << "."; + mtr->write<2>(block, block.page.frame + FIL_PAGE_TYPE, type); +} + +/** Put new extents to the free list if there are free extents above the free +limit. If an extent happens to contain an extent descriptor page, the extent +is put to the FSP_FREE_FRAG list with the page marked as used. +@param[in] init_space true if this is a single-table tablespace +and we are only initializing the first extent and the first bitmap pages; +then we will not allocate more extents +@param[in,out] space tablespace +@param[in,out] header tablespace header +@param[in,out] mtr mini-transaction +@return error code */ +static +dberr_t +fsp_fill_free_list( + bool init_space, + fil_space_t* space, + buf_block_t* header, + mtr_t* mtr) +{ + ut_d(space->modify_check(*mtr)); + + /* Check if we can fill free list from above the free list limit */ + uint32_t size= + mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + header->page.frame); + uint32_t limit= + mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + header->page.frame); + + ut_ad(size == space->size_in_header); + ut_ad(limit == space->free_limit); + + const auto zip_size= space->zip_size(); + + if (size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) + { + bool skip_resize= init_space; + switch (space->id) { + case TRX_SYS_SPACE: + skip_resize= !srv_sys_space.can_auto_extend_last_file(); + break; + case SRV_TMP_SPACE_ID: + skip_resize= !srv_tmp_space.can_auto_extend_last_file(); + break; + } + + if (!skip_resize) + { + fsp_try_extend_data_file(space, header, mtr); + size= space->size_in_header; + } + } + + uint32_t count= 0; + for (uint32_t i= limit, extent_size= FSP_EXTENT_SIZE, + physical_size= space->physical_size(); + (init_space && i < 1) || + (i + extent_size <= size && count < FSP_FREE_ADD); + i += extent_size) + { + const bool init_xdes= !ut_2pow_remainder(i, physical_size); + space->free_limit= i + extent_size; + mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FREE_LIMIT + + header->page.frame, i + extent_size); + + if (init_xdes) + { + /* We are going to initialize a new descriptor page + and a new ibuf bitmap page: the prior contents of the + pages should be ignored. */ + + if (i) + { + buf_block_t *f= buf_LRU_get_free_block(false); + buf_block_t *block= buf_page_create(space, static_cast<uint32_t>(i), + zip_size, mtr, f); + if (UNIV_UNLIKELY(block != f)) + buf_pool.free_block(f); + fsp_init_file_page(space, block, mtr); + mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame, + FIL_PAGE_TYPE_XDES); + } + + if (space->purpose != FIL_TYPE_TEMPORARY) + { + buf_block_t *f= buf_LRU_get_free_block(false); + buf_block_t *block= + buf_page_create(space, + static_cast<uint32_t>(i + FSP_IBUF_BITMAP_OFFSET), + zip_size, mtr, f); + if (UNIV_UNLIKELY(block != f)) + buf_pool.free_block(f); + fsp_init_file_page(space, block, mtr); + mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame, + FIL_PAGE_IBUF_BITMAP); + } + } + + buf_block_t *xdes= nullptr; + xdes_t *descr; + { + dberr_t err= DB_SUCCESS; + descr= xdes_get_descriptor_with_space_hdr(header, space, i, mtr, + &err, &xdes, init_space); + if (!descr) + return err; + } + + if (xdes != header && !space->full_crc32()) + fil_block_check_type(*xdes, FIL_PAGE_TYPE_XDES, mtr); + xdes_init(*xdes, descr, mtr); + const uint16_t xoffset= + static_cast<uint16_t>(descr - xdes->page.frame + XDES_FLST_NODE); + if (UNIV_UNLIKELY(init_xdes)) + { + /* The first page in the extent is a descriptor page and the + second is an ibuf bitmap page: mark them used */ + xdes_set_free<false>(*xdes, descr, 0, mtr); + xdes_set_free<false>(*xdes, descr, FSP_IBUF_BITMAP_OFFSET, mtr); + xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr); + if (dberr_t err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG, + xdes, xoffset, mtr)) + return err; + byte *n_used= FSP_HEADER_OFFSET + FSP_FRAG_N_USED + header->page.frame; + mtr->write<4>(*header, n_used, 2U + mach_read_from_4(n_used)); + } + else + { + if (dberr_t err= + flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE, + xdes, xoffset, mtr)) + return err; + count++; + } + } + + space->free_len+= count; + return DB_SUCCESS; +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Allocates a new free extent. +@param[in,out] space tablespace +@param[in] hint hint of which extent would be desirable: any +page offset in the extent goes; the hint must not be > FSP_FREE_LIMIT +@param[out] xdes extent descriptor page +@param[in,out] mtr mini-transaction +@return extent descriptor +@retval nullptr if cannot be allocated */ +static xdes_t *fsp_alloc_free_extent(fil_space_t *space, uint32_t hint, + buf_block_t **xdes, mtr_t *mtr, + dberr_t *err) +{ + fil_addr_t first; + xdes_t* descr; + buf_block_t* desc_block; + + buf_block_t* header = fsp_get_header(space, mtr, err); + if (!header) { +corrupted: + space->set_corrupted(); + return nullptr; + } + + descr = xdes_get_descriptor_with_space_hdr( + header, space, hint, mtr, err, &desc_block); + if (!descr) { + goto corrupted; + } + + if (desc_block != header && !space->full_crc32()) { + fil_block_check_type(*desc_block, FIL_PAGE_TYPE_XDES, mtr); + } + + if (xdes_get_state(descr) == XDES_FREE) { + /* Ok, we can take this extent */ + } else { + /* Take the first extent in the free list */ + first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE + + header->page.frame); + + if (first.page == FIL_NULL) { + *err = fsp_fill_free_list(false, space, header, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + goto corrupted; + } + + first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE + + header->page.frame); + if (first.page == FIL_NULL) { + return nullptr; /* No free extents left */ + } + } + + descr = xdes_lst_get_descriptor(*space, first, mtr, + &desc_block, err); + if (!descr) { + return descr; + } + } + + *err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE, desc_block, + static_cast<uint16_t>(descr - desc_block->page.frame + + XDES_FLST_NODE), + mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return nullptr; + } + + space->free_len--; + *xdes = desc_block; + + return(descr); +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Allocate a single free page. +@param[in,out] header tablespace header +@param[in,out] xdes extent descriptor page +@param[in,out] descr extent descriptor +@param[in] bit slot to allocate in the extent +@param[in,out] mtr mini-transaction +@return error code */ +static dberr_t +fsp_alloc_from_free_frag(buf_block_t *header, buf_block_t *xdes, xdes_t *descr, + ulint bit, mtr_t *mtr) +{ + if (UNIV_UNLIKELY(xdes_get_state(descr) != XDES_FREE_FRAG || + !xdes_is_free(descr, bit))) + return DB_CORRUPTION; + xdes_set_free<false>(*xdes, descr, bit, mtr); + + /* Update the FRAG_N_USED field */ + byte *n_used_p= FSP_HEADER_OFFSET + FSP_FRAG_N_USED + header->page.frame; + uint32_t n_used = mach_read_from_4(n_used_p) + 1; + + if (xdes_is_full(descr)) + { + /* The fragment is full: move it to another list */ + const uint16_t xoffset= + static_cast<uint16_t>(descr - xdes->page.frame + XDES_FLST_NODE); + if (dberr_t err= flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG, + xdes, xoffset, mtr)) + return err; + if (dberr_t err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG, + xdes, xoffset, mtr)) + return err; + xdes_set_state(*xdes, descr, XDES_FULL_FRAG, mtr); + n_used-= FSP_EXTENT_SIZE; + } + + mtr->write<4>(*header, n_used_p, n_used); + return DB_SUCCESS; +} + +/** Gets a buffer block for an allocated page. +@param[in,out] space tablespace +@param[in] offset page number of the allocated page +@param[in,out] mtr mini-transaction +@return block, initialized */ +static +buf_block_t* +fsp_page_create(fil_space_t *space, page_no_t offset, mtr_t *mtr) +{ + buf_block_t *block, *free_block; + + if (UNIV_UNLIKELY(space->is_being_truncated)) + { + const page_id_t page_id{space->id, offset}; + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); + mysql_mutex_lock(&buf_pool.mutex); + block= reinterpret_cast<buf_block_t*> + (buf_pool.page_hash.get(page_id, chain)); + if (block && block->page.oldest_modification() <= 1) + block= nullptr; + mysql_mutex_unlock(&buf_pool.mutex); + + if (block) + { + ut_ad(block->page.buf_fix_count() >= 1); + ut_ad(block->page.lock.x_lock_count() == 1); + ut_ad(mtr->have_x_latch(*block)); + free_block= block; + goto got_free_block; + } + } + + free_block= buf_LRU_get_free_block(false); +got_free_block: + block= buf_page_create(space, static_cast<uint32_t>(offset), + space->zip_size(), mtr, free_block); + if (UNIV_UNLIKELY(block != free_block)) + buf_pool.free_block(free_block); + + fsp_init_file_page(space, block, mtr); + return block; +} + +/** Allocates a single free page from a space. +The page is marked as used. +@param[in,out] space tablespace +@param[in] hint hint of which page would be desirable +@param[in,out] mtr mini-transaction +@param[in,out] init_mtr mini-transaction in which the page should be +initialized (may be the same as mtr) +@param[out] err error code +@return allocated block +@retval nullptr if no page could be allocated */ +static MY_ATTRIBUTE((warn_unused_result, nonnull)) +buf_block_t *fsp_alloc_free_page(fil_space_t *space, uint32_t hint, + mtr_t *mtr, mtr_t *init_mtr, dberr_t *err) +{ + ut_d(space->modify_check(*mtr)); + buf_block_t *block= fsp_get_header(space, mtr, err); + if (!block) + return block; + + buf_block_t *xdes; + /* Get the hinted descriptor */ + xdes_t *descr= xdes_get_descriptor_with_space_hdr(block, space, hint, mtr, + err, &xdes); + if (descr && xdes_get_state(descr) == XDES_FREE_FRAG) + /* Ok, we can take this extent */; + else if (*err != DB_SUCCESS) + { + err_exit: + space->set_corrupted(); + return nullptr; + } + else + { + /* Else take the first extent in free_frag list */ + fil_addr_t first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE_FRAG + + block->page.frame); + if (first.page == FIL_NULL) + { + /* There are no partially full fragments: allocate a free extent + and add it to the FREE_FRAG list. NOTE that the allocation may + have as a side-effect that an extent containing a descriptor + page is added to the FREE_FRAG list. But we will allocate our + page from the the free extent anyway. */ + descr= fsp_alloc_free_extent(space, hint, &xdes, mtr, err); + if (!descr) + return nullptr; + *err= flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG, xdes, + static_cast<uint16_t>(descr - xdes->page.frame + + XDES_FLST_NODE), mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) + return nullptr; + xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr); + } + else + { + descr= xdes_lst_get_descriptor(*space, first, mtr, &xdes, err); + if (!descr) + return nullptr; + /* Reset the hint */ + hint= 0; + } + } + + /* Now we have in descr an extent with at least one free page. Look + for a free page in the extent. */ + uint32_t free= xdes_find_free(descr, hint % FSP_EXTENT_SIZE); + if (free == FIL_NULL) + { + corrupted: + *err= DB_CORRUPTION; + goto err_exit; + } + + uint32_t page_no= xdes_get_offset(descr) + free; + uint32_t space_size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + + block->page.frame); + ut_ad(space_size == space->size_in_header || + (space->id == TRX_SYS_SPACE && + srv_startup_is_before_trx_rollback_phase)); + + if (space_size <= page_no) + { + /* It must be that we are extending a single-table tablespace + whose size is still < 64 pages */ + ut_ad(!is_system_tablespace(space->id)); + if (page_no >= FSP_EXTENT_SIZE) + { + sql_print_error("InnoDB: Trying to extend %s" + " by single page(s) though the size is " UINT32PF "." + " Page no " UINT32PF ".", + space->chain.start->name, space_size, page_no); + goto corrupted; + } + + if (!fsp_try_extend_data_file_with_pages(space, page_no, block, mtr)) + { + *err= DB_OUT_OF_FILE_SPACE; + return nullptr; + } + } + + *err= fsp_alloc_from_free_frag(block, xdes, descr, free, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) + goto corrupted; + return fsp_page_create(space, page_no, init_mtr); +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Return an extent to the free list of a space. +@param[in,out] space tablespace +@param[in] offset page number in the extent +@param[in,out] mtr mini-transaction +@return error code */ +static dberr_t fsp_free_extent(fil_space_t* space, page_no_t offset, + mtr_t* mtr) +{ + ut_ad(space->is_owner()); + dberr_t err; + buf_block_t *block= fsp_get_header(space, mtr, &err); + if (!block) + return err; + buf_block_t *xdes; + xdes_t *descr= xdes_get_descriptor_with_space_hdr(block, space, offset, mtr, + &err, &xdes); + if (!descr) + { + ut_ad(err || space->is_stopping()); + return err; + } + + if (UNIV_UNLIKELY(xdes_get_state(descr) == XDES_FREE)) + { + space->set_corrupted(); + return DB_CORRUPTION; + } + + xdes_init(*xdes, descr, mtr); + space->free_len++; + return flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE, + xdes, static_cast<uint16_t>(descr - xdes->page.frame + + XDES_FLST_NODE), mtr); +} + +MY_ATTRIBUTE((nonnull)) +/** Frees a single page of a space. +The page is marked as free and clean. +@param[in,out] space tablespace +@param[in] offset page number +@param[in,out] mtr mini-transaction +@return error code */ +static dberr_t fsp_free_page(fil_space_t *space, page_no_t offset, mtr_t *mtr) +{ + xdes_t* descr; + ulint frag_n_used; + + ut_ad(mtr); + ut_d(space->modify_check(*mtr)); + + /* fprintf(stderr, "Freeing page %lu in space %lu\n", page, space); */ + + dberr_t err; + buf_block_t* header = fsp_get_header(space, mtr, &err); + if (!header) { + ut_ad(space->is_stopping()); + return err; + } + buf_block_t* xdes; + + descr = xdes_get_descriptor_with_space_hdr(header, space, offset, mtr, + &err, &xdes); + if (!descr) { + ut_ad(err || space->is_stopping()); + return err; + } + + const auto state = xdes_get_state(descr); + + switch (state) { + case XDES_FREE_FRAG: + case XDES_FULL_FRAG: + if (!xdes_is_free(descr, offset % FSP_EXTENT_SIZE)) { + break; + } + /* fall through */ + default: + space->set_corrupted(); + return DB_CORRUPTION; + } + + frag_n_used = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED + + header->page.frame); + + const uint16_t xoffset= static_cast<uint16_t>(descr - xdes->page.frame + + XDES_FLST_NODE); + + if (state == XDES_FULL_FRAG) { + /* The fragment was full: move it to another list */ + err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG, + xdes, xoffset, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + err = flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG, + xdes, xoffset, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr); + mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED + + header->page.frame, + frag_n_used + FSP_EXTENT_SIZE - 1); + } else if (UNIV_UNLIKELY(!frag_n_used)) { + return DB_CORRUPTION; + } else { + mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED + + header->page.frame, frag_n_used - 1); + } + + mtr->free(*space, static_cast<uint32_t>(offset)); + xdes_set_free<true>(*xdes, descr, offset % FSP_EXTENT_SIZE, mtr); + ut_ad(err == DB_SUCCESS); + + if (!xdes_get_n_used(descr)) { + /* The extent has become free: move it to another list */ + err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG, + xdes, xoffset, mtr); + if (err == DB_SUCCESS) { + err = fsp_free_extent(space, offset, mtr); + } + } + + return err; +} + +/** @return Number of segment inodes which fit on a single page */ +inline ulint FSP_SEG_INODES_PER_PAGE(ulint physical_size) +{ + return (physical_size - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE; +} + +/** Returns the nth inode slot on an inode page. +@param[in] page segment inode page +@param[in] i inode index on page +@return segment inode */ +#define fsp_seg_inode_page_get_nth_inode(page, i) \ + FSEG_ARR_OFFSET + FSEG_INODE_SIZE * i + page + +/** Looks for a used segment inode on a segment inode page. +@param page segment inode page +@param physical_size page size +@return segment inode index +@retval ULINT_UNDEFINED if not found */ +static +ulint +fsp_seg_inode_page_find_used(const page_t *page, ulint physical_size) +{ + for (ulint i= 0; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++) + { + const byte *inode= fsp_seg_inode_page_get_nth_inode(page, i); + if (mach_read_from_8(FSEG_ID + inode)) + { + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)); + return i; + } + } + + return ULINT_UNDEFINED; +} + +/** Looks for an unused segment inode on a segment inode page. +@param[in] page segment inode page +@param[in] i search forward starting from this index +@param[in] physical_size page size +@return segment inode index +@retval ULINT_UNDEFINED if not found */ +static +ulint +fsp_seg_inode_page_find_free(const page_t *page, ulint i, ulint physical_size) +{ + for (; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++) + { + const byte *inode= fsp_seg_inode_page_get_nth_inode(page, i); + if (mach_read_from_8(FSEG_ID + inode)) + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)); + else + /* This is unused */ + return i; + } + return ULINT_UNDEFINED; +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Allocate a file segment inode page. +@param[in,out] space tablespace +@param[in,out] header tablespace header +@param[in,out] mtr mini-transaction +@return error code */ +static dberr_t fsp_alloc_seg_inode_page(fil_space_t *space, + buf_block_t *header, mtr_t *mtr) +{ + ut_ad(header->page.id().space() == space->id); + dberr_t err; + buf_block_t *block= fsp_alloc_free_page(space, 0, mtr, mtr, &err); + + if (!block) + return err; + + ut_ad(block->page.lock.not_recursive()); + + mtr->write<2>(*block, block->page.frame + FIL_PAGE_TYPE, FIL_PAGE_INODE); + +#ifdef UNIV_DEBUG + const byte *inode= FSEG_ID + FSEG_ARR_OFFSET + block->page.frame; + for (ulint i= FSP_SEG_INODES_PER_PAGE(space->physical_size()); i--; + inode += FSEG_INODE_SIZE) + ut_ad(!mach_read_from_8(inode)); +#endif + + return flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, + block, FSEG_INODE_PAGE_NODE, mtr); +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Allocate a file segment inode. +@param[in,out] space tablespace +@param[in,out] header tablespace header +@param[out] iblock segment inode page +@param[in,out] mtr mini-transaction +@param[out] err error code +@return segment inode +@retval nullptr on failure */ +static fseg_inode_t* +fsp_alloc_seg_inode(fil_space_t *space, buf_block_t *header, + buf_block_t **iblock, mtr_t *mtr, dberr_t *err) +{ + /* Allocate a new segment inode page if needed. */ + if (!flst_get_len(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE + + header->page.frame)) + { + *err= fsp_alloc_seg_inode_page(space, header, mtr); + if (*err != DB_SUCCESS) + return nullptr; + } + + const page_id_t page_id + { + space->id, + mach_read_from_4(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE + FLST_FIRST + + FIL_ADDR_PAGE + header->page.frame) + }; + + buf_block_t *block= + buf_page_get_gen(page_id, space->zip_size(), RW_SX_LATCH, + nullptr, BUF_GET_POSSIBLY_FREED, mtr, err); + if (!block) + return nullptr; + + if (!space->full_crc32()) + fil_block_check_type(*block, FIL_PAGE_INODE, mtr); + + const ulint physical_size= space->physical_size(); + ulint n= fsp_seg_inode_page_find_free(block->page.frame, 0, physical_size); + + if (UNIV_UNLIKELY(n >= FSP_SEG_INODES_PER_PAGE(physical_size))) + { + *err= DB_CORRUPTION; + return nullptr; + } + fseg_inode_t *inode= fsp_seg_inode_page_get_nth_inode(block->page.frame, n); + + if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(block->page.frame, n + 1, + physical_size)) + { + /* There are no other unused headers left on the page: move it + to another list */ + *err= flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, + block, FSEG_INODE_PAGE_NODE, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) + return nullptr; + *err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, + block, FSEG_INODE_PAGE_NODE, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) + return nullptr; + } + + ut_ad(!mach_read_from_8(inode + FSEG_ID) || + !memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)); + *iblock= block; + return inode; +} + +MY_ATTRIBUTE((nonnull)) +/** Frees a file segment inode. +@param[in,out] space tablespace +@param[in,out] inode segment inode +@param[in,out] iblock segment inode page +@param[in,out] mtr mini-transaction */ +static void fsp_free_seg_inode(fil_space_t *space, fseg_inode_t *inode, + buf_block_t *iblock, mtr_t *mtr) +{ + ut_d(space->modify_check(*mtr)); + + dberr_t err; + buf_block_t *header= fsp_get_header(space, mtr, &err); + if (!header) + return; + if (UNIV_UNLIKELY(memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4))) + { + space->set_corrupted(); + return; + } + + const ulint physical_size= space->physical_size(); + + if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(iblock->page.frame, 0, + physical_size)) + { + /* Move the page to another list */ + if (flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, + iblock, FSEG_INODE_PAGE_NODE, mtr) != DB_SUCCESS) + return; + if (flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, + iblock, FSEG_INODE_PAGE_NODE, mtr) != DB_SUCCESS) + return; + } + + mtr->memset(iblock, page_offset(inode) + FSEG_ID, FSEG_INODE_SIZE, 0); + + if (ULINT_UNDEFINED != fsp_seg_inode_page_find_used(iblock->page.frame, + physical_size)) + return; + + /* There are no other used headers left on the page: free it */ + if (flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, + iblock, FSEG_INODE_PAGE_NODE, mtr) == DB_SUCCESS) + fsp_free_page(space, iblock->page.id().page_no(), mtr); +} + +MY_ATTRIBUTE((nonnull(1,4,5), warn_unused_result)) +/** Returns the file segment inode, page x-latched. +@param[in] header segment header +@param[in] space space id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction +@param[out] block inode block +@param[out] err error code +@return segment inode, page x-latched +@retrval nullptr if the inode is free or corruption was noticed */ +static +fseg_inode_t* +fseg_inode_try_get( + const fseg_header_t* header, + uint32_t space, + ulint zip_size, + mtr_t* mtr, + buf_block_t** block, + dberr_t* err = nullptr) +{ + if (UNIV_UNLIKELY(space != mach_read_from_4(header + FSEG_HDR_SPACE))) + { + corrupted: + if (err) + *err= DB_CORRUPTION; + return nullptr; + } + + *block= + buf_page_get_gen(page_id_t(space, + mach_read_from_4(header + FSEG_HDR_PAGE_NO)), + zip_size, RW_SX_LATCH, nullptr, BUF_GET_POSSIBLY_FREED, + mtr, err); + if (!*block) + return nullptr; + + const uint16_t offset= mach_read_from_2(header + FSEG_HDR_OFFSET); + if (UNIV_UNLIKELY(offset >= (*block)->physical_size())) + goto corrupted; + + fseg_inode_t *inode= (*block)->page.frame + offset; + if (UNIV_UNLIKELY(!mach_read_from_8(inode + FSEG_ID) || + memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4))) + goto corrupted; + + return inode; +} + +/** Get the page number from the nth fragment page slot. +@param inode file segment findex +@param n slot index +@return page number +@retval FIL_NULL if not in use */ +static uint32_t fseg_get_nth_frag_page_no(const fseg_inode_t *inode, ulint n) +{ + ut_ad(inode); + ut_ad(n < FSEG_FRAG_ARR_N_SLOTS); + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)); + return(mach_read_from_4(inode + FSEG_FRAG_ARR + + n * FSEG_FRAG_SLOT_SIZE)); +} + +/** Set the page number in the nth fragment page slot. +@param[in,out] inode segment inode +@param[in,out] iblock segment inode page +@param[in] n slot index +@param[in] page_no page number to set +@param[in,out] mtr mini-transaction */ +inline void fseg_set_nth_frag_page_no(fseg_inode_t *inode, buf_block_t *iblock, + ulint n, ulint page_no, mtr_t *mtr) +{ + ut_ad(n < FSEG_FRAG_ARR_N_SLOTS); + ut_ad(mtr->memo_contains_flagged(iblock, MTR_MEMO_PAGE_SX_FIX)); + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)); + + mtr->write<4>(*iblock, inode + FSEG_FRAG_ARR + n * FSEG_FRAG_SLOT_SIZE, + page_no); +} + +/**********************************************************************//** +Finds a fragment page slot which is free. +@return slot index; ULINT_UNDEFINED if none found */ +static +ulint +fseg_find_free_frag_page_slot( +/*==========================*/ + fseg_inode_t* inode) /*!< in: segment inode */ +{ + ulint i; + ulint page_no; + + for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + page_no = fseg_get_nth_frag_page_no(inode, i); + + if (page_no == FIL_NULL) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/**********************************************************************//** +Finds a fragment page slot which is used and last in the array. +@return slot index; ULINT_UNDEFINED if none found */ +static +ulint +fseg_find_last_used_frag_page_slot( +/*===============================*/ + fseg_inode_t* inode) /*!< in: segment inode */ +{ + ulint i; + ulint page_no; + + for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + page_no = fseg_get_nth_frag_page_no( + inode, FSEG_FRAG_ARR_N_SLOTS - i - 1); + + if (page_no != FIL_NULL) { + + return(FSEG_FRAG_ARR_N_SLOTS - i - 1); + } + } + + return(ULINT_UNDEFINED); +} + +/** Calculate reserved fragment page slots. +@param inode file segment index +@return number of fragment pages */ +static ulint fseg_get_n_frag_pages(const fseg_inode_t *inode) +{ + ulint i; + ulint count = 0; + + for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + if (FIL_NULL != fseg_get_nth_frag_page_no(inode, i)) { + count++; + } + } + + return(count); +} + +/** Create a new segment. +@param space tablespace +@param byte_offset byte offset of the created segment header +@param mtr mini-transaction +@param err error code +@param has_done_reservation whether fsp_reserve_free_extents() was invoked +@param block block where segment header is placed, + or NULL to allocate an additional page for that +@return the block where the segment header is placed, x-latched +@retval nullptr if could not create segment */ +buf_block_t* +fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, dberr_t *err, + bool has_done_reservation, buf_block_t *block) +{ + fseg_inode_t* inode; + ib_id_t seg_id; + uint32_t n_reserved; + bool reserved_extent = false; + + DBUG_ENTER("fseg_create"); + + ut_ad(mtr); + ut_ad(byte_offset >= FIL_PAGE_DATA); + ut_ad(byte_offset + FSEG_HEADER_SIZE + <= srv_page_size - FIL_PAGE_DATA_END); + + mtr->x_lock_space(space); + ut_d(space->modify_check(*mtr)); + + ut_ad(!block || block->page.id().space() == space->id); + + buf_block_t* header = fsp_get_header(space, mtr, err); + if (!header) { + block = nullptr; + goto funct_exit; + } + + buf_block_t* iblock; + +inode_alloc: + inode = fsp_alloc_seg_inode(space, header, &iblock, mtr, err); + + if (!inode) { + block = nullptr; +reserve_extent: + if (!has_done_reservation && !reserved_extent) { + *err = fsp_reserve_free_extents(&n_reserved, space, 2, + FSP_NORMAL, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + DBUG_RETURN(nullptr); + } + + /* Extents reserved successfully. So + try allocating the page or inode */ + reserved_extent = true; + if (inode) { + goto page_alloc; + } + + goto inode_alloc; + } + + if (inode) { + fsp_free_seg_inode(space, inode, iblock, mtr); + } + goto funct_exit; + } + + /* Read the next segment id from space header and increment the + value in space header */ + + seg_id = mach_read_from_8(FSP_HEADER_OFFSET + FSP_SEG_ID + + header->page.frame); + + mtr->write<8>(*header, + FSP_HEADER_OFFSET + FSP_SEG_ID + header->page.frame, + seg_id + 1); + mtr->write<8>(*iblock, inode + FSEG_ID, seg_id); + ut_ad(!mach_read_from_4(inode + FSEG_NOT_FULL_N_USED)); + + flst_init(*iblock, inode + FSEG_FREE, mtr); + flst_init(*iblock, inode + FSEG_NOT_FULL, mtr); + flst_init(*iblock, inode + FSEG_FULL, mtr); + + mtr->memcpy(*iblock, inode + FSEG_MAGIC_N, FSEG_MAGIC_N_BYTES, 4); + compile_time_assert(FSEG_FRAG_SLOT_SIZE == 4); + compile_time_assert(FIL_NULL == 0xffffffff); + mtr->memset(iblock, + uint16_t(inode - iblock->page.frame) + FSEG_FRAG_ARR, + FSEG_FRAG_SLOT_SIZE * FSEG_FRAG_ARR_N_SLOTS, 0xff); + + if (!block) { +page_alloc: + block = fseg_alloc_free_page_low(space, + inode, iblock, 0, FSP_UP, +#ifdef UNIV_DEBUG + has_done_reservation, +#endif /* UNIV_DEBUG */ + mtr, mtr, err); + + if (!block) { + ut_ad(!has_done_reservation); + goto reserve_extent; + } + + ut_d(const auto x = block->page.lock.x_lock_count()); + ut_ad(x || block->page.lock.not_recursive()); + ut_ad(x == 1 || space->is_being_truncated); + ut_ad(x <= 2); + ut_ad(!fil_page_get_type(block->page.frame)); + mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame, + FIL_PAGE_TYPE_SYS); + } + + mtr->write<2>(*block, byte_offset + FSEG_HDR_OFFSET + + block->page.frame, page_offset(inode)); + + mtr->write<4>(*block, byte_offset + FSEG_HDR_PAGE_NO + + block->page.frame, iblock->page.id().page_no()); + + mtr->write<4,mtr_t::MAYBE_NOP>(*block, byte_offset + FSEG_HDR_SPACE + + block->page.frame, space->id); + +funct_exit: + if (!has_done_reservation && reserved_extent) { + space->release_free_extents(n_reserved); + } + + DBUG_RETURN(block); +} + +/**********************************************************************//** +Calculates the number of pages reserved by a segment, and how many pages are +currently used. +@return number of reserved pages */ +static +ulint +fseg_n_reserved_pages_low( +/*======================*/ + const fseg_inode_t* inode, /*!< in: segment inode */ + ulint* used) /*!< out: number of pages used (not + more than reserved) */ +{ + *used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL) + + fseg_get_n_frag_pages(inode); + + return fseg_get_n_frag_pages(inode) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FREE) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_NOT_FULL) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL); +} + +/** Calculate the number of pages reserved by a segment, +and how many pages are currently used. +@param[in] block buffer block containing the file segment header +@param[in] header file segment header +@param[out] used number of pages that are used (not more than reserved) +@param[in,out] mtr mini-transaction +@return number of reserved pages */ +ulint fseg_n_reserved_pages(const buf_block_t &block, + const fseg_header_t *header, ulint *used, + mtr_t *mtr) +{ + ut_ad(page_align(header) == block.page.frame); + buf_block_t *iblock; + if (fseg_inode_t *inode= + fseg_inode_try_get(header, block.page.id().space(), block.zip_size(), + mtr, &iblock)) + return fseg_n_reserved_pages_low(inode, used); + return *used= 0; +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Tries to fill the free list of a segment with consecutive free extents. +This happens if the segment is big enough to allow extents in the free list, +the free list is empty, and the extents can be allocated consecutively from +the hint onward. +@param[in] inode segment inode +@param[in,out] iblock segment inode page +@param[in] space tablespace +@param[in] hint hint which extent would be good as the first extent +@param[in,out] mtr mini-transaction */ +static dberr_t fseg_fill_free_list(const fseg_inode_t *inode, + buf_block_t *iblock, fil_space_t *space, + uint32_t hint, mtr_t *mtr) +{ + ulint used; + + ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + ut_d(space->modify_check(*mtr)); + + if (fseg_n_reserved_pages_low(inode, &used) < + FSEG_FREE_LIST_LIMIT * FSP_EXTENT_SIZE) + /* The segment is too small to allow extents in free list */ + return DB_SUCCESS; + + if (UNIV_UNLIKELY(memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4))) + { + space->set_corrupted(); + return DB_CORRUPTION; + } + + if (flst_get_len(inode + FSEG_FREE) > 0) + /* Free list is not empty */ + return DB_SUCCESS; + + for (ulint i= 0; i < FSEG_FREE_LIST_MAX_LEN; i++, hint += FSP_EXTENT_SIZE) + { + buf_block_t *xdes; + dberr_t err; + xdes_t *descr= xdes_get_descriptor(space, hint, mtr, &err, &xdes); + if (!descr || XDES_FREE != xdes_get_state(descr)) + /* We cannot allocate the desired extent: stop */ + return err; + + descr= fsp_alloc_free_extent(space, hint, &xdes, mtr, &err); + if (UNIV_UNLIKELY(!descr)) + return err; + + if (dberr_t err= + flst_add_last(iblock, + static_cast<uint16_t>(inode - iblock->page.frame + + FSEG_FREE), xdes, + static_cast<uint16_t>(descr - xdes->page.frame + + XDES_FLST_NODE), mtr)) + return err; + xdes_set_state(*xdes, descr, XDES_FSEG, mtr); + mtr->memcpy(*xdes, descr + XDES_ID, inode + FSEG_ID, 8); + } + + return DB_SUCCESS; +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Allocates a free extent for the segment: looks first in the free list of +the segment, then tries to allocate from the space free list. +NOTE that the extent returned still resides in the segment free list, it is +not yet taken off it! +@param[in] inode segment inode +@param[in,out] iblock segment inode page +@param[out] xdes extent descriptor page +@param[in,out] space tablespace +@param[in,out] mtr mini-transaction +@param[out] err error code +@retval nullptr if no page could be allocated */ +static +xdes_t* +fseg_alloc_free_extent( + const fseg_inode_t* inode, + buf_block_t* iblock, + buf_block_t** xdes, + fil_space_t* space, + mtr_t* mtr, + dberr_t* err) +{ + ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)); + ut_d(space->modify_check(*mtr)); + + if (flst_get_len(inode + FSEG_FREE)) + { + /* Segment free list is not empty, allocate from it */ + return xdes_lst_get_descriptor(*space, flst_get_first(inode + FSEG_FREE), + mtr, xdes, err); + } + + xdes_t* descr= fsp_alloc_free_extent(space, 0, xdes, mtr, err); + if (UNIV_UNLIKELY(!descr)) + return descr; + xdes_set_state(**xdes, descr, XDES_FSEG, mtr); + mtr->memcpy<mtr_t::MAYBE_NOP>(**xdes, descr + XDES_ID, inode + FSEG_ID, 8); + *err= flst_add_last(iblock, + static_cast<uint16_t>(inode - iblock->page.frame + + FSEG_FREE), *xdes, + static_cast<uint16_t>(descr - (*xdes)->page.frame + + XDES_FLST_NODE), mtr); + if (UNIV_LIKELY(*err != DB_SUCCESS)) + return nullptr; + /* Try to fill the segment free list */ + *err= fseg_fill_free_list(inode, iblock, space, + xdes_get_offset(descr) + FSP_EXTENT_SIZE, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) + return nullptr; + + return descr; +} + +/** Allocates a single free page from a segment. +This function implements the intelligent allocation strategy which tries to +minimize file space fragmentation. +@param[in,out] space tablespace +@param[in,out] seg_inode segment inode +@param[in,out] iblock segment inode page +@param[in] hint hint of which page would be desirable +@param[in] direction if the new page is needed because of +an index page split, and records are inserted there in order, into which +direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR +@param[in,out] mtr mini-transaction +@param[in,out] init_mtr mtr or another mini-transaction in +which the page should be initialized. +@param[out] err error code +@return the allocated page +@retval nullptr if no page could be allocated */ +static +buf_block_t* +fseg_alloc_free_page_low( + fil_space_t* space, + fseg_inode_t* seg_inode, + buf_block_t* iblock, + uint32_t hint, + byte direction, +#ifdef UNIV_DEBUG + bool has_done_reservation, + /*!< whether the space has already been reserved */ +#endif /* UNIV_DEBUG */ + mtr_t* mtr, + mtr_t* init_mtr, + dberr_t* err) +{ + ib_id_t seg_id; + ulint used; + ulint reserved; + xdes_t* descr; /*!< extent of the hinted page */ + uint32_t ret_page; /*!< the allocated page offset, FIL_NULL + if could not be allocated */ + xdes_t* ret_descr; /*!< the extent of the allocated page */ + buf_block_t* xdes; + ulint n; + + ut_ad((direction >= FSP_UP) && (direction <= FSP_NO_DIR)); + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4)); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + seg_id = mach_read_from_8(seg_inode + FSEG_ID); + + ut_ad(seg_id); + ut_d(space->modify_check(*mtr)); + ut_ad(fil_page_get_type(page_align(seg_inode)) == FIL_PAGE_INODE); + + reserved = fseg_n_reserved_pages_low(seg_inode, &used); + + buf_block_t* header = fsp_get_header(space, mtr, err); + if (!header) { + return header; + } + + descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr, + err, &xdes); + if (!descr) { + if (*err != DB_SUCCESS) { + return nullptr; + } + /* Hint outside space or too high above free limit: reset + hint */ + /* The file space header page is always allocated. */ + hint = 0; + descr = xdes_get_descriptor(space, hint, mtr, err, &xdes); + if (!descr) { + return nullptr; + } + } + + /* In the big if-else below we look for ret_page and ret_descr */ + /*-------------------------------------------------------------*/ + if ((xdes_get_state(descr) == XDES_FSEG) + && mach_read_from_8(descr + XDES_ID) == seg_id + && xdes_is_free(descr, hint % FSP_EXTENT_SIZE)) { +take_hinted_page: + /* 1. We can take the hinted page + =================================*/ + ret_descr = descr; + ret_page = hint; + /* Skip the check for extending the tablespace. If the + page hint were not within the size of the tablespace, + we would have got (descr == NULL) above and reset the hint. */ + goto got_hinted_page; + /*-----------------------------------------------------------*/ + } else if (xdes_get_state(descr) == XDES_FREE + && reserved - used < reserved / FSEG_FILLFACTOR + && used >= FSEG_FRAG_LIMIT) { + + /* 2. We allocate the free extent from space and can take + ========================================================= + the hinted page + ===============*/ + ret_descr = fsp_alloc_free_extent(space, hint, &xdes, + mtr, err); + + if (UNIV_UNLIKELY(ret_descr != descr)) { + if (*err != DB_SUCCESS) { + *err = DB_CORRUPTION; + } + return nullptr; + } + + xdes_set_state(*xdes, ret_descr, XDES_FSEG, mtr); + mtr->write<8,mtr_t::MAYBE_NOP>(*xdes, ret_descr + XDES_ID, + seg_id); + *err = flst_add_last( + iblock, + static_cast<uint16_t>(seg_inode - iblock->page.frame + + FSEG_FREE), xdes, + static_cast<uint16_t>(ret_descr + - xdes->page.frame + + XDES_FLST_NODE), mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return nullptr; + } + + /* Try to fill the segment free list */ + *err = fseg_fill_free_list(seg_inode, iblock, space, + hint + FSP_EXTENT_SIZE, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return nullptr; + } + goto take_hinted_page; + /*-----------------------------------------------------------*/ + } else if ((direction != FSP_NO_DIR) + && ((reserved - used) < reserved / FSEG_FILLFACTOR) + && (used >= FSEG_FRAG_LIMIT) + && (ret_descr = fseg_alloc_free_extent(seg_inode, iblock, + &xdes, space, + mtr, err))) { + /* 3. We take any free extent (which was already assigned above + =============================================================== + in the if-condition to ret_descr) and take the lowest or + ======================================================== + highest page in it, depending on the direction + ==============================================*/ + ret_page = xdes_get_offset(ret_descr); + + if (direction == FSP_DOWN) { + ret_page += FSP_EXTENT_SIZE - 1; + } + ut_ad(!has_done_reservation || ret_page != FIL_NULL); + /*-----------------------------------------------------------*/ + } else if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return nullptr; + } else if ((xdes_get_state(descr) == XDES_FSEG) + && mach_read_from_8(descr + XDES_ID) == seg_id + && (!xdes_is_full(descr))) { + + /* 4. We can take the page from the same extent as the + ====================================================== + hinted page (and the extent already belongs to the + ================================================== + segment) + ========*/ + ret_descr = descr; + ret_page = xdes_find_free(ret_descr, hint % FSP_EXTENT_SIZE); + if (ret_page == FIL_NULL) { + ut_ad(!has_done_reservation); + } else { + ret_page += xdes_get_offset(ret_descr); + } + /*-----------------------------------------------------------*/ + } else if (reserved - used > 0) { + /* 5. We take any unused page from the segment + ==============================================*/ + fil_addr_t first; + + if (flst_get_len(seg_inode + FSEG_NOT_FULL) > 0) { + first = flst_get_first(seg_inode + FSEG_NOT_FULL); + } else if (flst_get_len(seg_inode + FSEG_FREE) > 0) { + first = flst_get_first(seg_inode + FSEG_FREE); + } else { + ut_ad(!has_done_reservation); + return(NULL); + } + + ret_descr = xdes_lst_get_descriptor(*space, first, mtr, &xdes); + if (!ret_descr) { + return nullptr; + } + + ret_page = xdes_find_free(ret_descr); + if (ret_page == FIL_NULL) { + ut_ad(!has_done_reservation); + } else { + ret_page += xdes_get_offset(ret_descr); + } + /*-----------------------------------------------------------*/ + } else if (used < FSEG_FRAG_LIMIT) { + /* 6. We allocate an individual page from the space + ===================================================*/ + buf_block_t* block = fsp_alloc_free_page( + space, hint, mtr, init_mtr, err); + + ut_ad(block || !has_done_reservation || *err); + + if (block) { + /* Put the page in the fragment page array of the + segment */ + n = fseg_find_free_frag_page_slot(seg_inode); + if (UNIV_UNLIKELY(n == ULINT_UNDEFINED)) { + *err = DB_CORRUPTION; + return nullptr; + } + + fseg_set_nth_frag_page_no( + seg_inode, iblock, n, + block->page.id().page_no(), mtr); + } + + /* fsp_alloc_free_page() invoked fsp_init_file_page() + already. */ + return(block); + /*-----------------------------------------------------------*/ + } else { + /* 7. We allocate a new extent and take its first page + ======================================================*/ + ret_descr = fseg_alloc_free_extent(seg_inode, iblock, &xdes, + space, mtr, err); + + if (!ret_descr) { + ut_ad(!has_done_reservation || *err); + return nullptr; + } else { + ret_page = xdes_get_offset(ret_descr); + } + } + + if (ret_page == FIL_NULL) { + /* Page could not be allocated */ + + ut_ad(!has_done_reservation); + return(NULL); + } + + if (space->size <= ret_page && !is_predefined_tablespace(space->id)) { + /* It must be that we are extending a single-table + tablespace whose size is still < 64 pages */ + + if (ret_page >= FSP_EXTENT_SIZE) { + sql_print_error("InnoDB: Trying to extend '%s'" + " by single page(s) though the" + " space size " UINT32PF "." + " Page no " UINT32PF ".", + space->chain.start->name, space->size, + ret_page); + ut_ad(!has_done_reservation); + return(NULL); + } + + if (!fsp_try_extend_data_file_with_pages( + space, ret_page, header, mtr)) { + /* No disk space left */ + ut_ad(!has_done_reservation); + return(NULL); + } + } + +got_hinted_page: + /* ret_descr == NULL if the block was allocated from free_frag + (XDES_FREE_FRAG) */ + if (ret_descr != NULL) { + /* At this point we know the extent and the page offset. + The extent is still in the appropriate list (FSEG_NOT_FULL + or FSEG_FREE), and the page is not yet marked as used. */ + + ut_d(buf_block_t* xxdes); + ut_ad(xdes_get_descriptor(space, ret_page, mtr, err, &xxdes) + == ret_descr); + ut_ad(xdes == xxdes); + ut_ad(xdes_is_free(ret_descr, ret_page % FSP_EXTENT_SIZE)); + + *err = fseg_mark_page_used(seg_inode, iblock, ret_page, + ret_descr, xdes, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return nullptr; + } + } + + return fsp_page_create(space, ret_page, init_mtr); +} + +/**********************************************************************//** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize file space +fragmentation. +@retval NULL if no page could be allocated */ +buf_block_t* +fseg_alloc_free_page_general( +/*=========================*/ + fseg_header_t* seg_header,/*!< in/out: segment header */ + uint32_t hint, /*!< in: hint of which page would be + desirable */ + byte direction,/*!< in: if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR */ + bool has_done_reservation, /*!< in: true if the caller has + already done the reservation for the page + with fsp_reserve_free_extents, then there + is no need to do the check for this individual + page */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr,/*!< in/out: mtr or another mini-transaction + in which the page should be initialized. */ + dberr_t* err) /*!< out: error code */ +{ + fseg_inode_t* inode; + fil_space_t* space; + buf_block_t* iblock; + buf_block_t* block; + uint32_t n_reserved; + + const uint32_t space_id = page_get_space_id(page_align(seg_header)); + space = mtr->x_lock_space(space_id); + inode = fseg_inode_try_get(seg_header, space_id, space->zip_size(), + mtr, &iblock, err); + if (!inode) { + return nullptr; + } + if (!space->full_crc32()) { + fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + } + + if (!has_done_reservation) { + *err = fsp_reserve_free_extents(&n_reserved, space, 2, + FSP_NORMAL, mtr); + if (*err != DB_SUCCESS) { + return nullptr; + } + } + + block = fseg_alloc_free_page_low(space, + inode, iblock, hint, direction, +#ifdef UNIV_DEBUG + has_done_reservation, +#endif /* UNIV_DEBUG */ + mtr, init_mtr, err); + + /* The allocation cannot fail if we have already reserved a + space for the page. */ + ut_ad(block || !has_done_reservation || *err); + + if (!has_done_reservation) { + space->release_free_extents(n_reserved); + } + + return(block); +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Check that we have at least n_pages frag pages free in the first extent +of a single-table tablespace, and they are also physically initialized to +the data file. That is we have already extended the data file so that those +pages are inside the data file. If not, this function extends the tablespace +with pages. +@param[in,out] space tablespace +@param[in,out] header tablespace header, x-latched +@param[in] size tablespace size in pages, less than FSP_EXTENT_SIZE +@param[in,out] mtr mini-transaction +@param[in] n_pages number of pages to reserve +@return error code */ +static +dberr_t +fsp_reserve_free_pages( + fil_space_t* space, + buf_block_t* header, + ulint size, + mtr_t* mtr, + uint32_t n_pages) +{ + ut_ad(space != fil_system.sys_space && space != fil_system.temp_space); + ut_ad(size < FSP_EXTENT_SIZE); + + dberr_t err= DB_OUT_OF_FILE_SPACE; + const xdes_t *descr= + xdes_get_descriptor_with_space_hdr(header, space, 0, mtr, &err); + if (!descr) + return err; + const uint32_t n_used= xdes_get_n_used(descr); + if (size >= n_used + n_pages) + return DB_SUCCESS; + if (n_used > size) + return DB_CORRUPTION; + return fsp_try_extend_data_file_with_pages(space, n_used + n_pages - 1, + header, mtr) + ? DB_SUCCESS + : DB_OUT_OF_FILE_SPACE; +} + +/** Reserves free pages from a tablespace. All mini-transactions which may +use several pages from the tablespace should call this function beforehand +and reserve enough free extents so that they certainly will be able +to do their operation, like a B-tree page split, fully. Reservations +must be released with function fil_space_t::release_free_extents()! + +The alloc_type below has the following meaning: FSP_NORMAL means an +operation which will probably result in more space usage, like an +insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are +deleting rows, then this allocation will in the long run result in +less space usage (after a purge); FSP_CLEANING means allocation done +in a physical record delete (like in a purge) or other cleaning operation +which will result in less space usage in the long run. We prefer the latter +two types of allocation: when space is scarce, FSP_NORMAL allocations +will not succeed, but the latter two allocations will succeed, if possible. +The purpose is to avoid dead end where the database is full but the +user cannot free any space because these freeing operations temporarily +reserve some space. + +Single-table tablespaces whose size is < FSP_EXTENT_SIZE pages are a special +case. In this function we would liberally reserve several extents for +every page split or merge in a B-tree. But we do not want to waste disk space +if the table only occupies < FSP_EXTENT_SIZE pages. That is why we apply +different rules in that special case, just ensuring that there are n_pages +free pages available. + +@param[out] n_reserved number of extents actually reserved; if we + return true and the tablespace size is < + FSP_EXTENT_SIZE pages, then this can be 0, + otherwise it is n_ext +@param[in,out] space tablespace +@param[in] n_ext number of extents to reserve +@param[in] alloc_type page reservation type (FSP_BLOB, etc) +@param[in,out] mtr the mini transaction +@param[in] n_pages for small tablespaces (tablespace size is + less than FSP_EXTENT_SIZE), number of free + pages to reserve. +@return error code +@retval DB_SUCCESS if we were able to make the reservation */ +dberr_t +fsp_reserve_free_extents( + uint32_t* n_reserved, + fil_space_t* space, + uint32_t n_ext, + fsp_reserve_t alloc_type, + mtr_t* mtr, + uint32_t n_pages) +{ + ulint reserve; + + ut_ad(mtr); + *n_reserved = n_ext; + + const uint32_t extent_size = FSP_EXTENT_SIZE; + + mtr->x_lock_space(space); + const unsigned physical_size = space->physical_size(); + + dberr_t err; + buf_block_t* header = fsp_get_header(space, mtr, &err); + if (!header) { + return err; + } +try_again: + uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + + header->page.frame); + ut_ad(size == space->size_in_header); + + if (size < extent_size && n_pages < extent_size / 2) { + /* Use different rules for small single-table tablespaces */ + *n_reserved = 0; + return fsp_reserve_free_pages(space, header, size, + mtr, n_pages); + } + + uint32_t n_free_list_ext = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE + + header->page.frame); + ut_ad(space->free_len == n_free_list_ext); + + uint32_t free_limit = mach_read_from_4(FSP_HEADER_OFFSET + + FSP_FREE_LIMIT + + header->page.frame); + ut_ad(space->free_limit == free_limit); + + /* Below we play safe when counting free extents above the free limit: + some of them will contain extent descriptor pages, and therefore + will not be free extents */ + + uint32_t n_free_up; + + if (size >= free_limit) { + n_free_up = (size - free_limit) / extent_size; + if (n_free_up) { + n_free_up--; + n_free_up -= n_free_up / (physical_size / extent_size); + } + } else { + ut_ad(alloc_type == FSP_BLOB); + n_free_up = 0; + } + + uint32_t n_free = n_free_list_ext + n_free_up; + + switch (alloc_type) { + case FSP_NORMAL: + /* We reserve 1 extent + 0.5 % of the space size to undo logs + and 1 extent + 0.5 % to cleaning operations; NOTE: this source + code is duplicated in the function below! */ + + reserve = 2 + ((size / extent_size) * 2) / 200; + + if (n_free <= reserve + n_ext) { + + goto try_to_extend; + } + break; + case FSP_UNDO: + /* We reserve 0.5 % of the space size to cleaning operations */ + + reserve = 1 + ((size / extent_size) * 1) / 200; + + if (n_free <= reserve + n_ext) { + + goto try_to_extend; + } + break; + case FSP_CLEANING: + case FSP_BLOB: + reserve = 0; + break; + default: + ut_error; + } + + if (space->reserve_free_extents(n_free, n_ext)) { + return DB_SUCCESS; + } +try_to_extend: + if (fsp_try_extend_data_file(space, header, mtr)) { + goto try_again; + } + + return DB_OUT_OF_FILE_SPACE; +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Frees a single page of a segment. +@param[in] seg_inode segment inode +@param[in,out] space tablespace +@param[in] offset page number +@param[in,out] mtr mini-transaction +@param[in] ahi Drop adaptive hash index +@return error code */ +static +dberr_t +fseg_free_page_low( + fseg_inode_t* seg_inode, + buf_block_t* iblock, + fil_space_t* space, + page_no_t offset, + mtr_t* mtr +#ifdef BTR_CUR_HASH_ADAPT + ,bool ahi=false +#endif /* BTR_CUR_HASH_ADAPT */ + ) +{ + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4)); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + ut_ad(iblock->page.frame == page_align(seg_inode)); + ut_d(space->modify_check(*mtr)); + +#ifdef BTR_CUR_HASH_ADAPT + if (ahi) { + btr_search_drop_page_hash_when_freed( + page_id_t(space->id, offset)); + } +#endif /* BTR_CUR_HASH_ADAPT */ + + const uint32_t extent_size = FSP_EXTENT_SIZE; + ut_ad(ut_is_2pow(extent_size)); + buf_block_t* xdes; + dberr_t err; + xdes_t* descr = xdes_get_descriptor(space, offset, mtr, &err, &xdes); + + if (!descr) { + return err; + } + if (UNIV_UNLIKELY(xdes_is_free(descr, offset & (extent_size - 1)))) { +corrupted: + space->set_corrupted(); + return DB_CORRUPTION; + } + + if (xdes_get_state(descr) != XDES_FSEG) { + /* The page is in the fragment pages of the segment */ + for (ulint i = 0;; i++) { + if (fseg_get_nth_frag_page_no(seg_inode, i) + != offset) { + continue; + } + + compile_time_assert(FIL_NULL == 0xffffffff); + mtr->memset(iblock, uint16_t(seg_inode + - iblock->page.frame) + + FSEG_FRAG_ARR + + i * FSEG_FRAG_SLOT_SIZE, 4, 0xff); + break; + } + + return fsp_free_page(space, offset, mtr); + } + + /* If we get here, the page is in some extent of the segment */ + + if (UNIV_UNLIKELY(memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8))) { + goto corrupted; + } + + byte* p_not_full = seg_inode + FSEG_NOT_FULL_N_USED; + uint32_t not_full_n_used = mach_read_from_4(p_not_full); + const uint16_t xoffset= uint16_t(descr - xdes->page.frame + + XDES_FLST_NODE); + const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame); + + if (xdes_is_full(descr)) { + /* The fragment is full: move it to another list */ + err = flst_remove(iblock, + static_cast<uint16_t>(FSEG_FULL + ioffset), + xdes, xoffset, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + err = flst_add_last(iblock, static_cast<uint16_t>(FSEG_NOT_FULL + + ioffset), + xdes, xoffset, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + not_full_n_used += extent_size - 1; + } else { + if (!not_full_n_used) { + goto corrupted; + } + not_full_n_used--; + } + + mtr->write<4>(*iblock, p_not_full, not_full_n_used); + xdes_set_free<true>(*xdes, descr, offset & (extent_size - 1), mtr); + + if (!xdes_get_n_used(descr)) { + err = flst_remove(iblock, static_cast<uint16_t>(FSEG_NOT_FULL + + ioffset), + xdes, xoffset, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + err = fsp_free_extent(space, offset, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + } + + mtr->free(*space, static_cast<uint32_t>(offset)); + return DB_SUCCESS; +} + +/** Free a page in a file segment. +@param[in,out] seg_header file segment header +@param[in,out] space tablespace +@param[in] offset page number +@param[in,out] mtr mini-transaction +@param[in] have_latch whether space->x_lock() was already called +@return error code */ +dberr_t fseg_free_page(fseg_header_t *seg_header, fil_space_t *space, + uint32_t offset, mtr_t *mtr, bool have_latch) +{ + buf_block_t *iblock; + if (have_latch) + ut_ad(space->is_owner()); + else + mtr->x_lock_space(space); + + DBUG_PRINT("fseg_free_page", + ("space_id: " ULINTPF ", page_no: %u", space->id, offset)); + + dberr_t err; + if (fseg_inode_t *seg_inode= fseg_inode_try_get(seg_header, + space->id, space->zip_size(), + mtr, &iblock, &err)) + { + if (!space->full_crc32()) + fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + return fseg_free_page_low(seg_inode, iblock, space, offset, mtr); + } + + return err; +} + +/** Determine whether a page is allocated. +@param space tablespace +@param page page number +@return error code +@retval DB_SUCCESS if the page is marked as free +@retval DB_SUCCESS_LOCKED_REC if the page is marked as allocated */ +dberr_t fseg_page_is_allocated(fil_space_t *space, unsigned page) +{ + mtr_t mtr; + uint32_t dpage= xdes_calc_descriptor_page(space->zip_size(), page); + const unsigned zip_size= space->zip_size(); + dberr_t err= DB_SUCCESS; + + mtr.start(); + if (!space->is_owner()) + mtr.x_lock_space(space); + + if (page >= space->free_limit || page >= space->size_in_header); + else if (const buf_block_t *b= + buf_page_get_gen(page_id_t(space->id, dpage), space->zip_size(), + RW_S_LATCH, nullptr, BUF_GET_POSSIBLY_FREED, + &mtr, &err)) + { + if (!dpage && + (space->free_limit != + mach_read_from_4(FSP_FREE_LIMIT + FSP_HEADER_OFFSET + + b->page.frame) || + space->size_in_header != + mach_read_from_4(FSP_SIZE + FSP_HEADER_OFFSET + b->page.frame))) + err= DB_CORRUPTION; + else + err= xdes_is_free(b->page.frame + XDES_ARR_OFFSET + XDES_SIZE + * xdes_calc_descriptor_index(zip_size, page), + page & (FSP_EXTENT_SIZE - 1)) + ? DB_SUCCESS + : DB_SUCCESS_LOCKED_REC; + } + + mtr.commit(); + return err; +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Free an extent of a segment to the space free list. +@param[in,out] seg_inode segment inode +@param[in,out] space tablespace +@param[in] page page number in the extent +@param[in,out] mtr mini-transaction +@return error code */ +static +dberr_t +fseg_free_extent( + fseg_inode_t* seg_inode, + buf_block_t* iblock, + fil_space_t* space, + uint32_t page, + mtr_t* mtr +#ifdef BTR_CUR_HASH_ADAPT + ,bool ahi=false +#endif /* BTR_CUR_HASH_ADAPT */ + ) +{ + buf_block_t* xdes; + dberr_t err; + xdes_t* descr = xdes_get_descriptor(space, page, mtr, &err, &xdes); + + if (!descr) { + return err; + } + + if (UNIV_UNLIKELY(xdes_get_state(descr) != XDES_FSEG + || memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8) + || memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + + seg_inode, 4))) { + return DB_CORRUPTION; + } + ut_d(space->modify_check(*mtr)); + const uint32_t first_page_in_extent = page - (page % FSP_EXTENT_SIZE); + + const uint16_t xoffset= uint16_t(descr - xdes->page.frame + + XDES_FLST_NODE); + const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame); + +#ifdef BTR_CUR_HASH_ADAPT + if (ahi) { + for (uint32_t i = 0; i < FSP_EXTENT_SIZE; i++) { + if (!xdes_is_free(descr, i)) { + /* Drop search system page hash index + if the page is found in the pool and + is hashed */ + btr_search_drop_page_hash_when_freed( + page_id_t(space->id, + first_page_in_extent + i)); + } + } + } +#endif /* BTR_CUR_HASH_ADAPT */ + + uint16_t lst; + + if (xdes_is_full(descr)) { + lst = static_cast<uint16_t>(FSEG_FULL + ioffset); +remove: + err = flst_remove(iblock, lst, xdes, xoffset, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + } else if (!xdes_get_n_used(descr)) { + lst = static_cast<uint16_t>(FSEG_FREE + ioffset); + goto remove; + } else { + err = flst_remove( + iblock, static_cast<uint16_t>(FSEG_NOT_FULL + ioffset), + xdes, xoffset, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + uint32_t not_full_n_used = mach_read_from_4( + FSEG_NOT_FULL_N_USED + seg_inode); + uint32_t descr_n_used = xdes_get_n_used(descr); + if (not_full_n_used < descr_n_used) { + return DB_CORRUPTION; + } + mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED, + not_full_n_used - descr_n_used); + } + + std::vector<uint8_t> going_to_free; + static_assert(FSP_EXTENT_SIZE_MIN == 256, "compatibility"); + static_assert(FSP_EXTENT_SIZE_MAX == 64, "compatibility"); + + for (uint32_t i = 0; i < FSP_EXTENT_SIZE; i++) { + if (!xdes_is_free(descr, i)) { + going_to_free.emplace_back(uint8_t(i)); + } + } + + if (dberr_t err = fsp_free_extent(space, page, mtr)) { + return err; + } + + for (uint32_t i : going_to_free) { + mtr->free(*space, first_page_in_extent + i); + buf_page_free(space, first_page_in_extent + i, mtr); + } + + return DB_SUCCESS; +} + +/** Frees part of a segment. This function can be used to free +a segment by repeatedly calling this function in different +mini-transactions. Doing the freeing in a single mini-transaction +might result in too big a mini-transaction. +@param header segment header; NOTE: if the header resides on first + page of the frag list of the segment, this pointer + becomes obsolete after the last freeing step +@param mtr mini-transaction +@param ahi Drop the adaptive hash index +@return whether the freeing was completed */ +bool +fseg_free_step( + fseg_header_t* header, + mtr_t* mtr +#ifdef BTR_CUR_HASH_ADAPT + ,bool ahi +#endif /* BTR_CUR_HASH_ADAPT */ + ) +{ + ulint n; + fseg_inode_t* inode; + + const uint32_t space_id = page_get_space_id(page_align(header)); + const uint32_t header_page = page_get_page_no(page_align(header)); + + fil_space_t* space = mtr->x_lock_space(space_id); + xdes_t* descr = xdes_get_descriptor(space, header_page, mtr); + + if (!descr) { + return true; + } + + /* Check that the header resides on a page which has not been + freed yet */ + + if (UNIV_UNLIKELY(xdes_is_free(descr, + header_page & (FSP_EXTENT_SIZE - 1)))) { + /* Some corruption was detected: stop the freeing + in order to prevent a crash. */ + return true; + } + buf_block_t* iblock; + const ulint zip_size = space->zip_size(); + inode = fseg_inode_try_get(header, space_id, zip_size, mtr, &iblock); + if (!inode || space->is_stopping()) { + return true; + } + + if (!space->full_crc32()) { + fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + } + + dberr_t err; + descr = fseg_get_first_extent(inode, space, mtr, &err); + + if (descr) { + /* Free the extent held by the segment */ + return fseg_free_extent(inode, iblock, space, + xdes_get_offset(descr), mtr +#ifdef BTR_CUR_HASH_ADAPT + , ahi +#endif /* BTR_CUR_HASH_ADAPT */ + ) != DB_SUCCESS; + } + + if (err != DB_SUCCESS || space->is_stopping()) { + return true; + } + + /* Free a frag page */ + n = fseg_find_last_used_frag_page_slot(inode); + + if (n == ULINT_UNDEFINED) { + /* Freeing completed: free the segment inode */ + fsp_free_seg_inode(space, inode, iblock, mtr); + return true; + } + + page_no_t page_no = fseg_get_nth_frag_page_no(inode, n); + + if (fseg_free_page_low(inode, iblock, space, page_no, mtr +#ifdef BTR_CUR_HASH_ADAPT + , ahi +#endif /* BTR_CUR_HASH_ADAPT */ + ) != DB_SUCCESS) { + return true; + } + + buf_page_free(space, page_no, mtr); + + n = fseg_find_last_used_frag_page_slot(inode); + + if (n == ULINT_UNDEFINED) { + /* Freeing completed: free the segment inode */ + fsp_free_seg_inode(space, inode, iblock, mtr); + + return true; + } + + return false; +} + +bool +fseg_free_step_not_header( + fseg_header_t* header, + mtr_t* mtr +#ifdef BTR_CUR_HASH_ADAPT + ,bool ahi +#endif /* BTR_CUR_HASH_ADAPT */ + ) +{ + fseg_inode_t* inode; + + const uint32_t space_id = page_get_space_id(page_align(header)); + ut_ad(mtr->is_named_space(space_id)); + + fil_space_t* space = mtr->x_lock_space(space_id); + buf_block_t* iblock; + + inode = fseg_inode_try_get(header, space_id, space->zip_size(), + mtr, &iblock); + if (space->is_stopping()) { + return true; + } + + if (!inode) { + ib::warn() << "Double free of " + << page_id_t(space_id, + page_get_page_no(page_align(header))); + return true; + } + + if (!space->full_crc32()) { + fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + } + + dberr_t err; + if (xdes_t* descr = fseg_get_first_extent(inode, space, mtr, &err)) { + /* Free the extent held by the segment */ + return fseg_free_extent(inode, iblock, space, + xdes_get_offset(descr), + mtr +#ifdef BTR_CUR_HASH_ADAPT + , ahi +#endif /* BTR_CUR_HASH_ADAPT */ + ) != DB_SUCCESS; + } else if (err != DB_SUCCESS) { + return true; + } + + /* Free a frag page */ + + ulint n = fseg_find_last_used_frag_page_slot(inode); + + if (UNIV_UNLIKELY(n == ULINT_UNDEFINED)) { + return true; + } + + uint32_t page_no = fseg_get_nth_frag_page_no(inode, n); + + if (page_no == page_get_page_no(page_align(header))) { + return true; + } + + if (fseg_free_page_low(inode, iblock, space, page_no, mtr +#ifdef BTR_CUR_HASH_ADAPT + , ahi +#endif /* BTR_CUR_HASH_ADAPT */ + ) != DB_SUCCESS) { + return true; + } + buf_page_free(space, page_no, mtr); + return false; +} + +/** Returns the first extent descriptor for a segment. +We think of the extent lists of the segment catenated in the order +FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE. +@param[in] inode segment inode +@param[in] space tablespace +@param[in,out] mtr mini-transaction +@return the first extent descriptor +@retval nullptr if none, or on corruption */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +static +xdes_t* +fseg_get_first_extent( + fseg_inode_t* inode, + const fil_space_t* space, + mtr_t* mtr, + dberr_t* err) +{ + if (UNIV_UNLIKELY(space->id != page_get_space_id(page_align(inode)) || + memcmp(inode + FSEG_MAGIC_N, FSEG_MAGIC_N_BYTES, 4))) + { + corrupted: + *err= DB_CORRUPTION; + return nullptr; + } + + fil_addr_t first; + + if (flst_get_len(inode + FSEG_FULL)) + first= flst_get_first(inode + FSEG_FULL); + else if (flst_get_len(inode + FSEG_NOT_FULL)) + first= flst_get_first(inode + FSEG_NOT_FULL); + else if (flst_get_len(inode + FSEG_FREE)) + first= flst_get_first(inode + FSEG_FREE); + else + { + *err= DB_SUCCESS; + return nullptr; + } + + if (first.page == FIL_NULL) + goto corrupted; + + return xdes_lst_get_descriptor(*space, first, mtr, nullptr, err); +} + +#ifdef UNIV_BTR_PRINT +/*******************************************************************//** +Writes info of a segment. */ +static void fseg_print_low(const fseg_inode_t *inode) +{ + ulint space; + ulint n_used; + ulint n_frag; + ulint n_free; + ulint n_not_full; + ulint n_full; + ulint reserved; + ulint used; + ulint page_no; + ib_id_t seg_id; + + space = page_get_space_id(page_align(inode)); + page_no = page_get_page_no(page_align(inode)); + + reserved = fseg_n_reserved_pages_low(inode, &used); + + seg_id = mach_read_from_8(inode + FSEG_ID); + n_used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED); + n_frag = fseg_get_n_frag_pages(inode); + n_free = flst_get_len(inode + FSEG_FREE); + n_not_full = flst_get_len(inode + FSEG_NOT_FULL); + n_full = flst_get_len(inode + FSEG_FULL); + + ib::info() << "SEGMENT id " << seg_id + << " space " << space << ";" + << " page " << page_no << ";" + << " res " << reserved << " used " << used << ";" + << " full ext " << n_full << ";" + << " fragm pages " << n_frag << ";" + << " free extents " << n_free << ";" + << " not full extents " << n_not_full << ": pages " << n_used; + + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)); +} + +/*******************************************************************//** +Writes info of a segment. */ +void +fseg_print( +/*=======*/ + fseg_header_t* header, /*!< in: segment header */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + const fil_space_t *space= + mtr->x_lock_space(page_get_space_id(page_align(header))); + buf_block_t *block; + if (fseg_inode_t *inode= + fseg_inode_try_get(header, space->id, space->zip_size(), mtr, &block)) + fseg_print_low(inode); +} +#endif /* UNIV_BTR_PRINT */ + +#ifdef UNIV_DEBUG +std::ostream &fseg_header::to_stream(std::ostream &out) const +{ + out << "[fseg_header_t: space=" + << mach_read_from_4(m_header + FSEG_HDR_SPACE) + << ", page=" << mach_read_from_4(m_header + FSEG_HDR_PAGE_NO) + << ", offset=" << mach_read_from_2(m_header + FSEG_HDR_OFFSET) << "]"; + return out; +} +#endif /* UNIV_DEBUG */ diff --git a/storage/innobase/fsp/fsp0space.cc b/storage/innobase/fsp/fsp0space.cc new file mode 100644 index 00000000..c2152b08 --- /dev/null +++ b/storage/innobase/fsp/fsp0space.cc @@ -0,0 +1,224 @@ +/***************************************************************************** + +Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fsp/fsp0space.cc +Shared tablespace implementation. + +Created 2012-11-16 by Sunny Bains as srv/srv0space.cc +*******************************************************/ + +#include "fsp0sysspace.h" +#include "fsp0fsp.h" +#include "os0file.h" +#include "my_sys.h" + +/** Check if two tablespaces have common data file names. +@param other_space Tablespace to check against this. +@return true if they have the same data filenames and paths */ +bool +Tablespace::intersection( + const Tablespace* other_space) +{ + for (files_t::const_iterator it(other_space->begin()), + end(other_space->end()); it != end; ++it) { + + if (find(it->m_filename)) { + + return(true); + } + } + + return(false); +} + +/** Frees the memory allocated by the SysTablespace object. */ +void +Tablespace::shutdown() +{ + for (iterator it = begin(); it != end(); ++it) { + it->shutdown(); + } + + m_files.clear(); + ut_free(m_path); + m_path = NULL; + m_space_id = UINT32_MAX; +} + +/** Note that the data file was found. +@param[in,out] file Data file object to set */ +void +Tablespace::file_found(Datafile& file) +{ + /* Note that the file exists and can be opened + in the appropriate mode. */ + file.m_exists = true; + + file.set_open_flags( + &file == &m_files.front() + ? OS_FILE_OPEN_RETRY : OS_FILE_OPEN); +} + +/** Open or Create the data files if they do not exist. +@param[in] is_temp whether this is a temporary tablespace +@return DB_SUCCESS or error code */ +dberr_t +Tablespace::open_or_create(bool is_temp) +{ + fil_space_t* space = NULL; + dberr_t err = DB_SUCCESS; + + ut_ad(!m_files.empty()); + + for (iterator it = begin(); it != end(); ++it) { + if (it->m_exists) { + err = it->open_or_create( + m_ignore_read_only + ? false : srv_read_only_mode); + if (err != DB_SUCCESS) { + return err; + } + } else { + err = it->open_or_create( + m_ignore_read_only + ? false : srv_read_only_mode); + + if (err != DB_SUCCESS) { + return err; + } + + /* Set the correct open flags now that we have + successfully created the file. */ + file_found(*it); + } + + /* We can close the handle now and open the tablespace + the proper way. */ + it->close(); + + if (it == begin()) { + /* First data file. */ + + /* Create the tablespace entry for the multi-file + tablespace in the tablespace manager. */ + uint32_t fsp_flags; + + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + fsp_flags = (FSP_FLAGS_FCRC32_MASK_MARKER + | FSP_FLAGS_FCRC32_PAGE_SSIZE()); + break; + default: + fsp_flags = FSP_FLAGS_PAGE_SSIZE(); + } + + mysql_mutex_lock(&fil_system.mutex); + space = fil_space_t::create( + m_space_id, fsp_flags, + is_temp + ? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE, + NULL); + if (!space) { + mysql_mutex_unlock(&fil_system.mutex); + return DB_ERROR; + } + } else { + mysql_mutex_lock(&fil_system.mutex); + } + space->add(it->m_filepath, OS_FILE_CLOSED, it->m_size, + false, true); + mysql_mutex_unlock(&fil_system.mutex); + } + + return(err); +} + +/** Find a filename in the list of Datafiles for a tablespace +@return true if the filename exists in the data files */ +bool +Tablespace::find(const char* filename) const +{ + for (const_iterator it = begin(); it != end(); ++it) { + + if (innobase_strcasecmp(filename, it->m_filename) == 0) { + return(true); + } + } + + return(false); +} + +/** Delete all the data files. */ +void +Tablespace::delete_files() +{ + for (iterator it = begin(); it != end(); ++it) { + + it->close(); + + bool file_pre_exists; + bool success = os_file_delete_if_exists( + innodb_data_file_key, it->m_filepath, &file_pre_exists); + + if (success && file_pre_exists) { + ib::info() << "Removed temporary tablespace data" + " file: \"" << it->m_filepath << "\""; + } + } +} + +/** Use the ADD DATAFILE path to create a Datafile object and add it to the +front of m_files. +Parse the datafile path into a path and a filename with extension 'ibd'. +This datafile_path provided may or may not be an absolute path, but it +must end with the extension .ibd and have a basename of at least 1 byte. + +Set tablespace m_path member and add a Datafile with the filename. +@param[in] datafile_path full path of the tablespace file. */ +dberr_t Tablespace::add_datafile(const char *filepath) +{ + /* The path provided ends in ".ibd". This was assured by + validate_create_tablespace_info() */ + ut_d(const char* dot = strrchr(filepath, '.')); + ut_ad(dot != NULL && 0 == strcmp(dot, DOT_IBD)); + + /* If the path is an absolute path, separate it onto m_path and a + basename. For relative paths, make the whole thing a basename so that + it can be appended to the datadir. */ + bool is_abs_path = is_absolute_path(filepath); + size_t dirlen = (is_abs_path ? dirname_length(filepath) : 0); + const char* basename = filepath + dirlen; + + /* If the pathname contains a directory separator, fill the + m_path member which is the default directory for files in this + tablespace. Leave it null otherwise. */ + if (dirlen > 0) { + set_path(filepath, dirlen); + } + + /* Now add a new Datafile and set the filepath + using the m_path created above. */ + m_files.push_back(Datafile(m_flags, FIL_IBD_FILE_INITIAL_SIZE, 0)); + m_files.back().make_filepath(m_path, {basename, strlen(basename) - 4}, + IBD); + + return(DB_SUCCESS); +} diff --git a/storage/innobase/fsp/fsp0sysspace.cc b/storage/innobase/fsp/fsp0sysspace.cc new file mode 100644 index 00000000..e4a43e48 --- /dev/null +++ b/storage/innobase/fsp/fsp0sysspace.cc @@ -0,0 +1,1019 @@ +/***************************************************************************** + +Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fsp/fsp0space.cc +Multi file, shared, system tablespace implementation. + +Created 2012-11-16 by Sunny Bains as srv/srv0space.cc +Refactored 2013-7-26 by Kevin Lewis +*******************************************************/ + +#include "fsp0sysspace.h" +#include "srv0start.h" +#include "trx0sys.h" +#include "dict0load.h" +#include "mem0mem.h" +#include "os0file.h" +#include "row0mysql.h" +#include "buf0dblwr.h" + +/** The server header file is included to access opt_initialize global variable. +If server passes the option for create/open DB to SE, we should remove such +direct reference to server header and global variable */ +#include "mysqld.h" + +/** The control info of the system tablespace. */ +SysTablespace srv_sys_space; + +/** The control info of a temporary table shared tablespace. */ +SysTablespace srv_tmp_space; + +/** If the last data file is auto-extended, we add this many pages to it +at a time. We have to make this public because it is a config variable. */ +uint sys_tablespace_auto_extend_increment; + +/** Convert a numeric string that optionally ends in G or M or K, + to a number containing megabytes. +@param[in] str String with a quantity in bytes +@param[out] megs The number in megabytes +@return next character in string */ +char* +SysTablespace::parse_units( + char* ptr, + ulint* megs) +{ + char* endp; + + *megs = strtoul(ptr, &endp, 10); + + ptr = endp; + + switch (*ptr) { + case 'G': case 'g': + *megs *= 1024; + /* fall through */ + case 'M': case 'm': + ++ptr; + break; + case 'K': case 'k': + *megs /= 1024; + ++ptr; + break; + default: + *megs /= 1024 * 1024; + break; + } + + return(ptr); +} + +/** Parse the input params and populate member variables. +@param[in] filepath path to data files +@param[in] supports_raw true if the tablespace supports raw devices +@return true on success parse */ +bool +SysTablespace::parse_params( + const char* filepath_spec, + bool supports_raw) +{ + char* filepath; + ulint size; + char* input_str; + ulint n_files = 0; + + ut_ad(m_last_file_size_max == 0); + ut_ad(!m_auto_extend_last_file); + + char* new_str = mem_strdup(filepath_spec); + char* str = new_str; + + input_str = str; + + /*---------------------- PASS 1 ---------------------------*/ + /* First calculate the number of data files and check syntax: + filepath:size[K |M | G];filepath:size[K |M | G]... . + Note that a Windows path may contain a drive name and a ':'. */ + while (*str != '\0') { + filepath = str; + + while ((*str != ':' && *str != '\0') + || (*str == ':' + && (*(str + 1) == '\\' || *(str + 1) == '/' + || *(str + 1) == ':'))) { + str++; + } + + if (*str == '\0') { + ut_free(new_str); + + ib::error() + << "syntax error in file path or size" + " specified is less than 1 megabyte"; + return(false); + } + + str++; + + str = parse_units(str, &size); + + if (0 == strncmp(str, ":autoextend", + (sizeof ":autoextend") - 1)) { + + str += (sizeof ":autoextend") - 1; + + if (0 == strncmp(str, ":max:", + (sizeof ":max:") - 1)) { + + str += (sizeof ":max:") - 1; + + str = parse_units(str, &size); + } + + if (*str != '\0') { + ut_free(new_str); + ib::error() + << "syntax error in file path or" + << " size specified is less than" + << " 1 megabyte"; + return(false); + } + } + + if (::strlen(str) >= 6 + && *str == 'n' + && *(str + 1) == 'e' + && *(str + 2) == 'w') { + + if (!supports_raw) { + ib::error() + << "Tablespace doesn't support raw" + " devices"; + ut_free(new_str); + return(false); + } + + str += 3; + } + + if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') { + str += 3; + + if (!supports_raw) { + ib::error() + << "Tablespace doesn't support raw" + " devices"; + ut_free(new_str); + return(false); + } + } + + if (size == 0) { + + ut_free(new_str); + + ib::error() + << "syntax error in file path or size" + " specified is less than 1 megabyte"; + + return(false); + } + + ++n_files; + + if (*str == ';') { + str++; + } else if (*str != '\0') { + ut_free(new_str); + + ib::error() + << "syntax error in file path or size" + " specified is less than 1 megabyte"; + return(false); + } + } + + if (n_files == 0) { + + /* filepath_spec must contain at least one data file + definition */ + + ut_free(new_str); + + ib::error() + << "syntax error in file path or size specified" + " is less than 1 megabyte"; + + return(false); + } + + /*---------------------- PASS 2 ---------------------------*/ + /* Then store the actual values to our arrays */ + str = input_str; + ulint order = 0; + + while (*str != '\0') { + filepath = str; + + /* Note that we must step over the ':' in a Windows filepath; + a Windows path normally looks like C:\ibdata\ibdata1:1G, but + a Windows raw partition may have a specification like + \\.\C::1Gnewraw or \\.\PHYSICALDRIVE2:1Gnewraw */ + + while ((*str != ':' && *str != '\0') + || (*str == ':' + && (*(str + 1) == '\\' || *(str + 1) == '/' + || *(str + 1) == ':'))) { + str++; + } + + if (*str == ':') { + /* Make filepath a null-terminated string */ + *str = '\0'; + str++; + } + + str = parse_units(str, &size); + + if (0 == strncmp(str, ":autoextend", + (sizeof ":autoextend") - 1)) { + + m_auto_extend_last_file = true; + + str += (sizeof ":autoextend") - 1; + + if (0 == strncmp(str, ":max:", + (sizeof ":max:") - 1)) { + + str += (sizeof ":max:") - 1; + + str = parse_units(str, &m_last_file_size_max); + } + + if (*str != '\0') { + ut_free(new_str); + ib::error() << "syntax error in file path or" + " size specified is less than 1" + " megabyte"; + return(false); + } + } + + m_files.push_back(Datafile(flags(), uint32_t(size), order)); + m_files.back().make_filepath(path(), + {filepath, strlen(filepath)}, + NO_EXT); + + if (::strlen(str) >= 6 + && *str == 'n' + && *(str + 1) == 'e' + && *(str + 2) == 'w') { + + ut_a(supports_raw); + + str += 3; + + /* Initialize new raw device only during initialize */ + /* JAN: TODO: MySQL 5.7 used opt_initialize */ + m_files.back().m_type = + opt_bootstrap ? SRV_NEW_RAW : SRV_OLD_RAW; + } + + if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') { + + ut_a(supports_raw); + + str += 3; + + /* Initialize new raw device only during initialize */ + if (m_files.back().m_type == SRV_NOT_RAW) { + /* JAN: TODO: MySQL 5.7 used opt_initialize */ + m_files.back().m_type = + opt_bootstrap ? SRV_NEW_RAW : SRV_OLD_RAW; + } + } + + if (*str == ';') { + ++str; + } + order++; + } + + ut_ad(n_files == ulint(m_files.size())); + + ut_free(new_str); + + return(true); +} + +/** Frees the memory allocated by the parse method. */ +void +SysTablespace::shutdown() +{ + Tablespace::shutdown(); + + m_auto_extend_last_file = 0; + m_last_file_size_max = 0; + m_created_new_raw = 0; + m_is_tablespace_full = false; + m_sanity_checks_done = false; +} + +/** Verify the size of the physical file. +@param[in] file data file object +@return DB_SUCCESS if OK else error code. */ +dberr_t +SysTablespace::check_size( + Datafile& file) +{ + os_offset_t size = os_file_get_size(file.m_handle); + ut_a(size != (os_offset_t) -1); + + /* Under some error conditions like disk full scenarios + or file size reaching filesystem limit the data file + could contain an incomplete extent at the end. When we + extend a data file and if some failure happens, then + also the data file could contain an incomplete extent. + So we need to round the size downward to a megabyte.*/ + + const uint32_t rounded_size_pages = static_cast<uint32_t>( + size >> srv_page_size_shift); + + /* If last file */ + if (&file == &m_files.back() && m_auto_extend_last_file) { + + if (file.m_size > rounded_size_pages + || (m_last_file_size_max > 0 + && m_last_file_size_max < rounded_size_pages)) { + ib::error() << "The Auto-extending data file '" + << file.filepath() + << "' is of a different size " + << rounded_size_pages + << " pages than specified" + " by innodb_data_file_path"; + return(DB_ERROR); + } + + file.m_size = rounded_size_pages; + } + + if (rounded_size_pages != file.m_size) { + ib::error() << "The data file '" + << file.filepath() << "' is of a different size " + << rounded_size_pages << " pages" + " than the " << file.m_size << " pages specified by" + " innodb_data_file_path"; + return(DB_ERROR); + } + + return(DB_SUCCESS); +} + +/** Set the size of the file. +@param[in] file data file object +@return DB_SUCCESS or error code */ +dberr_t +SysTablespace::set_size( + Datafile& file) +{ + ut_ad(!srv_read_only_mode || m_ignore_read_only); + const ib::bytes_iec b{uint64_t{file.m_size} << srv_page_size_shift}; + + /* We created the data file and now write it full of zeros */ + ib::info() << "Setting file '" << file.filepath() << "' size to " << b + << ". Physically writing the file full; Please wait ..."; + + bool success = os_file_set_size( + file.m_filepath, file.m_handle, + static_cast<os_offset_t>(file.m_size) << srv_page_size_shift); + + if (success) { + ib::info() << "File '" << file.filepath() << "' size is now " + << b + << "."; + } else { + ib::error() << "Could not set the file size of '" + << file.filepath() << "'. Probably out of disk space"; + + return(DB_ERROR); + } + + return(DB_SUCCESS); +} + +/** Create a data file. +@param[in] file data file object +@return DB_SUCCESS or error code */ +dberr_t +SysTablespace::create_file( + Datafile& file) +{ + dberr_t err = DB_SUCCESS; + + ut_a(!file.m_exists); + ut_ad(!srv_read_only_mode || m_ignore_read_only); + + switch (file.m_type) { + case SRV_NEW_RAW: + + /* The partition is opened, not created; then it is + written over */ + m_created_new_raw = true; + + /* Fall through. */ + + case SRV_OLD_RAW: + + srv_start_raw_disk_in_use = TRUE; + + /* Fall through. */ + + case SRV_NOT_RAW: + err = file.open_or_create( + !m_ignore_read_only && srv_read_only_mode); + break; + } + + if (err != DB_SUCCESS) { + return err; + } + + switch (file.m_type) { + case SRV_OLD_RAW: + break; + case SRV_NOT_RAW: +#ifndef _WIN32 + if (!space_id() && my_disable_locking + && os_file_lock(file.m_handle, file.m_filepath)) { + err = DB_ERROR; + break; + } +#endif + /* fall through */ + case SRV_NEW_RAW: + err = set_size(file); + } + + return(err); +} + +/** Open a data file. +@param[in] file data file object +@return DB_SUCCESS or error code */ +dberr_t +SysTablespace::open_file( + Datafile& file) +{ + dberr_t err = DB_SUCCESS; + + ut_a(file.m_exists); + + switch (file.m_type) { + case SRV_NEW_RAW: + /* The partition is opened, not created; then it is + written over */ + m_created_new_raw = true; + + /* Fall through */ + + case SRV_OLD_RAW: + srv_start_raw_disk_in_use = TRUE; + + if (srv_read_only_mode && !m_ignore_read_only) { + ib::error() << "Can't open a raw device '" + << file.m_filepath << "' when" + " --innodb-read-only is set"; + + return(DB_ERROR); + } + + /* Fall through */ + + case SRV_NOT_RAW: + err = file.open_or_create( + !m_ignore_read_only && srv_read_only_mode); + + if (err != DB_SUCCESS) { + return(err); + } + break; + } + + switch (file.m_type) { + case SRV_NEW_RAW: + /* Set file size for new raw device. */ + err = set_size(file); + break; + + case SRV_NOT_RAW: +#ifndef _WIN32 + if (!space_id() && (m_ignore_read_only || !srv_read_only_mode) + && my_disable_locking + && os_file_lock(file.m_handle, file.m_filepath)) { + err = DB_ERROR; + break; + } +#endif + /* Check file size for existing file. */ + err = check_size(file); + break; + + case SRV_OLD_RAW: + err = DB_SUCCESS; + break; + + } + + if (err != DB_SUCCESS) { + file.close(); + } + + return(err); +} + +/** Check the tablespace header for this tablespace. +@return DB_SUCCESS or error code */ +inline dberr_t SysTablespace::read_lsn_and_check_flags() +{ + dberr_t err; + + files_t::iterator it = m_files.begin(); + + ut_a(it->m_exists); + + if (it->m_handle == OS_FILE_CLOSED) { + + err = it->open_or_create( + m_ignore_read_only ? false : srv_read_only_mode); + + if (err != DB_SUCCESS) { + return(err); + } + } + + err = it->read_first_page( + m_ignore_read_only ? false : srv_read_only_mode); + + if (err != DB_SUCCESS) { + return(err); + } + + ut_a(it->order() == 0); + + if (srv_operation <= SRV_OPERATION_EXPORT_RESTORED) { + buf_dblwr.init_or_load_pages(it->handle(), it->filepath()); + } + + /* Check the contents of the first page of the + first datafile. */ + for (int retry = 0; retry < 2; ++retry) { + + err = it->validate_first_page(); + + if (err != DB_SUCCESS + && (retry == 1 + || recv_sys.dblwr.restore_first_page( + it->m_space_id, it->m_filepath, + it->handle()))) { + + it->close(); + + return(err); + } + } + + /* Make sure the tablespace space ID matches the + space ID on the first page of the first datafile. */ + if (space_id() != it->m_space_id) { + + ib::error() + << "The data file '" << it->filepath() + << "' has the wrong space ID. It should be " + << space_id() << ", but " << it->m_space_id + << " was found"; + + it->close(); + + return(err); + } + + if (srv_operation == SRV_OPERATION_NORMAL) { + /* Prepare for possible upgrade from 0-sized ib_logfile0. */ + ut_ad(!log_sys.next_checkpoint_lsn); + log_sys.next_checkpoint_lsn = mach_read_from_8( + it->m_first_page + 26/*FIL_PAGE_FILE_FLUSH_LSN*/); + } + + it->close(); + + return(DB_SUCCESS); +} + +/** Check if a file can be opened in the correct mode. +@param[in] file data file object +@param[out] reason exact reason if file_status check failed. +@return DB_SUCCESS or error code. */ +dberr_t +SysTablespace::check_file_status( + const Datafile& file, + file_status_t& reason) +{ + os_file_stat_t stat; + + memset(&stat, 0x0, sizeof(stat)); + + dberr_t err = os_file_get_status( + file.m_filepath, &stat, true, + m_ignore_read_only ? false : srv_read_only_mode); + + reason = FILE_STATUS_VOID; + /* File exists but we can't read the rw-permission settings. */ + switch (err) { + case DB_FAIL: + ib::error() << "os_file_get_status() failed on '" + << file.filepath() + << "'. Can't determine file permissions"; + err = DB_ERROR; + reason = FILE_STATUS_RW_PERMISSION_ERROR; + break; + + case DB_SUCCESS: + /* Note: stat.rw_perm is only valid for "regular" files */ + + if (stat.type == OS_FILE_TYPE_FILE) { + if (!stat.rw_perm) { + ib::error() << "The data file" + << " '" << file.filepath() + << ((!srv_read_only_mode + || m_ignore_read_only) + ? "' must be writable" + : "' must be readable"); + + err = DB_ERROR; + reason = FILE_STATUS_READ_WRITE_ERROR; + } + + } else { + /* Not a regular file, bail out. */ + ib::error() << "The data file '" << file.filepath() + << "' is not a regular file."; + + err = DB_ERROR; + reason = FILE_STATUS_NOT_REGULAR_FILE_ERROR; + } + break; + + case DB_NOT_FOUND: + break; + + default: + ut_ad(0); + } + + return(err); +} + +/** Note that the data file was not found. +@param[in] file data file object +@param[out] create_new_db true if a new instance to be created +@return DB_SUCESS or error code */ +dberr_t +SysTablespace::file_not_found( + Datafile& file, + bool* create_new_db) +{ + file.m_exists = false; + + if (m_ignore_read_only) { + } else if (srv_read_only_mode) { + ib::error() << "Can't create file '" << file.filepath() + << "' when --innodb-read-only is set"; + return(DB_ERROR); + } else if (srv_force_recovery && space_id() == TRX_SYS_SPACE) { + ib::error() << "Can't create file '" << file.filepath() + << "' when --innodb-force-recovery is set"; + return DB_ERROR; + } + + if (&file == &m_files.front()) { + + /* First data file. */ + ut_a(!*create_new_db); + *create_new_db = TRUE; + + if (space_id() == TRX_SYS_SPACE) { + ib::info() << "The first data file '" + << file.filepath() << "' did not exist." + " A new tablespace will be created!"; + } + + } else { + ib::info() << "Need to create a new data file '" + << file.filepath() << "'."; + } + + /* Set the file create mode. */ + switch (file.m_type) { + case SRV_NOT_RAW: + file.set_open_flags(OS_FILE_CREATE); + break; + + case SRV_NEW_RAW: + case SRV_OLD_RAW: + file.set_open_flags(OS_FILE_OPEN_RAW); + break; + } + + return(DB_SUCCESS); +} + +/** Note that the data file was found. +@param[in,out] file data file object +@return true if a new instance to be created */ +bool +SysTablespace::file_found( + Datafile& file) +{ + /* Note that the file exists and can be opened + in the appropriate mode. */ + file.m_exists = true; + + /* Set the file open mode */ + switch (file.m_type) { + case SRV_NOT_RAW: + file.set_open_flags( + &file == &m_files.front() + ? OS_FILE_OPEN_RETRY : OS_FILE_OPEN); + break; + + case SRV_NEW_RAW: + case SRV_OLD_RAW: + file.set_open_flags(OS_FILE_OPEN_RAW); + break; + } + + /* Need to create the system tablespace for new raw device. */ + return(file.m_type == SRV_NEW_RAW); +} + +/** Check the data file specification. +@param[out] create_new_db true if a new database is to be created +@param[in] min_expected_size Minimum expected tablespace size in bytes +@return DB_SUCCESS if all OK else error code */ +dberr_t +SysTablespace::check_file_spec( + bool* create_new_db, + ulint min_expected_size) +{ + *create_new_db = FALSE; + + if (m_files.size() >= 1000) { + ib::error() << "There must be < 1000 data files " + " but " << m_files.size() << " have been" + " defined."; + + return(DB_ERROR); + } + + if (!m_auto_extend_last_file + && get_sum_of_sizes() + < (min_expected_size >> srv_page_size_shift)) { + ib::error() << "Tablespace size must be at least " + << (min_expected_size >> 20) << " MB"; + return(DB_ERROR); + } + + dberr_t err = DB_SUCCESS; + + ut_a(!m_files.empty()); + + /* If there is more than one data file and the last data file + doesn't exist, that is OK. We allow adding of new data files. */ + + files_t::iterator begin = m_files.begin(); + files_t::iterator end = m_files.end(); + + for (files_t::iterator it = begin; it != end; ++it) { + + file_status_t reason_if_failed; + err = check_file_status(*it, reason_if_failed); + + if (err == DB_NOT_FOUND) { + + err = file_not_found(*it, create_new_db); + + if (err != DB_SUCCESS) { + break; + } + + } else if (err != DB_SUCCESS) { + if (reason_if_failed == FILE_STATUS_READ_WRITE_ERROR) { + ib::error() << "The data file '" + << it->filepath() + << ((!srv_read_only_mode + || m_ignore_read_only) + ? "' must be writable" + : "' must be readable"); + } + + ut_a(err != DB_FAIL); + break; + + } else if (*create_new_db) { + ib::error() << "The data file '" + << begin->filepath() + << "' was not found but" + " one of the other data files '" + << it->filepath() << "' exists."; + + err = DB_ERROR; + break; + + } else { + *create_new_db = file_found(*it); + } + } + + return(err); +} + +/** Open or create the data files +@param[in] is_temp whether this is a temporary tablespace +@param[in] create_new_db whether we are creating a new database +@param[out] sum_new_sizes sum of sizes of the new files added +@return DB_SUCCESS or error code */ +dberr_t +SysTablespace::open_or_create( + bool is_temp, + bool create_new_db, + ulint* sum_new_sizes) +{ + dberr_t err = DB_SUCCESS; + fil_space_t* space = NULL; + + ut_ad(!m_files.empty()); + + if (sum_new_sizes) { + *sum_new_sizes = 0; + } + + files_t::iterator begin = m_files.begin(); + files_t::iterator end = m_files.end(); + + ut_ad(begin->order() == 0); + + for (files_t::iterator it = begin; it != end; ++it) { + + if (it->m_exists) { + err = open_file(*it); + + /* For new raw device increment new size. */ + if (sum_new_sizes && it->m_type == SRV_NEW_RAW) { + + *sum_new_sizes += it->m_size; + } + + } else { + err = create_file(*it); + + if (sum_new_sizes) { + *sum_new_sizes += it->m_size; + } + + /* Set the correct open flags now that we have + successfully created the file. */ + if (err == DB_SUCCESS) { + /* We ignore new_db OUT parameter here + as the information is known at this stage */ + file_found(*it); + } + } + + if (err != DB_SUCCESS) { + return(err); + } + + } + + if (!create_new_db && space_id() == TRX_SYS_SPACE) { + /* Validate the header page in the first datafile. */ + err = read_lsn_and_check_flags(); + if (err != DB_SUCCESS) { + return(err); + } + } + + /* Close the curent handles, add space and file info to the + fil_system cache and the Data Dictionary, and re-open them + in file_system cache so that they stay open until shutdown. */ + mysql_mutex_lock(&fil_system.mutex); + ulint node_counter = 0; + for (files_t::iterator it = begin; it != end; ++it) { + it->close(); + it->m_exists = true; + + if (it != begin) { + } else if (is_temp) { + ut_ad(space_id() == SRV_TMP_SPACE_ID); + space = fil_space_t::create( + SRV_TMP_SPACE_ID, flags(), + FIL_TYPE_TEMPORARY, NULL); + ut_ad(space == fil_system.temp_space); + if (!space) { + err = DB_ERROR; + break; + } + ut_ad(!space->is_compressed()); + ut_ad(space->full_crc32()); + } else { + ut_ad(space_id() == TRX_SYS_SPACE); + space = fil_space_t::create( + TRX_SYS_SPACE, it->flags(), + FIL_TYPE_TABLESPACE, NULL); + ut_ad(space == fil_system.sys_space); + if (!space) { + err = DB_ERROR; + break; + } + } + + uint32_t max_size = (++node_counter == m_files.size() + ? (m_last_file_size_max == 0 + ? UINT32_MAX + : uint32_t(m_last_file_size_max)) + : it->m_size); + + space->add(it->m_filepath, OS_FILE_CLOSED, it->m_size, + it->m_type != SRV_NOT_RAW, true, max_size); + } + + mysql_mutex_unlock(&fil_system.mutex); + return(err); +} + +/** Normalize the file size, convert from megabytes to number of pages. */ +void +SysTablespace::normalize_size() +{ + files_t::iterator end = m_files.end(); + + for (files_t::iterator it = m_files.begin(); it != end; ++it) { + + it->m_size <<= (20U - srv_page_size_shift); + } + + m_last_file_size_max <<= (20U - srv_page_size_shift); +} + + +/** +@return next increment size */ +uint32_t SysTablespace::get_increment() const +{ + if (m_last_file_size_max == 0) + return get_autoextend_increment(); + + if (!is_valid_size()) + { + ib::error() << "The last data file has a size of " << last_file_size() + << " but the max size allowed is " + << m_last_file_size_max; + } + + return std::min(uint32_t(m_last_file_size_max) - last_file_size(), + get_autoextend_increment()); +} + + +/** +@return true if configured to use raw devices */ +bool +SysTablespace::has_raw_device() +{ + files_t::iterator end = m_files.end(); + + for (files_t::iterator it = m_files.begin(); it != end; ++it) { + + if (it->is_raw_device()) { + return(true); + } + } + + return(false); +} |