summaryrefslogtreecommitdiffstats
path: root/storage/innobase/fsp
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/fsp')
-rw-r--r--storage/innobase/fsp/fsp0file.cc936
-rw-r--r--storage/innobase/fsp/fsp0fsp.cc3070
-rw-r--r--storage/innobase/fsp/fsp0space.cc224
-rw-r--r--storage/innobase/fsp/fsp0sysspace.cc1019
4 files changed, 5249 insertions, 0 deletions
diff --git a/storage/innobase/fsp/fsp0file.cc b/storage/innobase/fsp/fsp0file.cc
new file mode 100644
index 00000000..cafff419
--- /dev/null
+++ b/storage/innobase/fsp/fsp0file.cc
@@ -0,0 +1,936 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fsp/fsp0file.cc
+Tablespace data file implementation
+
+Created 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#include "fil0fil.h"
+#include "fsp0types.h"
+#include "os0file.h"
+#include "page0page.h"
+#include "srv0start.h"
+#include "log.h"
+
+/** Release the resources. */
+void
+Datafile::shutdown()
+{
+ close();
+
+ free_filepath();
+ free_first_page();
+}
+
+/** Create/open a data file.
+@param[in] read_only_mode if true, then readonly mode checks are enforced.
+@return DB_SUCCESS or error code */
+dberr_t
+Datafile::open_or_create(bool read_only_mode)
+{
+ bool success;
+ ut_a(m_filepath != NULL);
+ ut_ad(m_handle == OS_FILE_CLOSED);
+
+ m_handle = os_file_create(
+ innodb_data_file_key, m_filepath, m_open_flags,
+ OS_FILE_NORMAL, OS_DATA_FILE, read_only_mode, &success);
+
+ if (!success) {
+ m_last_os_error = os_file_get_last_error(true);
+ ib::error() << "Cannot open datafile '" << m_filepath << "'";
+ return(DB_CANNOT_OPEN_FILE);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Open a data file in read-only mode to check if it exists so that it
+can be validated.
+@param[in] strict whether to issue error messages
+@return DB_SUCCESS or error code */
+dberr_t
+Datafile::open_read_only(bool strict)
+{
+ bool success = false;
+ ut_ad(m_handle == OS_FILE_CLOSED);
+
+ /* This function can be called for file objects that do not need
+ to be opened, which is the case when the m_filepath is NULL */
+ if (m_filepath == NULL) {
+ return(DB_ERROR);
+ }
+
+ set_open_flags(OS_FILE_OPEN);
+ m_handle = os_file_create_simple_no_error_handling(
+ innodb_data_file_key, m_filepath, m_open_flags,
+ OS_FILE_READ_ONLY, true, &success);
+
+ if (success) {
+ m_exists = true;
+ init_file_info();
+
+ return(DB_SUCCESS);
+ }
+
+ if (strict) {
+ m_last_os_error = os_file_get_last_error(true);
+ ib::error() << "Cannot open datafile for read-only: '"
+ << m_filepath << "' OS error: " << m_last_os_error;
+ }
+
+ return(DB_CANNOT_OPEN_FILE);
+}
+
+/** Open a data file in read-write mode during start-up so that
+doublewrite pages can be restored and then it can be validated.*
+@return DB_SUCCESS or error code */
+inline dberr_t Datafile::open_read_write()
+{
+ bool success = false;
+ ut_ad(m_handle == OS_FILE_CLOSED);
+ ut_ad(!srv_read_only_mode);
+
+ /* This function can be called for file objects that do not need
+ to be opened, which is the case when the m_filepath is NULL */
+ if (m_filepath == NULL) {
+ return(DB_ERROR);
+ }
+
+ set_open_flags(OS_FILE_OPEN);
+ m_handle = os_file_create_simple_no_error_handling(
+ innodb_data_file_key, m_filepath, m_open_flags,
+ OS_FILE_READ_WRITE, false, &success);
+
+ if (!success) {
+ m_last_os_error = os_file_get_last_error(true);
+ ib::error() << "Cannot open datafile for read-write: '"
+ << m_filepath << "'";
+ return(DB_CANNOT_OPEN_FILE);
+ }
+
+ m_exists = true;
+
+ init_file_info();
+
+ return(DB_SUCCESS);
+}
+
+/** Initialize OS specific file info. */
+void
+Datafile::init_file_info()
+{
+#ifdef _WIN32
+ GetFileInformationByHandle((os_file_t)m_handle, &m_file_info);
+#else
+ fstat(m_handle, &m_file_info);
+#endif /* WIN32 */
+}
+
+/** Close a data file.
+@return DB_SUCCESS or error code */
+dberr_t
+Datafile::close()
+{
+ if (m_handle != OS_FILE_CLOSED) {
+ ibool success = os_file_close(m_handle);
+ ut_a(success);
+
+ m_handle = OS_FILE_CLOSED;
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Make a full filepath from a directory path and a filename.
+Prepend the dirpath to filename using the extension given.
+If dirpath is NULL, prepend the default datadir to filepath.
+Store the result in m_filepath.
+@param dirpath directory path
+@param name tablespace (table) name
+@param ext filename extension */
+void Datafile::make_filepath(const char *dirpath, fil_space_t::name_type name,
+ ib_extention ext)
+{
+ ut_ad(dirpath || name.size());
+ free_filepath();
+ m_filepath= fil_make_filepath(dirpath, name, ext, false);
+ ut_ad(m_filepath);
+ set_filename();
+}
+
+/** Set the filepath by duplicating the filepath sent in. This is the
+name of the file with its extension and absolute or relative path.
+@param[in] filepath filepath to set */
+void
+Datafile::set_filepath(const char* filepath)
+{
+ free_filepath();
+ m_filepath = static_cast<char*>(ut_malloc_nokey(strlen(filepath) + 1));
+ ::strcpy(m_filepath, filepath);
+ set_filename();
+}
+
+/** Free the filepath buffer. */
+void
+Datafile::free_filepath()
+{
+ if (m_filepath != NULL) {
+ ut_free(m_filepath);
+ m_filepath = NULL;
+ m_filename = NULL;
+ }
+}
+
+/** Do a quick test if the filepath provided looks the same as this filepath
+byte by byte. If they are two different looking paths to the same file,
+same_as() will be used to show that after the files are opened.
+@param[in] other filepath to compare with
+@retval true if it is the same filename by byte comparison
+@retval false if it looks different */
+bool
+Datafile::same_filepath_as(
+ const char* other) const
+{
+ return(0 == strcmp(m_filepath, other));
+}
+
+/** Test if another opened datafile is the same file as this object.
+@param[in] other Datafile to compare with
+@return true if it is the same file, else false */
+bool
+Datafile::same_as(
+ const Datafile& other) const
+{
+#ifdef _WIN32
+ return(m_file_info.dwVolumeSerialNumber
+ == other.m_file_info.dwVolumeSerialNumber
+ && m_file_info.nFileIndexHigh
+ == other.m_file_info.nFileIndexHigh
+ && m_file_info.nFileIndexLow
+ == other.m_file_info.nFileIndexLow);
+#else
+ return(m_file_info.st_ino == other.m_file_info.st_ino
+ && m_file_info.st_dev == other.m_file_info.st_dev);
+#endif /* WIN32 */
+}
+
+/** Reads a few significant fields from the first page of the first
+datafile. The Datafile must already be open.
+@param[in] read_only_mode If true, then readonly mode checks are enforced.
+@return DB_SUCCESS or DB_IO_ERROR if page cannot be read */
+dberr_t
+Datafile::read_first_page(bool read_only_mode)
+{
+ if (m_handle == OS_FILE_CLOSED) {
+
+ dberr_t err = open_or_create(read_only_mode);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ /* Align the memory for a possible read from a raw device */
+
+ m_first_page = static_cast<byte*>(
+ aligned_malloc(UNIV_PAGE_SIZE_MAX, srv_page_size));
+
+ dberr_t err = DB_ERROR;
+ size_t page_size = UNIV_PAGE_SIZE_MAX;
+
+ /* Don't want unnecessary complaints about partial reads. */
+
+ while (page_size >= UNIV_PAGE_SIZE_MIN) {
+
+ ulint n_read = 0;
+
+ err = os_file_read(
+ IORequestReadPartial, m_handle, m_first_page, 0,
+ page_size, &n_read);
+
+ if (err == DB_SUCCESS) {
+ break;
+ }
+
+ if (err == DB_IO_ERROR && n_read == 0) {
+ break;
+ }
+ if (err == DB_IO_ERROR && n_read >= UNIV_PAGE_SIZE_MIN) {
+ page_size >>= 1;
+ } else if (srv_operation == SRV_OPERATION_BACKUP) {
+ break;
+ } else {
+ ib::info() << "Cannot read first page of '"
+ << m_filepath << "': " << err;
+ break;
+ }
+ }
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ if (m_order == 0) {
+ if (memcmp_aligned<2>(FIL_PAGE_SPACE_ID + m_first_page,
+ FSP_HEADER_OFFSET + FSP_SPACE_ID
+ + m_first_page, 4)) {
+ ib::error()
+ << "Inconsistent tablespace ID in "
+ << m_filepath;
+ return DB_CORRUPTION;
+ }
+
+ m_space_id = mach_read_from_4(FIL_PAGE_SPACE_ID
+ + m_first_page);
+ m_flags = fsp_header_get_flags(m_first_page);
+ if (!fil_space_t::is_valid_flags(m_flags, m_space_id)) {
+ uint32_t cflags = fsp_flags_convert_from_101(m_flags);
+ if (cflags == UINT32_MAX) {
+ switch (fsp_flags_is_incompatible_mysql(m_flags)) {
+ case 0:
+ sql_print_error("InnoDB: Invalid flags 0x%" PRIx32 " in %s",
+ m_flags, m_filepath);
+ return DB_CORRUPTION;
+ case 3:
+ case 2:
+ sql_print_error("InnoDB: MySQL-8.0 tablespace in %s",
+ m_filepath);
+ break;
+ case 1:
+ sql_print_error("InnoDB: MySQL Encrypted tablespace in %s",
+ m_filepath);
+ break;
+ }
+ sql_print_error("InnoDB: Restart in MySQL for migration/recovery.");
+ return DB_UNSUPPORTED;
+ } else {
+ m_flags = cflags;
+ }
+ }
+ }
+
+ const size_t physical_size = fil_space_t::physical_size(m_flags);
+
+ if (physical_size > page_size) {
+ ib::error() << "File " << m_filepath
+ << " should be longer than "
+ << page_size << " bytes";
+ return(DB_CORRUPTION);
+ }
+
+ return(err);
+}
+
+/** Free the first page from memory when it is no longer needed. */
+void Datafile::free_first_page()
+{
+ aligned_free(m_first_page);
+ m_first_page= nullptr;
+}
+
+/** Validates the datafile and checks that it conforms with the expected
+space ID and flags. The file should exist and be successfully opened
+in order for this function to validate it.
+@param[in] space_id The expected tablespace ID.
+@param[in] flags The expected tablespace flags.
+@retval DB_SUCCESS if tablespace is valid, DB_ERROR if not.
+m_is_valid is also set true on success, else false. */
+dberr_t Datafile::validate_to_dd(uint32_t space_id, uint32_t flags)
+{
+ dberr_t err;
+
+ if (!is_open()) {
+ return DB_ERROR;
+ }
+
+ /* Validate this single-table-tablespace with the data dictionary,
+ but do not compare the DATA_DIR flag, in case the tablespace was
+ remotely located. */
+ err = validate_first_page();
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ flags &= ~FSP_FLAGS_MEM_MASK;
+
+ /* Make sure the datafile we found matched the space ID.
+ If the datafile is a file-per-table tablespace then also match
+ the row format and zip page size. */
+ if (m_space_id == space_id
+ && (fil_space_t::is_flags_equal(flags, m_flags)
+ || fil_space_t::is_flags_equal(m_flags, flags))) {
+ /* Datafile matches the tablespace expected. */
+ return(DB_SUCCESS);
+ }
+
+ /* else do not use this tablespace. */
+ m_is_valid = false;
+
+ ib::error() << "Refusing to load '" << m_filepath << "' (id="
+ << m_space_id << ", flags=" << ib::hex(m_flags)
+ << "); dictionary contains id="
+ << space_id << ", flags=" << ib::hex(flags);
+
+ return(DB_ERROR);
+}
+
+/** Validates this datafile for the purpose of recovery. The file should
+exist and be successfully opened. We initially open it in read-only mode
+because we just want to read the SpaceID. However, if the first page is
+corrupt and needs to be restored from the doublewrite buffer, we will
+reopen it in write mode and ry to restore that page.
+@retval DB_SUCCESS if tablespace is valid, DB_ERROR if not.
+m_is_valid is also set true on success, else false. */
+dberr_t
+Datafile::validate_for_recovery()
+{
+ dberr_t err;
+
+ ut_ad(is_open());
+ ut_ad(!srv_read_only_mode);
+
+ err = validate_first_page();
+
+ switch (err) {
+ case DB_TABLESPACE_EXISTS:
+ break;
+ case DB_SUCCESS:
+ if (!m_defer || !m_space_id) {
+ break;
+ }
+ /* InnoDB should check whether the deferred
+ tablespace page0 can be recovered from
+ double write buffer. InnoDB should try
+ to recover only if m_space_id exists because
+ dblwr pages can be searched via {space_id, 0}.
+ m_space_id is set in read_first_page(). */
+ /* fall through */
+ default:
+ /* Re-open the file in read-write mode Attempt to restore
+ page 0 from doublewrite and read the space ID from a survey
+ of the first few pages. */
+ close();
+ err = open_read_write();
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ if (!m_defer) {
+ err = find_space_id();
+ if (err != DB_SUCCESS || m_space_id == 0) {
+ ib::error() << "Datafile '" << m_filepath
+ << "' is corrupted. Cannot determine "
+ "the space ID from the first 64 pages.";
+ return(err);
+ }
+ }
+
+ if (m_space_id == UINT32_MAX) {
+ return DB_SUCCESS; /* empty file */
+ }
+
+ if (recv_sys.dblwr.restore_first_page(
+ m_space_id, m_filepath, m_handle)) {
+ return m_defer ? err : DB_CORRUPTION;
+ }
+
+ /* Free the previously read first page and then re-validate. */
+ free_first_page();
+ m_defer = false;
+ err = validate_first_page();
+ }
+
+ return(err);
+}
+
+/** Check the consistency of the first page of a datafile when the
+tablespace is opened. This occurs before the fil_space_t is created
+so the Space ID found here must not already be open.
+m_is_valid is set true on success, else false.
+@retval DB_SUCCESS on if the datafile is valid
+@retval DB_CORRUPTION if the datafile is not readable
+@retval DB_TABLESPACE_EXISTS if there is a duplicate space_id */
+dberr_t Datafile::validate_first_page()
+{
+ const char* error_txt = NULL;
+
+ m_is_valid = true;
+
+ if (m_first_page == NULL
+ && read_first_page(srv_read_only_mode) != DB_SUCCESS) {
+
+ error_txt = "Cannot read first page";
+ }
+
+ if (error_txt != NULL) {
+err_exit:
+ free_first_page();
+
+ if (recv_recovery_is_on()
+ || srv_operation == SRV_OPERATION_BACKUP) {
+ m_defer= true;
+ return DB_SUCCESS;
+ }
+
+ ib::info() << error_txt << " in datafile: " << m_filepath
+ << ", Space ID:" << m_space_id << ", Flags: "
+ << m_flags;
+ m_is_valid = false;
+ return(DB_CORRUPTION);
+ }
+
+ /* Check if the whole page is blank. */
+ if (!m_space_id && !m_flags) {
+ const byte* b = m_first_page;
+ ulint nonzero_bytes = srv_page_size;
+
+ while (*b == '\0' && --nonzero_bytes != 0) {
+
+ b++;
+ }
+
+ if (nonzero_bytes == 0) {
+ error_txt = "Header page consists of zero bytes";
+ goto err_exit;
+ }
+ }
+
+ if (!fil_space_t::is_valid_flags(m_flags, m_space_id)) {
+ /* Tablespace flags must be valid. */
+ error_txt = "Tablespace flags are invalid";
+ goto err_exit;
+ }
+
+ ulint logical_size = fil_space_t::logical_size(m_flags);
+
+ if (srv_page_size != logical_size) {
+ free_first_page();
+ if (recv_recovery_is_on()
+ || srv_operation == SRV_OPERATION_BACKUP) {
+ m_defer= true;
+ return DB_SUCCESS;
+ }
+ /* Logical size must be innodb_page_size. */
+ ib::error()
+ << "Data file '" << m_filepath << "' uses page size "
+ << logical_size << ", but the innodb_page_size"
+ " start-up parameter is "
+ << srv_page_size;
+ return(DB_ERROR);
+ }
+
+ if (page_get_page_no(m_first_page) != 0) {
+ /* First page must be number 0 */
+ error_txt = "Header page contains inconsistent data";
+ goto err_exit;
+ }
+
+ if (m_space_id >= SRV_SPACE_ID_UPPER_BOUND) {
+ error_txt = "A bad Space ID was found";
+ goto err_exit;
+ }
+
+ if (buf_page_is_corrupted(false, m_first_page, m_flags)) {
+ /* Look for checksum and other corruptions. */
+ error_txt = "Checksum mismatch";
+ goto err_exit;
+ }
+
+ mysql_mutex_lock(&fil_system.mutex);
+
+ fil_space_t* space = fil_space_get_by_id(m_space_id);
+
+ if (space) {
+ fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
+
+ if (node && !strcmp(m_filepath, node->name)) {
+ok_exit:
+ mysql_mutex_unlock(&fil_system.mutex);
+ return DB_SUCCESS;
+ }
+
+ if (!m_space_id
+ && (recv_recovery_is_on()
+ || srv_operation == SRV_OPERATION_BACKUP)) {
+ m_defer= true;
+ goto ok_exit;
+ }
+
+ /* Make sure the space_id has not already been opened. */
+ ib::error() << "Attempted to open a previously opened"
+ " tablespace. Previous tablespace: "
+ << (node ? node->name : "(unknown)")
+ << " uses space ID: " << m_space_id
+ << ". Cannot open filepath: " << m_filepath
+ << " which uses the same space ID.";
+ }
+
+ mysql_mutex_unlock(&fil_system.mutex);
+
+ if (space) {
+ m_is_valid = false;
+
+ free_first_page();
+
+ return(is_predefined_tablespace(m_space_id)
+ ? DB_CORRUPTION
+ : DB_TABLESPACE_EXISTS);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Determine the space id of the given file descriptor by reading a few
+pages from the beginning of the .ibd file.
+@return DB_SUCCESS if space id was successfully identified, else DB_ERROR. */
+dberr_t
+Datafile::find_space_id()
+{
+ os_offset_t file_size;
+
+ ut_ad(m_handle != OS_FILE_CLOSED);
+
+ file_size = os_file_get_size(m_handle);
+
+ if (!file_size) {
+ return DB_SUCCESS;
+ }
+
+ if (file_size == (os_offset_t) -1) {
+ ib::error() << "Could not get file size of datafile '"
+ << m_filepath << "'";
+ return(DB_CORRUPTION);
+ }
+
+ /* Assuming a page size, read the space_id from each page and store it
+ in a map. Find out which space_id is agreed on by majority of the
+ pages. Choose that space_id. */
+ for (ulint page_size = UNIV_ZIP_SIZE_MIN;
+ page_size <= UNIV_PAGE_SIZE_MAX;
+ page_size <<= 1) {
+ /* map[space_id] = count of pages */
+ typedef std::map<
+ uint32_t,
+ uint32_t,
+ std::less<uint32_t>,
+ ut_allocator<std::pair<const uint32_t, uint32_t> > >
+ Pages;
+
+ Pages verify;
+ uint32_t page_count = 64;
+ uint32_t valid_pages = 0;
+
+ /* Adjust the number of pages to analyze based on file size */
+ while ((page_count * page_size) > file_size) {
+ --page_count;
+ }
+
+ ib::info()
+ << "Page size:" << page_size
+ << ". Pages to analyze:" << page_count;
+
+ byte* page = static_cast<byte*>(
+ aligned_malloc(page_size, page_size));
+
+ uint32_t fsp_flags;
+ /* provide dummy value if the first os_file_read() fails */
+ switch (srv_checksum_algorithm) {
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+ fsp_flags = 1U << FSP_FLAGS_FCRC32_POS_MARKER
+ | FSP_FLAGS_FCRC32_PAGE_SSIZE()
+ | uint(innodb_compression_algorithm)
+ << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO;
+ break;
+ default:
+ fsp_flags = 0;
+ }
+
+ for (ulint j = 0; j < page_count; ++j) {
+ if (os_file_read(IORequestRead, m_handle, page,
+ j * page_size, page_size, nullptr)) {
+ ib::info()
+ << "READ FAIL: page_no:" << j;
+ continue;
+ }
+
+ if (j == 0) {
+ fsp_flags = mach_read_from_4(
+ page + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS);
+ }
+
+ bool noncompressed_ok = false;
+
+ /* For noncompressed pages, the page size must be
+ equal to srv_page_size. */
+ if (page_size == srv_page_size
+ && !fil_space_t::zip_size(fsp_flags)) {
+ noncompressed_ok = !buf_page_is_corrupted(
+ false, page, fsp_flags);
+ }
+
+ bool compressed_ok = false;
+
+ if (srv_page_size <= UNIV_PAGE_SIZE_DEF
+ && page_size == fil_space_t::zip_size(fsp_flags)) {
+ compressed_ok = !buf_page_is_corrupted(
+ false, page, fsp_flags);
+ }
+
+ if (noncompressed_ok || compressed_ok) {
+
+ uint32_t space_id = mach_read_from_4(page
+ + FIL_PAGE_SPACE_ID);
+
+ if (space_id > 0) {
+
+ ib::info()
+ << "VALID: space:"
+ << space_id << " page_no:" << j
+ << " page_size:" << page_size;
+
+ ++valid_pages;
+
+ ++verify[space_id];
+ }
+ }
+ }
+
+ aligned_free(page);
+
+ ib::info()
+ << "Page size: " << page_size
+ << ". Possible space_id count:" << verify.size();
+
+ const ulint pages_corrupted = 3;
+
+ for (ulint missed = 0; missed <= pages_corrupted; ++missed) {
+
+ for (Pages::const_iterator it = verify.begin();
+ it != verify.end();
+ ++it) {
+
+ ib::info() << "space_id:" << it->first
+ << ", Number of pages matched: "
+ << it->second << "/" << valid_pages
+ << " (" << page_size << ")";
+
+ if (it->second == (valid_pages - missed)) {
+ ib::info() << "Chosen space:"
+ << it->first;
+
+ m_space_id = it->first;
+ return(DB_SUCCESS);
+ }
+ }
+
+ }
+ }
+
+ return(DB_CORRUPTION);
+}
+
+/** Read an InnoDB Symbolic Link (ISL) file by name.
+@param link_filepath filepath of the ISL file
+@return data file name (must be freed by the caller)
+@retval nullptr on error */
+static char *read_link_file(const char *link_filepath)
+{
+ if (FILE* file= fopen(link_filepath, "r+b" STR_O_CLOEXEC))
+ {
+ char *filepath= static_cast<char*>(ut_malloc_nokey(OS_FILE_MAX_PATH));
+
+ os_file_read_string(file, filepath, OS_FILE_MAX_PATH);
+ fclose(file);
+
+ if (size_t len= strlen(filepath))
+ {
+ /* Trim whitespace from end of filepath */
+ len--;
+ while (static_cast<byte>(filepath[len]) <= 0x20)
+ {
+ if (!len)
+ return nullptr;
+ filepath[len--]= 0;
+ }
+ /* Ensure that the last 2 path separators are forward slashes,
+ because elsewhere we are assuming that tablespace file names end
+ in "/databasename/tablename.ibd". */
+ unsigned trailing_slashes= 0;
+ for (; len; len--)
+ {
+ switch (filepath[len]) {
+#ifdef _WIN32
+ case '\\':
+ filepath[len]= '/';
+ /* fall through */
+#endif
+ case '/':
+ if (++trailing_slashes >= 2)
+ return filepath;
+ }
+ }
+ }
+ }
+
+ return nullptr;
+}
+
+/** Create a link filename,
+open that file, and read the contents into m_filepath.
+@param name table name
+@return filepath()
+@retval nullptr if the .isl file does not exist or cannot be read */
+const char *RemoteDatafile::open_link_file(const fil_space_t::name_type name)
+{
+ if (!m_link_filepath)
+ m_link_filepath= fil_make_filepath(nullptr, name, ISL, false);
+ m_filepath= read_link_file(m_link_filepath);
+ return m_filepath;
+}
+
+/** Release the resources. */
+void
+RemoteDatafile::shutdown()
+{
+ Datafile::shutdown();
+
+ if (m_link_filepath != 0) {
+ ut_free(m_link_filepath);
+ m_link_filepath = 0;
+ }
+}
+
+/** Create InnoDB Symbolic Link (ISL) file.
+@param name tablespace name
+@param filepath full file name
+@return DB_SUCCESS or error code */
+dberr_t RemoteDatafile::create_link_file(fil_space_t::name_type name,
+ const char *filepath)
+{
+ bool success;
+ dberr_t err = DB_SUCCESS;
+ char* link_filepath = NULL;
+ char* prev_filepath = NULL;
+
+ ut_ad(!srv_read_only_mode);
+
+ link_filepath = fil_make_filepath(NULL, name, ISL, false);
+
+ if (link_filepath == NULL) {
+ return(DB_ERROR);
+ }
+
+ prev_filepath = read_link_file(link_filepath);
+ if (prev_filepath) {
+ /* Truncate (starting with MySQL 5.6, probably no
+ longer since MariaDB Server 10.2.19) used to call this
+ with an existing link file which contains the same filepath. */
+ bool same = !strncmp(prev_filepath, name.data(), name.size())
+ && !strcmp(prev_filepath + name.size(), DOT_IBD);
+ ut_free(prev_filepath);
+ if (same) {
+ ut_free(link_filepath);
+ return(DB_SUCCESS);
+ }
+ }
+
+ /** Check if the file already exists. */
+ FILE* file = NULL;
+ bool exists;
+ os_file_type_t ftype;
+
+ success = os_file_status(link_filepath, &exists, &ftype);
+ ulint error = 0;
+
+ if (success && !exists) {
+
+ file = fopen(link_filepath, "w");
+ if (file == NULL) {
+ /* This call will print its own error message */
+ error = os_file_get_last_error(true);
+ }
+ } else {
+ error = OS_FILE_ALREADY_EXISTS;
+ }
+
+ if (error != 0) {
+
+ ib::error() << "Cannot create file " << link_filepath << ".";
+
+ if (error == OS_FILE_ALREADY_EXISTS) {
+ ib::error() << "The link file: " << link_filepath
+ << " already exists.";
+ err = DB_TABLESPACE_EXISTS;
+
+ } else if (error == OS_FILE_DISK_FULL) {
+ err = DB_OUT_OF_FILE_SPACE;
+
+ } else {
+ err = DB_ERROR;
+ }
+
+ /* file is not open, no need to close it. */
+ ut_free(link_filepath);
+ return(err);
+ }
+
+ const size_t len = strlen(filepath);
+ if (fwrite(filepath, 1, len, file) != len) {
+ error = os_file_get_last_error(true);
+ ib::error() <<
+ "Cannot write link file: "
+ << link_filepath << " filepath: " << filepath;
+ err = DB_ERROR;
+ }
+
+ /* Close the file, we only need it at startup */
+ fclose(file);
+
+ ut_free(link_filepath);
+
+ return(err);
+}
+
+/** Delete an InnoDB Symbolic Link (ISL) file. */
+void
+RemoteDatafile::delete_link_file(void)
+{
+ ut_ad(m_link_filepath != NULL);
+
+ if (m_link_filepath != NULL) {
+ os_file_delete_if_exists(innodb_data_file_key,
+ m_link_filepath, NULL);
+ }
+}
+
+/** Delete an InnoDB Symbolic Link (ISL) file by name.
+@param name tablespace name */
+void RemoteDatafile::delete_link_file(fil_space_t::name_type name)
+{
+ if (char *link_filepath= fil_make_filepath(NULL, name, ISL, false))
+ {
+ os_file_delete_if_exists(innodb_data_file_key, link_filepath, nullptr);
+ ut_free(link_filepath);
+ }
+}
diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc
new file mode 100644
index 00000000..6c5c354e
--- /dev/null
+++ b/storage/innobase/fsp/fsp0fsp.cc
@@ -0,0 +1,3070 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fsp/fsp0fsp.cc
+File space management
+
+Created 11/29/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fsp0fsp.h"
+#include "buf0buf.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "mtr0log.h"
+#include "ut0byte.h"
+#include "page0page.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "ibuf0ibuf.h"
+#include "btr0btr.h"
+#include "btr0sea.h"
+#include "dict0boot.h"
+#include "log0log.h"
+#include "dict0mem.h"
+#include "fsp0types.h"
+#include "log.h"
+
+typedef uint32_t page_no_t;
+
+/** Returns the first extent descriptor for a segment.
+We think of the extent lists of the segment catenated in the order
+FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE.
+@param[in] inode segment inode
+@param[in] space tablespace
+@param[in,out] mtr mini-transaction
+@param[out] err error code
+@return the first extent descriptor, or NULL if none */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static
+xdes_t*
+fseg_get_first_extent(
+ fseg_inode_t* inode,
+ const fil_space_t* space,
+ mtr_t* mtr,
+ dberr_t* err);
+
+ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Put new extents to the free list if there are free extents above the free
+limit. If an extent happens to contain an extent descriptor page, the extent
+is put to the FSP_FREE_FRAG list with the page marked as used.
+@param[in] init_space true if this is a single-table tablespace
+and we are only initializing the first extent and the first bitmap pages;
+then we will not allocate more extents
+@param[in,out] space tablespace
+@param[in,out] header tablespace header
+@param[in,out] mtr mini-transaction */
+static
+dberr_t
+fsp_fill_free_list(
+ bool init_space,
+ fil_space_t* space,
+ buf_block_t* header,
+ mtr_t* mtr);
+
+/** Allocates a single free page from a segment.
+This function implements the intelligent allocation strategy which tries to
+minimize file space fragmentation.
+@param[in,out] space tablespace
+@param[in,out] seg_inode segment inode
+@param[in,out] iblock segment inode page
+@param[in] hint hint of which page would be desirable
+@param[in] direction if the new page is needed because of
+an index page split, and records are inserted there in order, into which
+direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR
+@param[in,out] mtr mini-transaction
+@param[in,out] init_mtr mtr or another mini-transaction in
+which the page should be initialized.
+@param[out] err error code
+@return the allocated page
+@retval nullptr if no page could be allocated */
+static
+buf_block_t*
+fseg_alloc_free_page_low(
+ fil_space_t* space,
+ fseg_inode_t* seg_inode,
+ buf_block_t* iblock,
+ uint32_t hint,
+ byte direction,
+#ifdef UNIV_DEBUG
+ bool has_done_reservation,
+ /*!< whether the space has already been reserved */
+#endif /* UNIV_DEBUG */
+ mtr_t* mtr,
+ mtr_t* init_mtr,
+ dberr_t* err)
+ MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Get the tablespace header block, SX-latched
+@param[in] space tablespace
+@param[in,out] mtr mini-transaction
+@param[out] err error code
+@return pointer to the space header, page x-locked
+@retval nullptr if the page cannot be retrieved or is corrupted */
+static buf_block_t *fsp_get_header(const fil_space_t *space, mtr_t *mtr,
+ dberr_t *err)
+{
+ const page_id_t id{space->id, 0};
+ buf_block_t *block= mtr->get_already_latched(id, MTR_MEMO_PAGE_SX_FIX);
+ if (block)
+ *err= DB_SUCCESS;
+ else
+ {
+ block= buf_page_get_gen(id, space->zip_size(), RW_SX_LATCH,
+ nullptr, BUF_GET_POSSIBLY_FREED,
+ mtr, err);
+ if (block &&
+ space->id != mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID +
+ block->page.frame))
+ {
+ *err= DB_CORRUPTION;
+ block= nullptr;
+ }
+ }
+ return block;
+}
+
+/** Set the XDES_FREE_BIT of a page.
+@tparam free desired value of XDES_FREE_BIT
+@param[in] block extent descriptor block
+@param[in,out] descr extent descriptor
+@param[in] offset page offset within the extent
+@param[in,out] mtr mini-transaction */
+template<bool free>
+inline void xdes_set_free(const buf_block_t &block, xdes_t *descr,
+ ulint offset, mtr_t *mtr)
+{
+ ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
+ MTR_MEMO_PAGE_X_FIX));
+ ut_ad(offset < FSP_EXTENT_SIZE);
+ ut_ad(page_align(descr) == block.page.frame);
+ compile_time_assert(XDES_BITS_PER_PAGE == 2);
+ compile_time_assert(XDES_FREE_BIT == 0);
+ compile_time_assert(XDES_CLEAN_BIT == 1);
+
+ ulint index= XDES_BITS_PER_PAGE * offset;
+ byte *b= &descr[XDES_BITMAP + (index >> 3)];
+ /* xdes_init() should have set all XDES_CLEAN_BIT. */
+ ut_ad(!(~*b & 0xaa));
+ /* Clear or set XDES_FREE_BIT. */
+ byte val= free
+ ? static_cast<byte>(*b | 1 << (index & 7))
+ : static_cast<byte>(*b & ~(1 << (index & 7)));
+ mtr->write<1>(block, b, val);
+}
+
+/**
+Find a free page.
+@param descr extent descriptor
+@param hint page offset to start searching from (towards larger pages)
+@return free page offset
+@retval FIL_NULL if no page is free */
+inline uint32_t xdes_find_free(const xdes_t *descr, uint32_t hint= 0)
+{
+ const uint32_t extent_size= FSP_EXTENT_SIZE;
+ ut_ad(hint < extent_size);
+ for (uint32_t i= hint; i < extent_size; i++)
+ if (xdes_is_free(descr, i))
+ return i;
+ for (uint32_t i= 0; i < hint; i++)
+ if (xdes_is_free(descr, i))
+ return i;
+ return FIL_NULL;
+}
+
+/**
+Determine the number of used pages in a descriptor.
+@param descr file descriptor
+@return number of pages used */
+inline uint32_t xdes_get_n_used(const xdes_t *descr)
+{
+ uint32_t count= 0;
+
+ for (uint32_t i= FSP_EXTENT_SIZE; i--; )
+ if (!xdes_is_free(descr, i))
+ count++;
+
+ return count;
+}
+
+/**
+Determine whether a file extent is full.
+@param descr file descriptor
+@return whether all pages have been allocated */
+inline bool xdes_is_full(const xdes_t *descr)
+{
+ return FSP_EXTENT_SIZE == xdes_get_n_used(descr);
+}
+
+/** Set the state of an extent descriptor.
+@param[in] block extent descriptor block
+@param[in,out] descr extent descriptor
+@param[in] state the state
+@param[in,out] mtr mini-transaction */
+inline void xdes_set_state(const buf_block_t &block, xdes_t *descr,
+ byte state, mtr_t *mtr)
+{
+ ut_ad(descr && mtr);
+ ut_ad(state >= XDES_FREE);
+ ut_ad(state <= XDES_FSEG);
+ ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
+ MTR_MEMO_PAGE_X_FIX));
+ ut_ad(page_align(descr) == block.page.frame);
+ ut_ad(mach_read_from_4(descr + XDES_STATE) <= XDES_FSEG);
+ mtr->write<1>(block, XDES_STATE + 3 + descr, state);
+}
+
+/**********************************************************************//**
+Gets the state of an xdes.
+@return state */
+UNIV_INLINE
+ulint
+xdes_get_state(
+/*===========*/
+ const xdes_t* descr) /*!< in: descriptor */
+{
+ ulint state;
+
+ ut_ad(descr);
+ state = mach_read_from_4(descr + XDES_STATE);
+ ut_ad(state - 1 < XDES_FSEG);
+ return(state);
+}
+
+/**********************************************************************//**
+Inits an extent descriptor to the free and clean state. */
+inline void xdes_init(const buf_block_t &block, xdes_t *descr, mtr_t *mtr)
+{
+ ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
+ MTR_MEMO_PAGE_X_FIX));
+ mtr->memset(&block, uint16_t(descr - block.page.frame) + XDES_BITMAP,
+ XDES_SIZE - XDES_BITMAP, 0xff);
+ xdes_set_state(block, descr, XDES_FREE, mtr);
+}
+
+/** Mark a page used in an extent descriptor.
+@param[in,out] seg_inode segment inode
+@param[in,out] iblock segment inode page
+@param[in] page page number
+@param[in,out] descr extent descriptor
+@param[in,out] xdes extent descriptor page
+@param[in,out] mtr mini-transaction
+@return error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fseg_mark_page_used(fseg_inode_t *seg_inode, buf_block_t *iblock,
+ ulint page, xdes_t *descr, buf_block_t *xdes, mtr_t *mtr)
+{
+ ut_ad(fil_page_get_type(iblock->page.frame) == FIL_PAGE_INODE);
+ ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+ ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4));
+ ut_ad(!memcmp(seg_inode + FSEG_ID, descr + XDES_ID, 4));
+
+ const uint16_t xoffset= uint16_t(descr - xdes->page.frame + XDES_FLST_NODE);
+ const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame);
+
+ if (!xdes_get_n_used(descr))
+ {
+ /* We move the extent from the free list to the NOT_FULL list */
+ if (dberr_t err= flst_remove(iblock, uint16_t(FSEG_FREE + ioffset),
+ xdes, xoffset, mtr))
+ return err;
+ if (dberr_t err= flst_add_last(iblock, uint16_t(FSEG_NOT_FULL + ioffset),
+ xdes, xoffset, mtr))
+ return err;
+ }
+
+ if (UNIV_UNLIKELY(!xdes_is_free(descr, page % FSP_EXTENT_SIZE)))
+ return DB_CORRUPTION;
+
+ /* We mark the page as used */
+ xdes_set_free<false>(*xdes, descr, page % FSP_EXTENT_SIZE, mtr);
+
+ byte* p_not_full= seg_inode + FSEG_NOT_FULL_N_USED;
+ const uint32_t not_full_n_used= mach_read_from_4(p_not_full) + 1;
+ mtr->write<4>(*iblock, p_not_full, not_full_n_used);
+ if (xdes_is_full(descr))
+ {
+ /* We move the extent from the NOT_FULL list to the FULL list */
+ if (dberr_t err= flst_remove(iblock, uint16_t(FSEG_NOT_FULL + ioffset),
+ xdes, xoffset, mtr))
+ return err;
+ if (dberr_t err= flst_add_last(iblock, uint16_t(FSEG_FULL + ioffset),
+ xdes, xoffset, mtr))
+ return err;
+ mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED,
+ not_full_n_used - FSP_EXTENT_SIZE);
+ }
+
+ return DB_SUCCESS;
+}
+
+/** Get pointer to a the extent descriptor of a page.
+@param[in,out] sp_header tablespace header page, x-latched
+@param[in] space tablespace
+@param[in] offset page offset
+@param[in,out] mtr mini-transaction
+@param[out] err error code
+@param[out] desc_block descriptor block
+@param[in] init_space whether the tablespace is being initialized
+@return pointer to the extent descriptor, NULL if the page does not
+exist in the space or if the offset exceeds free limit */
+UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
+xdes_t*
+xdes_get_descriptor_with_space_hdr(
+ buf_block_t* header,
+ const fil_space_t* space,
+ page_no_t offset,
+ mtr_t* mtr,
+ dberr_t* err = nullptr,
+ buf_block_t** desc_block = nullptr,
+ bool init_space = false)
+{
+ ut_ad(space->is_owner());
+ ut_ad(mtr->memo_contains_flagged(header, MTR_MEMO_PAGE_SX_FIX
+ | MTR_MEMO_PAGE_X_FIX));
+ /* Read free limit and space size */
+ uint32_t limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
+ + header->page.frame);
+ uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+ + header->page.frame);
+ ut_ad(limit == space->free_limit
+ || (space->free_limit == 0
+ && (init_space
+ || space->purpose == FIL_TYPE_TEMPORARY
+ || (srv_startup_is_before_trx_rollback_phase
+ && (space->id == TRX_SYS_SPACE
+ || srv_is_undo_tablespace(space->id))))));
+ ut_ad(size == space->size_in_header);
+
+ if (offset >= size || offset >= limit) {
+ return nullptr;
+ }
+
+ const unsigned zip_size = space->zip_size();
+
+ uint32_t descr_page_no = xdes_calc_descriptor_page(zip_size, offset);
+
+ buf_block_t* block = header;
+
+ if (descr_page_no) {
+ block = buf_page_get_gen(page_id_t(space->id, descr_page_no),
+ zip_size, RW_SX_LATCH, nullptr,
+ BUF_GET_POSSIBLY_FREED, mtr, err);
+ }
+
+ if (desc_block) {
+ *desc_block = block;
+ }
+
+ return block
+ ? XDES_ARR_OFFSET + XDES_SIZE
+ * xdes_calc_descriptor_index(zip_size, offset)
+ + block->page.frame
+ : nullptr;
+}
+
+MY_ATTRIBUTE((nonnull(1,3), warn_unused_result))
+/** Get the extent descriptor of a page.
+The page where the extent descriptor resides is x-locked. If the page
+offset is equal to the free limit of the space, we will add new
+extents from above the free limit to the space free list, if not free
+limit == space size. This adding is necessary to make the descriptor
+defined, as they are uninitialized above the free limit.
+@param[in] space tablespace
+@param[in] offset page offset; if equal to the free limit, we
+try to add new extents to the space free list
+@param[in,out] mtr mini-transaction
+@param[out] err error code
+@param[out] xdes extent descriptor page
+@return the extent descriptor */
+static xdes_t *xdes_get_descriptor(const fil_space_t *space, page_no_t offset,
+ mtr_t *mtr, dberr_t *err= nullptr,
+ buf_block_t **xdes= nullptr)
+{
+ if (buf_block_t *block=
+ buf_page_get_gen(page_id_t(space->id, 0), space->zip_size(), RW_SX_LATCH,
+ nullptr, BUF_GET_POSSIBLY_FREED, mtr, err))
+ return xdes_get_descriptor_with_space_hdr(block, space, offset, mtr,
+ err, xdes);
+ return nullptr;
+}
+
+MY_ATTRIBUTE((nonnull(3), warn_unused_result))
+/** Get a pointer to the extent descriptor. The page where the
+extent descriptor resides is x-locked.
+@param space tablespace
+@param lst_node file address of the list node contained in the descriptor
+@param mtr mini-transaction
+@param err error code
+@param block extent descriptor block
+@return pointer to the extent descriptor */
+static inline
+xdes_t *xdes_lst_get_descriptor(const fil_space_t &space, fil_addr_t lst_node,
+ mtr_t *mtr, buf_block_t **block= nullptr,
+ dberr_t *err= nullptr)
+{
+ ut_ad(mtr->memo_contains(space));
+ ut_ad(lst_node.boffset < space.physical_size());
+ buf_block_t *b;
+ if (!block)
+ block= &b;
+ *block= buf_page_get_gen(page_id_t{space.id, lst_node.page},
+ space.zip_size(), RW_SX_LATCH,
+ nullptr, BUF_GET_POSSIBLY_FREED, mtr, err);
+ if (*block)
+ return (*block)->page.frame + lst_node.boffset - XDES_FLST_NODE;
+
+ space.set_corrupted();
+ return nullptr;
+}
+
+/********************************************************************//**
+Returns page offset of the first page in extent described by a descriptor.
+@return offset of the first page in extent */
+static uint32_t xdes_get_offset(const xdes_t *descr)
+{
+ ut_ad(descr);
+ return page_get_page_no(page_align(descr)) +
+ uint32_t(((page_offset(descr) - XDES_ARR_OFFSET) / XDES_SIZE) *
+ FSP_EXTENT_SIZE);
+}
+
+/** Initialize a file page whose prior contents should be ignored.
+@param[in,out] block buffer pool block */
+void fsp_apply_init_file_page(buf_block_t *block)
+{
+ memset_aligned<UNIV_PAGE_SIZE_MIN>(block->page.frame, 0, srv_page_size);
+ const page_id_t id(block->page.id());
+
+ mach_write_to_4(block->page.frame + FIL_PAGE_OFFSET, id.page_no());
+ memset_aligned<8>(block->page.frame + FIL_PAGE_PREV, 0xff, 8);
+ mach_write_to_4(block->page.frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ id.space());
+ if (page_zip_des_t* page_zip= buf_block_get_page_zip(block))
+ {
+ memset_aligned<UNIV_ZIP_SIZE_MIN>(page_zip->data, 0,
+ page_zip_get_size(page_zip));
+ static_assert(FIL_PAGE_OFFSET == 4, "compatibility");
+ memcpy_aligned<4>(page_zip->data + FIL_PAGE_OFFSET,
+ block->page.frame + FIL_PAGE_OFFSET, 4);
+ memset_aligned<8>(page_zip->data + FIL_PAGE_PREV, 0xff, 8);
+ static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+ "not perfect alignment");
+ memcpy_aligned<2>(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ block->page.frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4);
+ }
+}
+
+#ifdef UNIV_DEBUG
+/** Assert that the mini-transaction is compatible with
+updating an allocation bitmap page.
+@param[in] mtr mini-transaction */
+void fil_space_t::modify_check(const mtr_t& mtr) const
+{
+ switch (mtr.get_log_mode()) {
+ case MTR_LOG_NONE:
+ /* These modes are only allowed within a non-bitmap page
+ when there is a higher-level redo log record written. */
+ ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_TEMPORARY);
+ break;
+ case MTR_LOG_NO_REDO:
+ ut_ad(purpose == FIL_TYPE_TEMPORARY || purpose == FIL_TYPE_IMPORT);
+ break;
+ default:
+ /* We may only write redo log for a persistent tablespace. */
+ ut_ad(purpose == FIL_TYPE_TABLESPACE);
+ ut_ad(mtr.is_named_space(id));
+ }
+}
+#endif
+
+/** Initialize a tablespace header.
+@param[in,out] space tablespace
+@param[in] size current size in blocks
+@param[in,out] mtr mini-transaction
+@return error code */
+dberr_t fsp_header_init(fil_space_t *space, uint32_t size, mtr_t *mtr)
+{
+ const page_id_t page_id(space->id, 0);
+ const ulint zip_size = space->zip_size();
+
+ buf_block_t *free_block = buf_LRU_get_free_block(false);
+
+ mtr->x_lock_space(space);
+
+ buf_block_t* block = buf_page_create(space, 0, zip_size, mtr,
+ free_block);
+ if (UNIV_UNLIKELY(block != free_block)) {
+ buf_pool.free_block(free_block);
+ }
+
+ space->size_in_header = size;
+ space->free_len = 0;
+ space->free_limit = 0;
+
+ /* The prior contents of the file page should be ignored */
+
+ fsp_init_file_page(space, block, mtr);
+
+ mtr->write<2>(*block, block->page.frame + FIL_PAGE_TYPE,
+ FIL_PAGE_TYPE_FSP_HDR);
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(*block, FSP_HEADER_OFFSET + FSP_SPACE_ID
+ + block->page.frame, space->id);
+ ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_NOT_USED
+ + block->page.frame));
+ /* recv_sys_t::parse() expects to find a WRITE record that
+ covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
+ in order to avoid optimizing away any unchanged most
+ significant bytes of FSP_SIZE. */
+ mtr->write<4,mtr_t::FORCED>(*block, FSP_HEADER_OFFSET + FSP_SIZE
+ + block->page.frame, size);
+ ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
+ + block->page.frame));
+ if (auto f = space->flags & ~FSP_FLAGS_MEM_MASK) {
+ mtr->write<4,mtr_t::FORCED>(*block,
+ FSP_HEADER_OFFSET + FSP_SPACE_FLAGS
+ + block->page.frame, f);
+ }
+ ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED
+ + block->page.frame));
+
+ flst_init(block, FSP_HEADER_OFFSET + FSP_FREE, mtr);
+ flst_init(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG, mtr);
+ flst_init(block, FSP_HEADER_OFFSET + FSP_FULL_FRAG, mtr);
+ flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, mtr);
+ flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, mtr);
+
+ mtr->write<8>(*block, FSP_HEADER_OFFSET + FSP_SEG_ID
+ + block->page.frame,
+ 1U);
+
+ if (dberr_t err = fsp_fill_free_list(!is_system_tablespace(space->id),
+ space, block, mtr)) {
+ return err;
+ }
+
+ /* Write encryption metadata to page 0 if tablespace is
+ encrypted or encryption is disabled by table option. */
+ if (space->crypt_data &&
+ (space->crypt_data->should_encrypt() ||
+ space->crypt_data->not_encrypted())) {
+ space->crypt_data->write_page0(block, mtr);
+ }
+
+ return DB_SUCCESS;
+}
+
+/** Try to extend a single-table tablespace so that a page would fit in the
+data file.
+@param[in,out] space tablespace
+@param[in] page_no page number
+@param[in,out] header tablespace header
+@param[in,out] mtr mini-transaction
+@return true if success */
+static ATTRIBUTE_COLD __attribute__((warn_unused_result))
+bool
+fsp_try_extend_data_file_with_pages(
+ fil_space_t* space,
+ uint32_t page_no,
+ buf_block_t* header,
+ mtr_t* mtr)
+{
+ bool success;
+ ulint size;
+
+ ut_ad(!is_system_tablespace(space->id));
+ ut_d(space->modify_check(*mtr));
+
+ size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+ + header->page.frame);
+ ut_ad(size == space->size_in_header);
+
+ ut_a(page_no >= size);
+
+ success = fil_space_extend(space, page_no + 1);
+ /* The size may be less than we wanted if we ran out of disk space. */
+ /* recv_sys_t::parse() expects to find a WRITE record that
+ covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
+ in order to avoid optimizing away any unchanged most
+ significant bytes of FSP_SIZE. */
+ mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE
+ + header->page.frame, space->size);
+ space->size_in_header = space->size;
+
+ return(success);
+}
+
+/** Calculate the number of physical pages in an extent for this file.
+@param[in] physical_size page_size of the datafile
+@return number of pages in an extent for this file */
+inline uint32_t fsp_get_extent_size_in_pages(ulint physical_size)
+{
+ return uint32_t((FSP_EXTENT_SIZE << srv_page_size_shift) / physical_size);
+}
+
+
+/** Calculate the number of pages to extend a datafile.
+We extend single-table tablespaces first one extent at a time,
+but 4 at a time for bigger tablespaces. It is not enough to extend always
+by one extent, because we need to add at least one extent to FSP_FREE.
+A single extent descriptor page will track many extents. And the extent
+that uses its extent descriptor page is put onto the FSP_FREE_FRAG list.
+Extents that do not use their extent descriptor page are added to FSP_FREE.
+The physical page size is used to determine how many extents are tracked
+on one extent descriptor page. See xdes_calc_descriptor_page().
+@param[in] physical_size page size in data file
+@param[in] size current number of pages in the datafile
+@return number of pages to extend the file. */
+static uint32_t fsp_get_pages_to_extend_ibd(unsigned physical_size,
+ uint32_t size)
+{
+ uint32_t extent_size = fsp_get_extent_size_in_pages(physical_size);
+ /* The threshold is set at 32MiB except when the physical page
+ size is small enough that it must be done sooner. */
+ uint32_t threshold = std::min(32 * extent_size, physical_size);
+
+ if (size >= threshold) {
+ /* Below in fsp_fill_free_list() we assume
+ that we add at most FSP_FREE_ADD extents at
+ a time */
+ extent_size *= FSP_FREE_ADD;
+ }
+
+ return extent_size;
+}
+
+/** Try to extend the last data file of a tablespace if it is auto-extending.
+@param[in,out] space tablespace
+@param[in,out] header tablespace header
+@param[in,out] mtr mini-transaction
+@return number of pages added
+@retval 0 if the tablespace was not extended */
+ATTRIBUTE_COLD __attribute__((nonnull))
+static
+ulint
+fsp_try_extend_data_file(fil_space_t *space, buf_block_t *header, mtr_t *mtr)
+{
+ const char* OUT_OF_SPACE_MSG =
+ "ran out of space. Please add another file or use"
+ " 'autoextend' for the last file in setting";
+
+ ut_d(space->modify_check(*mtr));
+
+ if (space->id == TRX_SYS_SPACE
+ && !srv_sys_space.can_auto_extend_last_file()) {
+
+ /* We print the error message only once to avoid
+ spamming the error log. Note that we don't need
+ to reset the flag to false as dealing with this
+ error requires server restart. */
+ if (!srv_sys_space.get_tablespace_full_status()) {
+ sql_print_error("InnoDB: The InnoDB system tablespace "
+ "%s" " innodb_data_file_path.",
+ OUT_OF_SPACE_MSG);
+ srv_sys_space.set_tablespace_full_status(true);
+ }
+ return(0);
+ } else if (space->id == SRV_TMP_SPACE_ID
+ && !srv_tmp_space.can_auto_extend_last_file()) {
+
+ /* We print the error message only once to avoid
+ spamming the error log. Note that we don't need
+ to reset the flag to false as dealing with this
+ error requires server restart. */
+ if (!srv_tmp_space.get_tablespace_full_status()) {
+ sql_print_error("InnoDB: The InnoDB temporary"
+ " tablespace %s"
+ " innodb_temp_data_file_path.",
+ OUT_OF_SPACE_MSG);
+ srv_tmp_space.set_tablespace_full_status(true);
+ }
+ return(0);
+ }
+
+ uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+ + header->page.frame);
+ ut_ad(size == space->size_in_header);
+ uint32_t size_increase;
+
+ const unsigned ps = space->physical_size();
+
+ switch (space->id) {
+ case TRX_SYS_SPACE:
+ size_increase = srv_sys_space.get_increment();
+ break;
+ case SRV_TMP_SPACE_ID:
+ size_increase = srv_tmp_space.get_increment();
+ break;
+ default:
+ uint32_t extent_pages = fsp_get_extent_size_in_pages(ps);
+ if (size < extent_pages) {
+ /* Let us first extend the file to extent_size */
+ if (!fsp_try_extend_data_file_with_pages(
+ space, extent_pages - 1, header, mtr)) {
+ return(0);
+ }
+
+ size = extent_pages;
+ }
+
+ size_increase = fsp_get_pages_to_extend_ibd(ps, size);
+ }
+
+ if (size_increase == 0) {
+ return(0);
+ }
+
+ if (!fil_space_extend(space, size + size_increase)) {
+ return(0);
+ }
+
+ /* For the system tablespace, we ignore any fragments of a
+ full megabyte when storing the size to the space header */
+
+ space->size_in_header = space->id
+ ? space->size
+ : ut_2pow_round(space->size, (1024 * 1024) / ps);
+
+ /* recv_sys_t::parse() expects to find a WRITE record that
+ covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
+ in order to avoid optimizing away any unchanged most
+ significant bytes of FSP_SIZE. */
+ mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE
+ + header->page.frame,
+ space->size_in_header);
+
+ return(size_increase);
+}
+
+/** Reset the page type.
+Data files created before MySQL 5.1.48 may contain garbage in FIL_PAGE_TYPE.
+In MySQL 3.23.53, only undo log pages and index pages were tagged.
+Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE.
+@param[in] block block with invalid FIL_PAGE_TYPE
+@param[in] type expected page type
+@param[in,out] mtr mini-transaction */
+ATTRIBUTE_COLD
+void fil_block_reset_type(const buf_block_t& block, ulint type, mtr_t* mtr)
+{
+ ib::info() << "Resetting invalid page " << block.page.id() << " type "
+ << fil_page_get_type(block.page.frame) << " to " << type << ".";
+ mtr->write<2>(block, block.page.frame + FIL_PAGE_TYPE, type);
+}
+
+/** Put new extents to the free list if there are free extents above the free
+limit. If an extent happens to contain an extent descriptor page, the extent
+is put to the FSP_FREE_FRAG list with the page marked as used.
+@param[in] init_space true if this is a single-table tablespace
+and we are only initializing the first extent and the first bitmap pages;
+then we will not allocate more extents
+@param[in,out] space tablespace
+@param[in,out] header tablespace header
+@param[in,out] mtr mini-transaction
+@return error code */
+static
+dberr_t
+fsp_fill_free_list(
+ bool init_space,
+ fil_space_t* space,
+ buf_block_t* header,
+ mtr_t* mtr)
+{
+ ut_d(space->modify_check(*mtr));
+
+ /* Check if we can fill free list from above the free list limit */
+ uint32_t size=
+ mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + header->page.frame);
+ uint32_t limit=
+ mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + header->page.frame);
+
+ ut_ad(size == space->size_in_header);
+ ut_ad(limit == space->free_limit);
+
+ const auto zip_size= space->zip_size();
+
+ if (size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD)
+ {
+ bool skip_resize= init_space;
+ switch (space->id) {
+ case TRX_SYS_SPACE:
+ skip_resize= !srv_sys_space.can_auto_extend_last_file();
+ break;
+ case SRV_TMP_SPACE_ID:
+ skip_resize= !srv_tmp_space.can_auto_extend_last_file();
+ break;
+ }
+
+ if (!skip_resize)
+ {
+ fsp_try_extend_data_file(space, header, mtr);
+ size= space->size_in_header;
+ }
+ }
+
+ uint32_t count= 0;
+ for (uint32_t i= limit, extent_size= FSP_EXTENT_SIZE,
+ physical_size= space->physical_size();
+ (init_space && i < 1) ||
+ (i + extent_size <= size && count < FSP_FREE_ADD);
+ i += extent_size)
+ {
+ const bool init_xdes= !ut_2pow_remainder(i, physical_size);
+ space->free_limit= i + extent_size;
+ mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FREE_LIMIT +
+ header->page.frame, i + extent_size);
+
+ if (init_xdes)
+ {
+ /* We are going to initialize a new descriptor page
+ and a new ibuf bitmap page: the prior contents of the
+ pages should be ignored. */
+
+ if (i)
+ {
+ buf_block_t *f= buf_LRU_get_free_block(false);
+ buf_block_t *block= buf_page_create(space, static_cast<uint32_t>(i),
+ zip_size, mtr, f);
+ if (UNIV_UNLIKELY(block != f))
+ buf_pool.free_block(f);
+ fsp_init_file_page(space, block, mtr);
+ mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame,
+ FIL_PAGE_TYPE_XDES);
+ }
+
+ if (space->purpose != FIL_TYPE_TEMPORARY)
+ {
+ buf_block_t *f= buf_LRU_get_free_block(false);
+ buf_block_t *block=
+ buf_page_create(space,
+ static_cast<uint32_t>(i + FSP_IBUF_BITMAP_OFFSET),
+ zip_size, mtr, f);
+ if (UNIV_UNLIKELY(block != f))
+ buf_pool.free_block(f);
+ fsp_init_file_page(space, block, mtr);
+ mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame,
+ FIL_PAGE_IBUF_BITMAP);
+ }
+ }
+
+ buf_block_t *xdes= nullptr;
+ xdes_t *descr;
+ {
+ dberr_t err= DB_SUCCESS;
+ descr= xdes_get_descriptor_with_space_hdr(header, space, i, mtr,
+ &err, &xdes, init_space);
+ if (!descr)
+ return err;
+ }
+
+ if (xdes != header && !space->full_crc32())
+ fil_block_check_type(*xdes, FIL_PAGE_TYPE_XDES, mtr);
+ xdes_init(*xdes, descr, mtr);
+ const uint16_t xoffset=
+ static_cast<uint16_t>(descr - xdes->page.frame + XDES_FLST_NODE);
+ if (UNIV_UNLIKELY(init_xdes))
+ {
+ /* The first page in the extent is a descriptor page and the
+ second is an ibuf bitmap page: mark them used */
+ xdes_set_free<false>(*xdes, descr, 0, mtr);
+ xdes_set_free<false>(*xdes, descr, FSP_IBUF_BITMAP_OFFSET, mtr);
+ xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
+ if (dberr_t err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+ xdes, xoffset, mtr))
+ return err;
+ byte *n_used= FSP_HEADER_OFFSET + FSP_FRAG_N_USED + header->page.frame;
+ mtr->write<4>(*header, n_used, 2U + mach_read_from_4(n_used));
+ }
+ else
+ {
+ if (dberr_t err=
+ flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE,
+ xdes, xoffset, mtr))
+ return err;
+ count++;
+ }
+ }
+
+ space->free_len+= count;
+ return DB_SUCCESS;
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Allocates a new free extent.
+@param[in,out] space tablespace
+@param[in] hint hint of which extent would be desirable: any
+page offset in the extent goes; the hint must not be > FSP_FREE_LIMIT
+@param[out] xdes extent descriptor page
+@param[in,out] mtr mini-transaction
+@return extent descriptor
+@retval nullptr if cannot be allocated */
+static xdes_t *fsp_alloc_free_extent(fil_space_t *space, uint32_t hint,
+ buf_block_t **xdes, mtr_t *mtr,
+ dberr_t *err)
+{
+ fil_addr_t first;
+ xdes_t* descr;
+ buf_block_t* desc_block;
+
+ buf_block_t* header = fsp_get_header(space, mtr, err);
+ if (!header) {
+corrupted:
+ space->set_corrupted();
+ return nullptr;
+ }
+
+ descr = xdes_get_descriptor_with_space_hdr(
+ header, space, hint, mtr, err, &desc_block);
+ if (!descr) {
+ goto corrupted;
+ }
+
+ if (desc_block != header && !space->full_crc32()) {
+ fil_block_check_type(*desc_block, FIL_PAGE_TYPE_XDES, mtr);
+ }
+
+ if (xdes_get_state(descr) == XDES_FREE) {
+ /* Ok, we can take this extent */
+ } else {
+ /* Take the first extent in the free list */
+ first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE
+ + header->page.frame);
+
+ if (first.page == FIL_NULL) {
+ *err = fsp_fill_free_list(false, space, header, mtr);
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+ goto corrupted;
+ }
+
+ first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE
+ + header->page.frame);
+ if (first.page == FIL_NULL) {
+ return nullptr; /* No free extents left */
+ }
+ }
+
+ descr = xdes_lst_get_descriptor(*space, first, mtr,
+ &desc_block, err);
+ if (!descr) {
+ return descr;
+ }
+ }
+
+ *err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE, desc_block,
+ static_cast<uint16_t>(descr - desc_block->page.frame
+ + XDES_FLST_NODE),
+ mtr);
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+ return nullptr;
+ }
+
+ space->free_len--;
+ *xdes = desc_block;
+
+ return(descr);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Allocate a single free page.
+@param[in,out] header tablespace header
+@param[in,out] xdes extent descriptor page
+@param[in,out] descr extent descriptor
+@param[in] bit slot to allocate in the extent
+@param[in,out] mtr mini-transaction
+@return error code */
+static dberr_t
+fsp_alloc_from_free_frag(buf_block_t *header, buf_block_t *xdes, xdes_t *descr,
+ ulint bit, mtr_t *mtr)
+{
+ if (UNIV_UNLIKELY(xdes_get_state(descr) != XDES_FREE_FRAG ||
+ !xdes_is_free(descr, bit)))
+ return DB_CORRUPTION;
+ xdes_set_free<false>(*xdes, descr, bit, mtr);
+
+ /* Update the FRAG_N_USED field */
+ byte *n_used_p= FSP_HEADER_OFFSET + FSP_FRAG_N_USED + header->page.frame;
+ uint32_t n_used = mach_read_from_4(n_used_p) + 1;
+
+ if (xdes_is_full(descr))
+ {
+ /* The fragment is full: move it to another list */
+ const uint16_t xoffset=
+ static_cast<uint16_t>(descr - xdes->page.frame + XDES_FLST_NODE);
+ if (dberr_t err= flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+ xdes, xoffset, mtr))
+ return err;
+ if (dberr_t err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
+ xdes, xoffset, mtr))
+ return err;
+ xdes_set_state(*xdes, descr, XDES_FULL_FRAG, mtr);
+ n_used-= FSP_EXTENT_SIZE;
+ }
+
+ mtr->write<4>(*header, n_used_p, n_used);
+ return DB_SUCCESS;
+}
+
+/** Gets a buffer block for an allocated page.
+@param[in,out] space tablespace
+@param[in] offset page number of the allocated page
+@param[in,out] mtr mini-transaction
+@return block, initialized */
+static
+buf_block_t*
+fsp_page_create(fil_space_t *space, page_no_t offset, mtr_t *mtr)
+{
+ buf_block_t *block, *free_block;
+
+ if (UNIV_UNLIKELY(space->is_being_truncated))
+ {
+ const page_id_t page_id{space->id, offset};
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+ mysql_mutex_lock(&buf_pool.mutex);
+ block= reinterpret_cast<buf_block_t*>
+ (buf_pool.page_hash.get(page_id, chain));
+ if (block && block->page.oldest_modification() <= 1)
+ block= nullptr;
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ if (block)
+ {
+ ut_ad(block->page.buf_fix_count() >= 1);
+ ut_ad(block->page.lock.x_lock_count() == 1);
+ ut_ad(mtr->have_x_latch(*block));
+ free_block= block;
+ goto got_free_block;
+ }
+ }
+
+ free_block= buf_LRU_get_free_block(false);
+got_free_block:
+ block= buf_page_create(space, static_cast<uint32_t>(offset),
+ space->zip_size(), mtr, free_block);
+ if (UNIV_UNLIKELY(block != free_block))
+ buf_pool.free_block(free_block);
+
+ fsp_init_file_page(space, block, mtr);
+ return block;
+}
+
+/** Allocates a single free page from a space.
+The page is marked as used.
+@param[in,out] space tablespace
+@param[in] hint hint of which page would be desirable
+@param[in,out] mtr mini-transaction
+@param[in,out] init_mtr mini-transaction in which the page should be
+initialized (may be the same as mtr)
+@param[out] err error code
+@return allocated block
+@retval nullptr if no page could be allocated */
+static MY_ATTRIBUTE((warn_unused_result, nonnull))
+buf_block_t *fsp_alloc_free_page(fil_space_t *space, uint32_t hint,
+ mtr_t *mtr, mtr_t *init_mtr, dberr_t *err)
+{
+ ut_d(space->modify_check(*mtr));
+ buf_block_t *block= fsp_get_header(space, mtr, err);
+ if (!block)
+ return block;
+
+ buf_block_t *xdes;
+ /* Get the hinted descriptor */
+ xdes_t *descr= xdes_get_descriptor_with_space_hdr(block, space, hint, mtr,
+ err, &xdes);
+ if (descr && xdes_get_state(descr) == XDES_FREE_FRAG)
+ /* Ok, we can take this extent */;
+ else if (*err != DB_SUCCESS)
+ {
+ err_exit:
+ space->set_corrupted();
+ return nullptr;
+ }
+ else
+ {
+ /* Else take the first extent in free_frag list */
+ fil_addr_t first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE_FRAG +
+ block->page.frame);
+ if (first.page == FIL_NULL)
+ {
+ /* There are no partially full fragments: allocate a free extent
+ and add it to the FREE_FRAG list. NOTE that the allocation may
+ have as a side-effect that an extent containing a descriptor
+ page is added to the FREE_FRAG list. But we will allocate our
+ page from the the free extent anyway. */
+ descr= fsp_alloc_free_extent(space, hint, &xdes, mtr, err);
+ if (!descr)
+ return nullptr;
+ *err= flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG, xdes,
+ static_cast<uint16_t>(descr - xdes->page.frame +
+ XDES_FLST_NODE), mtr);
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+ return nullptr;
+ xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
+ }
+ else
+ {
+ descr= xdes_lst_get_descriptor(*space, first, mtr, &xdes, err);
+ if (!descr)
+ return nullptr;
+ /* Reset the hint */
+ hint= 0;
+ }
+ }
+
+ /* Now we have in descr an extent with at least one free page. Look
+ for a free page in the extent. */
+ uint32_t free= xdes_find_free(descr, hint % FSP_EXTENT_SIZE);
+ if (free == FIL_NULL)
+ {
+ corrupted:
+ *err= DB_CORRUPTION;
+ goto err_exit;
+ }
+
+ uint32_t page_no= xdes_get_offset(descr) + free;
+ uint32_t space_size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE +
+ block->page.frame);
+ ut_ad(space_size == space->size_in_header ||
+ (space->id == TRX_SYS_SPACE &&
+ srv_startup_is_before_trx_rollback_phase));
+
+ if (space_size <= page_no)
+ {
+ /* It must be that we are extending a single-table tablespace
+ whose size is still < 64 pages */
+ ut_ad(!is_system_tablespace(space->id));
+ if (page_no >= FSP_EXTENT_SIZE)
+ {
+ sql_print_error("InnoDB: Trying to extend %s"
+ " by single page(s) though the size is " UINT32PF "."
+ " Page no " UINT32PF ".",
+ space->chain.start->name, space_size, page_no);
+ goto corrupted;
+ }
+
+ if (!fsp_try_extend_data_file_with_pages(space, page_no, block, mtr))
+ {
+ *err= DB_OUT_OF_FILE_SPACE;
+ return nullptr;
+ }
+ }
+
+ *err= fsp_alloc_from_free_frag(block, xdes, descr, free, mtr);
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+ goto corrupted;
+ return fsp_page_create(space, page_no, init_mtr);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Return an extent to the free list of a space.
+@param[in,out] space tablespace
+@param[in] offset page number in the extent
+@param[in,out] mtr mini-transaction
+@return error code */
+static dberr_t fsp_free_extent(fil_space_t* space, page_no_t offset,
+ mtr_t* mtr)
+{
+ ut_ad(space->is_owner());
+ dberr_t err;
+ buf_block_t *block= fsp_get_header(space, mtr, &err);
+ if (!block)
+ return err;
+ buf_block_t *xdes;
+ xdes_t *descr= xdes_get_descriptor_with_space_hdr(block, space, offset, mtr,
+ &err, &xdes);
+ if (!descr)
+ {
+ ut_ad(err || space->is_stopping());
+ return err;
+ }
+
+ if (UNIV_UNLIKELY(xdes_get_state(descr) == XDES_FREE))
+ {
+ space->set_corrupted();
+ return DB_CORRUPTION;
+ }
+
+ xdes_init(*xdes, descr, mtr);
+ space->free_len++;
+ return flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE,
+ xdes, static_cast<uint16_t>(descr - xdes->page.frame +
+ XDES_FLST_NODE), mtr);
+}
+
+MY_ATTRIBUTE((nonnull))
+/** Frees a single page of a space.
+The page is marked as free and clean.
+@param[in,out] space tablespace
+@param[in] offset page number
+@param[in,out] mtr mini-transaction
+@return error code */
+static dberr_t fsp_free_page(fil_space_t *space, page_no_t offset, mtr_t *mtr)
+{
+ xdes_t* descr;
+ ulint frag_n_used;
+
+ ut_ad(mtr);
+ ut_d(space->modify_check(*mtr));
+
+ /* fprintf(stderr, "Freeing page %lu in space %lu\n", page, space); */
+
+ dberr_t err;
+ buf_block_t* header = fsp_get_header(space, mtr, &err);
+ if (!header) {
+ ut_ad(space->is_stopping());
+ return err;
+ }
+ buf_block_t* xdes;
+
+ descr = xdes_get_descriptor_with_space_hdr(header, space, offset, mtr,
+ &err, &xdes);
+ if (!descr) {
+ ut_ad(err || space->is_stopping());
+ return err;
+ }
+
+ const auto state = xdes_get_state(descr);
+
+ switch (state) {
+ case XDES_FREE_FRAG:
+ case XDES_FULL_FRAG:
+ if (!xdes_is_free(descr, offset % FSP_EXTENT_SIZE)) {
+ break;
+ }
+ /* fall through */
+ default:
+ space->set_corrupted();
+ return DB_CORRUPTION;
+ }
+
+ frag_n_used = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED
+ + header->page.frame);
+
+ const uint16_t xoffset= static_cast<uint16_t>(descr - xdes->page.frame
+ + XDES_FLST_NODE);
+
+ if (state == XDES_FULL_FRAG) {
+ /* The fragment was full: move it to another list */
+ err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
+ xdes, xoffset, mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ return err;
+ }
+ err = flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+ xdes, xoffset, mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ return err;
+ }
+ xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
+ mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED
+ + header->page.frame,
+ frag_n_used + FSP_EXTENT_SIZE - 1);
+ } else if (UNIV_UNLIKELY(!frag_n_used)) {
+ return DB_CORRUPTION;
+ } else {
+ mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED
+ + header->page.frame, frag_n_used - 1);
+ }
+
+ mtr->free(*space, static_cast<uint32_t>(offset));
+ xdes_set_free<true>(*xdes, descr, offset % FSP_EXTENT_SIZE, mtr);
+ ut_ad(err == DB_SUCCESS);
+
+ if (!xdes_get_n_used(descr)) {
+ /* The extent has become free: move it to another list */
+ err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+ xdes, xoffset, mtr);
+ if (err == DB_SUCCESS) {
+ err = fsp_free_extent(space, offset, mtr);
+ }
+ }
+
+ return err;
+}
+
+/** @return Number of segment inodes which fit on a single page */
+inline ulint FSP_SEG_INODES_PER_PAGE(ulint physical_size)
+{
+ return (physical_size - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE;
+}
+
+/** Returns the nth inode slot on an inode page.
+@param[in] page segment inode page
+@param[in] i inode index on page
+@return segment inode */
+#define fsp_seg_inode_page_get_nth_inode(page, i) \
+ FSEG_ARR_OFFSET + FSEG_INODE_SIZE * i + page
+
+/** Looks for a used segment inode on a segment inode page.
+@param page segment inode page
+@param physical_size page size
+@return segment inode index
+@retval ULINT_UNDEFINED if not found */
+static
+ulint
+fsp_seg_inode_page_find_used(const page_t *page, ulint physical_size)
+{
+ for (ulint i= 0; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++)
+ {
+ const byte *inode= fsp_seg_inode_page_get_nth_inode(page, i);
+ if (mach_read_from_8(FSEG_ID + inode))
+ {
+ ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+ return i;
+ }
+ }
+
+ return ULINT_UNDEFINED;
+}
+
+/** Looks for an unused segment inode on a segment inode page.
+@param[in] page segment inode page
+@param[in] i search forward starting from this index
+@param[in] physical_size page size
+@return segment inode index
+@retval ULINT_UNDEFINED if not found */
+static
+ulint
+fsp_seg_inode_page_find_free(const page_t *page, ulint i, ulint physical_size)
+{
+ for (; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++)
+ {
+ const byte *inode= fsp_seg_inode_page_get_nth_inode(page, i);
+ if (mach_read_from_8(FSEG_ID + inode))
+ ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+ else
+ /* This is unused */
+ return i;
+ }
+ return ULINT_UNDEFINED;
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Allocate a file segment inode page.
+@param[in,out] space tablespace
+@param[in,out] header tablespace header
+@param[in,out] mtr mini-transaction
+@return error code */
+static dberr_t fsp_alloc_seg_inode_page(fil_space_t *space,
+ buf_block_t *header, mtr_t *mtr)
+{
+ ut_ad(header->page.id().space() == space->id);
+ dberr_t err;
+ buf_block_t *block= fsp_alloc_free_page(space, 0, mtr, mtr, &err);
+
+ if (!block)
+ return err;
+
+ ut_ad(block->page.lock.not_recursive());
+
+ mtr->write<2>(*block, block->page.frame + FIL_PAGE_TYPE, FIL_PAGE_INODE);
+
+#ifdef UNIV_DEBUG
+ const byte *inode= FSEG_ID + FSEG_ARR_OFFSET + block->page.frame;
+ for (ulint i= FSP_SEG_INODES_PER_PAGE(space->physical_size()); i--;
+ inode += FSEG_INODE_SIZE)
+ ut_ad(!mach_read_from_8(inode));
+#endif
+
+ return flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+ block, FSEG_INODE_PAGE_NODE, mtr);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Allocate a file segment inode.
+@param[in,out] space tablespace
+@param[in,out] header tablespace header
+@param[out] iblock segment inode page
+@param[in,out] mtr mini-transaction
+@param[out] err error code
+@return segment inode
+@retval nullptr on failure */
+static fseg_inode_t*
+fsp_alloc_seg_inode(fil_space_t *space, buf_block_t *header,
+ buf_block_t **iblock, mtr_t *mtr, dberr_t *err)
+{
+ /* Allocate a new segment inode page if needed. */
+ if (!flst_get_len(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE +
+ header->page.frame))
+ {
+ *err= fsp_alloc_seg_inode_page(space, header, mtr);
+ if (*err != DB_SUCCESS)
+ return nullptr;
+ }
+
+ const page_id_t page_id
+ {
+ space->id,
+ mach_read_from_4(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE + FLST_FIRST +
+ FIL_ADDR_PAGE + header->page.frame)
+ };
+
+ buf_block_t *block=
+ buf_page_get_gen(page_id, space->zip_size(), RW_SX_LATCH,
+ nullptr, BUF_GET_POSSIBLY_FREED, mtr, err);
+ if (!block)
+ return nullptr;
+
+ if (!space->full_crc32())
+ fil_block_check_type(*block, FIL_PAGE_INODE, mtr);
+
+ const ulint physical_size= space->physical_size();
+ ulint n= fsp_seg_inode_page_find_free(block->page.frame, 0, physical_size);
+
+ if (UNIV_UNLIKELY(n >= FSP_SEG_INODES_PER_PAGE(physical_size)))
+ {
+ *err= DB_CORRUPTION;
+ return nullptr;
+ }
+ fseg_inode_t *inode= fsp_seg_inode_page_get_nth_inode(block->page.frame, n);
+
+ if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(block->page.frame, n + 1,
+ physical_size))
+ {
+ /* There are no other unused headers left on the page: move it
+ to another list */
+ *err= flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+ block, FSEG_INODE_PAGE_NODE, mtr);
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+ return nullptr;
+ *err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
+ block, FSEG_INODE_PAGE_NODE, mtr);
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+ return nullptr;
+ }
+
+ ut_ad(!mach_read_from_8(inode + FSEG_ID) ||
+ !memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+ *iblock= block;
+ return inode;
+}
+
+MY_ATTRIBUTE((nonnull))
+/** Frees a file segment inode.
+@param[in,out] space tablespace
+@param[in,out] inode segment inode
+@param[in,out] iblock segment inode page
+@param[in,out] mtr mini-transaction */
+static void fsp_free_seg_inode(fil_space_t *space, fseg_inode_t *inode,
+ buf_block_t *iblock, mtr_t *mtr)
+{
+ ut_d(space->modify_check(*mtr));
+
+ dberr_t err;
+ buf_block_t *header= fsp_get_header(space, mtr, &err);
+ if (!header)
+ return;
+ if (UNIV_UNLIKELY(memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)))
+ {
+ space->set_corrupted();
+ return;
+ }
+
+ const ulint physical_size= space->physical_size();
+
+ if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(iblock->page.frame, 0,
+ physical_size))
+ {
+ /* Move the page to another list */
+ if (flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
+ iblock, FSEG_INODE_PAGE_NODE, mtr) != DB_SUCCESS)
+ return;
+ if (flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+ iblock, FSEG_INODE_PAGE_NODE, mtr) != DB_SUCCESS)
+ return;
+ }
+
+ mtr->memset(iblock, page_offset(inode) + FSEG_ID, FSEG_INODE_SIZE, 0);
+
+ if (ULINT_UNDEFINED != fsp_seg_inode_page_find_used(iblock->page.frame,
+ physical_size))
+ return;
+
+ /* There are no other used headers left on the page: free it */
+ if (flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+ iblock, FSEG_INODE_PAGE_NODE, mtr) == DB_SUCCESS)
+ fsp_free_page(space, iblock->page.id().page_no(), mtr);
+}
+
+MY_ATTRIBUTE((nonnull(1,4,5), warn_unused_result))
+/** Returns the file segment inode, page x-latched.
+@param[in] header segment header
+@param[in] space space id
+@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out] mtr mini-transaction
+@param[out] block inode block
+@param[out] err error code
+@return segment inode, page x-latched
+@retrval nullptr if the inode is free or corruption was noticed */
+static
+fseg_inode_t*
+fseg_inode_try_get(
+ const fseg_header_t* header,
+ uint32_t space,
+ ulint zip_size,
+ mtr_t* mtr,
+ buf_block_t** block,
+ dberr_t* err = nullptr)
+{
+ if (UNIV_UNLIKELY(space != mach_read_from_4(header + FSEG_HDR_SPACE)))
+ {
+ corrupted:
+ if (err)
+ *err= DB_CORRUPTION;
+ return nullptr;
+ }
+
+ *block=
+ buf_page_get_gen(page_id_t(space,
+ mach_read_from_4(header + FSEG_HDR_PAGE_NO)),
+ zip_size, RW_SX_LATCH, nullptr, BUF_GET_POSSIBLY_FREED,
+ mtr, err);
+ if (!*block)
+ return nullptr;
+
+ const uint16_t offset= mach_read_from_2(header + FSEG_HDR_OFFSET);
+ if (UNIV_UNLIKELY(offset >= (*block)->physical_size()))
+ goto corrupted;
+
+ fseg_inode_t *inode= (*block)->page.frame + offset;
+ if (UNIV_UNLIKELY(!mach_read_from_8(inode + FSEG_ID) ||
+ memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)))
+ goto corrupted;
+
+ return inode;
+}
+
+/** Get the page number from the nth fragment page slot.
+@param inode file segment findex
+@param n slot index
+@return page number
+@retval FIL_NULL if not in use */
+static uint32_t fseg_get_nth_frag_page_no(const fseg_inode_t *inode, ulint n)
+{
+ ut_ad(inode);
+ ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
+ ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+ return(mach_read_from_4(inode + FSEG_FRAG_ARR
+ + n * FSEG_FRAG_SLOT_SIZE));
+}
+
+/** Set the page number in the nth fragment page slot.
+@param[in,out] inode segment inode
+@param[in,out] iblock segment inode page
+@param[in] n slot index
+@param[in] page_no page number to set
+@param[in,out] mtr mini-transaction */
+inline void fseg_set_nth_frag_page_no(fseg_inode_t *inode, buf_block_t *iblock,
+ ulint n, ulint page_no, mtr_t *mtr)
+{
+ ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
+ ut_ad(mtr->memo_contains_flagged(iblock, MTR_MEMO_PAGE_SX_FIX));
+ ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+
+ mtr->write<4>(*iblock, inode + FSEG_FRAG_ARR + n * FSEG_FRAG_SLOT_SIZE,
+ page_no);
+}
+
+/**********************************************************************//**
+Finds a fragment page slot which is free.
+@return slot index; ULINT_UNDEFINED if none found */
+static
+ulint
+fseg_find_free_frag_page_slot(
+/*==========================*/
+ fseg_inode_t* inode) /*!< in: segment inode */
+{
+ ulint i;
+ ulint page_no;
+
+ for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+ page_no = fseg_get_nth_frag_page_no(inode, i);
+
+ if (page_no == FIL_NULL) {
+
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Finds a fragment page slot which is used and last in the array.
+@return slot index; ULINT_UNDEFINED if none found */
+static
+ulint
+fseg_find_last_used_frag_page_slot(
+/*===============================*/
+ fseg_inode_t* inode) /*!< in: segment inode */
+{
+ ulint i;
+ ulint page_no;
+
+ for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+ page_no = fseg_get_nth_frag_page_no(
+ inode, FSEG_FRAG_ARR_N_SLOTS - i - 1);
+
+ if (page_no != FIL_NULL) {
+
+ return(FSEG_FRAG_ARR_N_SLOTS - i - 1);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/** Calculate reserved fragment page slots.
+@param inode file segment index
+@return number of fragment pages */
+static ulint fseg_get_n_frag_pages(const fseg_inode_t *inode)
+{
+ ulint i;
+ ulint count = 0;
+
+ for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+ if (FIL_NULL != fseg_get_nth_frag_page_no(inode, i)) {
+ count++;
+ }
+ }
+
+ return(count);
+}
+
+/** Create a new segment.
+@param space tablespace
+@param byte_offset byte offset of the created segment header
+@param mtr mini-transaction
+@param err error code
+@param has_done_reservation whether fsp_reserve_free_extents() was invoked
+@param block block where segment header is placed,
+ or NULL to allocate an additional page for that
+@return the block where the segment header is placed, x-latched
+@retval nullptr if could not create segment */
+buf_block_t*
+fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, dberr_t *err,
+ bool has_done_reservation, buf_block_t *block)
+{
+ fseg_inode_t* inode;
+ ib_id_t seg_id;
+ uint32_t n_reserved;
+ bool reserved_extent = false;
+
+ DBUG_ENTER("fseg_create");
+
+ ut_ad(mtr);
+ ut_ad(byte_offset >= FIL_PAGE_DATA);
+ ut_ad(byte_offset + FSEG_HEADER_SIZE
+ <= srv_page_size - FIL_PAGE_DATA_END);
+
+ mtr->x_lock_space(space);
+ ut_d(space->modify_check(*mtr));
+
+ ut_ad(!block || block->page.id().space() == space->id);
+
+ buf_block_t* header = fsp_get_header(space, mtr, err);
+ if (!header) {
+ block = nullptr;
+ goto funct_exit;
+ }
+
+ buf_block_t* iblock;
+
+inode_alloc:
+ inode = fsp_alloc_seg_inode(space, header, &iblock, mtr, err);
+
+ if (!inode) {
+ block = nullptr;
+reserve_extent:
+ if (!has_done_reservation && !reserved_extent) {
+ *err = fsp_reserve_free_extents(&n_reserved, space, 2,
+ FSP_NORMAL, mtr);
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+ DBUG_RETURN(nullptr);
+ }
+
+ /* Extents reserved successfully. So
+ try allocating the page or inode */
+ reserved_extent = true;
+ if (inode) {
+ goto page_alloc;
+ }
+
+ goto inode_alloc;
+ }
+
+ if (inode) {
+ fsp_free_seg_inode(space, inode, iblock, mtr);
+ }
+ goto funct_exit;
+ }
+
+ /* Read the next segment id from space header and increment the
+ value in space header */
+
+ seg_id = mach_read_from_8(FSP_HEADER_OFFSET + FSP_SEG_ID
+ + header->page.frame);
+
+ mtr->write<8>(*header,
+ FSP_HEADER_OFFSET + FSP_SEG_ID + header->page.frame,
+ seg_id + 1);
+ mtr->write<8>(*iblock, inode + FSEG_ID, seg_id);
+ ut_ad(!mach_read_from_4(inode + FSEG_NOT_FULL_N_USED));
+
+ flst_init(*iblock, inode + FSEG_FREE, mtr);
+ flst_init(*iblock, inode + FSEG_NOT_FULL, mtr);
+ flst_init(*iblock, inode + FSEG_FULL, mtr);
+
+ mtr->memcpy(*iblock, inode + FSEG_MAGIC_N, FSEG_MAGIC_N_BYTES, 4);
+ compile_time_assert(FSEG_FRAG_SLOT_SIZE == 4);
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ mtr->memset(iblock,
+ uint16_t(inode - iblock->page.frame) + FSEG_FRAG_ARR,
+ FSEG_FRAG_SLOT_SIZE * FSEG_FRAG_ARR_N_SLOTS, 0xff);
+
+ if (!block) {
+page_alloc:
+ block = fseg_alloc_free_page_low(space,
+ inode, iblock, 0, FSP_UP,
+#ifdef UNIV_DEBUG
+ has_done_reservation,
+#endif /* UNIV_DEBUG */
+ mtr, mtr, err);
+
+ if (!block) {
+ ut_ad(!has_done_reservation);
+ goto reserve_extent;
+ }
+
+ ut_d(const auto x = block->page.lock.x_lock_count());
+ ut_ad(x || block->page.lock.not_recursive());
+ ut_ad(x == 1 || space->is_being_truncated);
+ ut_ad(x <= 2);
+ ut_ad(!fil_page_get_type(block->page.frame));
+ mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame,
+ FIL_PAGE_TYPE_SYS);
+ }
+
+ mtr->write<2>(*block, byte_offset + FSEG_HDR_OFFSET
+ + block->page.frame, page_offset(inode));
+
+ mtr->write<4>(*block, byte_offset + FSEG_HDR_PAGE_NO
+ + block->page.frame, iblock->page.id().page_no());
+
+ mtr->write<4,mtr_t::MAYBE_NOP>(*block, byte_offset + FSEG_HDR_SPACE
+ + block->page.frame, space->id);
+
+funct_exit:
+ if (!has_done_reservation && reserved_extent) {
+ space->release_free_extents(n_reserved);
+ }
+
+ DBUG_RETURN(block);
+}
+
+/**********************************************************************//**
+Calculates the number of pages reserved by a segment, and how many pages are
+currently used.
+@return number of reserved pages */
+static
+ulint
+fseg_n_reserved_pages_low(
+/*======================*/
+ const fseg_inode_t* inode, /*!< in: segment inode */
+ ulint* used) /*!< out: number of pages used (not
+ more than reserved) */
+{
+ *used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED)
+ + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL)
+ + fseg_get_n_frag_pages(inode);
+
+ return fseg_get_n_frag_pages(inode)
+ + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FREE)
+ + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_NOT_FULL)
+ + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL);
+}
+
+/** Calculate the number of pages reserved by a segment,
+and how many pages are currently used.
+@param[in] block buffer block containing the file segment header
+@param[in] header file segment header
+@param[out] used number of pages that are used (not more than reserved)
+@param[in,out] mtr mini-transaction
+@return number of reserved pages */
+ulint fseg_n_reserved_pages(const buf_block_t &block,
+ const fseg_header_t *header, ulint *used,
+ mtr_t *mtr)
+{
+ ut_ad(page_align(header) == block.page.frame);
+ buf_block_t *iblock;
+ if (fseg_inode_t *inode=
+ fseg_inode_try_get(header, block.page.id().space(), block.zip_size(),
+ mtr, &iblock))
+ return fseg_n_reserved_pages_low(inode, used);
+ return *used= 0;
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Tries to fill the free list of a segment with consecutive free extents.
+This happens if the segment is big enough to allow extents in the free list,
+the free list is empty, and the extents can be allocated consecutively from
+the hint onward.
+@param[in] inode segment inode
+@param[in,out] iblock segment inode page
+@param[in] space tablespace
+@param[in] hint hint which extent would be good as the first extent
+@param[in,out] mtr mini-transaction */
+static dberr_t fseg_fill_free_list(const fseg_inode_t *inode,
+ buf_block_t *iblock, fil_space_t *space,
+ uint32_t hint, mtr_t *mtr)
+{
+ ulint used;
+
+ ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+ ut_d(space->modify_check(*mtr));
+
+ if (fseg_n_reserved_pages_low(inode, &used) <
+ FSEG_FREE_LIST_LIMIT * FSP_EXTENT_SIZE)
+ /* The segment is too small to allow extents in free list */
+ return DB_SUCCESS;
+
+ if (UNIV_UNLIKELY(memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)))
+ {
+ space->set_corrupted();
+ return DB_CORRUPTION;
+ }
+
+ if (flst_get_len(inode + FSEG_FREE) > 0)
+ /* Free list is not empty */
+ return DB_SUCCESS;
+
+ for (ulint i= 0; i < FSEG_FREE_LIST_MAX_LEN; i++, hint += FSP_EXTENT_SIZE)
+ {
+ buf_block_t *xdes;
+ dberr_t err;
+ xdes_t *descr= xdes_get_descriptor(space, hint, mtr, &err, &xdes);
+ if (!descr || XDES_FREE != xdes_get_state(descr))
+ /* We cannot allocate the desired extent: stop */
+ return err;
+
+ descr= fsp_alloc_free_extent(space, hint, &xdes, mtr, &err);
+ if (UNIV_UNLIKELY(!descr))
+ return err;
+
+ if (dberr_t err=
+ flst_add_last(iblock,
+ static_cast<uint16_t>(inode - iblock->page.frame +
+ FSEG_FREE), xdes,
+ static_cast<uint16_t>(descr - xdes->page.frame +
+ XDES_FLST_NODE), mtr))
+ return err;
+ xdes_set_state(*xdes, descr, XDES_FSEG, mtr);
+ mtr->memcpy(*xdes, descr + XDES_ID, inode + FSEG_ID, 8);
+ }
+
+ return DB_SUCCESS;
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Allocates a free extent for the segment: looks first in the free list of
+the segment, then tries to allocate from the space free list.
+NOTE that the extent returned still resides in the segment free list, it is
+not yet taken off it!
+@param[in] inode segment inode
+@param[in,out] iblock segment inode page
+@param[out] xdes extent descriptor page
+@param[in,out] space tablespace
+@param[in,out] mtr mini-transaction
+@param[out] err error code
+@retval nullptr if no page could be allocated */
+static
+xdes_t*
+fseg_alloc_free_extent(
+ const fseg_inode_t* inode,
+ buf_block_t* iblock,
+ buf_block_t** xdes,
+ fil_space_t* space,
+ mtr_t* mtr,
+ dberr_t* err)
+{
+ ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+ ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+ ut_d(space->modify_check(*mtr));
+
+ if (flst_get_len(inode + FSEG_FREE))
+ {
+ /* Segment free list is not empty, allocate from it */
+ return xdes_lst_get_descriptor(*space, flst_get_first(inode + FSEG_FREE),
+ mtr, xdes, err);
+ }
+
+ xdes_t* descr= fsp_alloc_free_extent(space, 0, xdes, mtr, err);
+ if (UNIV_UNLIKELY(!descr))
+ return descr;
+ xdes_set_state(**xdes, descr, XDES_FSEG, mtr);
+ mtr->memcpy<mtr_t::MAYBE_NOP>(**xdes, descr + XDES_ID, inode + FSEG_ID, 8);
+ *err= flst_add_last(iblock,
+ static_cast<uint16_t>(inode - iblock->page.frame +
+ FSEG_FREE), *xdes,
+ static_cast<uint16_t>(descr - (*xdes)->page.frame +
+ XDES_FLST_NODE), mtr);
+ if (UNIV_LIKELY(*err != DB_SUCCESS))
+ return nullptr;
+ /* Try to fill the segment free list */
+ *err= fseg_fill_free_list(inode, iblock, space,
+ xdes_get_offset(descr) + FSP_EXTENT_SIZE, mtr);
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+ return nullptr;
+
+ return descr;
+}
+
+/** Allocates a single free page from a segment.
+This function implements the intelligent allocation strategy which tries to
+minimize file space fragmentation.
+@param[in,out] space tablespace
+@param[in,out] seg_inode segment inode
+@param[in,out] iblock segment inode page
+@param[in] hint hint of which page would be desirable
+@param[in] direction if the new page is needed because of
+an index page split, and records are inserted there in order, into which
+direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR
+@param[in,out] mtr mini-transaction
+@param[in,out] init_mtr mtr or another mini-transaction in
+which the page should be initialized.
+@param[out] err error code
+@return the allocated page
+@retval nullptr if no page could be allocated */
+static
+buf_block_t*
+fseg_alloc_free_page_low(
+ fil_space_t* space,
+ fseg_inode_t* seg_inode,
+ buf_block_t* iblock,
+ uint32_t hint,
+ byte direction,
+#ifdef UNIV_DEBUG
+ bool has_done_reservation,
+ /*!< whether the space has already been reserved */
+#endif /* UNIV_DEBUG */
+ mtr_t* mtr,
+ mtr_t* init_mtr,
+ dberr_t* err)
+{
+ ib_id_t seg_id;
+ ulint used;
+ ulint reserved;
+ xdes_t* descr; /*!< extent of the hinted page */
+ uint32_t ret_page; /*!< the allocated page offset, FIL_NULL
+ if could not be allocated */
+ xdes_t* ret_descr; /*!< the extent of the allocated page */
+ buf_block_t* xdes;
+ ulint n;
+
+ ut_ad((direction >= FSP_UP) && (direction <= FSP_NO_DIR));
+ ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4));
+ ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+ seg_id = mach_read_from_8(seg_inode + FSEG_ID);
+
+ ut_ad(seg_id);
+ ut_d(space->modify_check(*mtr));
+ ut_ad(fil_page_get_type(page_align(seg_inode)) == FIL_PAGE_INODE);
+
+ reserved = fseg_n_reserved_pages_low(seg_inode, &used);
+
+ buf_block_t* header = fsp_get_header(space, mtr, err);
+ if (!header) {
+ return header;
+ }
+
+ descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr,
+ err, &xdes);
+ if (!descr) {
+ if (*err != DB_SUCCESS) {
+ return nullptr;
+ }
+ /* Hint outside space or too high above free limit: reset
+ hint */
+ /* The file space header page is always allocated. */
+ hint = 0;
+ descr = xdes_get_descriptor(space, hint, mtr, err, &xdes);
+ if (!descr) {
+ return nullptr;
+ }
+ }
+
+ /* In the big if-else below we look for ret_page and ret_descr */
+ /*-------------------------------------------------------------*/
+ if ((xdes_get_state(descr) == XDES_FSEG)
+ && mach_read_from_8(descr + XDES_ID) == seg_id
+ && xdes_is_free(descr, hint % FSP_EXTENT_SIZE)) {
+take_hinted_page:
+ /* 1. We can take the hinted page
+ =================================*/
+ ret_descr = descr;
+ ret_page = hint;
+ /* Skip the check for extending the tablespace. If the
+ page hint were not within the size of the tablespace,
+ we would have got (descr == NULL) above and reset the hint. */
+ goto got_hinted_page;
+ /*-----------------------------------------------------------*/
+ } else if (xdes_get_state(descr) == XDES_FREE
+ && reserved - used < reserved / FSEG_FILLFACTOR
+ && used >= FSEG_FRAG_LIMIT) {
+
+ /* 2. We allocate the free extent from space and can take
+ =========================================================
+ the hinted page
+ ===============*/
+ ret_descr = fsp_alloc_free_extent(space, hint, &xdes,
+ mtr, err);
+
+ if (UNIV_UNLIKELY(ret_descr != descr)) {
+ if (*err != DB_SUCCESS) {
+ *err = DB_CORRUPTION;
+ }
+ return nullptr;
+ }
+
+ xdes_set_state(*xdes, ret_descr, XDES_FSEG, mtr);
+ mtr->write<8,mtr_t::MAYBE_NOP>(*xdes, ret_descr + XDES_ID,
+ seg_id);
+ *err = flst_add_last(
+ iblock,
+ static_cast<uint16_t>(seg_inode - iblock->page.frame
+ + FSEG_FREE), xdes,
+ static_cast<uint16_t>(ret_descr
+ - xdes->page.frame
+ + XDES_FLST_NODE), mtr);
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+ return nullptr;
+ }
+
+ /* Try to fill the segment free list */
+ *err = fseg_fill_free_list(seg_inode, iblock, space,
+ hint + FSP_EXTENT_SIZE, mtr);
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+ return nullptr;
+ }
+ goto take_hinted_page;
+ /*-----------------------------------------------------------*/
+ } else if ((direction != FSP_NO_DIR)
+ && ((reserved - used) < reserved / FSEG_FILLFACTOR)
+ && (used >= FSEG_FRAG_LIMIT)
+ && (ret_descr = fseg_alloc_free_extent(seg_inode, iblock,
+ &xdes, space,
+ mtr, err))) {
+ /* 3. We take any free extent (which was already assigned above
+ ===============================================================
+ in the if-condition to ret_descr) and take the lowest or
+ ========================================================
+ highest page in it, depending on the direction
+ ==============================================*/
+ ret_page = xdes_get_offset(ret_descr);
+
+ if (direction == FSP_DOWN) {
+ ret_page += FSP_EXTENT_SIZE - 1;
+ }
+ ut_ad(!has_done_reservation || ret_page != FIL_NULL);
+ /*-----------------------------------------------------------*/
+ } else if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+ return nullptr;
+ } else if ((xdes_get_state(descr) == XDES_FSEG)
+ && mach_read_from_8(descr + XDES_ID) == seg_id
+ && (!xdes_is_full(descr))) {
+
+ /* 4. We can take the page from the same extent as the
+ ======================================================
+ hinted page (and the extent already belongs to the
+ ==================================================
+ segment)
+ ========*/
+ ret_descr = descr;
+ ret_page = xdes_find_free(ret_descr, hint % FSP_EXTENT_SIZE);
+ if (ret_page == FIL_NULL) {
+ ut_ad(!has_done_reservation);
+ } else {
+ ret_page += xdes_get_offset(ret_descr);
+ }
+ /*-----------------------------------------------------------*/
+ } else if (reserved - used > 0) {
+ /* 5. We take any unused page from the segment
+ ==============================================*/
+ fil_addr_t first;
+
+ if (flst_get_len(seg_inode + FSEG_NOT_FULL) > 0) {
+ first = flst_get_first(seg_inode + FSEG_NOT_FULL);
+ } else if (flst_get_len(seg_inode + FSEG_FREE) > 0) {
+ first = flst_get_first(seg_inode + FSEG_FREE);
+ } else {
+ ut_ad(!has_done_reservation);
+ return(NULL);
+ }
+
+ ret_descr = xdes_lst_get_descriptor(*space, first, mtr, &xdes);
+ if (!ret_descr) {
+ return nullptr;
+ }
+
+ ret_page = xdes_find_free(ret_descr);
+ if (ret_page == FIL_NULL) {
+ ut_ad(!has_done_reservation);
+ } else {
+ ret_page += xdes_get_offset(ret_descr);
+ }
+ /*-----------------------------------------------------------*/
+ } else if (used < FSEG_FRAG_LIMIT) {
+ /* 6. We allocate an individual page from the space
+ ===================================================*/
+ buf_block_t* block = fsp_alloc_free_page(
+ space, hint, mtr, init_mtr, err);
+
+ ut_ad(block || !has_done_reservation || *err);
+
+ if (block) {
+ /* Put the page in the fragment page array of the
+ segment */
+ n = fseg_find_free_frag_page_slot(seg_inode);
+ if (UNIV_UNLIKELY(n == ULINT_UNDEFINED)) {
+ *err = DB_CORRUPTION;
+ return nullptr;
+ }
+
+ fseg_set_nth_frag_page_no(
+ seg_inode, iblock, n,
+ block->page.id().page_no(), mtr);
+ }
+
+ /* fsp_alloc_free_page() invoked fsp_init_file_page()
+ already. */
+ return(block);
+ /*-----------------------------------------------------------*/
+ } else {
+ /* 7. We allocate a new extent and take its first page
+ ======================================================*/
+ ret_descr = fseg_alloc_free_extent(seg_inode, iblock, &xdes,
+ space, mtr, err);
+
+ if (!ret_descr) {
+ ut_ad(!has_done_reservation || *err);
+ return nullptr;
+ } else {
+ ret_page = xdes_get_offset(ret_descr);
+ }
+ }
+
+ if (ret_page == FIL_NULL) {
+ /* Page could not be allocated */
+
+ ut_ad(!has_done_reservation);
+ return(NULL);
+ }
+
+ if (space->size <= ret_page && !is_predefined_tablespace(space->id)) {
+ /* It must be that we are extending a single-table
+ tablespace whose size is still < 64 pages */
+
+ if (ret_page >= FSP_EXTENT_SIZE) {
+ sql_print_error("InnoDB: Trying to extend '%s'"
+ " by single page(s) though the"
+ " space size " UINT32PF "."
+ " Page no " UINT32PF ".",
+ space->chain.start->name, space->size,
+ ret_page);
+ ut_ad(!has_done_reservation);
+ return(NULL);
+ }
+
+ if (!fsp_try_extend_data_file_with_pages(
+ space, ret_page, header, mtr)) {
+ /* No disk space left */
+ ut_ad(!has_done_reservation);
+ return(NULL);
+ }
+ }
+
+got_hinted_page:
+ /* ret_descr == NULL if the block was allocated from free_frag
+ (XDES_FREE_FRAG) */
+ if (ret_descr != NULL) {
+ /* At this point we know the extent and the page offset.
+ The extent is still in the appropriate list (FSEG_NOT_FULL
+ or FSEG_FREE), and the page is not yet marked as used. */
+
+ ut_d(buf_block_t* xxdes);
+ ut_ad(xdes_get_descriptor(space, ret_page, mtr, err, &xxdes)
+ == ret_descr);
+ ut_ad(xdes == xxdes);
+ ut_ad(xdes_is_free(ret_descr, ret_page % FSP_EXTENT_SIZE));
+
+ *err = fseg_mark_page_used(seg_inode, iblock, ret_page,
+ ret_descr, xdes, mtr);
+ if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+ return nullptr;
+ }
+ }
+
+ return fsp_page_create(space, ret_page, init_mtr);
+}
+
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@retval NULL if no page could be allocated */
+buf_block_t*
+fseg_alloc_free_page_general(
+/*=========================*/
+ fseg_header_t* seg_header,/*!< in/out: segment header */
+ uint32_t hint, /*!< in: hint of which page would be
+ desirable */
+ byte direction,/*!< in: if the new page is needed because
+ of an index page split, and records are
+ inserted there in order, into which
+ direction they go alphabetically: FSP_DOWN,
+ FSP_UP, FSP_NO_DIR */
+ bool has_done_reservation, /*!< in: true if the caller has
+ already done the reservation for the page
+ with fsp_reserve_free_extents, then there
+ is no need to do the check for this individual
+ page */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ mtr_t* init_mtr,/*!< in/out: mtr or another mini-transaction
+ in which the page should be initialized. */
+ dberr_t* err) /*!< out: error code */
+{
+ fseg_inode_t* inode;
+ fil_space_t* space;
+ buf_block_t* iblock;
+ buf_block_t* block;
+ uint32_t n_reserved;
+
+ const uint32_t space_id = page_get_space_id(page_align(seg_header));
+ space = mtr->x_lock_space(space_id);
+ inode = fseg_inode_try_get(seg_header, space_id, space->zip_size(),
+ mtr, &iblock, err);
+ if (!inode) {
+ return nullptr;
+ }
+ if (!space->full_crc32()) {
+ fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
+ }
+
+ if (!has_done_reservation) {
+ *err = fsp_reserve_free_extents(&n_reserved, space, 2,
+ FSP_NORMAL, mtr);
+ if (*err != DB_SUCCESS) {
+ return nullptr;
+ }
+ }
+
+ block = fseg_alloc_free_page_low(space,
+ inode, iblock, hint, direction,
+#ifdef UNIV_DEBUG
+ has_done_reservation,
+#endif /* UNIV_DEBUG */
+ mtr, init_mtr, err);
+
+ /* The allocation cannot fail if we have already reserved a
+ space for the page. */
+ ut_ad(block || !has_done_reservation || *err);
+
+ if (!has_done_reservation) {
+ space->release_free_extents(n_reserved);
+ }
+
+ return(block);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Check that we have at least n_pages frag pages free in the first extent
+of a single-table tablespace, and they are also physically initialized to
+the data file. That is we have already extended the data file so that those
+pages are inside the data file. If not, this function extends the tablespace
+with pages.
+@param[in,out] space tablespace
+@param[in,out] header tablespace header, x-latched
+@param[in] size tablespace size in pages, less than FSP_EXTENT_SIZE
+@param[in,out] mtr mini-transaction
+@param[in] n_pages number of pages to reserve
+@return error code */
+static
+dberr_t
+fsp_reserve_free_pages(
+ fil_space_t* space,
+ buf_block_t* header,
+ ulint size,
+ mtr_t* mtr,
+ uint32_t n_pages)
+{
+ ut_ad(space != fil_system.sys_space && space != fil_system.temp_space);
+ ut_ad(size < FSP_EXTENT_SIZE);
+
+ dberr_t err= DB_OUT_OF_FILE_SPACE;
+ const xdes_t *descr=
+ xdes_get_descriptor_with_space_hdr(header, space, 0, mtr, &err);
+ if (!descr)
+ return err;
+ const uint32_t n_used= xdes_get_n_used(descr);
+ if (size >= n_used + n_pages)
+ return DB_SUCCESS;
+ if (n_used > size)
+ return DB_CORRUPTION;
+ return fsp_try_extend_data_file_with_pages(space, n_used + n_pages - 1,
+ header, mtr)
+ ? DB_SUCCESS
+ : DB_OUT_OF_FILE_SPACE;
+}
+
+/** Reserves free pages from a tablespace. All mini-transactions which may
+use several pages from the tablespace should call this function beforehand
+and reserve enough free extents so that they certainly will be able
+to do their operation, like a B-tree page split, fully. Reservations
+must be released with function fil_space_t::release_free_extents()!
+
+The alloc_type below has the following meaning: FSP_NORMAL means an
+operation which will probably result in more space usage, like an
+insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are
+deleting rows, then this allocation will in the long run result in
+less space usage (after a purge); FSP_CLEANING means allocation done
+in a physical record delete (like in a purge) or other cleaning operation
+which will result in less space usage in the long run. We prefer the latter
+two types of allocation: when space is scarce, FSP_NORMAL allocations
+will not succeed, but the latter two allocations will succeed, if possible.
+The purpose is to avoid dead end where the database is full but the
+user cannot free any space because these freeing operations temporarily
+reserve some space.
+
+Single-table tablespaces whose size is < FSP_EXTENT_SIZE pages are a special
+case. In this function we would liberally reserve several extents for
+every page split or merge in a B-tree. But we do not want to waste disk space
+if the table only occupies < FSP_EXTENT_SIZE pages. That is why we apply
+different rules in that special case, just ensuring that there are n_pages
+free pages available.
+
+@param[out] n_reserved number of extents actually reserved; if we
+ return true and the tablespace size is <
+ FSP_EXTENT_SIZE pages, then this can be 0,
+ otherwise it is n_ext
+@param[in,out] space tablespace
+@param[in] n_ext number of extents to reserve
+@param[in] alloc_type page reservation type (FSP_BLOB, etc)
+@param[in,out] mtr the mini transaction
+@param[in] n_pages for small tablespaces (tablespace size is
+ less than FSP_EXTENT_SIZE), number of free
+ pages to reserve.
+@return error code
+@retval DB_SUCCESS if we were able to make the reservation */
+dberr_t
+fsp_reserve_free_extents(
+ uint32_t* n_reserved,
+ fil_space_t* space,
+ uint32_t n_ext,
+ fsp_reserve_t alloc_type,
+ mtr_t* mtr,
+ uint32_t n_pages)
+{
+ ulint reserve;
+
+ ut_ad(mtr);
+ *n_reserved = n_ext;
+
+ const uint32_t extent_size = FSP_EXTENT_SIZE;
+
+ mtr->x_lock_space(space);
+ const unsigned physical_size = space->physical_size();
+
+ dberr_t err;
+ buf_block_t* header = fsp_get_header(space, mtr, &err);
+ if (!header) {
+ return err;
+ }
+try_again:
+ uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+ + header->page.frame);
+ ut_ad(size == space->size_in_header);
+
+ if (size < extent_size && n_pages < extent_size / 2) {
+ /* Use different rules for small single-table tablespaces */
+ *n_reserved = 0;
+ return fsp_reserve_free_pages(space, header, size,
+ mtr, n_pages);
+ }
+
+ uint32_t n_free_list_ext = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE
+ + header->page.frame);
+ ut_ad(space->free_len == n_free_list_ext);
+
+ uint32_t free_limit = mach_read_from_4(FSP_HEADER_OFFSET
+ + FSP_FREE_LIMIT
+ + header->page.frame);
+ ut_ad(space->free_limit == free_limit);
+
+ /* Below we play safe when counting free extents above the free limit:
+ some of them will contain extent descriptor pages, and therefore
+ will not be free extents */
+
+ uint32_t n_free_up;
+
+ if (size >= free_limit) {
+ n_free_up = (size - free_limit) / extent_size;
+ if (n_free_up) {
+ n_free_up--;
+ n_free_up -= n_free_up / (physical_size / extent_size);
+ }
+ } else {
+ ut_ad(alloc_type == FSP_BLOB);
+ n_free_up = 0;
+ }
+
+ uint32_t n_free = n_free_list_ext + n_free_up;
+
+ switch (alloc_type) {
+ case FSP_NORMAL:
+ /* We reserve 1 extent + 0.5 % of the space size to undo logs
+ and 1 extent + 0.5 % to cleaning operations; NOTE: this source
+ code is duplicated in the function below! */
+
+ reserve = 2 + ((size / extent_size) * 2) / 200;
+
+ if (n_free <= reserve + n_ext) {
+
+ goto try_to_extend;
+ }
+ break;
+ case FSP_UNDO:
+ /* We reserve 0.5 % of the space size to cleaning operations */
+
+ reserve = 1 + ((size / extent_size) * 1) / 200;
+
+ if (n_free <= reserve + n_ext) {
+
+ goto try_to_extend;
+ }
+ break;
+ case FSP_CLEANING:
+ case FSP_BLOB:
+ reserve = 0;
+ break;
+ default:
+ ut_error;
+ }
+
+ if (space->reserve_free_extents(n_free, n_ext)) {
+ return DB_SUCCESS;
+ }
+try_to_extend:
+ if (fsp_try_extend_data_file(space, header, mtr)) {
+ goto try_again;
+ }
+
+ return DB_OUT_OF_FILE_SPACE;
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Frees a single page of a segment.
+@param[in] seg_inode segment inode
+@param[in,out] space tablespace
+@param[in] offset page number
+@param[in,out] mtr mini-transaction
+@param[in] ahi Drop adaptive hash index
+@return error code */
+static
+dberr_t
+fseg_free_page_low(
+ fseg_inode_t* seg_inode,
+ buf_block_t* iblock,
+ fil_space_t* space,
+ page_no_t offset,
+ mtr_t* mtr
+#ifdef BTR_CUR_HASH_ADAPT
+ ,bool ahi=false
+#endif /* BTR_CUR_HASH_ADAPT */
+ )
+{
+ ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4));
+ ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+ ut_ad(iblock->page.frame == page_align(seg_inode));
+ ut_d(space->modify_check(*mtr));
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (ahi) {
+ btr_search_drop_page_hash_when_freed(
+ page_id_t(space->id, offset));
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ const uint32_t extent_size = FSP_EXTENT_SIZE;
+ ut_ad(ut_is_2pow(extent_size));
+ buf_block_t* xdes;
+ dberr_t err;
+ xdes_t* descr = xdes_get_descriptor(space, offset, mtr, &err, &xdes);
+
+ if (!descr) {
+ return err;
+ }
+ if (UNIV_UNLIKELY(xdes_is_free(descr, offset & (extent_size - 1)))) {
+corrupted:
+ space->set_corrupted();
+ return DB_CORRUPTION;
+ }
+
+ if (xdes_get_state(descr) != XDES_FSEG) {
+ /* The page is in the fragment pages of the segment */
+ for (ulint i = 0;; i++) {
+ if (fseg_get_nth_frag_page_no(seg_inode, i)
+ != offset) {
+ continue;
+ }
+
+ compile_time_assert(FIL_NULL == 0xffffffff);
+ mtr->memset(iblock, uint16_t(seg_inode
+ - iblock->page.frame)
+ + FSEG_FRAG_ARR
+ + i * FSEG_FRAG_SLOT_SIZE, 4, 0xff);
+ break;
+ }
+
+ return fsp_free_page(space, offset, mtr);
+ }
+
+ /* If we get here, the page is in some extent of the segment */
+
+ if (UNIV_UNLIKELY(memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8))) {
+ goto corrupted;
+ }
+
+ byte* p_not_full = seg_inode + FSEG_NOT_FULL_N_USED;
+ uint32_t not_full_n_used = mach_read_from_4(p_not_full);
+ const uint16_t xoffset= uint16_t(descr - xdes->page.frame
+ + XDES_FLST_NODE);
+ const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame);
+
+ if (xdes_is_full(descr)) {
+ /* The fragment is full: move it to another list */
+ err = flst_remove(iblock,
+ static_cast<uint16_t>(FSEG_FULL + ioffset),
+ xdes, xoffset, mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ return err;
+ }
+ err = flst_add_last(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
+ + ioffset),
+ xdes, xoffset, mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ return err;
+ }
+ not_full_n_used += extent_size - 1;
+ } else {
+ if (!not_full_n_used) {
+ goto corrupted;
+ }
+ not_full_n_used--;
+ }
+
+ mtr->write<4>(*iblock, p_not_full, not_full_n_used);
+ xdes_set_free<true>(*xdes, descr, offset & (extent_size - 1), mtr);
+
+ if (!xdes_get_n_used(descr)) {
+ err = flst_remove(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
+ + ioffset),
+ xdes, xoffset, mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ return err;
+ }
+ err = fsp_free_extent(space, offset, mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ return err;
+ }
+ }
+
+ mtr->free(*space, static_cast<uint32_t>(offset));
+ return DB_SUCCESS;
+}
+
+/** Free a page in a file segment.
+@param[in,out] seg_header file segment header
+@param[in,out] space tablespace
+@param[in] offset page number
+@param[in,out] mtr mini-transaction
+@param[in] have_latch whether space->x_lock() was already called
+@return error code */
+dberr_t fseg_free_page(fseg_header_t *seg_header, fil_space_t *space,
+ uint32_t offset, mtr_t *mtr, bool have_latch)
+{
+ buf_block_t *iblock;
+ if (have_latch)
+ ut_ad(space->is_owner());
+ else
+ mtr->x_lock_space(space);
+
+ DBUG_PRINT("fseg_free_page",
+ ("space_id: " ULINTPF ", page_no: %u", space->id, offset));
+
+ dberr_t err;
+ if (fseg_inode_t *seg_inode= fseg_inode_try_get(seg_header,
+ space->id, space->zip_size(),
+ mtr, &iblock, &err))
+ {
+ if (!space->full_crc32())
+ fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
+ return fseg_free_page_low(seg_inode, iblock, space, offset, mtr);
+ }
+
+ return err;
+}
+
+/** Determine whether a page is allocated.
+@param space tablespace
+@param page page number
+@return error code
+@retval DB_SUCCESS if the page is marked as free
+@retval DB_SUCCESS_LOCKED_REC if the page is marked as allocated */
+dberr_t fseg_page_is_allocated(fil_space_t *space, unsigned page)
+{
+ mtr_t mtr;
+ uint32_t dpage= xdes_calc_descriptor_page(space->zip_size(), page);
+ const unsigned zip_size= space->zip_size();
+ dberr_t err= DB_SUCCESS;
+
+ mtr.start();
+ if (!space->is_owner())
+ mtr.x_lock_space(space);
+
+ if (page >= space->free_limit || page >= space->size_in_header);
+ else if (const buf_block_t *b=
+ buf_page_get_gen(page_id_t(space->id, dpage), space->zip_size(),
+ RW_S_LATCH, nullptr, BUF_GET_POSSIBLY_FREED,
+ &mtr, &err))
+ {
+ if (!dpage &&
+ (space->free_limit !=
+ mach_read_from_4(FSP_FREE_LIMIT + FSP_HEADER_OFFSET +
+ b->page.frame) ||
+ space->size_in_header !=
+ mach_read_from_4(FSP_SIZE + FSP_HEADER_OFFSET + b->page.frame)))
+ err= DB_CORRUPTION;
+ else
+ err= xdes_is_free(b->page.frame + XDES_ARR_OFFSET + XDES_SIZE
+ * xdes_calc_descriptor_index(zip_size, page),
+ page & (FSP_EXTENT_SIZE - 1))
+ ? DB_SUCCESS
+ : DB_SUCCESS_LOCKED_REC;
+ }
+
+ mtr.commit();
+ return err;
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Free an extent of a segment to the space free list.
+@param[in,out] seg_inode segment inode
+@param[in,out] space tablespace
+@param[in] page page number in the extent
+@param[in,out] mtr mini-transaction
+@return error code */
+static
+dberr_t
+fseg_free_extent(
+ fseg_inode_t* seg_inode,
+ buf_block_t* iblock,
+ fil_space_t* space,
+ uint32_t page,
+ mtr_t* mtr
+#ifdef BTR_CUR_HASH_ADAPT
+ ,bool ahi=false
+#endif /* BTR_CUR_HASH_ADAPT */
+ )
+{
+ buf_block_t* xdes;
+ dberr_t err;
+ xdes_t* descr = xdes_get_descriptor(space, page, mtr, &err, &xdes);
+
+ if (!descr) {
+ return err;
+ }
+
+ if (UNIV_UNLIKELY(xdes_get_state(descr) != XDES_FSEG
+ || memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8)
+ || memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N
+ + seg_inode, 4))) {
+ return DB_CORRUPTION;
+ }
+ ut_d(space->modify_check(*mtr));
+ const uint32_t first_page_in_extent = page - (page % FSP_EXTENT_SIZE);
+
+ const uint16_t xoffset= uint16_t(descr - xdes->page.frame
+ + XDES_FLST_NODE);
+ const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame);
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (ahi) {
+ for (uint32_t i = 0; i < FSP_EXTENT_SIZE; i++) {
+ if (!xdes_is_free(descr, i)) {
+ /* Drop search system page hash index
+ if the page is found in the pool and
+ is hashed */
+ btr_search_drop_page_hash_when_freed(
+ page_id_t(space->id,
+ first_page_in_extent + i));
+ }
+ }
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+ uint16_t lst;
+
+ if (xdes_is_full(descr)) {
+ lst = static_cast<uint16_t>(FSEG_FULL + ioffset);
+remove:
+ err = flst_remove(iblock, lst, xdes, xoffset, mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ return err;
+ }
+ } else if (!xdes_get_n_used(descr)) {
+ lst = static_cast<uint16_t>(FSEG_FREE + ioffset);
+ goto remove;
+ } else {
+ err = flst_remove(
+ iblock, static_cast<uint16_t>(FSEG_NOT_FULL + ioffset),
+ xdes, xoffset, mtr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ return err;
+ }
+ uint32_t not_full_n_used = mach_read_from_4(
+ FSEG_NOT_FULL_N_USED + seg_inode);
+ uint32_t descr_n_used = xdes_get_n_used(descr);
+ if (not_full_n_used < descr_n_used) {
+ return DB_CORRUPTION;
+ }
+ mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED,
+ not_full_n_used - descr_n_used);
+ }
+
+ std::vector<uint8_t> going_to_free;
+ static_assert(FSP_EXTENT_SIZE_MIN == 256, "compatibility");
+ static_assert(FSP_EXTENT_SIZE_MAX == 64, "compatibility");
+
+ for (uint32_t i = 0; i < FSP_EXTENT_SIZE; i++) {
+ if (!xdes_is_free(descr, i)) {
+ going_to_free.emplace_back(uint8_t(i));
+ }
+ }
+
+ if (dberr_t err = fsp_free_extent(space, page, mtr)) {
+ return err;
+ }
+
+ for (uint32_t i : going_to_free) {
+ mtr->free(*space, first_page_in_extent + i);
+ buf_page_free(space, first_page_in_extent + i, mtr);
+ }
+
+ return DB_SUCCESS;
+}
+
+/** Frees part of a segment. This function can be used to free
+a segment by repeatedly calling this function in different
+mini-transactions. Doing the freeing in a single mini-transaction
+might result in too big a mini-transaction.
+@param header segment header; NOTE: if the header resides on first
+ page of the frag list of the segment, this pointer
+ becomes obsolete after the last freeing step
+@param mtr mini-transaction
+@param ahi Drop the adaptive hash index
+@return whether the freeing was completed */
+bool
+fseg_free_step(
+ fseg_header_t* header,
+ mtr_t* mtr
+#ifdef BTR_CUR_HASH_ADAPT
+ ,bool ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+ )
+{
+ ulint n;
+ fseg_inode_t* inode;
+
+ const uint32_t space_id = page_get_space_id(page_align(header));
+ const uint32_t header_page = page_get_page_no(page_align(header));
+
+ fil_space_t* space = mtr->x_lock_space(space_id);
+ xdes_t* descr = xdes_get_descriptor(space, header_page, mtr);
+
+ if (!descr) {
+ return true;
+ }
+
+ /* Check that the header resides on a page which has not been
+ freed yet */
+
+ if (UNIV_UNLIKELY(xdes_is_free(descr,
+ header_page & (FSP_EXTENT_SIZE - 1)))) {
+ /* Some corruption was detected: stop the freeing
+ in order to prevent a crash. */
+ return true;
+ }
+ buf_block_t* iblock;
+ const ulint zip_size = space->zip_size();
+ inode = fseg_inode_try_get(header, space_id, zip_size, mtr, &iblock);
+ if (!inode || space->is_stopping()) {
+ return true;
+ }
+
+ if (!space->full_crc32()) {
+ fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
+ }
+
+ dberr_t err;
+ descr = fseg_get_first_extent(inode, space, mtr, &err);
+
+ if (descr) {
+ /* Free the extent held by the segment */
+ return fseg_free_extent(inode, iblock, space,
+ xdes_get_offset(descr), mtr
+#ifdef BTR_CUR_HASH_ADAPT
+ , ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+ ) != DB_SUCCESS;
+ }
+
+ if (err != DB_SUCCESS || space->is_stopping()) {
+ return true;
+ }
+
+ /* Free a frag page */
+ n = fseg_find_last_used_frag_page_slot(inode);
+
+ if (n == ULINT_UNDEFINED) {
+ /* Freeing completed: free the segment inode */
+ fsp_free_seg_inode(space, inode, iblock, mtr);
+ return true;
+ }
+
+ page_no_t page_no = fseg_get_nth_frag_page_no(inode, n);
+
+ if (fseg_free_page_low(inode, iblock, space, page_no, mtr
+#ifdef BTR_CUR_HASH_ADAPT
+ , ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+ ) != DB_SUCCESS) {
+ return true;
+ }
+
+ buf_page_free(space, page_no, mtr);
+
+ n = fseg_find_last_used_frag_page_slot(inode);
+
+ if (n == ULINT_UNDEFINED) {
+ /* Freeing completed: free the segment inode */
+ fsp_free_seg_inode(space, inode, iblock, mtr);
+
+ return true;
+ }
+
+ return false;
+}
+
+bool
+fseg_free_step_not_header(
+ fseg_header_t* header,
+ mtr_t* mtr
+#ifdef BTR_CUR_HASH_ADAPT
+ ,bool ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+ )
+{
+ fseg_inode_t* inode;
+
+ const uint32_t space_id = page_get_space_id(page_align(header));
+ ut_ad(mtr->is_named_space(space_id));
+
+ fil_space_t* space = mtr->x_lock_space(space_id);
+ buf_block_t* iblock;
+
+ inode = fseg_inode_try_get(header, space_id, space->zip_size(),
+ mtr, &iblock);
+ if (space->is_stopping()) {
+ return true;
+ }
+
+ if (!inode) {
+ ib::warn() << "Double free of "
+ << page_id_t(space_id,
+ page_get_page_no(page_align(header)));
+ return true;
+ }
+
+ if (!space->full_crc32()) {
+ fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
+ }
+
+ dberr_t err;
+ if (xdes_t* descr = fseg_get_first_extent(inode, space, mtr, &err)) {
+ /* Free the extent held by the segment */
+ return fseg_free_extent(inode, iblock, space,
+ xdes_get_offset(descr),
+ mtr
+#ifdef BTR_CUR_HASH_ADAPT
+ , ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+ ) != DB_SUCCESS;
+ } else if (err != DB_SUCCESS) {
+ return true;
+ }
+
+ /* Free a frag page */
+
+ ulint n = fseg_find_last_used_frag_page_slot(inode);
+
+ if (UNIV_UNLIKELY(n == ULINT_UNDEFINED)) {
+ return true;
+ }
+
+ uint32_t page_no = fseg_get_nth_frag_page_no(inode, n);
+
+ if (page_no == page_get_page_no(page_align(header))) {
+ return true;
+ }
+
+ if (fseg_free_page_low(inode, iblock, space, page_no, mtr
+#ifdef BTR_CUR_HASH_ADAPT
+ , ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+ ) != DB_SUCCESS) {
+ return true;
+ }
+ buf_page_free(space, page_no, mtr);
+ return false;
+}
+
+/** Returns the first extent descriptor for a segment.
+We think of the extent lists of the segment catenated in the order
+FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE.
+@param[in] inode segment inode
+@param[in] space tablespace
+@param[in,out] mtr mini-transaction
+@return the first extent descriptor
+@retval nullptr if none, or on corruption */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static
+xdes_t*
+fseg_get_first_extent(
+ fseg_inode_t* inode,
+ const fil_space_t* space,
+ mtr_t* mtr,
+ dberr_t* err)
+{
+ if (UNIV_UNLIKELY(space->id != page_get_space_id(page_align(inode)) ||
+ memcmp(inode + FSEG_MAGIC_N, FSEG_MAGIC_N_BYTES, 4)))
+ {
+ corrupted:
+ *err= DB_CORRUPTION;
+ return nullptr;
+ }
+
+ fil_addr_t first;
+
+ if (flst_get_len(inode + FSEG_FULL))
+ first= flst_get_first(inode + FSEG_FULL);
+ else if (flst_get_len(inode + FSEG_NOT_FULL))
+ first= flst_get_first(inode + FSEG_NOT_FULL);
+ else if (flst_get_len(inode + FSEG_FREE))
+ first= flst_get_first(inode + FSEG_FREE);
+ else
+ {
+ *err= DB_SUCCESS;
+ return nullptr;
+ }
+
+ if (first.page == FIL_NULL)
+ goto corrupted;
+
+ return xdes_lst_get_descriptor(*space, first, mtr, nullptr, err);
+}
+
+#ifdef UNIV_BTR_PRINT
+/*******************************************************************//**
+Writes info of a segment. */
+static void fseg_print_low(const fseg_inode_t *inode)
+{
+ ulint space;
+ ulint n_used;
+ ulint n_frag;
+ ulint n_free;
+ ulint n_not_full;
+ ulint n_full;
+ ulint reserved;
+ ulint used;
+ ulint page_no;
+ ib_id_t seg_id;
+
+ space = page_get_space_id(page_align(inode));
+ page_no = page_get_page_no(page_align(inode));
+
+ reserved = fseg_n_reserved_pages_low(inode, &used);
+
+ seg_id = mach_read_from_8(inode + FSEG_ID);
+ n_used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED);
+ n_frag = fseg_get_n_frag_pages(inode);
+ n_free = flst_get_len(inode + FSEG_FREE);
+ n_not_full = flst_get_len(inode + FSEG_NOT_FULL);
+ n_full = flst_get_len(inode + FSEG_FULL);
+
+ ib::info() << "SEGMENT id " << seg_id
+ << " space " << space << ";"
+ << " page " << page_no << ";"
+ << " res " << reserved << " used " << used << ";"
+ << " full ext " << n_full << ";"
+ << " fragm pages " << n_frag << ";"
+ << " free extents " << n_free << ";"
+ << " not full extents " << n_not_full << ": pages " << n_used;
+
+ ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+}
+
+/*******************************************************************//**
+Writes info of a segment. */
+void
+fseg_print(
+/*=======*/
+ fseg_header_t* header, /*!< in: segment header */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ const fil_space_t *space=
+ mtr->x_lock_space(page_get_space_id(page_align(header)));
+ buf_block_t *block;
+ if (fseg_inode_t *inode=
+ fseg_inode_try_get(header, space->id, space->zip_size(), mtr, &block))
+ fseg_print_low(inode);
+}
+#endif /* UNIV_BTR_PRINT */
+
+#ifdef UNIV_DEBUG
+std::ostream &fseg_header::to_stream(std::ostream &out) const
+{
+ out << "[fseg_header_t: space="
+ << mach_read_from_4(m_header + FSEG_HDR_SPACE)
+ << ", page=" << mach_read_from_4(m_header + FSEG_HDR_PAGE_NO)
+ << ", offset=" << mach_read_from_2(m_header + FSEG_HDR_OFFSET) << "]";
+ return out;
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/fsp/fsp0space.cc b/storage/innobase/fsp/fsp0space.cc
new file mode 100644
index 00000000..c2152b08
--- /dev/null
+++ b/storage/innobase/fsp/fsp0space.cc
@@ -0,0 +1,224 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fsp/fsp0space.cc
+Shared tablespace implementation.
+
+Created 2012-11-16 by Sunny Bains as srv/srv0space.cc
+*******************************************************/
+
+#include "fsp0sysspace.h"
+#include "fsp0fsp.h"
+#include "os0file.h"
+#include "my_sys.h"
+
+/** Check if two tablespaces have common data file names.
+@param other_space Tablespace to check against this.
+@return true if they have the same data filenames and paths */
+bool
+Tablespace::intersection(
+ const Tablespace* other_space)
+{
+ for (files_t::const_iterator it(other_space->begin()),
+ end(other_space->end()); it != end; ++it) {
+
+ if (find(it->m_filename)) {
+
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/** Frees the memory allocated by the SysTablespace object. */
+void
+Tablespace::shutdown()
+{
+ for (iterator it = begin(); it != end(); ++it) {
+ it->shutdown();
+ }
+
+ m_files.clear();
+ ut_free(m_path);
+ m_path = NULL;
+ m_space_id = UINT32_MAX;
+}
+
+/** Note that the data file was found.
+@param[in,out] file Data file object to set */
+void
+Tablespace::file_found(Datafile& file)
+{
+ /* Note that the file exists and can be opened
+ in the appropriate mode. */
+ file.m_exists = true;
+
+ file.set_open_flags(
+ &file == &m_files.front()
+ ? OS_FILE_OPEN_RETRY : OS_FILE_OPEN);
+}
+
+/** Open or Create the data files if they do not exist.
+@param[in] is_temp whether this is a temporary tablespace
+@return DB_SUCCESS or error code */
+dberr_t
+Tablespace::open_or_create(bool is_temp)
+{
+ fil_space_t* space = NULL;
+ dberr_t err = DB_SUCCESS;
+
+ ut_ad(!m_files.empty());
+
+ for (iterator it = begin(); it != end(); ++it) {
+ if (it->m_exists) {
+ err = it->open_or_create(
+ m_ignore_read_only
+ ? false : srv_read_only_mode);
+ if (err != DB_SUCCESS) {
+ return err;
+ }
+ } else {
+ err = it->open_or_create(
+ m_ignore_read_only
+ ? false : srv_read_only_mode);
+
+ if (err != DB_SUCCESS) {
+ return err;
+ }
+
+ /* Set the correct open flags now that we have
+ successfully created the file. */
+ file_found(*it);
+ }
+
+ /* We can close the handle now and open the tablespace
+ the proper way. */
+ it->close();
+
+ if (it == begin()) {
+ /* First data file. */
+
+ /* Create the tablespace entry for the multi-file
+ tablespace in the tablespace manager. */
+ uint32_t fsp_flags;
+
+ switch (srv_checksum_algorithm) {
+ case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+ fsp_flags = (FSP_FLAGS_FCRC32_MASK_MARKER
+ | FSP_FLAGS_FCRC32_PAGE_SSIZE());
+ break;
+ default:
+ fsp_flags = FSP_FLAGS_PAGE_SSIZE();
+ }
+
+ mysql_mutex_lock(&fil_system.mutex);
+ space = fil_space_t::create(
+ m_space_id, fsp_flags,
+ is_temp
+ ? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE,
+ NULL);
+ if (!space) {
+ mysql_mutex_unlock(&fil_system.mutex);
+ return DB_ERROR;
+ }
+ } else {
+ mysql_mutex_lock(&fil_system.mutex);
+ }
+ space->add(it->m_filepath, OS_FILE_CLOSED, it->m_size,
+ false, true);
+ mysql_mutex_unlock(&fil_system.mutex);
+ }
+
+ return(err);
+}
+
+/** Find a filename in the list of Datafiles for a tablespace
+@return true if the filename exists in the data files */
+bool
+Tablespace::find(const char* filename) const
+{
+ for (const_iterator it = begin(); it != end(); ++it) {
+
+ if (innobase_strcasecmp(filename, it->m_filename) == 0) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/** Delete all the data files. */
+void
+Tablespace::delete_files()
+{
+ for (iterator it = begin(); it != end(); ++it) {
+
+ it->close();
+
+ bool file_pre_exists;
+ bool success = os_file_delete_if_exists(
+ innodb_data_file_key, it->m_filepath, &file_pre_exists);
+
+ if (success && file_pre_exists) {
+ ib::info() << "Removed temporary tablespace data"
+ " file: \"" << it->m_filepath << "\"";
+ }
+ }
+}
+
+/** Use the ADD DATAFILE path to create a Datafile object and add it to the
+front of m_files.
+Parse the datafile path into a path and a filename with extension 'ibd'.
+This datafile_path provided may or may not be an absolute path, but it
+must end with the extension .ibd and have a basename of at least 1 byte.
+
+Set tablespace m_path member and add a Datafile with the filename.
+@param[in] datafile_path full path of the tablespace file. */
+dberr_t Tablespace::add_datafile(const char *filepath)
+{
+ /* The path provided ends in ".ibd". This was assured by
+ validate_create_tablespace_info() */
+ ut_d(const char* dot = strrchr(filepath, '.'));
+ ut_ad(dot != NULL && 0 == strcmp(dot, DOT_IBD));
+
+ /* If the path is an absolute path, separate it onto m_path and a
+ basename. For relative paths, make the whole thing a basename so that
+ it can be appended to the datadir. */
+ bool is_abs_path = is_absolute_path(filepath);
+ size_t dirlen = (is_abs_path ? dirname_length(filepath) : 0);
+ const char* basename = filepath + dirlen;
+
+ /* If the pathname contains a directory separator, fill the
+ m_path member which is the default directory for files in this
+ tablespace. Leave it null otherwise. */
+ if (dirlen > 0) {
+ set_path(filepath, dirlen);
+ }
+
+ /* Now add a new Datafile and set the filepath
+ using the m_path created above. */
+ m_files.push_back(Datafile(m_flags, FIL_IBD_FILE_INITIAL_SIZE, 0));
+ m_files.back().make_filepath(m_path, {basename, strlen(basename) - 4},
+ IBD);
+
+ return(DB_SUCCESS);
+}
diff --git a/storage/innobase/fsp/fsp0sysspace.cc b/storage/innobase/fsp/fsp0sysspace.cc
new file mode 100644
index 00000000..e4a43e48
--- /dev/null
+++ b/storage/innobase/fsp/fsp0sysspace.cc
@@ -0,0 +1,1019 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fsp/fsp0space.cc
+Multi file, shared, system tablespace implementation.
+
+Created 2012-11-16 by Sunny Bains as srv/srv0space.cc
+Refactored 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#include "fsp0sysspace.h"
+#include "srv0start.h"
+#include "trx0sys.h"
+#include "dict0load.h"
+#include "mem0mem.h"
+#include "os0file.h"
+#include "row0mysql.h"
+#include "buf0dblwr.h"
+
+/** The server header file is included to access opt_initialize global variable.
+If server passes the option for create/open DB to SE, we should remove such
+direct reference to server header and global variable */
+#include "mysqld.h"
+
+/** The control info of the system tablespace. */
+SysTablespace srv_sys_space;
+
+/** The control info of a temporary table shared tablespace. */
+SysTablespace srv_tmp_space;
+
+/** If the last data file is auto-extended, we add this many pages to it
+at a time. We have to make this public because it is a config variable. */
+uint sys_tablespace_auto_extend_increment;
+
+/** Convert a numeric string that optionally ends in G or M or K,
+ to a number containing megabytes.
+@param[in] str String with a quantity in bytes
+@param[out] megs The number in megabytes
+@return next character in string */
+char*
+SysTablespace::parse_units(
+ char* ptr,
+ ulint* megs)
+{
+ char* endp;
+
+ *megs = strtoul(ptr, &endp, 10);
+
+ ptr = endp;
+
+ switch (*ptr) {
+ case 'G': case 'g':
+ *megs *= 1024;
+ /* fall through */
+ case 'M': case 'm':
+ ++ptr;
+ break;
+ case 'K': case 'k':
+ *megs /= 1024;
+ ++ptr;
+ break;
+ default:
+ *megs /= 1024 * 1024;
+ break;
+ }
+
+ return(ptr);
+}
+
+/** Parse the input params and populate member variables.
+@param[in] filepath path to data files
+@param[in] supports_raw true if the tablespace supports raw devices
+@return true on success parse */
+bool
+SysTablespace::parse_params(
+ const char* filepath_spec,
+ bool supports_raw)
+{
+ char* filepath;
+ ulint size;
+ char* input_str;
+ ulint n_files = 0;
+
+ ut_ad(m_last_file_size_max == 0);
+ ut_ad(!m_auto_extend_last_file);
+
+ char* new_str = mem_strdup(filepath_spec);
+ char* str = new_str;
+
+ input_str = str;
+
+ /*---------------------- PASS 1 ---------------------------*/
+ /* First calculate the number of data files and check syntax:
+ filepath:size[K |M | G];filepath:size[K |M | G]... .
+ Note that a Windows path may contain a drive name and a ':'. */
+ while (*str != '\0') {
+ filepath = str;
+
+ while ((*str != ':' && *str != '\0')
+ || (*str == ':'
+ && (*(str + 1) == '\\' || *(str + 1) == '/'
+ || *(str + 1) == ':'))) {
+ str++;
+ }
+
+ if (*str == '\0') {
+ ut_free(new_str);
+
+ ib::error()
+ << "syntax error in file path or size"
+ " specified is less than 1 megabyte";
+ return(false);
+ }
+
+ str++;
+
+ str = parse_units(str, &size);
+
+ if (0 == strncmp(str, ":autoextend",
+ (sizeof ":autoextend") - 1)) {
+
+ str += (sizeof ":autoextend") - 1;
+
+ if (0 == strncmp(str, ":max:",
+ (sizeof ":max:") - 1)) {
+
+ str += (sizeof ":max:") - 1;
+
+ str = parse_units(str, &size);
+ }
+
+ if (*str != '\0') {
+ ut_free(new_str);
+ ib::error()
+ << "syntax error in file path or"
+ << " size specified is less than"
+ << " 1 megabyte";
+ return(false);
+ }
+ }
+
+ if (::strlen(str) >= 6
+ && *str == 'n'
+ && *(str + 1) == 'e'
+ && *(str + 2) == 'w') {
+
+ if (!supports_raw) {
+ ib::error()
+ << "Tablespace doesn't support raw"
+ " devices";
+ ut_free(new_str);
+ return(false);
+ }
+
+ str += 3;
+ }
+
+ if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') {
+ str += 3;
+
+ if (!supports_raw) {
+ ib::error()
+ << "Tablespace doesn't support raw"
+ " devices";
+ ut_free(new_str);
+ return(false);
+ }
+ }
+
+ if (size == 0) {
+
+ ut_free(new_str);
+
+ ib::error()
+ << "syntax error in file path or size"
+ " specified is less than 1 megabyte";
+
+ return(false);
+ }
+
+ ++n_files;
+
+ if (*str == ';') {
+ str++;
+ } else if (*str != '\0') {
+ ut_free(new_str);
+
+ ib::error()
+ << "syntax error in file path or size"
+ " specified is less than 1 megabyte";
+ return(false);
+ }
+ }
+
+ if (n_files == 0) {
+
+ /* filepath_spec must contain at least one data file
+ definition */
+
+ ut_free(new_str);
+
+ ib::error()
+ << "syntax error in file path or size specified"
+ " is less than 1 megabyte";
+
+ return(false);
+ }
+
+ /*---------------------- PASS 2 ---------------------------*/
+ /* Then store the actual values to our arrays */
+ str = input_str;
+ ulint order = 0;
+
+ while (*str != '\0') {
+ filepath = str;
+
+ /* Note that we must step over the ':' in a Windows filepath;
+ a Windows path normally looks like C:\ibdata\ibdata1:1G, but
+ a Windows raw partition may have a specification like
+ \\.\C::1Gnewraw or \\.\PHYSICALDRIVE2:1Gnewraw */
+
+ while ((*str != ':' && *str != '\0')
+ || (*str == ':'
+ && (*(str + 1) == '\\' || *(str + 1) == '/'
+ || *(str + 1) == ':'))) {
+ str++;
+ }
+
+ if (*str == ':') {
+ /* Make filepath a null-terminated string */
+ *str = '\0';
+ str++;
+ }
+
+ str = parse_units(str, &size);
+
+ if (0 == strncmp(str, ":autoextend",
+ (sizeof ":autoextend") - 1)) {
+
+ m_auto_extend_last_file = true;
+
+ str += (sizeof ":autoextend") - 1;
+
+ if (0 == strncmp(str, ":max:",
+ (sizeof ":max:") - 1)) {
+
+ str += (sizeof ":max:") - 1;
+
+ str = parse_units(str, &m_last_file_size_max);
+ }
+
+ if (*str != '\0') {
+ ut_free(new_str);
+ ib::error() << "syntax error in file path or"
+ " size specified is less than 1"
+ " megabyte";
+ return(false);
+ }
+ }
+
+ m_files.push_back(Datafile(flags(), uint32_t(size), order));
+ m_files.back().make_filepath(path(),
+ {filepath, strlen(filepath)},
+ NO_EXT);
+
+ if (::strlen(str) >= 6
+ && *str == 'n'
+ && *(str + 1) == 'e'
+ && *(str + 2) == 'w') {
+
+ ut_a(supports_raw);
+
+ str += 3;
+
+ /* Initialize new raw device only during initialize */
+ /* JAN: TODO: MySQL 5.7 used opt_initialize */
+ m_files.back().m_type =
+ opt_bootstrap ? SRV_NEW_RAW : SRV_OLD_RAW;
+ }
+
+ if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') {
+
+ ut_a(supports_raw);
+
+ str += 3;
+
+ /* Initialize new raw device only during initialize */
+ if (m_files.back().m_type == SRV_NOT_RAW) {
+ /* JAN: TODO: MySQL 5.7 used opt_initialize */
+ m_files.back().m_type =
+ opt_bootstrap ? SRV_NEW_RAW : SRV_OLD_RAW;
+ }
+ }
+
+ if (*str == ';') {
+ ++str;
+ }
+ order++;
+ }
+
+ ut_ad(n_files == ulint(m_files.size()));
+
+ ut_free(new_str);
+
+ return(true);
+}
+
+/** Frees the memory allocated by the parse method. */
+void
+SysTablespace::shutdown()
+{
+ Tablespace::shutdown();
+
+ m_auto_extend_last_file = 0;
+ m_last_file_size_max = 0;
+ m_created_new_raw = 0;
+ m_is_tablespace_full = false;
+ m_sanity_checks_done = false;
+}
+
+/** Verify the size of the physical file.
+@param[in] file data file object
+@return DB_SUCCESS if OK else error code. */
+dberr_t
+SysTablespace::check_size(
+ Datafile& file)
+{
+ os_offset_t size = os_file_get_size(file.m_handle);
+ ut_a(size != (os_offset_t) -1);
+
+ /* Under some error conditions like disk full scenarios
+ or file size reaching filesystem limit the data file
+ could contain an incomplete extent at the end. When we
+ extend a data file and if some failure happens, then
+ also the data file could contain an incomplete extent.
+ So we need to round the size downward to a megabyte.*/
+
+ const uint32_t rounded_size_pages = static_cast<uint32_t>(
+ size >> srv_page_size_shift);
+
+ /* If last file */
+ if (&file == &m_files.back() && m_auto_extend_last_file) {
+
+ if (file.m_size > rounded_size_pages
+ || (m_last_file_size_max > 0
+ && m_last_file_size_max < rounded_size_pages)) {
+ ib::error() << "The Auto-extending data file '"
+ << file.filepath()
+ << "' is of a different size "
+ << rounded_size_pages
+ << " pages than specified"
+ " by innodb_data_file_path";
+ return(DB_ERROR);
+ }
+
+ file.m_size = rounded_size_pages;
+ }
+
+ if (rounded_size_pages != file.m_size) {
+ ib::error() << "The data file '"
+ << file.filepath() << "' is of a different size "
+ << rounded_size_pages << " pages"
+ " than the " << file.m_size << " pages specified by"
+ " innodb_data_file_path";
+ return(DB_ERROR);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Set the size of the file.
+@param[in] file data file object
+@return DB_SUCCESS or error code */
+dberr_t
+SysTablespace::set_size(
+ Datafile& file)
+{
+ ut_ad(!srv_read_only_mode || m_ignore_read_only);
+ const ib::bytes_iec b{uint64_t{file.m_size} << srv_page_size_shift};
+
+ /* We created the data file and now write it full of zeros */
+ ib::info() << "Setting file '" << file.filepath() << "' size to " << b
+ << ". Physically writing the file full; Please wait ...";
+
+ bool success = os_file_set_size(
+ file.m_filepath, file.m_handle,
+ static_cast<os_offset_t>(file.m_size) << srv_page_size_shift);
+
+ if (success) {
+ ib::info() << "File '" << file.filepath() << "' size is now "
+ << b
+ << ".";
+ } else {
+ ib::error() << "Could not set the file size of '"
+ << file.filepath() << "'. Probably out of disk space";
+
+ return(DB_ERROR);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Create a data file.
+@param[in] file data file object
+@return DB_SUCCESS or error code */
+dberr_t
+SysTablespace::create_file(
+ Datafile& file)
+{
+ dberr_t err = DB_SUCCESS;
+
+ ut_a(!file.m_exists);
+ ut_ad(!srv_read_only_mode || m_ignore_read_only);
+
+ switch (file.m_type) {
+ case SRV_NEW_RAW:
+
+ /* The partition is opened, not created; then it is
+ written over */
+ m_created_new_raw = true;
+
+ /* Fall through. */
+
+ case SRV_OLD_RAW:
+
+ srv_start_raw_disk_in_use = TRUE;
+
+ /* Fall through. */
+
+ case SRV_NOT_RAW:
+ err = file.open_or_create(
+ !m_ignore_read_only && srv_read_only_mode);
+ break;
+ }
+
+ if (err != DB_SUCCESS) {
+ return err;
+ }
+
+ switch (file.m_type) {
+ case SRV_OLD_RAW:
+ break;
+ case SRV_NOT_RAW:
+#ifndef _WIN32
+ if (!space_id() && my_disable_locking
+ && os_file_lock(file.m_handle, file.m_filepath)) {
+ err = DB_ERROR;
+ break;
+ }
+#endif
+ /* fall through */
+ case SRV_NEW_RAW:
+ err = set_size(file);
+ }
+
+ return(err);
+}
+
+/** Open a data file.
+@param[in] file data file object
+@return DB_SUCCESS or error code */
+dberr_t
+SysTablespace::open_file(
+ Datafile& file)
+{
+ dberr_t err = DB_SUCCESS;
+
+ ut_a(file.m_exists);
+
+ switch (file.m_type) {
+ case SRV_NEW_RAW:
+ /* The partition is opened, not created; then it is
+ written over */
+ m_created_new_raw = true;
+
+ /* Fall through */
+
+ case SRV_OLD_RAW:
+ srv_start_raw_disk_in_use = TRUE;
+
+ if (srv_read_only_mode && !m_ignore_read_only) {
+ ib::error() << "Can't open a raw device '"
+ << file.m_filepath << "' when"
+ " --innodb-read-only is set";
+
+ return(DB_ERROR);
+ }
+
+ /* Fall through */
+
+ case SRV_NOT_RAW:
+ err = file.open_or_create(
+ !m_ignore_read_only && srv_read_only_mode);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ break;
+ }
+
+ switch (file.m_type) {
+ case SRV_NEW_RAW:
+ /* Set file size for new raw device. */
+ err = set_size(file);
+ break;
+
+ case SRV_NOT_RAW:
+#ifndef _WIN32
+ if (!space_id() && (m_ignore_read_only || !srv_read_only_mode)
+ && my_disable_locking
+ && os_file_lock(file.m_handle, file.m_filepath)) {
+ err = DB_ERROR;
+ break;
+ }
+#endif
+ /* Check file size for existing file. */
+ err = check_size(file);
+ break;
+
+ case SRV_OLD_RAW:
+ err = DB_SUCCESS;
+ break;
+
+ }
+
+ if (err != DB_SUCCESS) {
+ file.close();
+ }
+
+ return(err);
+}
+
+/** Check the tablespace header for this tablespace.
+@return DB_SUCCESS or error code */
+inline dberr_t SysTablespace::read_lsn_and_check_flags()
+{
+ dberr_t err;
+
+ files_t::iterator it = m_files.begin();
+
+ ut_a(it->m_exists);
+
+ if (it->m_handle == OS_FILE_CLOSED) {
+
+ err = it->open_or_create(
+ m_ignore_read_only ? false : srv_read_only_mode);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ err = it->read_first_page(
+ m_ignore_read_only ? false : srv_read_only_mode);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ ut_a(it->order() == 0);
+
+ if (srv_operation <= SRV_OPERATION_EXPORT_RESTORED) {
+ buf_dblwr.init_or_load_pages(it->handle(), it->filepath());
+ }
+
+ /* Check the contents of the first page of the
+ first datafile. */
+ for (int retry = 0; retry < 2; ++retry) {
+
+ err = it->validate_first_page();
+
+ if (err != DB_SUCCESS
+ && (retry == 1
+ || recv_sys.dblwr.restore_first_page(
+ it->m_space_id, it->m_filepath,
+ it->handle()))) {
+
+ it->close();
+
+ return(err);
+ }
+ }
+
+ /* Make sure the tablespace space ID matches the
+ space ID on the first page of the first datafile. */
+ if (space_id() != it->m_space_id) {
+
+ ib::error()
+ << "The data file '" << it->filepath()
+ << "' has the wrong space ID. It should be "
+ << space_id() << ", but " << it->m_space_id
+ << " was found";
+
+ it->close();
+
+ return(err);
+ }
+
+ if (srv_operation == SRV_OPERATION_NORMAL) {
+ /* Prepare for possible upgrade from 0-sized ib_logfile0. */
+ ut_ad(!log_sys.next_checkpoint_lsn);
+ log_sys.next_checkpoint_lsn = mach_read_from_8(
+ it->m_first_page + 26/*FIL_PAGE_FILE_FLUSH_LSN*/);
+ }
+
+ it->close();
+
+ return(DB_SUCCESS);
+}
+
+/** Check if a file can be opened in the correct mode.
+@param[in] file data file object
+@param[out] reason exact reason if file_status check failed.
+@return DB_SUCCESS or error code. */
+dberr_t
+SysTablespace::check_file_status(
+ const Datafile& file,
+ file_status_t& reason)
+{
+ os_file_stat_t stat;
+
+ memset(&stat, 0x0, sizeof(stat));
+
+ dberr_t err = os_file_get_status(
+ file.m_filepath, &stat, true,
+ m_ignore_read_only ? false : srv_read_only_mode);
+
+ reason = FILE_STATUS_VOID;
+ /* File exists but we can't read the rw-permission settings. */
+ switch (err) {
+ case DB_FAIL:
+ ib::error() << "os_file_get_status() failed on '"
+ << file.filepath()
+ << "'. Can't determine file permissions";
+ err = DB_ERROR;
+ reason = FILE_STATUS_RW_PERMISSION_ERROR;
+ break;
+
+ case DB_SUCCESS:
+ /* Note: stat.rw_perm is only valid for "regular" files */
+
+ if (stat.type == OS_FILE_TYPE_FILE) {
+ if (!stat.rw_perm) {
+ ib::error() << "The data file"
+ << " '" << file.filepath()
+ << ((!srv_read_only_mode
+ || m_ignore_read_only)
+ ? "' must be writable"
+ : "' must be readable");
+
+ err = DB_ERROR;
+ reason = FILE_STATUS_READ_WRITE_ERROR;
+ }
+
+ } else {
+ /* Not a regular file, bail out. */
+ ib::error() << "The data file '" << file.filepath()
+ << "' is not a regular file.";
+
+ err = DB_ERROR;
+ reason = FILE_STATUS_NOT_REGULAR_FILE_ERROR;
+ }
+ break;
+
+ case DB_NOT_FOUND:
+ break;
+
+ default:
+ ut_ad(0);
+ }
+
+ return(err);
+}
+
+/** Note that the data file was not found.
+@param[in] file data file object
+@param[out] create_new_db true if a new instance to be created
+@return DB_SUCESS or error code */
+dberr_t
+SysTablespace::file_not_found(
+ Datafile& file,
+ bool* create_new_db)
+{
+ file.m_exists = false;
+
+ if (m_ignore_read_only) {
+ } else if (srv_read_only_mode) {
+ ib::error() << "Can't create file '" << file.filepath()
+ << "' when --innodb-read-only is set";
+ return(DB_ERROR);
+ } else if (srv_force_recovery && space_id() == TRX_SYS_SPACE) {
+ ib::error() << "Can't create file '" << file.filepath()
+ << "' when --innodb-force-recovery is set";
+ return DB_ERROR;
+ }
+
+ if (&file == &m_files.front()) {
+
+ /* First data file. */
+ ut_a(!*create_new_db);
+ *create_new_db = TRUE;
+
+ if (space_id() == TRX_SYS_SPACE) {
+ ib::info() << "The first data file '"
+ << file.filepath() << "' did not exist."
+ " A new tablespace will be created!";
+ }
+
+ } else {
+ ib::info() << "Need to create a new data file '"
+ << file.filepath() << "'.";
+ }
+
+ /* Set the file create mode. */
+ switch (file.m_type) {
+ case SRV_NOT_RAW:
+ file.set_open_flags(OS_FILE_CREATE);
+ break;
+
+ case SRV_NEW_RAW:
+ case SRV_OLD_RAW:
+ file.set_open_flags(OS_FILE_OPEN_RAW);
+ break;
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Note that the data file was found.
+@param[in,out] file data file object
+@return true if a new instance to be created */
+bool
+SysTablespace::file_found(
+ Datafile& file)
+{
+ /* Note that the file exists and can be opened
+ in the appropriate mode. */
+ file.m_exists = true;
+
+ /* Set the file open mode */
+ switch (file.m_type) {
+ case SRV_NOT_RAW:
+ file.set_open_flags(
+ &file == &m_files.front()
+ ? OS_FILE_OPEN_RETRY : OS_FILE_OPEN);
+ break;
+
+ case SRV_NEW_RAW:
+ case SRV_OLD_RAW:
+ file.set_open_flags(OS_FILE_OPEN_RAW);
+ break;
+ }
+
+ /* Need to create the system tablespace for new raw device. */
+ return(file.m_type == SRV_NEW_RAW);
+}
+
+/** Check the data file specification.
+@param[out] create_new_db true if a new database is to be created
+@param[in] min_expected_size Minimum expected tablespace size in bytes
+@return DB_SUCCESS if all OK else error code */
+dberr_t
+SysTablespace::check_file_spec(
+ bool* create_new_db,
+ ulint min_expected_size)
+{
+ *create_new_db = FALSE;
+
+ if (m_files.size() >= 1000) {
+ ib::error() << "There must be < 1000 data files "
+ " but " << m_files.size() << " have been"
+ " defined.";
+
+ return(DB_ERROR);
+ }
+
+ if (!m_auto_extend_last_file
+ && get_sum_of_sizes()
+ < (min_expected_size >> srv_page_size_shift)) {
+ ib::error() << "Tablespace size must be at least "
+ << (min_expected_size >> 20) << " MB";
+ return(DB_ERROR);
+ }
+
+ dberr_t err = DB_SUCCESS;
+
+ ut_a(!m_files.empty());
+
+ /* If there is more than one data file and the last data file
+ doesn't exist, that is OK. We allow adding of new data files. */
+
+ files_t::iterator begin = m_files.begin();
+ files_t::iterator end = m_files.end();
+
+ for (files_t::iterator it = begin; it != end; ++it) {
+
+ file_status_t reason_if_failed;
+ err = check_file_status(*it, reason_if_failed);
+
+ if (err == DB_NOT_FOUND) {
+
+ err = file_not_found(*it, create_new_db);
+
+ if (err != DB_SUCCESS) {
+ break;
+ }
+
+ } else if (err != DB_SUCCESS) {
+ if (reason_if_failed == FILE_STATUS_READ_WRITE_ERROR) {
+ ib::error() << "The data file '"
+ << it->filepath()
+ << ((!srv_read_only_mode
+ || m_ignore_read_only)
+ ? "' must be writable"
+ : "' must be readable");
+ }
+
+ ut_a(err != DB_FAIL);
+ break;
+
+ } else if (*create_new_db) {
+ ib::error() << "The data file '"
+ << begin->filepath()
+ << "' was not found but"
+ " one of the other data files '"
+ << it->filepath() << "' exists.";
+
+ err = DB_ERROR;
+ break;
+
+ } else {
+ *create_new_db = file_found(*it);
+ }
+ }
+
+ return(err);
+}
+
+/** Open or create the data files
+@param[in] is_temp whether this is a temporary tablespace
+@param[in] create_new_db whether we are creating a new database
+@param[out] sum_new_sizes sum of sizes of the new files added
+@return DB_SUCCESS or error code */
+dberr_t
+SysTablespace::open_or_create(
+ bool is_temp,
+ bool create_new_db,
+ ulint* sum_new_sizes)
+{
+ dberr_t err = DB_SUCCESS;
+ fil_space_t* space = NULL;
+
+ ut_ad(!m_files.empty());
+
+ if (sum_new_sizes) {
+ *sum_new_sizes = 0;
+ }
+
+ files_t::iterator begin = m_files.begin();
+ files_t::iterator end = m_files.end();
+
+ ut_ad(begin->order() == 0);
+
+ for (files_t::iterator it = begin; it != end; ++it) {
+
+ if (it->m_exists) {
+ err = open_file(*it);
+
+ /* For new raw device increment new size. */
+ if (sum_new_sizes && it->m_type == SRV_NEW_RAW) {
+
+ *sum_new_sizes += it->m_size;
+ }
+
+ } else {
+ err = create_file(*it);
+
+ if (sum_new_sizes) {
+ *sum_new_sizes += it->m_size;
+ }
+
+ /* Set the correct open flags now that we have
+ successfully created the file. */
+ if (err == DB_SUCCESS) {
+ /* We ignore new_db OUT parameter here
+ as the information is known at this stage */
+ file_found(*it);
+ }
+ }
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ }
+
+ if (!create_new_db && space_id() == TRX_SYS_SPACE) {
+ /* Validate the header page in the first datafile. */
+ err = read_lsn_and_check_flags();
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ /* Close the curent handles, add space and file info to the
+ fil_system cache and the Data Dictionary, and re-open them
+ in file_system cache so that they stay open until shutdown. */
+ mysql_mutex_lock(&fil_system.mutex);
+ ulint node_counter = 0;
+ for (files_t::iterator it = begin; it != end; ++it) {
+ it->close();
+ it->m_exists = true;
+
+ if (it != begin) {
+ } else if (is_temp) {
+ ut_ad(space_id() == SRV_TMP_SPACE_ID);
+ space = fil_space_t::create(
+ SRV_TMP_SPACE_ID, flags(),
+ FIL_TYPE_TEMPORARY, NULL);
+ ut_ad(space == fil_system.temp_space);
+ if (!space) {
+ err = DB_ERROR;
+ break;
+ }
+ ut_ad(!space->is_compressed());
+ ut_ad(space->full_crc32());
+ } else {
+ ut_ad(space_id() == TRX_SYS_SPACE);
+ space = fil_space_t::create(
+ TRX_SYS_SPACE, it->flags(),
+ FIL_TYPE_TABLESPACE, NULL);
+ ut_ad(space == fil_system.sys_space);
+ if (!space) {
+ err = DB_ERROR;
+ break;
+ }
+ }
+
+ uint32_t max_size = (++node_counter == m_files.size()
+ ? (m_last_file_size_max == 0
+ ? UINT32_MAX
+ : uint32_t(m_last_file_size_max))
+ : it->m_size);
+
+ space->add(it->m_filepath, OS_FILE_CLOSED, it->m_size,
+ it->m_type != SRV_NOT_RAW, true, max_size);
+ }
+
+ mysql_mutex_unlock(&fil_system.mutex);
+ return(err);
+}
+
+/** Normalize the file size, convert from megabytes to number of pages. */
+void
+SysTablespace::normalize_size()
+{
+ files_t::iterator end = m_files.end();
+
+ for (files_t::iterator it = m_files.begin(); it != end; ++it) {
+
+ it->m_size <<= (20U - srv_page_size_shift);
+ }
+
+ m_last_file_size_max <<= (20U - srv_page_size_shift);
+}
+
+
+/**
+@return next increment size */
+uint32_t SysTablespace::get_increment() const
+{
+ if (m_last_file_size_max == 0)
+ return get_autoextend_increment();
+
+ if (!is_valid_size())
+ {
+ ib::error() << "The last data file has a size of " << last_file_size()
+ << " but the max size allowed is "
+ << m_last_file_size_max;
+ }
+
+ return std::min(uint32_t(m_last_file_size_max) - last_file_size(),
+ get_autoextend_increment());
+}
+
+
+/**
+@return true if configured to use raw devices */
+bool
+SysTablespace::has_raw_device()
+{
+ files_t::iterator end = m_files.end();
+
+ for (files_t::iterator it = m_files.begin(); it != end; ++it) {
+
+ if (it->is_raw_device()) {
+ return(true);
+ }
+ }
+
+ return(false);
+}