diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:07:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:07:14 +0000 |
commit | a175314c3e5827eb193872241446f2f8f5c9d33c (patch) | |
tree | cd3d60ca99ae00829c52a6ca79150a5b6e62528b /storage/innobase/fsp | |
parent | Initial commit. (diff) | |
download | mariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.tar.xz mariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.zip |
Adding upstream version 1:10.5.12.upstream/1%10.5.12upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/fsp')
-rw-r--r-- | storage/innobase/fsp/fsp0file.cc | 1043 | ||||
-rw-r--r-- | storage/innobase/fsp/fsp0fsp.cc | 2890 | ||||
-rw-r--r-- | storage/innobase/fsp/fsp0space.cc | 230 | ||||
-rw-r--r-- | storage/innobase/fsp/fsp0sysspace.cc | 994 |
4 files changed, 5157 insertions, 0 deletions
diff --git a/storage/innobase/fsp/fsp0file.cc b/storage/innobase/fsp/fsp0file.cc new file mode 100644 index 00000000..57164113 --- /dev/null +++ b/storage/innobase/fsp/fsp0file.cc @@ -0,0 +1,1043 @@ +/***************************************************************************** + +Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fsp/fsp0file.cc +Tablespace data file implementation + +Created 2013-7-26 by Kevin Lewis +*******************************************************/ + +#include "fil0fil.h" +#include "fsp0types.h" +#include "os0file.h" +#include "page0page.h" +#include "srv0start.h" + +/** Initialize the name, size and order of this datafile +@param[in] name tablespace name, will be copied +@param[in] flags tablespace flags */ +void +Datafile::init( + const char* name, + ulint flags) +{ + ut_ad(m_name == NULL); + ut_ad(name != NULL); + + m_name = mem_strdup(name); + m_flags = flags; +} + +/** Release the resources. */ +void +Datafile::shutdown() +{ + close(); + + ut_free(m_name); + m_name = NULL; + free_filepath(); + free_first_page(); +} + +/** Create/open a data file. +@param[in] read_only_mode if true, then readonly mode checks are enforced. +@return DB_SUCCESS or error code */ +dberr_t +Datafile::open_or_create(bool read_only_mode) +{ + bool success; + ut_a(m_filepath != NULL); + ut_ad(m_handle == OS_FILE_CLOSED); + + m_handle = os_file_create( + innodb_data_file_key, m_filepath, m_open_flags, + OS_FILE_NORMAL, OS_DATA_FILE, read_only_mode, &success); + + if (!success) { + m_last_os_error = os_file_get_last_error(true); + ib::error() << "Cannot open datafile '" << m_filepath << "'"; + return(DB_CANNOT_OPEN_FILE); + } + + return(DB_SUCCESS); +} + +/** Open a data file in read-only mode to check if it exists so that it +can be validated. +@param[in] strict whether to issue error messages +@return DB_SUCCESS or error code */ +dberr_t +Datafile::open_read_only(bool strict) +{ + bool success = false; + ut_ad(m_handle == OS_FILE_CLOSED); + + /* This function can be called for file objects that do not need + to be opened, which is the case when the m_filepath is NULL */ + if (m_filepath == NULL) { + return(DB_ERROR); + } + + set_open_flags(OS_FILE_OPEN); + m_handle = os_file_create_simple_no_error_handling( + innodb_data_file_key, m_filepath, m_open_flags, + OS_FILE_READ_ONLY, true, &success); + + if (success) { + m_exists = true; + init_file_info(); + + return(DB_SUCCESS); + } + + if (strict) { + m_last_os_error = os_file_get_last_error(true); + ib::error() << "Cannot open datafile for read-only: '" + << m_filepath << "' OS error: " << m_last_os_error; + } + + return(DB_CANNOT_OPEN_FILE); +} + +/** Open a data file in read-write mode during start-up so that +doublewrite pages can be restored and then it can be validated.* +@param[in] read_only_mode if true, then readonly mode checks are enforced. +@return DB_SUCCESS or error code */ +dberr_t +Datafile::open_read_write(bool read_only_mode) +{ + bool success = false; + ut_ad(m_handle == OS_FILE_CLOSED); + + /* This function can be called for file objects that do not need + to be opened, which is the case when the m_filepath is NULL */ + if (m_filepath == NULL) { + return(DB_ERROR); + } + + set_open_flags(OS_FILE_OPEN); + m_handle = os_file_create_simple_no_error_handling( + innodb_data_file_key, m_filepath, m_open_flags, + OS_FILE_READ_WRITE, read_only_mode, &success); + + if (!success) { + m_last_os_error = os_file_get_last_error(true); + ib::error() << "Cannot open datafile for read-write: '" + << m_filepath << "'"; + return(DB_CANNOT_OPEN_FILE); + } + + m_exists = true; + + init_file_info(); + + return(DB_SUCCESS); +} + +/** Initialize OS specific file info. */ +void +Datafile::init_file_info() +{ +#ifdef _WIN32 + GetFileInformationByHandle((os_file_t)m_handle, &m_file_info); +#else + fstat(m_handle, &m_file_info); +#endif /* WIN32 */ +} + +/** Close a data file. +@return DB_SUCCESS or error code */ +dberr_t +Datafile::close() +{ + if (m_handle != OS_FILE_CLOSED) { + ibool success = os_file_close(m_handle); + ut_a(success); + + m_handle = OS_FILE_CLOSED; + } + + return(DB_SUCCESS); +} + +/** Make a full filepath from a directory path and a filename. +Prepend the dirpath to filename using the extension given. +If dirpath is NULL, prepend the default datadir to filepath. +Store the result in m_filepath. +@param[in] dirpath directory path +@param[in] filename filename or filepath +@param[in] ext filename extension */ +void +Datafile::make_filepath( + const char* dirpath, + const char* filename, + ib_extention ext) +{ + ut_ad(dirpath != NULL || filename != NULL); + + free_filepath(); + + m_filepath = fil_make_filepath(dirpath, filename, ext, false); + + ut_ad(m_filepath != NULL); + + set_filename(); +} + +/** Set the filepath by duplicating the filepath sent in. This is the +name of the file with its extension and absolute or relative path. +@param[in] filepath filepath to set */ +void +Datafile::set_filepath(const char* filepath) +{ + free_filepath(); + m_filepath = static_cast<char*>(ut_malloc_nokey(strlen(filepath) + 1)); + ::strcpy(m_filepath, filepath); + set_filename(); +} + +/** Free the filepath buffer. */ +void +Datafile::free_filepath() +{ + if (m_filepath != NULL) { + ut_free(m_filepath); + m_filepath = NULL; + m_filename = NULL; + } +} + +/** Do a quick test if the filepath provided looks the same as this filepath +byte by byte. If they are two different looking paths to the same file, +same_as() will be used to show that after the files are opened. +@param[in] other filepath to compare with +@retval true if it is the same filename by byte comparison +@retval false if it looks different */ +bool +Datafile::same_filepath_as( + const char* other) const +{ + return(0 == strcmp(m_filepath, other)); +} + +/** Test if another opened datafile is the same file as this object. +@param[in] other Datafile to compare with +@return true if it is the same file, else false */ +bool +Datafile::same_as( + const Datafile& other) const +{ +#ifdef _WIN32 + return(m_file_info.dwVolumeSerialNumber + == other.m_file_info.dwVolumeSerialNumber + && m_file_info.nFileIndexHigh + == other.m_file_info.nFileIndexHigh + && m_file_info.nFileIndexLow + == other.m_file_info.nFileIndexLow); +#else + return(m_file_info.st_ino == other.m_file_info.st_ino + && m_file_info.st_dev == other.m_file_info.st_dev); +#endif /* WIN32 */ +} + +/** Allocate and set the datafile or tablespace name in m_name. +If a name is provided, use it; else extract a file-per-table +tablespace name from m_filepath. The value of m_name +will be freed in the destructor. +@param[in] name tablespace name if known, NULL if not */ +void +Datafile::set_name(const char* name) +{ + ut_free(m_name); + + if (name != NULL) { + m_name = mem_strdup(name); + } else { + m_name = fil_path_to_space_name(m_filepath); + } +} + +/** Reads a few significant fields from the first page of the first +datafile. The Datafile must already be open. +@param[in] read_only_mode If true, then readonly mode checks are enforced. +@return DB_SUCCESS or DB_IO_ERROR if page cannot be read */ +dberr_t +Datafile::read_first_page(bool read_only_mode) +{ + if (m_handle == OS_FILE_CLOSED) { + + dberr_t err = open_or_create(read_only_mode); + + if (err != DB_SUCCESS) { + return(err); + } + } + + /* Align the memory for a possible read from a raw device */ + + m_first_page = static_cast<byte*>( + aligned_malloc(UNIV_PAGE_SIZE_MAX, srv_page_size)); + + dberr_t err = DB_ERROR; + size_t page_size = UNIV_PAGE_SIZE_MAX; + + /* Don't want unnecessary complaints about partial reads. */ + + while (page_size >= UNIV_PAGE_SIZE_MIN) { + + ulint n_read = 0; + + err = os_file_read_no_error_handling( + IORequestReadPartial, m_handle, m_first_page, 0, + page_size, &n_read); + + if (err == DB_IO_ERROR && n_read >= UNIV_PAGE_SIZE_MIN) { + + page_size >>= 1; + + } else if (err == DB_SUCCESS) { + + ut_a(n_read == page_size); + + break; + + } else if (srv_operation == SRV_OPERATION_BACKUP) { + break; + } else { + + ib::error() + << "Cannot read first page of '" + << m_filepath << "' " + << err; + break; + } + } + + if (err != DB_SUCCESS) { + return(err); + } + + if (m_order == 0) { + if (memcmp_aligned<2>(FIL_PAGE_SPACE_ID + m_first_page, + FSP_HEADER_OFFSET + FSP_SPACE_ID + + m_first_page, 4)) { + ib::error() + << "Inconsistent tablespace ID in " + << m_filepath; + return DB_CORRUPTION; + } + + m_space_id = mach_read_from_4(FIL_PAGE_SPACE_ID + + m_first_page); + m_flags = fsp_header_get_flags(m_first_page); + if (!fil_space_t::is_valid_flags(m_flags, m_space_id)) { + ulint cflags = fsp_flags_convert_from_101(m_flags); + if (cflags == ULINT_UNDEFINED) { + ib::error() + << "Invalid flags " << ib::hex(m_flags) + << " in " << m_filepath; + return(DB_CORRUPTION); + } else { + m_flags = cflags; + } + } + } + + const size_t physical_size = fil_space_t::physical_size(m_flags); + + if (physical_size > page_size) { + ib::error() << "File " << m_filepath + << " should be longer than " + << page_size << " bytes"; + return(DB_CORRUPTION); + } + + return(err); +} + +/** Free the first page from memory when it is no longer needed. */ +void Datafile::free_first_page() +{ + aligned_free(m_first_page); + m_first_page= nullptr; +} + +/** Validates the datafile and checks that it conforms with the expected +space ID and flags. The file should exist and be successfully opened +in order for this function to validate it. +@param[in] space_id The expected tablespace ID. +@param[in] flags The expected tablespace flags. +@retval DB_SUCCESS if tablespace is valid, DB_ERROR if not. +m_is_valid is also set true on success, else false. */ +dberr_t +Datafile::validate_to_dd(ulint space_id, ulint flags) +{ + dberr_t err; + + if (!is_open()) { + return DB_ERROR; + } + + /* Validate this single-table-tablespace with the data dictionary, + but do not compare the DATA_DIR flag, in case the tablespace was + remotely located. */ + err = validate_first_page(0); + if (err != DB_SUCCESS) { + return(err); + } + + flags &= ~FSP_FLAGS_MEM_MASK; + + /* Make sure the datafile we found matched the space ID. + If the datafile is a file-per-table tablespace then also match + the row format and zip page size. */ + if (m_space_id == space_id + && (fil_space_t::is_flags_equal(flags, m_flags) + || fil_space_t::is_flags_equal(m_flags, flags))) { + /* Datafile matches the tablespace expected. */ + return(DB_SUCCESS); + } + + /* else do not use this tablespace. */ + m_is_valid = false; + + ib::error() << "Refusing to load '" << m_filepath << "' (id=" + << m_space_id << ", flags=" << ib::hex(m_flags) + << "); dictionary contains id=" + << space_id << ", flags=" << ib::hex(flags); + + return(DB_ERROR); +} + +/** Validates this datafile for the purpose of recovery. The file should +exist and be successfully opened. We initially open it in read-only mode +because we just want to read the SpaceID. However, if the first page is +corrupt and needs to be restored from the doublewrite buffer, we will +reopen it in write mode and ry to restore that page. +@retval DB_SUCCESS if tablespace is valid, DB_ERROR if not. +m_is_valid is also set true on success, else false. */ +dberr_t +Datafile::validate_for_recovery() +{ + dberr_t err; + + ut_ad(is_open()); + ut_ad(!srv_read_only_mode); + + err = validate_first_page(0); + + switch (err) { + case DB_SUCCESS: + case DB_TABLESPACE_EXISTS: + break; + + default: + /* Re-open the file in read-write mode Attempt to restore + page 0 from doublewrite and read the space ID from a survey + of the first few pages. */ + close(); + err = open_read_write(srv_read_only_mode); + if (err != DB_SUCCESS) { + return(err); + } + + err = find_space_id(); + if (err != DB_SUCCESS || m_space_id == 0) { + ib::error() << "Datafile '" << m_filepath << "' is" + " corrupted. Cannot determine the space ID from" + " the first 64 pages."; + return(err); + } + + if (restore_from_doublewrite()) { + return(DB_CORRUPTION); + } + + /* Free the previously read first page and then re-validate. */ + free_first_page(); + err = validate_first_page(0); + } + + if (err == DB_SUCCESS) { + set_name(NULL); + } + + return(err); +} + +/** Check the consistency of the first page of a datafile when the +tablespace is opened. This occurs before the fil_space_t is created +so the Space ID found here must not already be open. +m_is_valid is set true on success, else false. +@param[out] flush_lsn contents of FIL_PAGE_FILE_FLUSH_LSN +@retval DB_SUCCESS on if the datafile is valid +@retval DB_CORRUPTION if the datafile is not readable +@retval DB_TABLESPACE_EXISTS if there is a duplicate space_id */ +dberr_t +Datafile::validate_first_page(lsn_t* flush_lsn) +{ + char* prev_name; + char* prev_filepath; + const char* error_txt = NULL; + + m_is_valid = true; + + if (m_first_page == NULL + && read_first_page(srv_read_only_mode) != DB_SUCCESS) { + + error_txt = "Cannot read first page"; + } else { + ut_ad(m_first_page); + + if (flush_lsn != NULL) { + + *flush_lsn = mach_read_from_8( + m_first_page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + } + } + + if (error_txt != NULL) { +err_exit: + ib::info() << error_txt << " in datafile: " << m_filepath + << ", Space ID:" << m_space_id << ", Flags: " + << m_flags; + m_is_valid = false; + free_first_page(); + return(DB_CORRUPTION); + } + + /* Check if the whole page is blank. */ + if (!m_space_id && !m_flags) { + const byte* b = m_first_page; + ulint nonzero_bytes = srv_page_size; + + while (*b == '\0' && --nonzero_bytes != 0) { + + b++; + } + + if (nonzero_bytes == 0) { + error_txt = "Header page consists of zero bytes"; + goto err_exit; + } + } + + if (!fil_space_t::is_valid_flags(m_flags, m_space_id)) { + /* Tablespace flags must be valid. */ + error_txt = "Tablespace flags are invalid"; + goto err_exit; + } + + ulint logical_size = fil_space_t::logical_size(m_flags); + + if (srv_page_size != logical_size) { + /* Logical size must be innodb_page_size. */ + ib::error() + << "Data file '" << m_filepath << "' uses page size " + << logical_size << ", but the innodb_page_size" + " start-up parameter is " + << srv_page_size; + free_first_page(); + return(DB_ERROR); + } + + if (page_get_page_no(m_first_page) != 0) { + /* First page must be number 0 */ + error_txt = "Header page contains inconsistent data"; + goto err_exit; + } + + if (m_space_id >= SRV_SPACE_ID_UPPER_BOUND) { + error_txt = "A bad Space ID was found"; + goto err_exit; + } + + if (buf_page_is_corrupted(false, m_first_page, m_flags)) { + /* Look for checksum and other corruptions. */ + error_txt = "Checksum mismatch"; + goto err_exit; + } + + if (fil_space_read_name_and_filepath( + m_space_id, &prev_name, &prev_filepath)) { + + if (0 == strcmp(m_filepath, prev_filepath)) { + ut_free(prev_name); + ut_free(prev_filepath); + return(DB_SUCCESS); + } + + /* Make sure the space_id has not already been opened. */ + ib::error() << "Attempted to open a previously opened" + " tablespace. Previous tablespace " << prev_name + << " at filepath: " << prev_filepath + << " uses space ID: " << m_space_id + << ". Cannot open filepath: " << m_filepath + << " which uses the same space ID."; + + ut_free(prev_name); + ut_free(prev_filepath); + + m_is_valid = false; + + free_first_page(); + + return(is_predefined_tablespace(m_space_id) + ? DB_CORRUPTION + : DB_TABLESPACE_EXISTS); + } + + return(DB_SUCCESS); +} + +/** Determine the space id of the given file descriptor by reading a few +pages from the beginning of the .ibd file. +@return DB_SUCCESS if space id was successfully identified, else DB_ERROR. */ +dberr_t +Datafile::find_space_id() +{ + os_offset_t file_size; + + ut_ad(m_handle != OS_FILE_CLOSED); + + file_size = os_file_get_size(m_handle); + + if (file_size == (os_offset_t) -1) { + ib::error() << "Could not get file size of datafile '" + << m_filepath << "'"; + return(DB_CORRUPTION); + } + + /* Assuming a page size, read the space_id from each page and store it + in a map. Find out which space_id is agreed on by majority of the + pages. Choose that space_id. */ + for (ulint page_size = UNIV_ZIP_SIZE_MIN; + page_size <= UNIV_PAGE_SIZE_MAX; + page_size <<= 1) { + /* map[space_id] = count of pages */ + typedef std::map< + ulint, + ulint, + std::less<ulint>, + ut_allocator<std::pair<const ulint, ulint> > > + Pages; + + Pages verify; + ulint page_count = 64; + ulint valid_pages = 0; + + /* Adjust the number of pages to analyze based on file size */ + while ((page_count * page_size) > file_size) { + --page_count; + } + + ib::info() + << "Page size:" << page_size + << ". Pages to analyze:" << page_count; + + byte* page = static_cast<byte*>( + aligned_malloc(page_size, page_size)); + + ulint fsp_flags; + /* provide dummy value if the first os_file_read() fails */ + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + fsp_flags = 1U << FSP_FLAGS_FCRC32_POS_MARKER + | FSP_FLAGS_FCRC32_PAGE_SSIZE() + | innodb_compression_algorithm + << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO; + break; + default: + fsp_flags = 0; + } + + for (ulint j = 0; j < page_count; ++j) { + if (os_file_read(IORequestRead, m_handle, page, + j * page_size, page_size)) { + ib::info() + << "READ FAIL: page_no:" << j; + continue; + } + + if (j == 0) { + fsp_flags = mach_read_from_4( + page + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS); + } + + bool noncompressed_ok = false; + + /* For noncompressed pages, the page size must be + equal to srv_page_size. */ + if (page_size == srv_page_size + && !fil_space_t::zip_size(fsp_flags)) { + noncompressed_ok = !buf_page_is_corrupted( + false, page, fsp_flags); + } + + bool compressed_ok = false; + + if (srv_page_size <= UNIV_PAGE_SIZE_DEF + && page_size == fil_space_t::zip_size(fsp_flags)) { + compressed_ok = !buf_page_is_corrupted( + false, page, fsp_flags); + } + + if (noncompressed_ok || compressed_ok) { + + ulint space_id = mach_read_from_4(page + + FIL_PAGE_SPACE_ID); + + if (space_id > 0) { + + ib::info() + << "VALID: space:" + << space_id << " page_no:" << j + << " page_size:" << page_size; + + ++valid_pages; + + ++verify[space_id]; + } + } + } + + aligned_free(page); + + ib::info() + << "Page size: " << page_size + << ". Possible space_id count:" << verify.size(); + + const ulint pages_corrupted = 3; + + for (ulint missed = 0; missed <= pages_corrupted; ++missed) { + + for (Pages::const_iterator it = verify.begin(); + it != verify.end(); + ++it) { + + ib::info() << "space_id:" << it->first + << ", Number of pages matched: " + << it->second << "/" << valid_pages + << " (" << page_size << ")"; + + if (it->second == (valid_pages - missed)) { + ib::info() << "Chosen space:" + << it->first; + + m_space_id = it->first; + return(DB_SUCCESS); + } + } + + } + } + + return(DB_CORRUPTION); +} + + +/** Restore the first page of the tablespace from +the double write buffer. +@return whether the operation failed */ +bool +Datafile::restore_from_doublewrite() +{ + if (srv_operation != SRV_OPERATION_NORMAL) { + return true; + } + + /* Find if double write buffer contains page_no of given space id. */ + const page_id_t page_id(m_space_id, 0); + const byte* page = recv_sys.dblwr.find_page(page_id); + + if (!page) { + /* If the first page of the given user tablespace is not there + in the doublewrite buffer, then the recovery is going to fail + now. Hence this is treated as an error. */ + + ib::error() + << "Corrupted page " << page_id + << " of datafile '" << m_filepath + << "' could not be found in the doublewrite buffer."; + + return(true); + } + + ulint flags = mach_read_from_4( + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page); + + if (!fil_space_t::is_valid_flags(flags, m_space_id)) { + flags = fsp_flags_convert_from_101(flags); + /* recv_dblwr_t::validate_page() inside find_page() + checked this already. */ + ut_ad(flags != ULINT_UNDEFINED); + /* The flags on the page should be converted later. */ + } + + ulint physical_size = fil_space_t::physical_size(flags); + + ut_a(page_get_page_no(page) == page_id.page_no()); + + ib::info() << "Restoring page " << page_id + << " of datafile '" << m_filepath + << "' from the doublewrite buffer. Writing " + << physical_size << " bytes into file '" + << m_filepath << "'"; + + return(os_file_write( + IORequestWrite, + m_filepath, m_handle, page, 0, physical_size) + != DB_SUCCESS); +} + +/** Create a link filename based on the contents of m_name, +open that file, and read the contents into m_filepath. +@retval DB_SUCCESS if remote linked tablespace file is opened and read. +@retval DB_CANNOT_OPEN_FILE if the link file does not exist. */ +dberr_t +RemoteDatafile::open_link_file() +{ + if (m_link_filepath == NULL) { + m_link_filepath = fil_make_filepath(NULL, name(), ISL, false); + } + + m_filepath = read_link_file(m_link_filepath); + + return(m_filepath == NULL ? DB_CANNOT_OPEN_FILE : DB_SUCCESS); +} + +/** Opens a handle to the file linked to in an InnoDB Symbolic Link file +in read-only mode so that it can be validated. +@param[in] strict whether to issue error messages +@return DB_SUCCESS if remote linked tablespace file is found and opened. */ +dberr_t +RemoteDatafile::open_read_only(bool strict) +{ + if (m_filepath == NULL && open_link_file() == DB_CANNOT_OPEN_FILE) { + return(DB_ERROR); + } + + dberr_t err = Datafile::open_read_only(strict); + + if (err != DB_SUCCESS && strict) { + /* The following call prints an error message */ + os_file_get_last_error(true); + ib::error() << "A link file was found named '" + << m_link_filepath << "' but the linked tablespace '" + << m_filepath << "' could not be opened read-only."; + } + + return(err); +} + +/** Opens a handle to the file linked to in an InnoDB Symbolic Link file +in read-write mode so that it can be restored from doublewrite and validated. +@param[in] read_only_mode If true, then readonly mode checks are enforced. +@return DB_SUCCESS if remote linked tablespace file is found and opened. */ +dberr_t +RemoteDatafile::open_read_write(bool read_only_mode) +{ + if (m_filepath == NULL && open_link_file() == DB_CANNOT_OPEN_FILE) { + return(DB_ERROR); + } + + dberr_t err = Datafile::open_read_write(read_only_mode); + + if (err != DB_SUCCESS) { + /* The following call prints an error message */ + m_last_os_error = os_file_get_last_error(true); + ib::error() << "A link file was found named '" + << m_link_filepath << "' but the linked data file '" + << m_filepath << "' could not be opened for writing."; + } + + return(err); +} + +/** Release the resources. */ +void +RemoteDatafile::shutdown() +{ + Datafile::shutdown(); + + if (m_link_filepath != 0) { + ut_free(m_link_filepath); + m_link_filepath = 0; + } +} + +/** Creates a new InnoDB Symbolic Link (ISL) file. It is always created +under the 'datadir' of MySQL. The datadir is the directory of a +running mysqld program. We can refer to it by simply using the path ".". +@param[in] name tablespace name +@param[in] filepath remote filepath of tablespace datafile +@return DB_SUCCESS or error code */ +dberr_t +RemoteDatafile::create_link_file( + const char* name, + const char* filepath) +{ + bool success; + dberr_t err = DB_SUCCESS; + char* link_filepath = NULL; + char* prev_filepath = NULL; + + ut_ad(!srv_read_only_mode); + ut_ad(0 == strcmp(&filepath[strlen(filepath) - 4], DOT_IBD)); + + link_filepath = fil_make_filepath(NULL, name, ISL, false); + + if (link_filepath == NULL) { + return(DB_ERROR); + } + + prev_filepath = read_link_file(link_filepath); + if (prev_filepath) { + /* Truncate (starting with MySQL 5.6, probably no + longer since MariaDB Server 10.2.19) used to call this + with an existing link file which contains the same filepath. */ + bool same = !strcmp(prev_filepath, filepath); + ut_free(prev_filepath); + if (same) { + ut_free(link_filepath); + return(DB_SUCCESS); + } + } + + /** Check if the file already exists. */ + FILE* file = NULL; + bool exists; + os_file_type_t ftype; + + success = os_file_status(link_filepath, &exists, &ftype); + ulint error = 0; + + if (success && !exists) { + + file = fopen(link_filepath, "w"); + if (file == NULL) { + /* This call will print its own error message */ + error = os_file_get_last_error(true); + } + } else { + error = OS_FILE_ALREADY_EXISTS; + } + + if (error != 0) { + + ib::error() << "Cannot create file " << link_filepath << "."; + + if (error == OS_FILE_ALREADY_EXISTS) { + ib::error() << "The link file: " << link_filepath + << " already exists."; + err = DB_TABLESPACE_EXISTS; + + } else if (error == OS_FILE_DISK_FULL) { + err = DB_OUT_OF_FILE_SPACE; + + } else { + err = DB_ERROR; + } + + /* file is not open, no need to close it. */ + ut_free(link_filepath); + return(err); + } + + ulint rbytes = fwrite(filepath, 1, strlen(filepath), file); + + if (rbytes != strlen(filepath)) { + error = os_file_get_last_error(true); + ib::error() << + "Cannot write link file: " + << link_filepath << " filepath: " << filepath; + err = DB_ERROR; + } + + /* Close the file, we only need it at startup */ + fclose(file); + + ut_free(link_filepath); + + return(err); +} + +/** Delete an InnoDB Symbolic Link (ISL) file. */ +void +RemoteDatafile::delete_link_file(void) +{ + ut_ad(m_link_filepath != NULL); + + if (m_link_filepath != NULL) { + os_file_delete_if_exists(innodb_data_file_key, + m_link_filepath, NULL); + } +} + +/** Delete an InnoDB Symbolic Link (ISL) file by name. +@param[in] name tablespace name */ +void +RemoteDatafile::delete_link_file( + const char* name) +{ + char* link_filepath = fil_make_filepath(NULL, name, ISL, false); + + if (link_filepath != NULL) { + os_file_delete_if_exists( + innodb_data_file_key, link_filepath, NULL); + + ut_free(link_filepath); + } +} + +/** Read an InnoDB Symbolic Link (ISL) file by name. +It is always created under the datadir of MySQL. +For file-per-table tablespaces, the isl file is expected to be +in a 'database' directory and called 'tablename.isl'. +The caller must free the memory returned if it is not null. +@param[in] link_filepath filepath of the ISL file +@return Filepath of the IBD file read from the ISL file */ +char* +RemoteDatafile::read_link_file( + const char* link_filepath) +{ + FILE* file = fopen(link_filepath, "r+b" STR_O_CLOEXEC); + if (file == NULL) { + return(NULL); + } + + char* filepath = static_cast<char*>(ut_malloc_nokey(OS_FILE_MAX_PATH)); + + os_file_read_string(file, filepath, OS_FILE_MAX_PATH); + fclose(file); + + if (filepath[0] != '\0') { + /* Trim whitespace from end of filepath */ + ulint last_ch = strlen(filepath) - 1; + while (last_ch > 4 && filepath[last_ch] <= 0x20) { + filepath[last_ch--] = 0x00; + } + os_normalize_path(filepath); + } + + return(filepath); +} diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc new file mode 100644 index 00000000..3d5a7edd --- /dev/null +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -0,0 +1,2890 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fsp/fsp0fsp.cc +File space management + +Created 11/29/1995 Heikki Tuuri +***********************************************************************/ + +#include "fsp0fsp.h" +#include "buf0buf.h" +#include "fil0fil.h" +#include "fil0crypt.h" +#include "mtr0log.h" +#include "ut0byte.h" +#include "page0page.h" +#include "fut0fut.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "ibuf0ibuf.h" +#include "btr0btr.h" +#include "btr0sea.h" +#include "dict0boot.h" +#include "log0log.h" +#include "dict0mem.h" +#include "fsp0types.h" + +// JAN: MySQL 5.7 Encryption +// #include <my_aes.h> + +typedef uint32_t page_no_t; + +/** Return an extent to the free list of a space. +@param[in,out] space tablespace +@param[in] offset page number in the extent +@param[in,out] mtr mini-transaction */ +MY_ATTRIBUTE((nonnull)) +static +void +fsp_free_extent( + fil_space_t* space, + page_no_t offset, + mtr_t* mtr); + +/** Returns the first extent descriptor for a segment. +We think of the extent lists of the segment catenated in the order +FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE. +@param[in] inode segment inode +@param[in] space tablespace +@param[in,out] mtr mini-transaction +@return the first extent descriptor, or NULL if none */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +static +xdes_t* +fseg_get_first_extent( + fseg_inode_t* inode, + const fil_space_t* space, + mtr_t* mtr); + +/** Put new extents to the free list if there are free extents above the free +limit. If an extent happens to contain an extent descriptor page, the extent +is put to the FSP_FREE_FRAG list with the page marked as used. +@param[in] init_space true if this is a single-table tablespace +and we are only initializing the first extent and the first bitmap pages; +then we will not allocate more extents +@param[in,out] space tablespace +@param[in,out] header tablespace header +@param[in,out] mtr mini-transaction */ +static ATTRIBUTE_COLD +void +fsp_fill_free_list( + bool init_space, + fil_space_t* space, + buf_block_t* header, + mtr_t* mtr); + +/** Allocates a single free page from a segment. +This function implements the intelligent allocation strategy which tries to +minimize file space fragmentation. +@param[in,out] space tablespace +@param[in,out] seg_inode segment inode +@param[in,out] iblock segment inode page +@param[in] hint hint of which page would be desirable +@param[in] direction if the new page is needed because of +an index page split, and records are inserted there in order, into which +direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR +@param[in,out] mtr mini-transaction +@param[in,out] init_mtr mtr or another mini-transaction in +which the page should be initialized. +@retval NULL if no page could be allocated */ +static +buf_block_t* +fseg_alloc_free_page_low( + fil_space_t* space, + fseg_inode_t* seg_inode, + buf_block_t* iblock, + uint32_t hint, + byte direction, +#ifdef UNIV_DEBUG + bool has_done_reservation, + /*!< whether the space has already been reserved */ +#endif /* UNIV_DEBUG */ + mtr_t* mtr, + mtr_t* init_mtr) + MY_ATTRIBUTE((warn_unused_result)); + +/** Get the tablespace header block, SX-latched +@param[in] space tablespace +@param[in,out] mtr mini-transaction +@return pointer to the space header, page x-locked */ +inline buf_block_t *fsp_get_header(const fil_space_t *space, mtr_t *mtr) +{ + buf_block_t *block= buf_page_get(page_id_t(space->id, 0), space->zip_size(), + RW_SX_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + ut_ad(space->id == mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + + block->frame)); + return block; +} + +/** Set the XDES_FREE_BIT of a page. +@tparam free desired value of XDES_FREE_BIT +@param[in] block extent descriptor block +@param[in,out] descr extent descriptor +@param[in] offset page offset within the extent +@param[in,out] mtr mini-transaction */ +template<bool free> +inline void xdes_set_free(const buf_block_t &block, xdes_t *descr, + ulint offset, mtr_t *mtr) +{ + ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX | + MTR_MEMO_PAGE_X_FIX)); + ut_ad(offset < FSP_EXTENT_SIZE); + ut_ad(page_align(descr) == block.frame); + compile_time_assert(XDES_BITS_PER_PAGE == 2); + compile_time_assert(XDES_FREE_BIT == 0); + compile_time_assert(XDES_CLEAN_BIT == 1); + + ulint index= XDES_BITS_PER_PAGE * offset; + byte *b= &descr[XDES_BITMAP + (index >> 3)]; + /* xdes_init() should have set all XDES_CLEAN_BIT. */ + ut_ad(!(~*b & 0xaa)); + /* Clear or set XDES_FREE_BIT. */ + byte val= free + ? static_cast<byte>(*b | 1 << (index & 7)) + : static_cast<byte>(*b & ~(1 << (index & 7))); + mtr->write<1>(block, b, val); +} + +/** +Find a free page. +@param descr extent descriptor +@param hint page offset to start searching from (towards larger pages) +@return free page offset +@retval FIL_NULL if no page is free */ +inline uint32_t xdes_find_free(const xdes_t *descr, uint32_t hint= 0) +{ + const uint32_t extent_size= FSP_EXTENT_SIZE; + ut_ad(hint < extent_size); + for (uint32_t i= hint; i < extent_size; i++) + if (xdes_is_free(descr, i)) + return i; + for (uint32_t i= 0; i < hint; i++) + if (xdes_is_free(descr, i)) + return i; + return FIL_NULL; +} + +/** +Determine the number of used pages in a descriptor. +@param descr file descriptor +@return number of pages used */ +inline uint32_t xdes_get_n_used(const xdes_t *descr) +{ + uint32_t count= 0; + + for (uint32_t i= FSP_EXTENT_SIZE; i--; ) + if (!xdes_is_free(descr, i)) + count++; + + return count; +} + +/** +Determine whether a file extent is full. +@param descr file descriptor +@return whether all pages have been allocated */ +inline bool xdes_is_full(const xdes_t *descr) +{ + return FSP_EXTENT_SIZE == xdes_get_n_used(descr); +} + +/** Set the state of an extent descriptor. +@param[in] block extent descriptor block +@param[in,out] descr extent descriptor +@param[in] state the state +@param[in,out] mtr mini-transaction */ +inline void xdes_set_state(const buf_block_t &block, xdes_t *descr, + byte state, mtr_t *mtr) +{ + ut_ad(descr && mtr); + ut_ad(state >= XDES_FREE); + ut_ad(state <= XDES_FSEG); + ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX | + MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_align(descr) == block.frame); + ut_ad(mach_read_from_4(descr + XDES_STATE) <= XDES_FSEG); + mtr->write<1>(block, XDES_STATE + 3 + descr, state); +} + +/**********************************************************************//** +Gets the state of an xdes. +@return state */ +UNIV_INLINE +ulint +xdes_get_state( +/*===========*/ + const xdes_t* descr) /*!< in: descriptor */ +{ + ulint state; + + ut_ad(descr); + state = mach_read_from_4(descr + XDES_STATE); + ut_ad(state - 1 < XDES_FSEG); + return(state); +} + +/**********************************************************************//** +Inits an extent descriptor to the free and clean state. */ +inline void xdes_init(const buf_block_t &block, xdes_t *descr, mtr_t *mtr) +{ + ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX | + MTR_MEMO_PAGE_X_FIX)); + mtr->memset(&block, uint16_t(descr - block.frame) + XDES_BITMAP, + XDES_SIZE - XDES_BITMAP, 0xff); + xdes_set_state(block, descr, XDES_FREE, mtr); +} + +/** Mark a page used in an extent descriptor. +@param[in,out] seg_inode segment inode +@param[in,out] iblock segment inode page +@param[in] page page number +@param[in,out] descr extent descriptor +@param[in,out] xdes extent descriptor page +@param[in,out] mtr mini-transaction */ +static MY_ATTRIBUTE((nonnull)) +void +fseg_mark_page_used(fseg_inode_t *seg_inode, buf_block_t *iblock, + ulint page, xdes_t *descr, buf_block_t *xdes, mtr_t *mtr) +{ + ut_ad(fil_page_get_type(iblock->frame) == FIL_PAGE_INODE); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + ut_ad(!memcmp(seg_inode + FSEG_ID, descr + XDES_ID, 4)); + + const uint16_t xoffset= uint16_t(descr - xdes->frame + XDES_FLST_NODE); + const uint16_t ioffset= uint16_t(seg_inode - iblock->frame); + + if (!xdes_get_n_used(descr)) + { + /* We move the extent from the free list to the NOT_FULL list */ + flst_remove(iblock, uint16_t(FSEG_FREE + ioffset), xdes, xoffset, mtr); + flst_add_last(iblock, uint16_t(FSEG_NOT_FULL + ioffset), + xdes, xoffset, mtr); + } + + ut_ad(xdes_is_free(descr, page % FSP_EXTENT_SIZE)); + + /* We mark the page as used */ + xdes_set_free<false>(*xdes, descr, page % FSP_EXTENT_SIZE, mtr); + + byte* p_not_full= seg_inode + FSEG_NOT_FULL_N_USED; + const uint32_t not_full_n_used= mach_read_from_4(p_not_full) + 1; + mtr->write<4>(*iblock, p_not_full, not_full_n_used); + if (xdes_is_full(descr)) + { + /* We move the extent from the NOT_FULL list to the FULL list */ + flst_remove(iblock, uint16_t(FSEG_NOT_FULL + ioffset), xdes, xoffset, mtr); + flst_add_last(iblock, uint16_t(FSEG_FULL + ioffset), xdes, xoffset, mtr); + mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED, + not_full_n_used - FSP_EXTENT_SIZE); + } +} + +/** Get pointer to a the extent descriptor of a page. +@param[in,out] sp_header tablespace header page, x-latched +@param[in] space tablespace +@param[in] offset page offset +@param[out] desc_block descriptor block +@param[in,out] mtr mini-transaction +@param[in] init_space whether the tablespace is being initialized +@return pointer to the extent descriptor, NULL if the page does not +exist in the space or if the offset exceeds free limit */ +UNIV_INLINE MY_ATTRIBUTE((warn_unused_result)) +xdes_t* +xdes_get_descriptor_with_space_hdr( + buf_block_t* header, + const fil_space_t* space, + page_no_t offset, + buf_block_t** desc_block, + mtr_t* mtr, + bool init_space = false) +{ + ut_ad(mtr->memo_contains(*space)); + ut_ad(mtr->memo_contains_flagged(header, MTR_MEMO_PAGE_SX_FIX + | MTR_MEMO_PAGE_X_FIX)); + /* Read free limit and space size */ + uint32_t limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + + header->frame); + uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + + header->frame); + ut_ad(limit == space->free_limit + || (space->free_limit == 0 + && (init_space + || space->purpose == FIL_TYPE_TEMPORARY + || (srv_startup_is_before_trx_rollback_phase + && (space->id == TRX_SYS_SPACE + || srv_is_undo_tablespace(space->id)))))); + ut_ad(size == space->size_in_header); + + if ((offset >= size) || (offset >= limit)) { + return(NULL); + } + + const unsigned zip_size = space->zip_size(); + + uint32_t descr_page_no = xdes_calc_descriptor_page(zip_size, offset); + + buf_block_t* block = header; + + if (descr_page_no) { + block = buf_page_get( + page_id_t(space->id, descr_page_no), zip_size, + RW_SX_LATCH, mtr); + + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + } + + if (desc_block != NULL) { + *desc_block = block; + } + + return XDES_ARR_OFFSET + XDES_SIZE + * xdes_calc_descriptor_index(zip_size, offset) + + block->frame; +} + +/** Get the extent descriptor of a page. +The page where the extent descriptor resides is x-locked. If the page +offset is equal to the free limit of the space, we will add new +extents from above the free limit to the space free list, if not free +limit == space size. This adding is necessary to make the descriptor +defined, as they are uninitialized above the free limit. +@param[in] space tablespace +@param[in] offset page offset; if equal to the free limit, we +try to add new extents to the space free list +@param[out] xdes extent descriptor page +@param[in,out] mtr mini-transaction +@return the extent descriptor */ +static xdes_t* xdes_get_descriptor(const fil_space_t *space, page_no_t offset, + buf_block_t **xdes, mtr_t *mtr) +{ + buf_block_t *block= buf_page_get(page_id_t(space->id, 0), space->zip_size(), + RW_SX_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + return xdes_get_descriptor_with_space_hdr(block, space, offset, xdes, mtr); +} + +/** Get the extent descriptor of a page. +The page where the extent descriptor resides is x-locked. If the page +offset is equal to the free limit of the space, we will add new +extents from above the free limit to the space free list, if not free +limit == space size. This adding is necessary to make the descriptor +defined, as they are uninitialized above the free limit. +@param[in] space tablespace +@param[in] page descriptor page offset +@param[in] offset page offset +@param[in,out] mtr mini-transaction +@return the extent descriptor +@retval NULL if the descriptor is not available */ +MY_ATTRIBUTE((warn_unused_result)) +static +const xdes_t* +xdes_get_descriptor_const( + const fil_space_t* space, + page_no_t page, + page_no_t offset, + mtr_t* mtr) +{ + ut_ad(mtr->memo_contains(space->latch, MTR_MEMO_SX_LOCK)); + ut_ad(offset < space->free_limit); + ut_ad(offset < space->size_in_header); + + const ulint zip_size = space->zip_size(); + + if (buf_block_t* block = buf_page_get_gen(page_id_t(space->id, page), + zip_size, RW_S_LATCH, + nullptr, + BUF_GET_POSSIBLY_FREED, + __FILE__, __LINE__, mtr)) { + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + + if (block->page.status == buf_page_t::FREED) { + return nullptr; + } + + ut_ad(page != 0 || space->free_limit == mach_read_from_4( + FSP_FREE_LIMIT + FSP_HEADER_OFFSET + + block->frame)); + ut_ad(page != 0 || space->size_in_header == mach_read_from_4( + FSP_SIZE + FSP_HEADER_OFFSET + + block->frame)); + + return(block->frame + XDES_ARR_OFFSET + XDES_SIZE + * xdes_calc_descriptor_index(zip_size, offset)); + } + + return(NULL); +} + +/** Get a pointer to the extent descriptor. The page where the +extent descriptor resides is x-locked. +@param[in] space tablespace +@param[in] lst_node file address of the list node + contained in the descriptor +@param[out] block extent descriptor block +@param[in,out] mtr mini-transaction +@return pointer to the extent descriptor */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +UNIV_INLINE +xdes_t* +xdes_lst_get_descriptor( + const fil_space_t* space, + fil_addr_t lst_node, + buf_block_t** block, + mtr_t* mtr) +{ + ut_ad(mtr->memo_contains(*space)); + return fut_get_ptr(space->id, space->zip_size(), + lst_node, RW_SX_LATCH, mtr, block) + - XDES_FLST_NODE; +} + +/********************************************************************//** +Returns page offset of the first page in extent described by a descriptor. +@return offset of the first page in extent */ +static uint32_t xdes_get_offset(const xdes_t *descr) +{ + ut_ad(descr); + return page_get_page_no(page_align(descr)) + + uint32_t(((page_offset(descr) - XDES_ARR_OFFSET) / XDES_SIZE) * + FSP_EXTENT_SIZE); +} + +/** Initialize a file page whose prior contents should be ignored. +@param[in,out] block buffer pool block */ +void fsp_apply_init_file_page(buf_block_t *block) +{ + memset_aligned<UNIV_PAGE_SIZE_MIN>(block->frame, 0, srv_page_size); + const page_id_t id(block->page.id()); + + mach_write_to_4(block->frame + FIL_PAGE_OFFSET, id.page_no()); + if (log_sys.is_physical()) + memset_aligned<8>(block->frame + FIL_PAGE_PREV, 0xff, 8); + mach_write_to_4(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id.space()); + if (page_zip_des_t* page_zip= buf_block_get_page_zip(block)) + { + memset_aligned<UNIV_ZIP_SIZE_MIN>(page_zip->data, 0, + page_zip_get_size(page_zip)); + static_assert(FIL_PAGE_OFFSET == 4, "compatibility"); + memcpy_aligned<4>(page_zip->data + FIL_PAGE_OFFSET, + block->frame + FIL_PAGE_OFFSET, 4); + if (log_sys.is_physical()) + memset_aligned<8>(page_zip->data + FIL_PAGE_PREV, 0xff, 8); + static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2, + "not perfect alignment"); + memcpy_aligned<2>(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4); + } +} + +#ifdef UNIV_DEBUG +/** Assert that the mini-transaction is compatible with +updating an allocation bitmap page. +@param[in] mtr mini-transaction */ +void fil_space_t::modify_check(const mtr_t& mtr) const +{ + switch (mtr.get_log_mode()) { + case MTR_LOG_NONE: + /* These modes are only allowed within a non-bitmap page + when there is a higher-level redo log record written. */ + ut_ad(purpose == FIL_TYPE_TABLESPACE + || purpose == FIL_TYPE_TEMPORARY); + break; + case MTR_LOG_NO_REDO: + ut_ad(purpose == FIL_TYPE_TEMPORARY + || purpose == FIL_TYPE_IMPORT); + return; + case MTR_LOG_ALL: + /* We may only write redo log for a persistent + tablespace. */ + ut_ad(purpose == FIL_TYPE_TABLESPACE); + ut_ad(mtr.is_named_space(id)); + return; + } + + ut_ad("invalid log mode" == 0); +} +#endif + +/**********************************************************************//** +Writes the space id and flags to a tablespace header. The flags contain +row type, physical/compressed page size, and logical/uncompressed page +size of the tablespace. */ +void +fsp_header_init_fields( +/*===================*/ + page_t* page, /*!< in/out: first page in the space */ + ulint space_id, /*!< in: space id */ + ulint flags) /*!< in: tablespace flags (FSP_SPACE_FLAGS) */ +{ + flags &= ~FSP_FLAGS_MEM_MASK; + ut_a(fil_space_t::is_valid_flags(flags, space_id)); + + mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + page, + space_id); + mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page, + flags); +} + +/** Initialize a tablespace header. +@param[in,out] space tablespace +@param[in] size current size in blocks +@param[in,out] mtr mini-transaction */ +void fsp_header_init(fil_space_t* space, uint32_t size, mtr_t* mtr) +{ + const page_id_t page_id(space->id, 0); + const ulint zip_size = space->zip_size(); + + buf_block_t *free_block = buf_LRU_get_free_block(false); + + mtr_x_lock_space(space, mtr); + + buf_block_t* block = buf_page_create(space, 0, zip_size, mtr, + free_block); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + + if (UNIV_UNLIKELY(block != free_block)) { + buf_pool.free_block(free_block); + } + + space->size_in_header = size; + space->free_len = 0; + space->free_limit = 0; + + /* The prior contents of the file page should be ignored */ + + fsp_init_file_page(space, block, mtr); + + mtr->write<2>(*block, block->frame + FIL_PAGE_TYPE, + FIL_PAGE_TYPE_FSP_HDR); + + mtr->write<4,mtr_t::MAYBE_NOP>(*block, FSP_HEADER_OFFSET + FSP_SPACE_ID + + block->frame, space->id); + ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_NOT_USED + + block->frame)); + /* recv_sys_t::parse() expects to find a WRITE record that + covers all 4 bytes. Therefore, we must specify mtr_t::FORCED + in order to avoid optimizing away any unchanged most + significant bytes of FSP_SIZE. */ + mtr->write<4,mtr_t::FORCED>(*block, FSP_HEADER_OFFSET + FSP_SIZE + + block->frame, size); + ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + + block->frame)); + if (auto f = space->flags & ~FSP_FLAGS_MEM_MASK) { + mtr->write<4,mtr_t::FORCED>(*block, + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + + block->frame, f); + } + ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED + + block->frame)); + + flst_init(block, FSP_HEADER_OFFSET + FSP_FREE, mtr); + flst_init(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG, mtr); + flst_init(block, FSP_HEADER_OFFSET + FSP_FULL_FRAG, mtr); + flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, mtr); + flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, mtr); + + mtr->write<8>(*block, FSP_HEADER_OFFSET + FSP_SEG_ID + block->frame, + 1U); + + fsp_fill_free_list(!is_system_tablespace(space->id), + space, block, mtr); + + /* Write encryption metadata to page 0 if tablespace is + encrypted or encryption is disabled by table option. */ + if (space->crypt_data && + (space->crypt_data->should_encrypt() || + space->crypt_data->not_encrypted())) { + space->crypt_data->write_page0(block, mtr); + } +} + +/** Try to extend a single-table tablespace so that a page would fit in the +data file. +@param[in,out] space tablespace +@param[in] page_no page number +@param[in,out] header tablespace header +@param[in,out] mtr mini-transaction +@return true if success */ +static ATTRIBUTE_COLD __attribute__((warn_unused_result)) +bool +fsp_try_extend_data_file_with_pages( + fil_space_t* space, + uint32_t page_no, + buf_block_t* header, + mtr_t* mtr) +{ + bool success; + ulint size; + + ut_a(!is_system_tablespace(space->id)); + ut_d(space->modify_check(*mtr)); + + size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + header->frame); + ut_ad(size == space->size_in_header); + + ut_a(page_no >= size); + + success = fil_space_extend(space, page_no + 1); + /* The size may be less than we wanted if we ran out of disk space. */ + /* recv_sys_t::parse() expects to find a WRITE record that + covers all 4 bytes. Therefore, we must specify mtr_t::FORCED + in order to avoid optimizing away any unchanged most + significant bytes of FSP_SIZE. */ + mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE + + header->frame, space->size); + space->size_in_header = space->size; + + return(success); +} + +/** Calculate the number of physical pages in an extent for this file. +@param[in] physical_size page_size of the datafile +@return number of pages in an extent for this file */ +inline uint32_t fsp_get_extent_size_in_pages(ulint physical_size) +{ + return uint32_t((FSP_EXTENT_SIZE << srv_page_size_shift) / physical_size); +} + + +/** Calculate the number of pages to extend a datafile. +We extend single-table tablespaces first one extent at a time, +but 4 at a time for bigger tablespaces. It is not enough to extend always +by one extent, because we need to add at least one extent to FSP_FREE. +A single extent descriptor page will track many extents. And the extent +that uses its extent descriptor page is put onto the FSP_FREE_FRAG list. +Extents that do not use their extent descriptor page are added to FSP_FREE. +The physical page size is used to determine how many extents are tracked +on one extent descriptor page. See xdes_calc_descriptor_page(). +@param[in] physical_size page size in data file +@param[in] size current number of pages in the datafile +@return number of pages to extend the file. */ +static uint32_t fsp_get_pages_to_extend_ibd(unsigned physical_size, + uint32_t size) +{ + uint32_t extent_size = fsp_get_extent_size_in_pages(physical_size); + /* The threshold is set at 32MiB except when the physical page + size is small enough that it must be done sooner. */ + uint32_t threshold = std::min(32 * extent_size, physical_size); + + if (size >= threshold) { + /* Below in fsp_fill_free_list() we assume + that we add at most FSP_FREE_ADD extents at + a time */ + extent_size *= FSP_FREE_ADD; + } + + return extent_size; +} + +/** Try to extend the last data file of a tablespace if it is auto-extending. +@param[in,out] space tablespace +@param[in,out] header tablespace header +@param[in,out] mtr mini-transaction +@return number of pages added +@retval 0 if the tablespace was not extended */ +ATTRIBUTE_COLD __attribute__((nonnull)) +static +ulint +fsp_try_extend_data_file(fil_space_t *space, buf_block_t *header, mtr_t *mtr) +{ + const char* OUT_OF_SPACE_MSG = + "ran out of space. Please add another file or use" + " 'autoextend' for the last file in setting"; + + ut_d(space->modify_check(*mtr)); + + if (space->id == TRX_SYS_SPACE + && !srv_sys_space.can_auto_extend_last_file()) { + + /* We print the error message only once to avoid + spamming the error log. Note that we don't need + to reset the flag to false as dealing with this + error requires server restart. */ + if (!srv_sys_space.get_tablespace_full_status()) { + ib::error() << "The InnoDB system tablespace " + << OUT_OF_SPACE_MSG + << " innodb_data_file_path."; + srv_sys_space.set_tablespace_full_status(true); + } + return(0); + } else if (space->id == SRV_TMP_SPACE_ID + && !srv_tmp_space.can_auto_extend_last_file()) { + + /* We print the error message only once to avoid + spamming the error log. Note that we don't need + to reset the flag to false as dealing with this + error requires server restart. */ + if (!srv_tmp_space.get_tablespace_full_status()) { + ib::error() << "The InnoDB temporary tablespace " + << OUT_OF_SPACE_MSG + << " innodb_temp_data_file_path."; + srv_tmp_space.set_tablespace_full_status(true); + } + return(0); + } + + uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + + header->frame); + ut_ad(size == space->size_in_header); + uint32_t size_increase; + + const unsigned ps = space->physical_size(); + + switch (space->id) { + case TRX_SYS_SPACE: + size_increase = srv_sys_space.get_increment(); + break; + case SRV_TMP_SPACE_ID: + size_increase = srv_tmp_space.get_increment(); + break; + default: + uint32_t extent_pages = fsp_get_extent_size_in_pages(ps); + if (size < extent_pages) { + /* Let us first extend the file to extent_size */ + if (!fsp_try_extend_data_file_with_pages( + space, extent_pages - 1, header, mtr)) { + return(0); + } + + size = extent_pages; + } + + size_increase = fsp_get_pages_to_extend_ibd(ps, size); + } + + if (size_increase == 0) { + return(0); + } + + if (!fil_space_extend(space, size + size_increase)) { + return(0); + } + + /* We ignore any fragments of a full megabyte when storing the size + to the space header */ + + space->size_in_header = ut_2pow_round(space->size, (1024 * 1024) / ps); + + /* recv_sys_t::parse() expects to find a WRITE record that + covers all 4 bytes. Therefore, we must specify mtr_t::FORCED + in order to avoid optimizing away any unchanged most + significant bytes of FSP_SIZE. */ + mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE + + header->frame, space->size_in_header); + + return(size_increase); +} + +/** Reset the page type. +Data files created before MySQL 5.1.48 may contain garbage in FIL_PAGE_TYPE. +In MySQL 3.23.53, only undo log pages and index pages were tagged. +Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE. +@param[in] block block with invalid FIL_PAGE_TYPE +@param[in] type expected page type +@param[in,out] mtr mini-transaction */ +ATTRIBUTE_COLD +void fil_block_reset_type(const buf_block_t& block, ulint type, mtr_t* mtr) +{ + ib::info() + << "Resetting invalid page " << block.page.id() << " type " + << fil_page_get_type(block.frame) << " to " << type << "."; + mtr->write<2>(block, block.frame + FIL_PAGE_TYPE, type); +} + +/** Put new extents to the free list if there are free extents above the free +limit. If an extent happens to contain an extent descriptor page, the extent +is put to the FSP_FREE_FRAG list with the page marked as used. +@param[in] init_space true if this is a single-table tablespace +and we are only initializing the first extent and the first bitmap pages; +then we will not allocate more extents +@param[in,out] space tablespace +@param[in,out] header tablespace header +@param[in,out] mtr mini-transaction */ +static +void +fsp_fill_free_list( + bool init_space, + fil_space_t* space, + buf_block_t* header, + mtr_t* mtr) +{ + ut_d(space->modify_check(*mtr)); + + /* Check if we can fill free list from above the free list limit */ + uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + + header->frame); + uint32_t limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + + header->frame); + + ut_ad(size == space->size_in_header); + ut_ad(limit == space->free_limit); + + const ulint zip_size = space->zip_size(); + + if (size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) { + bool skip_resize = init_space; + switch (space->id) { + case TRX_SYS_SPACE: + skip_resize = !srv_sys_space.can_auto_extend_last_file(); + break; + case SRV_TMP_SPACE_ID: + skip_resize = !srv_tmp_space.can_auto_extend_last_file(); + break; + } + + if (!skip_resize) { + fsp_try_extend_data_file(space, header, mtr); + size = space->size_in_header; + } + } + + uint32_t count = 0; + + for (uint32_t i = limit, extent_size = FSP_EXTENT_SIZE, + physical_size = space->physical_size(); + (init_space && i < 1) + || (i + extent_size <= size && count < FSP_FREE_ADD); + i += extent_size) { + const bool init_xdes = !ut_2pow_remainder(i, physical_size); + + space->free_limit = i + extent_size; + mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FREE_LIMIT + + header->frame, i + extent_size); + + if (init_xdes) { + + buf_block_t* block; + + /* We are going to initialize a new descriptor page + and a new ibuf bitmap page: the prior contents of the + pages should be ignored. */ + + if (i > 0) { + buf_block_t *f= buf_LRU_get_free_block(false); + block= buf_page_create( + space, static_cast<uint32_t>(i), + zip_size, mtr, f); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + if (UNIV_UNLIKELY(block != f)) { + buf_pool.free_block(f); + } + fsp_init_file_page(space, block, mtr); + mtr->write<2>(*block, + FIL_PAGE_TYPE + block->frame, + FIL_PAGE_TYPE_XDES); + } + + if (space->purpose != FIL_TYPE_TEMPORARY) { + buf_block_t *f= buf_LRU_get_free_block(false); + block = buf_page_create( + space, + static_cast<uint32_t>( + i + FSP_IBUF_BITMAP_OFFSET), + zip_size, mtr, f); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + if (UNIV_UNLIKELY(block != f)) { + buf_pool.free_block(f); + } + fsp_init_file_page(space, block, mtr); + mtr->write<2>(*block, + block->frame + FIL_PAGE_TYPE, + FIL_PAGE_IBUF_BITMAP); + } + } + + buf_block_t* xdes; + xdes_t* descr = xdes_get_descriptor_with_space_hdr( + header, space, i, &xdes, mtr, init_space); + if (xdes != header && !space->full_crc32()) { + fil_block_check_type(*xdes, FIL_PAGE_TYPE_XDES, mtr); + } + xdes_init(*xdes, descr, mtr); + const uint16_t xoffset= static_cast<uint16_t>( + descr - xdes->frame + XDES_FLST_NODE); + + if (UNIV_UNLIKELY(init_xdes)) { + + /* The first page in the extent is a descriptor page + and the second is an ibuf bitmap page: mark them + used */ + + xdes_set_free<false>(*xdes, descr, 0, mtr); + xdes_set_free<false>(*xdes, descr, + FSP_IBUF_BITMAP_OFFSET, mtr); + xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr); + + flst_add_last(header, + FSP_HEADER_OFFSET + FSP_FREE_FRAG, + xdes, xoffset, mtr); + byte* n_used = FSP_HEADER_OFFSET + FSP_FRAG_N_USED + + header->frame; + mtr->write<4>(*header, n_used, + 2U + mach_read_from_4(n_used)); + } else { + flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE, + xdes, xoffset, mtr); + count++; + } + } + + space->free_len += count; +} + +/** Allocates a new free extent. +@param[in,out] space tablespace +@param[in] hint hint of which extent would be desirable: any +page offset in the extent goes; the hint must not be > FSP_FREE_LIMIT +@param[out] xdes extent descriptor page +@param[in,out] mtr mini-transaction +@return extent descriptor, NULL if cannot be allocated */ +static +xdes_t* +fsp_alloc_free_extent( + fil_space_t* space, + uint32_t hint, + buf_block_t** xdes, + mtr_t* mtr) +{ + fil_addr_t first; + xdes_t* descr; + buf_block_t* desc_block = NULL; + + buf_block_t* header = fsp_get_header(space, mtr); + + descr = xdes_get_descriptor_with_space_hdr( + header, space, hint, &desc_block, mtr); + + if (desc_block != header && !space->full_crc32()) { + fil_block_check_type(*desc_block, FIL_PAGE_TYPE_XDES, mtr); + } + + if (descr && (xdes_get_state(descr) == XDES_FREE)) { + /* Ok, we can take this extent */ + } else { + /* Take the first extent in the free list */ + first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE + + header->frame); + + if (first.page == FIL_NULL) { + fsp_fill_free_list(false, space, header, mtr); + + first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE + + header->frame); + if (first.page == FIL_NULL) { + return nullptr; /* No free extents left */ + } + } + + descr = xdes_lst_get_descriptor(space, first, &desc_block, + mtr); + } + + flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE, desc_block, + static_cast<uint16_t>( + descr - desc_block->frame + XDES_FLST_NODE), mtr); + space->free_len--; + *xdes = desc_block; + + return(descr); +} + +/** Allocate a single free page. +@param[in,out] header tablespace header +@param[in,out] xdes extent descriptor page +@param[in,out] descr extent descriptor +@param[in] bit slot to allocate in the extent +@param[in,out] mtr mini-transaction */ +static void +fsp_alloc_from_free_frag(buf_block_t *header, buf_block_t *xdes, xdes_t *descr, + ulint bit, mtr_t *mtr) +{ + ut_ad(xdes_get_state(descr) == XDES_FREE_FRAG); + ut_a(xdes_is_free(descr, bit)); + xdes_set_free<false>(*xdes, descr, bit, mtr); + + /* Update the FRAG_N_USED field */ + byte* n_used_p = FSP_HEADER_OFFSET + FSP_FRAG_N_USED + header->frame; + + uint32_t n_used = mach_read_from_4(n_used_p) + 1; + + if (xdes_is_full(descr)) { + /* The fragment is full: move it to another list */ + const uint16_t xoffset= static_cast<uint16_t>( + descr - xdes->frame + XDES_FLST_NODE); + flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG, + xdes, xoffset, mtr); + xdes_set_state(*xdes, descr, XDES_FULL_FRAG, mtr); + + flst_add_last(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG, + xdes, xoffset, mtr); + n_used -= FSP_EXTENT_SIZE; + } + + mtr->write<4>(*header, n_used_p, n_used); +} + +/** Gets a buffer block for an allocated page. +@param[in,out] space tablespace +@param[in] offset page number of the allocated page +@param[in,out] mtr mini-transaction +@return block, initialized */ +static +buf_block_t* +fsp_page_create(fil_space_t *space, page_no_t offset, mtr_t *mtr) +{ + buf_block_t *free_block= buf_LRU_get_free_block(false); + buf_block_t *block= buf_page_create(space, static_cast<uint32_t>(offset), + space->zip_size(), mtr, free_block); + if (UNIV_UNLIKELY(block != free_block)) + buf_pool.free_block(free_block); + fsp_init_file_page(space, block, mtr); + return block; +} + +/** Allocates a single free page from a space. +The page is marked as used. +@param[in,out] space tablespace +@param[in] hint hint of which page would be desirable +@param[in,out] mtr mini-transaction +@param[in,out] init_mtr mini-transaction in which the page should be +initialized (may be the same as mtr) +@retval NULL if no page could be allocated */ +static MY_ATTRIBUTE((warn_unused_result, nonnull)) +buf_block_t* +fsp_alloc_free_page( + fil_space_t* space, + uint32_t hint, + mtr_t* mtr, + mtr_t* init_mtr) +{ + fil_addr_t first; + xdes_t* descr; + const ulint space_id = space->id; + + ut_d(space->modify_check(*mtr)); + buf_block_t* block = fsp_get_header(space, mtr); + buf_block_t *xdes; + + /* Get the hinted descriptor */ + descr = xdes_get_descriptor_with_space_hdr(block, space, hint, &xdes, + mtr); + + if (descr && (xdes_get_state(descr) == XDES_FREE_FRAG)) { + /* Ok, we can take this extent */ + } else { + /* Else take the first extent in free_frag list */ + first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE_FRAG + + block->frame); + + if (first.page == FIL_NULL) { + /* There are no partially full fragments: allocate + a free extent and add it to the FREE_FRAG list. NOTE + that the allocation may have as a side-effect that an + extent containing a descriptor page is added to the + FREE_FRAG list. But we will allocate our page from the + the free extent anyway. */ + + descr = fsp_alloc_free_extent(space, hint, &xdes, mtr); + + if (descr == NULL) { + /* No free space left */ + + return(NULL); + } + + xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr); + flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG, + xdes, static_cast<uint16_t>( + descr - xdes->frame + + XDES_FLST_NODE), mtr); + } else { + descr = xdes_lst_get_descriptor(space, first, &xdes, + mtr); + } + + /* Reset the hint */ + hint = 0; + } + + /* Now we have in descr an extent with at least one free page. Look + for a free page in the extent. */ + + uint32_t free = xdes_find_free(descr, hint % FSP_EXTENT_SIZE); + if (free == FIL_NULL) { + + ut_print_buf(stderr, ((byte*) descr) - 500, 1000); + putc('\n', stderr); + + ut_error; + } + + uint32_t page_no = xdes_get_offset(descr) + free; + + uint32_t space_size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + + block->frame); + ut_ad(space_size == space->size_in_header + || (space_id == TRX_SYS_SPACE + && srv_startup_is_before_trx_rollback_phase)); + + if (space_size <= page_no) { + /* It must be that we are extending a single-table tablespace + whose size is still < 64 pages */ + + ut_a(!is_system_tablespace(space_id)); + if (page_no >= FSP_EXTENT_SIZE) { + ib::error() << "Trying to extend a single-table" + " tablespace " << space->name << " , by single" + " page(s) though the space size " << space_size + << ". Page no " << page_no << "."; + return(NULL); + } + + if (!fsp_try_extend_data_file_with_pages(space, page_no, + block, mtr)) { + /* No disk space left */ + return(NULL); + } + } + + fsp_alloc_from_free_frag(block, xdes, descr, free, mtr); + return fsp_page_create(space, page_no, init_mtr); +} + +/** Frees a single page of a space. +The page is marked as free and clean. +@param[in,out] space tablespace +@param[in] offset page number +@param[in,out] mtr mini-transaction */ +static void fsp_free_page(fil_space_t* space, page_no_t offset, mtr_t* mtr) +{ + xdes_t* descr; + ulint state; + ulint frag_n_used; + + ut_ad(mtr); + ut_d(space->modify_check(*mtr)); + + /* fprintf(stderr, "Freeing page %lu in space %lu\n", page, space); */ + + buf_block_t* header = fsp_get_header(space, mtr); + buf_block_t* xdes= 0; + + descr = xdes_get_descriptor_with_space_hdr(header, space, offset, + &xdes, mtr); + + state = xdes_get_state(descr); + + if (UNIV_UNLIKELY(state != XDES_FREE_FRAG + && state != XDES_FULL_FRAG)) { + ib::error() << "File space extent descriptor of page " + << page_id_t(space->id, offset) + << " has state " << state; + /* Crash in debug version, so that we get a core dump + of this corruption. */ + ut_ad(0); + + if (state == XDES_FREE) { + /* We put here some fault tolerance: if the page + is already free, return without doing anything! */ + + return; + } + + ut_error; + } + + if (xdes_is_free(descr, offset % FSP_EXTENT_SIZE)) { + ib::error() << "File space extent descriptor of page " + << page_id_t(space->id, offset) + << " says it is free."; + /* Crash in debug version, so that we get a core dump + of this corruption. */ + ut_ad(0); + + /* We put here some fault tolerance: if the page + is already free, return without doing anything! */ + + return; + } + + mtr->free(*space, static_cast<uint32_t>(offset)); + + const ulint bit = offset % FSP_EXTENT_SIZE; + + xdes_set_free<true>(*xdes, descr, bit, mtr); + + frag_n_used = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED + + header->frame); + + const uint16_t xoffset= static_cast<uint16_t>(descr - xdes->frame + + XDES_FLST_NODE); + + if (state == XDES_FULL_FRAG) { + /* The fragment was full: move it to another list */ + flst_remove(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG, + xdes, xoffset, mtr); + xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr); + flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG, + xdes, xoffset, mtr); + mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED + + header->frame, + frag_n_used + FSP_EXTENT_SIZE - 1); + } else { + ut_a(frag_n_used > 0); + mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED + + header->frame, frag_n_used - 1); + } + + if (!xdes_get_n_used(descr)) { + /* The extent has become free: move it to another list */ + flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG, + xdes, xoffset, mtr); + fsp_free_extent(space, offset, mtr); + } +} + +/** Return an extent to the free list of a space. +@param[in,out] space tablespace +@param[in] offset page number in the extent +@param[in,out] mtr mini-transaction */ +static void fsp_free_extent(fil_space_t* space, page_no_t offset, mtr_t* mtr) +{ + ut_ad(mtr->memo_contains(*space)); + + buf_block_t *block= fsp_get_header(space, mtr); + buf_block_t *xdes= 0; + + xdes_t* descr= xdes_get_descriptor_with_space_hdr(block, space, offset, + &xdes, mtr); + ut_a(xdes_get_state(descr) != XDES_FREE); + + xdes_init(*xdes, descr, mtr); + + flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE, + xdes, static_cast<uint16_t>(descr - xdes->frame + + XDES_FLST_NODE), mtr); + space->free_len++; +} + +/** @return Number of segment inodes which fit on a single page */ +inline ulint FSP_SEG_INODES_PER_PAGE(ulint physical_size) +{ + return (physical_size - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE; +} + +/** Returns the nth inode slot on an inode page. +@param[in] page segment inode page +@param[in] i inode index on page +@return segment inode */ +#define fsp_seg_inode_page_get_nth_inode(page, i) \ + FSEG_ARR_OFFSET + FSEG_INODE_SIZE * i + page + +/** Looks for a used segment inode on a segment inode page. +@param[in] page segment inode page +@param[in] physical_size page size +@return segment inode index, or ULINT_UNDEFINED if not found */ +static +ulint +fsp_seg_inode_page_find_used(const page_t* page, ulint physical_size) +{ + for (ulint i = 0; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++) { + if (!mach_read_from_8( + FSEG_ID + + fsp_seg_inode_page_get_nth_inode(page, i))) { + continue; + } + /* This is used */ + ut_ad(FSEG_MAGIC_N_VALUE == mach_read_from_4( + FSEG_MAGIC_N + + fsp_seg_inode_page_get_nth_inode(page, i))); + return i; + } + + return(ULINT_UNDEFINED); +} + +/** Looks for an unused segment inode on a segment inode page. +@param[in] page segment inode page +@param[in] i search forward starting from this index +@param[in] physical_size page size +@return segment inode index, or ULINT_UNDEFINED if not found */ +static +ulint +fsp_seg_inode_page_find_free(const page_t* page, ulint i, ulint physical_size) +{ + for (; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++) { + if (!mach_read_from_8( + FSEG_ID + + fsp_seg_inode_page_get_nth_inode(page, i))) { + /* This is unused */ + return i; + } + + ut_ad(FSEG_MAGIC_N_VALUE == mach_read_from_4( + FSEG_MAGIC_N + + fsp_seg_inode_page_get_nth_inode(page, i))); + } + + return ULINT_UNDEFINED; +} + +/** Allocate a file segment inode page. +@param[in,out] space tablespace +@param[in,out] header tablespace header +@param[in,out] mtr mini-transaction +@return whether the allocation succeeded */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +static +bool +fsp_alloc_seg_inode_page(fil_space_t *space, buf_block_t *header, mtr_t *mtr) +{ + ut_ad(header->page.id().space() == space->id); + buf_block_t *block= fsp_alloc_free_page(space, 0, mtr, mtr); + + if (!block) + return false; + + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1); + + mtr->write<2>(*block, block->frame + FIL_PAGE_TYPE, FIL_PAGE_INODE); + +#ifdef UNIV_DEBUG + const byte *inode= FSEG_ID + FSEG_ARR_OFFSET + block->frame; + for (ulint i= FSP_SEG_INODES_PER_PAGE(space->physical_size()); i--; + inode += FSEG_INODE_SIZE) + ut_ad(!mach_read_from_8(inode)); +#endif + + flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, + block, FSEG_INODE_PAGE_NODE, mtr); + return true; +} + +/** Allocate a file segment inode. +@param[in,out] space tablespace +@param[in,out] header tablespace header +@param[out] iblock segment inode page +@param[in,out] mtr mini-transaction +@return segment inode +@retval NULL if not enough space */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +static fseg_inode_t* +fsp_alloc_seg_inode(fil_space_t *space, buf_block_t *header, + buf_block_t **iblock, mtr_t *mtr) +{ + buf_block_t* block; + fseg_inode_t* inode; + + /* Allocate a new segment inode page if needed. */ + if (!flst_get_len(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE + + header->frame) + && !fsp_alloc_seg_inode_page(space, header, mtr)) { + return(NULL); + } + const page_id_t page_id( + space->id, + flst_get_first(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE + + header->frame).page); + + block = buf_page_get(page_id, space->zip_size(), RW_SX_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + if (!space->full_crc32()) { + fil_block_check_type(*block, FIL_PAGE_INODE, mtr); + } + + const ulint physical_size = space->physical_size(); + + ulint n = fsp_seg_inode_page_find_free(block->frame, 0, physical_size); + + ut_a(n < FSP_SEG_INODES_PER_PAGE(physical_size)); + + inode = fsp_seg_inode_page_get_nth_inode(block->frame, n); + + if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(block->frame, + n + 1, + physical_size)) { + /* There are no other unused headers left on the page: move it + to another list */ + flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, + block, FSEG_INODE_PAGE_NODE, mtr); + flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, + block, FSEG_INODE_PAGE_NODE, mtr); + } + + ut_ad(!mach_read_from_8(inode + FSEG_ID) + || mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + *iblock = block; + return(inode); +} + +/** Frees a file segment inode. +@param[in,out] space tablespace +@param[in,out] inode segment inode +@param[in,out] iblock segment inode page +@param[in,out] mtr mini-transaction */ +static void fsp_free_seg_inode( + fil_space_t* space, + fseg_inode_t* inode, + buf_block_t* iblock, + mtr_t* mtr) +{ + ut_d(space->modify_check(*mtr)); + + buf_block_t* header = fsp_get_header(space, mtr); + + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + + const ulint physical_size = space->physical_size(); + + if (ULINT_UNDEFINED + == fsp_seg_inode_page_find_free(iblock->frame, 0, physical_size)) { + /* Move the page to another list */ + flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, + iblock, FSEG_INODE_PAGE_NODE, mtr); + flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, + iblock, FSEG_INODE_PAGE_NODE, mtr); + } + + mtr->memset(iblock, page_offset(inode) + FSEG_ID, FSEG_INODE_SIZE, 0); + + if (ULINT_UNDEFINED + == fsp_seg_inode_page_find_used(iblock->frame, physical_size)) { + /* There are no other used headers left on the page: free it */ + flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, + iblock, FSEG_INODE_PAGE_NODE, mtr); + fsp_free_page(space, iblock->page.id().page_no(), mtr); + } +} + +/** Returns the file segment inode, page x-latched. +@param[in] header segment header +@param[in] space space id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction +@param[out] block inode block, or NULL to ignore +@return segment inode, page x-latched; NULL if the inode is free */ +static +fseg_inode_t* +fseg_inode_try_get( + const fseg_header_t* header, + ulint space, + ulint zip_size, + mtr_t* mtr, + buf_block_t** block) +{ + fil_addr_t inode_addr; + fseg_inode_t* inode; + + inode_addr.page = mach_read_from_4(header + FSEG_HDR_PAGE_NO); + inode_addr.boffset = mach_read_from_2(header + FSEG_HDR_OFFSET); + ut_ad(space == mach_read_from_4(header + FSEG_HDR_SPACE)); + + inode = fut_get_ptr(space, zip_size, inode_addr, RW_SX_LATCH, mtr, + block); + + if (UNIV_UNLIKELY(!mach_read_from_8(inode + FSEG_ID))) { + + inode = NULL; + } else { + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) + == FSEG_MAGIC_N_VALUE); + } + + return(inode); +} + +/** Returns the file segment inode, page x-latched. +@param[in] header segment header +@param[in] space space id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction +@param[out] block inode block +@return segment inode, page x-latched */ +static +fseg_inode_t* +fseg_inode_get( + const fseg_header_t* header, + ulint space, + ulint zip_size, + mtr_t* mtr, + buf_block_t** block = NULL) +{ + fseg_inode_t* inode + = fseg_inode_try_get(header, space, zip_size, mtr, block); + ut_a(inode); + return(inode); +} + +/** Get the page number from the nth fragment page slot. +@param inode file segment findex +@param n slot index +@return page number +@retval FIL_NULL if not in use */ +static uint32_t fseg_get_nth_frag_page_no(const fseg_inode_t *inode, ulint n) +{ + ut_ad(inode); + ut_ad(n < FSEG_FRAG_ARR_N_SLOTS); + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + return(mach_read_from_4(inode + FSEG_FRAG_ARR + + n * FSEG_FRAG_SLOT_SIZE)); +} + +/** Set the page number in the nth fragment page slot. +@param[in,out] inode segment inode +@param[in,out] iblock segment inode page +@param[in] n slot index +@param[in] page_no page number to set +@param[in,out] mtr mini-transaction */ +inline void fseg_set_nth_frag_page_no(fseg_inode_t *inode, buf_block_t *iblock, + ulint n, ulint page_no, mtr_t *mtr) +{ + ut_ad(n < FSEG_FRAG_ARR_N_SLOTS); + ut_ad(mtr->memo_contains_flagged(iblock, MTR_MEMO_PAGE_SX_FIX)); + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + + mtr->write<4>(*iblock, inode + FSEG_FRAG_ARR + n * FSEG_FRAG_SLOT_SIZE, + page_no); +} + +/**********************************************************************//** +Finds a fragment page slot which is free. +@return slot index; ULINT_UNDEFINED if none found */ +static +ulint +fseg_find_free_frag_page_slot( +/*==========================*/ + fseg_inode_t* inode) /*!< in: segment inode */ +{ + ulint i; + ulint page_no; + + for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + page_no = fseg_get_nth_frag_page_no(inode, i); + + if (page_no == FIL_NULL) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/**********************************************************************//** +Finds a fragment page slot which is used and last in the array. +@return slot index; ULINT_UNDEFINED if none found */ +static +ulint +fseg_find_last_used_frag_page_slot( +/*===============================*/ + fseg_inode_t* inode) /*!< in: segment inode */ +{ + ulint i; + ulint page_no; + + for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + page_no = fseg_get_nth_frag_page_no( + inode, FSEG_FRAG_ARR_N_SLOTS - i - 1); + + if (page_no != FIL_NULL) { + + return(FSEG_FRAG_ARR_N_SLOTS - i - 1); + } + } + + return(ULINT_UNDEFINED); +} + +/** Calculate reserved fragment page slots. +@param inode file segment index +@return number of fragment pages */ +static ulint fseg_get_n_frag_pages(const fseg_inode_t *inode) +{ + ulint i; + ulint count = 0; + + for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + if (FIL_NULL != fseg_get_nth_frag_page_no(inode, i)) { + count++; + } + } + + return(count); +} + +/** Create a new segment. +@param space tablespace +@param byte_offset byte offset of the created segment header +@param mtr mini-transaction +@param has_done_reservation whether fsp_reserve_free_extents() was invoked +@param block block where segment header is placed, + or NULL to allocate an additional page for that +@return the block where the segment header is placed, x-latched +@retval NULL if could not create segment because of lack of space */ +buf_block_t* +fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, + bool has_done_reservation, buf_block_t *block) +{ + fseg_inode_t* inode; + ib_id_t seg_id; + uint32_t n_reserved; + + DBUG_ENTER("fseg_create"); + + ut_ad(mtr); + ut_ad(byte_offset >= FIL_PAGE_DATA); + ut_ad(byte_offset + FSEG_HEADER_SIZE + <= srv_page_size - FIL_PAGE_DATA_END); + + mtr_x_lock_space(space, mtr); + ut_d(space->modify_check(*mtr)); + + if (block) { + ut_ad(block->page.id().space() == space->id); + + if (!space->full_crc32()) { + fil_block_check_type(*block, block->page.id() + == page_id_t(TRX_SYS_SPACE, + TRX_SYS_PAGE_NO) + ? FIL_PAGE_TYPE_TRX_SYS + : FIL_PAGE_TYPE_SYS, + mtr); + } + } + + if (!has_done_reservation + && !fsp_reserve_free_extents(&n_reserved, space, 2, + FSP_NORMAL, mtr)) { + DBUG_RETURN(NULL); + } + + buf_block_t* header = fsp_get_header(space, mtr); + buf_block_t* iblock; + + inode = fsp_alloc_seg_inode(space, header, &iblock, mtr); + + if (inode == NULL) { + goto funct_exit; + } + + /* Read the next segment id from space header and increment the + value in space header */ + + seg_id = mach_read_from_8(FSP_HEADER_OFFSET + FSP_SEG_ID + + header->frame); + + mtr->write<8>(*header, FSP_HEADER_OFFSET + FSP_SEG_ID + header->frame, + seg_id + 1); + mtr->write<8>(*iblock, inode + FSEG_ID, seg_id); + ut_ad(!mach_read_from_4(inode + FSEG_NOT_FULL_N_USED)); + + flst_init(*iblock, inode + FSEG_FREE, mtr); + flst_init(*iblock, inode + FSEG_NOT_FULL, mtr); + flst_init(*iblock, inode + FSEG_FULL, mtr); + + mtr->write<4>(*iblock, inode + FSEG_MAGIC_N, FSEG_MAGIC_N_VALUE); + compile_time_assert(FSEG_FRAG_SLOT_SIZE == 4); + compile_time_assert(FIL_NULL == 0xffffffff); + mtr->memset(iblock, uint16_t(inode - iblock->frame) + FSEG_FRAG_ARR, + FSEG_FRAG_SLOT_SIZE * FSEG_FRAG_ARR_N_SLOTS, 0xff); + + if (!block) { + block = fseg_alloc_free_page_low(space, + inode, iblock, 0, FSP_UP, +#ifdef UNIV_DEBUG + has_done_reservation, +#endif /* UNIV_DEBUG */ + mtr, mtr); + + /* The allocation cannot fail if we have already reserved a + space for the page. */ + ut_ad(!has_done_reservation || block != NULL); + + if (block == NULL) { + fsp_free_seg_inode(space, inode, iblock, mtr); + goto funct_exit; + } + + ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1); + ut_ad(!fil_page_get_type(block->frame)); + mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame, + FIL_PAGE_TYPE_SYS); + } + + mtr->write<2>(*block, byte_offset + FSEG_HDR_OFFSET + + block->frame, page_offset(inode)); + + mtr->write<4>(*block, byte_offset + FSEG_HDR_PAGE_NO + + block->frame, iblock->page.id().page_no()); + + mtr->write<4,mtr_t::MAYBE_NOP>(*block, byte_offset + FSEG_HDR_SPACE + + block->frame, space->id); + +funct_exit: + if (!has_done_reservation) { + space->release_free_extents(n_reserved); + } + + DBUG_RETURN(block); +} + +/**********************************************************************//** +Calculates the number of pages reserved by a segment, and how many pages are +currently used. +@return number of reserved pages */ +static +ulint +fseg_n_reserved_pages_low( +/*======================*/ + const fseg_inode_t* inode, /*!< in: segment inode */ + ulint* used) /*!< out: number of pages used (not + more than reserved) */ +{ + *used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL) + + fseg_get_n_frag_pages(inode); + + return fseg_get_n_frag_pages(inode) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FREE) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_NOT_FULL) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL); +} + +/** Calculate the number of pages reserved by a segment, +and how many pages are currently used. +@param[in] block buffer block containing the file segment header +@param[in] header file segment header +@param[out] used number of pages that are used (not more than reserved) +@param[in,out] mtr mini-transaction +@return number of reserved pages */ +ulint fseg_n_reserved_pages(const buf_block_t &block, + const fseg_header_t *header, ulint *used, + mtr_t *mtr) +{ + ut_ad(page_align(header) == block.frame); + return fseg_n_reserved_pages_low(fseg_inode_get(header, + block.page.id().space(), + block.zip_size(), mtr), + used); +} + +/** Tries to fill the free list of a segment with consecutive free extents. +This happens if the segment is big enough to allow extents in the free list, +the free list is empty, and the extents can be allocated consecutively from +the hint onward. +@param[in,out] inode segment inode +@param[in,out] iblock segment inode page +@param[in] space tablespace +@param[in] hint hint which extent would be good as the first extent +@param[in,out] mtr mini-transaction */ +static +void +fseg_fill_free_list( + fseg_inode_t* inode, + buf_block_t* iblock, + fil_space_t* space, + uint32_t hint, + mtr_t* mtr) +{ + xdes_t* descr; + ulint i; + ib_id_t seg_id; + ulint reserved; + ulint used; + + ut_ad(inode && mtr); + ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + ut_d(space->modify_check(*mtr)); + + reserved = fseg_n_reserved_pages_low(inode, &used); + + if (reserved < FSEG_FREE_LIST_LIMIT * FSP_EXTENT_SIZE) { + + /* The segment is too small to allow extents in free list */ + + return; + } + + if (flst_get_len(inode + FSEG_FREE) > 0) { + /* Free list is not empty */ + + return; + } + + for (i = 0; i < FSEG_FREE_LIST_MAX_LEN; i++) { + buf_block_t* xdes; + descr = xdes_get_descriptor(space, hint, &xdes, mtr); + + if (!descr || (XDES_FREE != xdes_get_state(descr))) { + /* We cannot allocate the desired extent: stop */ + return; + } + + descr = fsp_alloc_free_extent(space, hint, &xdes, mtr); + + xdes_set_state(*xdes, descr, XDES_FSEG, mtr); + + seg_id = mach_read_from_8(inode + FSEG_ID); + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) + == FSEG_MAGIC_N_VALUE); + mtr->write<8>(*xdes, descr + XDES_ID, seg_id); + + flst_add_last(iblock, + static_cast<uint16_t>(inode - iblock->frame + + FSEG_FREE), xdes, + static_cast<uint16_t>(descr - xdes->frame + + XDES_FLST_NODE), mtr); + hint += FSP_EXTENT_SIZE; + } +} + +/** Allocates a free extent for the segment: looks first in the free list of +the segment, then tries to allocate from the space free list. +NOTE that the extent returned still resides in the segment free list, it is +not yet taken off it! +@param[in,out] inode segment inode +@param[in,out] iblock segment inode page +@param[out] xdes extent descriptor page +@param[in,out] space tablespace +@param[in,out] mtr mini-transaction +@retval NULL if no page could be allocated */ +static +xdes_t* +fseg_alloc_free_extent( + fseg_inode_t* inode, + buf_block_t* iblock, + buf_block_t** xdes, + fil_space_t* space, + mtr_t* mtr) +{ + xdes_t* descr; + ib_id_t seg_id; + fil_addr_t first; + + ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + ut_d(space->modify_check(*mtr)); + + if (flst_get_len(inode + FSEG_FREE) > 0) { + /* Segment free list is not empty, allocate from it */ + + first = flst_get_first(inode + FSEG_FREE); + + descr = xdes_lst_get_descriptor(space, first, xdes, mtr); + } else { + /* Segment free list was empty, allocate from space */ + descr = fsp_alloc_free_extent(space, 0, xdes, mtr); + + if (descr == NULL) { + + return(NULL); + } + + seg_id = mach_read_from_8(inode + FSEG_ID); + + xdes_set_state(**xdes, descr, XDES_FSEG, mtr); + mtr->write<8,mtr_t::MAYBE_NOP>(**xdes, descr + XDES_ID, + seg_id); + flst_add_last(iblock, + static_cast<uint16_t>(inode - iblock->frame + + FSEG_FREE), *xdes, + static_cast<uint16_t>(descr - (*xdes)->frame + + XDES_FLST_NODE), mtr); + + /* Try to fill the segment free list */ + fseg_fill_free_list(inode, iblock, space, + xdes_get_offset(descr) + FSP_EXTENT_SIZE, + mtr); + } + + return(descr); +} + +/** Allocates a single free page from a segment. +This function implements the intelligent allocation strategy which tries to +minimize file space fragmentation. +@param[in,out] space tablespace +@param[in,out] seg_inode segment inode +@param[in,out] iblock segment inode page +@param[in] hint hint of which page would be desirable +@param[in] direction if the new page is needed because of +an index page split, and records are inserted there in order, into which +direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR +@param[in,out] mtr mini-transaction +@param[in,out] init_mtr mtr or another mini-transaction in +which the page should be initialized. +@retval NULL if no page could be allocated */ +static +buf_block_t* +fseg_alloc_free_page_low( + fil_space_t* space, + fseg_inode_t* seg_inode, + buf_block_t* iblock, + uint32_t hint, + byte direction, +#ifdef UNIV_DEBUG + bool has_done_reservation, + /*!< whether the space has already been reserved */ +#endif /* UNIV_DEBUG */ + mtr_t* mtr, + mtr_t* init_mtr) +{ + ib_id_t seg_id; + ulint used; + ulint reserved; + xdes_t* descr; /*!< extent of the hinted page */ + uint32_t ret_page; /*!< the allocated page offset, FIL_NULL + if could not be allocated */ + xdes_t* ret_descr; /*!< the extent of the allocated page */ + buf_block_t* xdes; + ulint n; + const ulint space_id = space->id; + + ut_ad((direction >= FSP_UP) && (direction <= FSP_NO_DIR)); + ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) + == FSEG_MAGIC_N_VALUE); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + seg_id = mach_read_from_8(seg_inode + FSEG_ID); + + ut_ad(seg_id); + ut_d(space->modify_check(*mtr)); + ut_ad(fil_page_get_type(page_align(seg_inode)) == FIL_PAGE_INODE); + + reserved = fseg_n_reserved_pages_low(seg_inode, &used); + + buf_block_t* header = fsp_get_header(space, mtr); + + descr = xdes_get_descriptor_with_space_hdr(header, space, hint, + &xdes, mtr); + if (descr == NULL) { + /* Hint outside space or too high above free limit: reset + hint */ + /* The file space header page is always allocated. */ + hint = 0; + descr = xdes_get_descriptor(space, hint, &xdes, mtr); + } + + /* In the big if-else below we look for ret_page and ret_descr */ + /*-------------------------------------------------------------*/ + if ((xdes_get_state(descr) == XDES_FSEG) + && mach_read_from_8(descr + XDES_ID) == seg_id + && xdes_is_free(descr, hint % FSP_EXTENT_SIZE)) { +take_hinted_page: + /* 1. We can take the hinted page + =================================*/ + ret_descr = descr; + ret_page = hint; + /* Skip the check for extending the tablespace. If the + page hint were not within the size of the tablespace, + we would have got (descr == NULL) above and reset the hint. */ + goto got_hinted_page; + /*-----------------------------------------------------------*/ + } else if (xdes_get_state(descr) == XDES_FREE + && reserved - used < reserved / FSEG_FILLFACTOR + && used >= FSEG_FRAG_LIMIT) { + + /* 2. We allocate the free extent from space and can take + ========================================================= + the hinted page + ===============*/ + ret_descr = fsp_alloc_free_extent(space, hint, &xdes, mtr); + + ut_a(ret_descr == descr); + + xdes_set_state(*xdes, ret_descr, XDES_FSEG, mtr); + mtr->write<8,mtr_t::MAYBE_NOP>(*xdes, ret_descr + XDES_ID, + seg_id); + flst_add_last(iblock, + static_cast<uint16_t>(seg_inode - iblock->frame + + FSEG_FREE), xdes, + static_cast<uint16_t>(ret_descr - xdes->frame + + XDES_FLST_NODE), mtr); + + /* Try to fill the segment free list */ + fseg_fill_free_list(seg_inode, iblock, space, + hint + FSP_EXTENT_SIZE, mtr); + goto take_hinted_page; + /*-----------------------------------------------------------*/ + } else if ((direction != FSP_NO_DIR) + && ((reserved - used) < reserved / FSEG_FILLFACTOR) + && (used >= FSEG_FRAG_LIMIT) + && !!(ret_descr = fseg_alloc_free_extent(seg_inode, iblock, + &xdes, space, + mtr))) { + /* 3. We take any free extent (which was already assigned above + =============================================================== + in the if-condition to ret_descr) and take the lowest or + ======================================================== + highest page in it, depending on the direction + ==============================================*/ + ret_page = xdes_get_offset(ret_descr); + + if (direction == FSP_DOWN) { + ret_page += FSP_EXTENT_SIZE - 1; + } + ut_ad(!has_done_reservation || ret_page != FIL_NULL); + /*-----------------------------------------------------------*/ + } else if ((xdes_get_state(descr) == XDES_FSEG) + && mach_read_from_8(descr + XDES_ID) == seg_id + && (!xdes_is_full(descr))) { + + /* 4. We can take the page from the same extent as the + ====================================================== + hinted page (and the extent already belongs to the + ================================================== + segment) + ========*/ + ret_descr = descr; + ret_page = xdes_find_free(ret_descr, hint % FSP_EXTENT_SIZE); + if (ret_page == FIL_NULL) { + ut_ad(!has_done_reservation); + } else { + ret_page += xdes_get_offset(ret_descr); + } + /*-----------------------------------------------------------*/ + } else if (reserved - used > 0) { + /* 5. We take any unused page from the segment + ==============================================*/ + fil_addr_t first; + + if (flst_get_len(seg_inode + FSEG_NOT_FULL) > 0) { + first = flst_get_first(seg_inode + FSEG_NOT_FULL); + } else if (flst_get_len(seg_inode + FSEG_FREE) > 0) { + first = flst_get_first(seg_inode + FSEG_FREE); + } else { + ut_ad(!has_done_reservation); + return(NULL); + } + + ret_descr = xdes_lst_get_descriptor(space, first, &xdes, mtr); + ret_page = xdes_find_free(ret_descr); + if (ret_page == FIL_NULL) { + ut_ad(!has_done_reservation); + } else { + ret_page += xdes_get_offset(ret_descr); + } + /*-----------------------------------------------------------*/ + } else if (used < FSEG_FRAG_LIMIT) { + /* 6. We allocate an individual page from the space + ===================================================*/ + buf_block_t* block = fsp_alloc_free_page( + space, hint, mtr, init_mtr); + + ut_ad(!has_done_reservation || block); + + if (block) { + /* Put the page in the fragment page array of the + segment */ + n = fseg_find_free_frag_page_slot(seg_inode); + ut_a(n != ULINT_UNDEFINED); + + fseg_set_nth_frag_page_no( + seg_inode, iblock, n, + block->page.id().page_no(), mtr); + } + + /* fsp_alloc_free_page() invoked fsp_init_file_page() + already. */ + return(block); + /*-----------------------------------------------------------*/ + } else { + /* 7. We allocate a new extent and take its first page + ======================================================*/ + ret_descr = fseg_alloc_free_extent(seg_inode, iblock, &xdes, + space, mtr); + + if (ret_descr == NULL) { + ret_page = FIL_NULL; + ut_ad(!has_done_reservation); + } else { + ret_page = xdes_get_offset(ret_descr); + ut_ad(!has_done_reservation || ret_page != FIL_NULL); + } + } + + if (ret_page == FIL_NULL) { + /* Page could not be allocated */ + + ut_ad(!has_done_reservation); + return(NULL); + } + + if (space->size <= ret_page && !is_system_tablespace(space_id)) { + /* It must be that we are extending a single-table + tablespace whose size is still < 64 pages */ + + if (ret_page >= FSP_EXTENT_SIZE) { + ib::error() << "Error (2): trying to extend" + " a single-table tablespace " << space_id + << " by single page(s) though the" + << " space size " << space->size + << ". Page no " << ret_page << "."; + ut_ad(!has_done_reservation); + return(NULL); + } + + if (!fsp_try_extend_data_file_with_pages( + space, ret_page, header, mtr)) { + /* No disk space left */ + ut_ad(!has_done_reservation); + return(NULL); + } + } + +got_hinted_page: + /* ret_descr == NULL if the block was allocated from free_frag + (XDES_FREE_FRAG) */ + if (ret_descr != NULL) { + /* At this point we know the extent and the page offset. + The extent is still in the appropriate list (FSEG_NOT_FULL + or FSEG_FREE), and the page is not yet marked as used. */ + + ut_d(buf_block_t* xxdes); + ut_ad(xdes_get_descriptor(space, ret_page, &xxdes, mtr) + == ret_descr); + ut_ad(xdes == xxdes); + ut_ad(xdes_is_free(ret_descr, ret_page % FSP_EXTENT_SIZE)); + + fseg_mark_page_used(seg_inode, iblock, ret_page, ret_descr, + xdes, mtr); + } + + return fsp_page_create(space, ret_page, init_mtr); +} + +/**********************************************************************//** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize file space +fragmentation. +@retval NULL if no page could be allocated */ +buf_block_t* +fseg_alloc_free_page_general( +/*=========================*/ + fseg_header_t* seg_header,/*!< in/out: segment header */ + uint32_t hint, /*!< in: hint of which page would be + desirable */ + byte direction,/*!< in: if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR */ + bool has_done_reservation, /*!< in: true if the caller has + already done the reservation for the page + with fsp_reserve_free_extents, then there + is no need to do the check for this individual + page */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr)/*!< in/out: mtr or another mini-transaction + in which the page should be initialized. */ +{ + fseg_inode_t* inode; + ulint space_id; + fil_space_t* space; + buf_block_t* iblock; + buf_block_t* block; + uint32_t n_reserved; + + space_id = page_get_space_id(page_align(seg_header)); + space = mtr_x_lock_space(space_id, mtr); + inode = fseg_inode_get(seg_header, space_id, space->zip_size(), + mtr, &iblock); + if (!space->full_crc32()) { + fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + } + + if (!has_done_reservation + && !fsp_reserve_free_extents(&n_reserved, space, 2, + FSP_NORMAL, mtr)) { + return(NULL); + } + + block = fseg_alloc_free_page_low(space, + inode, iblock, hint, direction, +#ifdef UNIV_DEBUG + has_done_reservation, +#endif /* UNIV_DEBUG */ + mtr, init_mtr); + + /* The allocation cannot fail if we have already reserved a + space for the page. */ + ut_ad(!has_done_reservation || block != NULL); + + if (!has_done_reservation) { + space->release_free_extents(n_reserved); + } + + return(block); +} + +/** Check that we have at least n_pages frag pages free in the first extent +of a single-table tablespace, and they are also physically initialized to +the data file. That is we have already extended the data file so that those +pages are inside the data file. If not, this function extends the tablespace +with pages. +@param[in,out] space tablespace +@param[in,out] header tablespace header, x-latched +@param[in] size tablespace size in pages, less than FSP_EXTENT_SIZE +@param[in,out] mtr mini-transaction +@param[in] n_pages number of pages to reserve +@return true if there were at least n_pages free pages, or we were able +to extend */ +static +bool +fsp_reserve_free_pages( + fil_space_t* space, + buf_block_t* header, + ulint size, + mtr_t* mtr, + uint32_t n_pages) +{ + xdes_t* descr; + + ut_a(!is_system_tablespace(space->id)); + ut_a(size < FSP_EXTENT_SIZE); + + buf_block_t* xdes; + descr = xdes_get_descriptor_with_space_hdr(header, space, 0, &xdes, + mtr); + uint32_t n_used = xdes_get_n_used(descr); + + ut_a(n_used <= size); + + return(size >= n_used + n_pages + || fsp_try_extend_data_file_with_pages( + space, n_used + n_pages - 1, header, mtr)); +} + +/** Reserves free pages from a tablespace. All mini-transactions which may +use several pages from the tablespace should call this function beforehand +and reserve enough free extents so that they certainly will be able +to do their operation, like a B-tree page split, fully. Reservations +must be released with function fil_space_t::release_free_extents()! + +The alloc_type below has the following meaning: FSP_NORMAL means an +operation which will probably result in more space usage, like an +insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are +deleting rows, then this allocation will in the long run result in +less space usage (after a purge); FSP_CLEANING means allocation done +in a physical record delete (like in a purge) or other cleaning operation +which will result in less space usage in the long run. We prefer the latter +two types of allocation: when space is scarce, FSP_NORMAL allocations +will not succeed, but the latter two allocations will succeed, if possible. +The purpose is to avoid dead end where the database is full but the +user cannot free any space because these freeing operations temporarily +reserve some space. + +Single-table tablespaces whose size is < FSP_EXTENT_SIZE pages are a special +case. In this function we would liberally reserve several extents for +every page split or merge in a B-tree. But we do not want to waste disk space +if the table only occupies < FSP_EXTENT_SIZE pages. That is why we apply +different rules in that special case, just ensuring that there are n_pages +free pages available. + +@param[out] n_reserved number of extents actually reserved; if we + return true and the tablespace size is < + FSP_EXTENT_SIZE pages, then this can be 0, + otherwise it is n_ext +@param[in,out] space tablespace +@param[in] n_ext number of extents to reserve +@param[in] alloc_type page reservation type (FSP_BLOB, etc) +@param[in,out] mtr the mini transaction +@param[in] n_pages for small tablespaces (tablespace size is + less than FSP_EXTENT_SIZE), number of free + pages to reserve. +@return true if we were able to make the reservation */ +bool +fsp_reserve_free_extents( + uint32_t* n_reserved, + fil_space_t* space, + uint32_t n_ext, + fsp_reserve_t alloc_type, + mtr_t* mtr, + uint32_t n_pages) +{ + ulint reserve; + + ut_ad(mtr); + *n_reserved = n_ext; + + const uint32_t extent_size = FSP_EXTENT_SIZE; + + mtr_x_lock_space(space, mtr); + const unsigned physical_size = space->physical_size(); + + buf_block_t* header = fsp_get_header(space, mtr); +try_again: + uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + + header->frame); + ut_ad(size == space->size_in_header); + + if (size < extent_size && n_pages < extent_size / 2) { + /* Use different rules for small single-table tablespaces */ + *n_reserved = 0; + return(fsp_reserve_free_pages(space, header, size, + mtr, n_pages)); + } + + uint32_t n_free_list_ext = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE + + header->frame); + ut_ad(space->free_len == n_free_list_ext); + + uint32_t free_limit = mach_read_from_4(FSP_HEADER_OFFSET + + FSP_FREE_LIMIT + + header->frame); + ut_ad(space->free_limit == free_limit); + + /* Below we play safe when counting free extents above the free limit: + some of them will contain extent descriptor pages, and therefore + will not be free extents */ + + uint32_t n_free_up; + + if (size >= free_limit) { + n_free_up = (size - free_limit) / extent_size; + if (n_free_up) { + n_free_up--; + n_free_up -= n_free_up / (physical_size / extent_size); + } + } else { + ut_ad(alloc_type == FSP_BLOB); + n_free_up = 0; + } + + uint32_t n_free = n_free_list_ext + n_free_up; + + switch (alloc_type) { + case FSP_NORMAL: + /* We reserve 1 extent + 0.5 % of the space size to undo logs + and 1 extent + 0.5 % to cleaning operations; NOTE: this source + code is duplicated in the function below! */ + + reserve = 2 + ((size / extent_size) * 2) / 200; + + if (n_free <= reserve + n_ext) { + + goto try_to_extend; + } + break; + case FSP_UNDO: + /* We reserve 0.5 % of the space size to cleaning operations */ + + reserve = 1 + ((size / extent_size) * 1) / 200; + + if (n_free <= reserve + n_ext) { + + goto try_to_extend; + } + break; + case FSP_CLEANING: + case FSP_BLOB: + reserve = 0; + break; + default: + ut_error; + } + + if (space->reserve_free_extents(n_free, n_ext)) { + return(true); + } +try_to_extend: + if (fsp_try_extend_data_file(space, header, mtr)) { + goto try_again; + } + + return(false); +} + +/** Frees a single page of a segment. +@param[in] seg_inode segment inode +@param[in,out] space tablespace +@param[in] offset page number +@param[in,out] mtr mini-transaction */ +static +void +fseg_free_page_low( + fseg_inode_t* seg_inode, + buf_block_t* iblock, + fil_space_t* space, + page_no_t offset, + mtr_t* mtr) +{ + ib_id_t descr_id; + ib_id_t seg_id; + + ut_ad(seg_inode != NULL); + ut_ad(mtr != NULL); + ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) + == FSEG_MAGIC_N_VALUE); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + ut_ad(iblock->frame == page_align(seg_inode)); + ut_d(space->modify_check(*mtr)); + + const uint32_t extent_size = FSP_EXTENT_SIZE; + ut_ad(ut_is_2pow(extent_size)); + buf_block_t* xdes; + xdes_t* descr = xdes_get_descriptor(space, offset, &xdes, mtr); + + if (xdes_is_free(descr, offset & (extent_size - 1))) { + ib::fatal() << "InnoDB is trying to free page " + << page_id_t(space->id, offset) + << " though it is already marked as free in the" + " tablespace! The tablespace free space info is" + " corrupt. You may need to dump your tables and" + " recreate the whole database!" + << FORCE_RECOVERY_MSG; + } + + if (xdes_get_state(descr) != XDES_FSEG) { + /* The page is in the fragment pages of the segment */ + for (ulint i = 0;; i++) { + if (fseg_get_nth_frag_page_no(seg_inode, i) + != offset) { + continue; + } + + compile_time_assert(FIL_NULL == 0xffffffff); + mtr->memset(iblock, uint16_t(seg_inode - iblock->frame) + + FSEG_FRAG_ARR + + i * FSEG_FRAG_SLOT_SIZE, 4, 0xff); + break; + } + + fsp_free_page(space, offset, mtr); + return; + } + + /* If we get here, the page is in some extent of the segment */ + + descr_id = mach_read_from_8(descr + XDES_ID); + seg_id = mach_read_from_8(seg_inode + FSEG_ID); + + if (UNIV_UNLIKELY(descr_id != seg_id)) { + fputs("InnoDB: Dump of the tablespace extent descriptor: ", + stderr); + ut_print_buf(stderr, descr, 40); + fputs("\nInnoDB: Dump of the segment inode: ", stderr); + ut_print_buf(stderr, seg_inode, 40); + putc('\n', stderr); + + ib::fatal() << "InnoDB is trying to free page " + << page_id_t(space->id, offset) + << ", which does not belong to segment " << descr_id + << " but belongs to segment " << seg_id << "." + << FORCE_RECOVERY_MSG; + } + + byte* p_not_full = seg_inode + FSEG_NOT_FULL_N_USED; + uint32_t not_full_n_used = mach_read_from_4(p_not_full); + const uint16_t xoffset= uint16_t(descr - xdes->frame + XDES_FLST_NODE); + const uint16_t ioffset= uint16_t(seg_inode - iblock->frame); + + if (xdes_is_full(descr)) { + /* The fragment is full: move it to another list */ + flst_remove(iblock, static_cast<uint16_t>(FSEG_FULL + ioffset), + xdes, xoffset, mtr); + flst_add_last(iblock, static_cast<uint16_t>(FSEG_NOT_FULL + + ioffset), + xdes, xoffset, mtr); + not_full_n_used += extent_size - 1; + } else { + ut_a(not_full_n_used > 0); + not_full_n_used--; + } + + mtr->write<4>(*iblock, p_not_full, not_full_n_used); + + const ulint bit = offset & (extent_size - 1); + + xdes_set_free<true>(*xdes, descr, bit, mtr); + + if (!xdes_get_n_used(descr)) { + /* The extent has become free: free it to space */ + flst_remove(iblock, static_cast<uint16_t>(FSEG_NOT_FULL + + ioffset), + xdes, xoffset, mtr); + fsp_free_extent(space, offset, mtr); + } + + mtr->free(*space, static_cast<uint32_t>(offset)); +} + +/** Free a page in a file segment. +@param[in,out] seg_header file segment header +@param[in,out] space tablespace +@param[in] offset page number +@param[in,out] mtr mini-transaction */ +void +fseg_free_page( + fseg_header_t* seg_header, + fil_space_t* space, + uint32_t offset, + mtr_t* mtr) +{ + DBUG_ENTER("fseg_free_page"); + fseg_inode_t* seg_inode; + buf_block_t* iblock; + mtr_x_lock_space(space, mtr); + + DBUG_LOG("fseg_free_page", "space_id: " << space->id + << ", page_no: " << offset); + + seg_inode = fseg_inode_get(seg_header, space->id, space->zip_size(), + mtr, + &iblock); + if (!space->full_crc32()) { + fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + } + + fseg_free_page_low(seg_inode, iblock, space, offset, mtr); + + DBUG_VOID_RETURN; +} + +/** Determine whether a page is free. +@param[in,out] space tablespace +@param[in] page page number +@return whether the page is marked as free */ +bool +fseg_page_is_free(fil_space_t* space, unsigned page) +{ + bool is_free; + mtr_t mtr; + page_no_t dpage = xdes_calc_descriptor_page(space->zip_size(), + page); + + mtr.start(); + mtr_sx_lock_space(space, &mtr); + + if (page >= space->free_limit || page >= space->size_in_header) { + is_free = true; + } else if (const xdes_t* descr = xdes_get_descriptor_const( + space, dpage, page, &mtr)) { + is_free = xdes_is_free(descr, page % FSP_EXTENT_SIZE); + } else { + is_free = true; + } + mtr.commit(); + + return(is_free); +} + +/** Free an extent of a segment to the space free list. +@param[in,out] seg_inode segment inode +@param[in,out] space tablespace +@param[in] page page number in the extent +@param[in,out] mtr mini-transaction */ +MY_ATTRIBUTE((nonnull)) +static +void +fseg_free_extent( + fseg_inode_t* seg_inode, + buf_block_t* iblock, + fil_space_t* space, + uint32_t page, + mtr_t* mtr) +{ + + ut_ad(mtr != NULL); + + buf_block_t* xdes; + xdes_t* descr = xdes_get_descriptor(space, page, &xdes, mtr); + + ut_a(xdes_get_state(descr) == XDES_FSEG); + ut_a(!memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8)); + ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) + == FSEG_MAGIC_N_VALUE); + ut_d(space->modify_check(*mtr)); + const uint32_t first_page_in_extent = page - (page % FSP_EXTENT_SIZE); + + const uint16_t xoffset= uint16_t(descr - xdes->frame + XDES_FLST_NODE); + const uint16_t ioffset= uint16_t(seg_inode - iblock->frame); + + if (xdes_is_full(descr)) { + flst_remove(iblock, static_cast<uint16_t>(FSEG_FULL + ioffset), + xdes, xoffset, mtr); + } else if (!xdes_get_n_used(descr)) { + flst_remove(iblock, static_cast<uint16_t>(FSEG_FREE + ioffset), + xdes, xoffset, mtr); + } else { + flst_remove(iblock, static_cast<uint16_t>(FSEG_NOT_FULL + + ioffset), + xdes, xoffset, mtr); + uint32_t not_full_n_used = mach_read_from_4( + FSEG_NOT_FULL_N_USED + seg_inode); + uint32_t descr_n_used = xdes_get_n_used(descr); + ut_a(not_full_n_used >= descr_n_used); + mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED, + not_full_n_used - descr_n_used); + } + + fsp_free_extent(space, page, mtr); + + for (uint32_t i = 0; i < FSP_EXTENT_SIZE; i++) { + if (!xdes_is_free(descr, i)) { + buf_page_free(space, first_page_in_extent + i, mtr, + __FILE__, __LINE__); + } + } +} + +/**********************************************************************//** +Frees part of a segment. This function can be used to free a segment by +repeatedly calling this function in different mini-transactions. Doing +the freeing in a single mini-transaction might result in too big a +mini-transaction. +@return whether the freeing was completed */ +bool +fseg_free_step( + fseg_header_t* header, /*!< in, own: segment header; NOTE: if the header + resides on the first page of the frag list + of the segment, this pointer becomes obsolete + after the last freeing step */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint n; + fseg_inode_t* inode; + + DBUG_ENTER("fseg_free_step"); + + const uint32_t space_id = page_get_space_id(page_align(header)); + const uint32_t header_page = page_get_page_no(page_align(header)); + + fil_space_t* space = mtr_x_lock_space(space_id, mtr); + buf_block_t* xdes; + xdes_t* descr = xdes_get_descriptor(space, header_page, &xdes, mtr); + + /* Check that the header resides on a page which has not been + freed yet */ + + ut_a(!xdes_is_free(descr, header_page % FSP_EXTENT_SIZE)); + buf_block_t* iblock; + const ulint zip_size = space->zip_size(); + inode = fseg_inode_try_get(header, space_id, zip_size, mtr, &iblock); + + if (inode == NULL) { + ib::info() << "Double free of inode from " + << page_id_t(space_id, header_page); + DBUG_RETURN(true); + } + + if (!space->full_crc32()) { + fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + } + descr = fseg_get_first_extent(inode, space, mtr); + + if (descr != NULL) { + /* Free the extent held by the segment */ + fseg_free_extent(inode, iblock, space, xdes_get_offset(descr), + mtr); + DBUG_RETURN(false); + } + + /* Free a frag page */ + n = fseg_find_last_used_frag_page_slot(inode); + + if (n == ULINT_UNDEFINED) { + /* Freeing completed: free the segment inode */ + fsp_free_seg_inode(space, inode, iblock, mtr); + + DBUG_RETURN(true); + } + + page_no_t page_no = fseg_get_nth_frag_page_no(inode, n); + + fseg_free_page_low(inode, iblock, space, page_no, mtr); + + buf_page_free(space, page_no, mtr, __FILE__, __LINE__); + + n = fseg_find_last_used_frag_page_slot(inode); + + if (n == ULINT_UNDEFINED) { + /* Freeing completed: free the segment inode */ + fsp_free_seg_inode(space, inode, iblock, mtr); + + DBUG_RETURN(true); + } + + DBUG_RETURN(false); +} + +/**********************************************************************//** +Frees part of a segment. Differs from fseg_free_step because this function +leaves the header page unfreed. +@return whether the freeing was completed, except for the header page */ +bool +fseg_free_step_not_header( + fseg_header_t* header, /*!< in: segment header which must reside on + the first fragment page of the segment */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint n; + xdes_t* descr; + fseg_inode_t* inode; + + const uint32_t space_id = page_get_space_id(page_align(header)); + ut_ad(mtr->is_named_space(space_id)); + + fil_space_t* space = mtr_x_lock_space(space_id, mtr); + buf_block_t* iblock; + + inode = fseg_inode_get(header, space_id, space->zip_size(), mtr, + &iblock); + if (!space->full_crc32()) { + fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + } + + descr = fseg_get_first_extent(inode, space, mtr); + + if (descr != NULL) { + /* Free the extent held by the segment */ + fseg_free_extent(inode, iblock, space, xdes_get_offset(descr), + mtr); + return false; + } + + /* Free a frag page */ + + n = fseg_find_last_used_frag_page_slot(inode); + + ut_a(n != ULINT_UNDEFINED); + + uint32_t page_no = fseg_get_nth_frag_page_no(inode, n); + + if (page_no == page_get_page_no(page_align(header))) { + return true; + } + + fseg_free_page_low(inode, iblock, space, page_no, mtr); + buf_page_free(space, page_no, mtr, __FILE__, __LINE__); + return false; +} + +/** Returns the first extent descriptor for a segment. +We think of the extent lists of the segment catenated in the order +FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE. +@param[in] inode segment inode +@param[in] space tablespace +@param[in,out] mtr mini-transaction +@return the first extent descriptor, or NULL if none */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +static +xdes_t* +fseg_get_first_extent( + fseg_inode_t* inode, + const fil_space_t* space, + mtr_t* mtr) +{ + fil_addr_t first; + + ut_ad(space->id == page_get_space_id(page_align(inode))); + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + + if (flst_get_len(inode + FSEG_FULL) > 0) { + first = flst_get_first(inode + FSEG_FULL); + } else if (flst_get_len(inode + FSEG_NOT_FULL) > 0) { + first = flst_get_first(inode + FSEG_NOT_FULL); + } else if (flst_get_len(inode + FSEG_FREE) > 0) { + first = flst_get_first(inode + FSEG_FREE); + } else { + return(NULL); + } + + DBUG_ASSERT(first.page != FIL_NULL); + + buf_block_t *xdes; + + return(first.page == FIL_NULL ? NULL + : xdes_lst_get_descriptor(space, first, &xdes, mtr)); +} + +#ifdef UNIV_BTR_PRINT +/*******************************************************************//** +Writes info of a segment. */ +static void fseg_print_low(const fseg_inode_t *inode) +{ + ulint space; + ulint n_used; + ulint n_frag; + ulint n_free; + ulint n_not_full; + ulint n_full; + ulint reserved; + ulint used; + ulint page_no; + ib_id_t seg_id; + + space = page_get_space_id(page_align(inode)); + page_no = page_get_page_no(page_align(inode)); + + reserved = fseg_n_reserved_pages_low(inode, &used); + + seg_id = mach_read_from_8(inode + FSEG_ID); + n_used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED); + n_frag = fseg_get_n_frag_pages(inode); + n_free = flst_get_len(inode + FSEG_FREE); + n_not_full = flst_get_len(inode + FSEG_NOT_FULL); + n_full = flst_get_len(inode + FSEG_FULL); + + ib::info() << "SEGMENT id " << seg_id + << " space " << space << ";" + << " page " << page_no << ";" + << " res " << reserved << " used " << used << ";" + << " full ext " << n_full << ";" + << " fragm pages " << n_frag << ";" + << " free extents " << n_free << ";" + << " not full extents " << n_not_full << ": pages " << n_used; + + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); +} + +/*******************************************************************//** +Writes info of a segment. */ +void +fseg_print( +/*=======*/ + fseg_header_t* header, /*!< in: segment header */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + fseg_inode_t* inode; + ulint space_id; + + space_id = page_get_space_id(page_align(header)); + const fil_space_t* space = mtr_x_lock_space(space_id, mtr); + + inode = fseg_inode_get(header, space_id, space->zip_size(), mtr); + + fseg_print_low(inode); +} +#endif /* UNIV_BTR_PRINT */ + +#ifdef UNIV_DEBUG +std::ostream &fseg_header::to_stream(std::ostream &out) const +{ + out << "[fseg_header_t: space=" + << mach_read_from_4(m_header + FSEG_HDR_SPACE) + << ", page=" << mach_read_from_4(m_header + FSEG_HDR_PAGE_NO) + << ", offset=" << mach_read_from_2(m_header + FSEG_HDR_OFFSET) << "]"; + return out; +} +#endif /* UNIV_DEBUG */ diff --git a/storage/innobase/fsp/fsp0space.cc b/storage/innobase/fsp/fsp0space.cc new file mode 100644 index 00000000..b0a80efe --- /dev/null +++ b/storage/innobase/fsp/fsp0space.cc @@ -0,0 +1,230 @@ +/***************************************************************************** + +Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fsp/fsp0space.cc +Shared tablespace implementation. + +Created 2012-11-16 by Sunny Bains as srv/srv0space.cc +*******************************************************/ + +#include "fsp0sysspace.h" +#include "fsp0fsp.h" +#include "os0file.h" +#include "my_sys.h" + +/** Check if two tablespaces have common data file names. +@param other_space Tablespace to check against this. +@return true if they have the same data filenames and paths */ +bool +Tablespace::intersection( + const Tablespace* other_space) +{ + for (files_t::const_iterator it(other_space->begin()), + end(other_space->end()); it != end; ++it) { + + if (find(it->m_filename)) { + + return(true); + } + } + + return(false); +} + +/** Frees the memory allocated by the SysTablespace object. */ +void +Tablespace::shutdown() +{ + for (iterator it = begin(); it != end(); ++it) { + it->shutdown(); + } + + m_files.clear(); + ut_free(m_path); + m_path = NULL; + m_space_id = ULINT_UNDEFINED; +} + +/** Note that the data file was found. +@param[in,out] file Data file object to set */ +void +Tablespace::file_found(Datafile& file) +{ + /* Note that the file exists and can be opened + in the appropriate mode. */ + file.m_exists = true; + + file.set_open_flags( + &file == &m_files.front() + ? OS_FILE_OPEN_RETRY : OS_FILE_OPEN); +} + +/** Open or Create the data files if they do not exist. +@param[in] is_temp whether this is a temporary tablespace +@return DB_SUCCESS or error code */ +dberr_t +Tablespace::open_or_create(bool is_temp) +{ + fil_space_t* space = NULL; + dberr_t err = DB_SUCCESS; + + ut_ad(!m_files.empty()); + + for (iterator it = begin(); it != end(); ++it) { + + if (it->m_exists) { + err = it->open_or_create( + m_ignore_read_only + ? false : srv_read_only_mode); + } else { + err = it->open_or_create( + m_ignore_read_only + ? false : srv_read_only_mode); + + /* Set the correct open flags now that we have + successfully created the file. */ + if (err == DB_SUCCESS) { + file_found(*it); + } + } + + if (err != DB_SUCCESS) { + break; + } + + /* We can close the handle now and open the tablespace + the proper way. */ + it->close(); + + if (it == begin()) { + /* First data file. */ + + /* Create the tablespace entry for the multi-file + tablespace in the tablespace manager. */ + ulint fsp_flags = 0; + + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + fsp_flags = (FSP_FLAGS_FCRC32_MASK_MARKER + | FSP_FLAGS_FCRC32_PAGE_SSIZE()); + break; + default: + fsp_flags = FSP_FLAGS_PAGE_SSIZE(); + } + + space = fil_space_t::create( + m_name, m_space_id, fsp_flags, + is_temp + ? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE, + NULL); + if (!space) { + return DB_ERROR; + } + } + + ut_a(fil_validate()); + + space->add(it->m_filepath, OS_FILE_CLOSED, it->m_size, + false, true); + } + + return(err); +} + +/** Find a filename in the list of Datafiles for a tablespace +@return true if the filename exists in the data files */ +bool +Tablespace::find(const char* filename) const +{ + for (const_iterator it = begin(); it != end(); ++it) { + + if (innobase_strcasecmp(filename, it->m_filename) == 0) { + return(true); + } + } + + return(false); +} + +/** Delete all the data files. */ +void +Tablespace::delete_files() +{ + for (iterator it = begin(); it != end(); ++it) { + + it->close(); + + bool file_pre_exists; + bool success = os_file_delete_if_exists( + innodb_data_file_key, it->m_filepath, &file_pre_exists); + + if (success && file_pre_exists) { + ib::info() << "Removed temporary tablespace data" + " file: \"" << it->m_name << "\""; + } + } +} + +/** Use the ADD DATAFILE path to create a Datafile object and add it to the +front of m_files. +Parse the datafile path into a path and a filename with extension 'ibd'. +This datafile_path provided may or may not be an absolute path, but it +must end with the extension .ibd and have a basename of at least 1 byte. + +Set tablespace m_path member and add a Datafile with the filename. +@param[in] datafile_path full path of the tablespace file. */ +dberr_t +Tablespace::add_datafile( + const char* datafile_added) +{ + /* The path provided ends in ".ibd". This was assured by + validate_create_tablespace_info() */ + ut_d(const char* dot = strrchr(datafile_added, '.')); + ut_ad(dot != NULL && 0 == strcmp(dot, DOT_IBD)); + + char* filepath = mem_strdup(datafile_added); + os_normalize_path(filepath); + + /* If the path is an absolute path, separate it onto m_path and a + basename. For relative paths, make the whole thing a basename so that + it can be appended to the datadir. */ + bool is_abs_path = is_absolute_path(filepath); + size_t dirlen = (is_abs_path ? dirname_length(filepath) : 0); + const char* basename = filepath + dirlen; + + /* If the pathname contains a directory separator, fill the + m_path member which is the default directory for files in this + tablespace. Leave it null otherwise. */ + if (dirlen > 0) { + set_path(filepath, dirlen); + } + + /* Now add a new Datafile and set the filepath + using the m_path created above. */ + m_files.push_back(Datafile(m_name, m_flags, + FIL_IBD_FILE_INITIAL_SIZE, 0)); + Datafile* datafile = &m_files.back(); + datafile->make_filepath(m_path, basename, IBD); + + ut_free(filepath); + + return(DB_SUCCESS); +} diff --git a/storage/innobase/fsp/fsp0sysspace.cc b/storage/innobase/fsp/fsp0sysspace.cc new file mode 100644 index 00000000..a2c9e1bc --- /dev/null +++ b/storage/innobase/fsp/fsp0sysspace.cc @@ -0,0 +1,994 @@ +/***************************************************************************** + +Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fsp/fsp0space.cc +Multi file, shared, system tablespace implementation. + +Created 2012-11-16 by Sunny Bains as srv/srv0space.cc +Refactored 2013-7-26 by Kevin Lewis +*******************************************************/ + +#include "fsp0sysspace.h" +#include "srv0start.h" +#include "trx0sys.h" +#include "dict0load.h" +#include "mem0mem.h" +#include "os0file.h" +#include "row0mysql.h" +#include "buf0dblwr.h" + +/** The server header file is included to access opt_initialize global variable. +If server passes the option for create/open DB to SE, we should remove such +direct reference to server header and global variable */ +#include "mysqld.h" + +/** The control info of the system tablespace. */ +SysTablespace srv_sys_space; + +/** The control info of a temporary table shared tablespace. */ +SysTablespace srv_tmp_space; + +/** If the last data file is auto-extended, we add this many pages to it +at a time. We have to make this public because it is a config variable. */ +uint sys_tablespace_auto_extend_increment; + +/** Convert a numeric string that optionally ends in G or M or K, + to a number containing megabytes. +@param[in] str String with a quantity in bytes +@param[out] megs The number in megabytes +@return next character in string */ +char* +SysTablespace::parse_units( + char* ptr, + ulint* megs) +{ + char* endp; + + *megs = strtoul(ptr, &endp, 10); + + ptr = endp; + + switch (*ptr) { + case 'G': case 'g': + *megs *= 1024; + /* fall through */ + case 'M': case 'm': + ++ptr; + break; + case 'K': case 'k': + *megs /= 1024; + ++ptr; + break; + default: + *megs /= 1024 * 1024; + break; + } + + return(ptr); +} + +/** Parse the input params and populate member variables. +@param[in] filepath path to data files +@param[in] supports_raw true if the tablespace supports raw devices +@return true on success parse */ +bool +SysTablespace::parse_params( + const char* filepath_spec, + bool supports_raw) +{ + char* filepath; + ulint size; + char* input_str; + ulint n_files = 0; + + ut_ad(m_last_file_size_max == 0); + ut_ad(!m_auto_extend_last_file); + + char* new_str = mem_strdup(filepath_spec); + char* str = new_str; + + input_str = str; + + /*---------------------- PASS 1 ---------------------------*/ + /* First calculate the number of data files and check syntax: + filepath:size[K |M | G];filepath:size[K |M | G]... . + Note that a Windows path may contain a drive name and a ':'. */ + while (*str != '\0') { + filepath = str; + + while ((*str != ':' && *str != '\0') + || (*str == ':' + && (*(str + 1) == '\\' || *(str + 1) == '/' + || *(str + 1) == ':'))) { + str++; + } + + if (*str == '\0') { + ut_free(new_str); + + ib::error() + << "syntax error in file path or size" + " specified is less than 1 megabyte"; + return(false); + } + + str++; + + str = parse_units(str, &size); + + if (0 == strncmp(str, ":autoextend", + (sizeof ":autoextend") - 1)) { + + str += (sizeof ":autoextend") - 1; + + if (0 == strncmp(str, ":max:", + (sizeof ":max:") - 1)) { + + str += (sizeof ":max:") - 1; + + str = parse_units(str, &size); + } + + if (*str != '\0') { + ut_free(new_str); + ib::error() + << "syntax error in file path or" + << " size specified is less than" + << " 1 megabyte"; + return(false); + } + } + + if (::strlen(str) >= 6 + && *str == 'n' + && *(str + 1) == 'e' + && *(str + 2) == 'w') { + + if (!supports_raw) { + ib::error() + << "Tablespace doesn't support raw" + " devices"; + ut_free(new_str); + return(false); + } + + str += 3; + } + + if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') { + str += 3; + + if (!supports_raw) { + ib::error() + << "Tablespace doesn't support raw" + " devices"; + ut_free(new_str); + return(false); + } + } + + if (size == 0) { + + ut_free(new_str); + + ib::error() + << "syntax error in file path or size" + " specified is less than 1 megabyte"; + + return(false); + } + + ++n_files; + + if (*str == ';') { + str++; + } else if (*str != '\0') { + ut_free(new_str); + + ib::error() + << "syntax error in file path or size" + " specified is less than 1 megabyte"; + return(false); + } + } + + if (n_files == 0) { + + /* filepath_spec must contain at least one data file + definition */ + + ut_free(new_str); + + ib::error() + << "syntax error in file path or size specified" + " is less than 1 megabyte"; + + return(false); + } + + /*---------------------- PASS 2 ---------------------------*/ + /* Then store the actual values to our arrays */ + str = input_str; + ulint order = 0; + + while (*str != '\0') { + filepath = str; + + /* Note that we must step over the ':' in a Windows filepath; + a Windows path normally looks like C:\ibdata\ibdata1:1G, but + a Windows raw partition may have a specification like + \\.\C::1Gnewraw or \\.\PHYSICALDRIVE2:1Gnewraw */ + + while ((*str != ':' && *str != '\0') + || (*str == ':' + && (*(str + 1) == '\\' || *(str + 1) == '/' + || *(str + 1) == ':'))) { + str++; + } + + if (*str == ':') { + /* Make filepath a null-terminated string */ + *str = '\0'; + str++; + } + + str = parse_units(str, &size); + + if (0 == strncmp(str, ":autoextend", + (sizeof ":autoextend") - 1)) { + + m_auto_extend_last_file = true; + + str += (sizeof ":autoextend") - 1; + + if (0 == strncmp(str, ":max:", + (sizeof ":max:") - 1)) { + + str += (sizeof ":max:") - 1; + + str = parse_units(str, &m_last_file_size_max); + } + + if (*str != '\0') { + ut_free(new_str); + ib::error() << "syntax error in file path or" + " size specified is less than 1" + " megabyte"; + return(false); + } + } + + m_files.push_back(Datafile(filepath, flags(), uint32_t(size), + order)); + Datafile* datafile = &m_files.back(); + datafile->make_filepath(path(), filepath, NO_EXT); + + if (::strlen(str) >= 6 + && *str == 'n' + && *(str + 1) == 'e' + && *(str + 2) == 'w') { + + ut_a(supports_raw); + + str += 3; + + /* Initialize new raw device only during initialize */ + /* JAN: TODO: MySQL 5.7 used opt_initialize */ + m_files.back().m_type = + opt_bootstrap ? SRV_NEW_RAW : SRV_OLD_RAW; + } + + if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') { + + ut_a(supports_raw); + + str += 3; + + /* Initialize new raw device only during initialize */ + if (m_files.back().m_type == SRV_NOT_RAW) { + /* JAN: TODO: MySQL 5.7 used opt_initialize */ + m_files.back().m_type = + opt_bootstrap ? SRV_NEW_RAW : SRV_OLD_RAW; + } + } + + if (*str == ';') { + ++str; + } + order++; + } + + ut_ad(n_files == ulint(m_files.size())); + + ut_free(new_str); + + return(true); +} + +/** Frees the memory allocated by the parse method. */ +void +SysTablespace::shutdown() +{ + Tablespace::shutdown(); + + m_auto_extend_last_file = 0; + m_last_file_size_max = 0; + m_created_new_raw = 0; + m_is_tablespace_full = false; + m_sanity_checks_done = false; +} + +/** Verify the size of the physical file. +@param[in] file data file object +@return DB_SUCCESS if OK else error code. */ +dberr_t +SysTablespace::check_size( + Datafile& file) +{ + os_offset_t size = os_file_get_size(file.m_handle); + ut_a(size != (os_offset_t) -1); + + /* Under some error conditions like disk full scenarios + or file size reaching filesystem limit the data file + could contain an incomplete extent at the end. When we + extend a data file and if some failure happens, then + also the data file could contain an incomplete extent. + So we need to round the size downward to a megabyte.*/ + + const uint32_t rounded_size_pages = static_cast<uint32_t>( + size >> srv_page_size_shift); + + /* If last file */ + if (&file == &m_files.back() && m_auto_extend_last_file) { + + if (file.m_size > rounded_size_pages + || (m_last_file_size_max > 0 + && m_last_file_size_max < rounded_size_pages)) { + ib::error() << "The Auto-extending " << name() + << " data file '" << file.filepath() << "' is" + " of a different size " << rounded_size_pages + << " pages than specified" + " in the .cnf file: initial " << file.m_size + << " pages, max " << m_last_file_size_max + << " (relevant if non-zero) pages!"; + return(DB_ERROR); + } + + file.m_size = rounded_size_pages; + } + + if (rounded_size_pages != file.m_size) { + ib::error() << "The " << name() << " data file '" + << file.filepath() << "' is of a different size " + << rounded_size_pages << " pages" + " than the " << file.m_size << " pages specified in" + " the .cnf file!"; + return(DB_ERROR); + } + + return(DB_SUCCESS); +} + +/** Set the size of the file. +@param[in] file data file object +@return DB_SUCCESS or error code */ +dberr_t +SysTablespace::set_size( + Datafile& file) +{ + ut_ad(!srv_read_only_mode || m_ignore_read_only); + + /* We created the data file and now write it full of zeros */ + ib::info() << "Setting file '" << file.filepath() << "' size to " + << (file.m_size >> (20U - srv_page_size_shift)) << " MB." + " Physically writing the file full; Please wait ..."; + + bool success = os_file_set_size( + file.m_filepath, file.m_handle, + static_cast<os_offset_t>(file.m_size) << srv_page_size_shift); + + if (success) { + ib::info() << "File '" << file.filepath() << "' size is now " + << (file.m_size >> (20U - srv_page_size_shift)) + << " MB."; + } else { + ib::error() << "Could not set the file size of '" + << file.filepath() << "'. Probably out of disk space"; + + return(DB_ERROR); + } + + return(DB_SUCCESS); +} + +/** Create a data file. +@param[in] file data file object +@return DB_SUCCESS or error code */ +dberr_t +SysTablespace::create_file( + Datafile& file) +{ + dberr_t err = DB_SUCCESS; + + ut_a(!file.m_exists); + ut_ad(!srv_read_only_mode || m_ignore_read_only); + + switch (file.m_type) { + case SRV_NEW_RAW: + + /* The partition is opened, not created; then it is + written over */ + m_created_new_raw = true; + + /* Fall through. */ + + case SRV_OLD_RAW: + + srv_start_raw_disk_in_use = TRUE; + + /* Fall through. */ + + case SRV_NOT_RAW: + err = file.open_or_create( + m_ignore_read_only ? false : srv_read_only_mode); + break; + } + + + if (err == DB_SUCCESS && file.m_type != SRV_OLD_RAW) { + err = set_size(file); + } + + return(err); +} + +/** Open a data file. +@param[in] file data file object +@return DB_SUCCESS or error code */ +dberr_t +SysTablespace::open_file( + Datafile& file) +{ + dberr_t err = DB_SUCCESS; + + ut_a(file.m_exists); + + switch (file.m_type) { + case SRV_NEW_RAW: + /* The partition is opened, not created; then it is + written over */ + m_created_new_raw = true; + + /* Fall through */ + + case SRV_OLD_RAW: + srv_start_raw_disk_in_use = TRUE; + + if (srv_read_only_mode && !m_ignore_read_only) { + ib::error() << "Can't open a raw device '" + << file.m_filepath << "' when" + " --innodb-read-only is set"; + + return(DB_ERROR); + } + + /* Fall through */ + + case SRV_NOT_RAW: + err = file.open_or_create( + m_ignore_read_only ? false : srv_read_only_mode); + + if (err != DB_SUCCESS) { + return(err); + } + break; + } + + switch (file.m_type) { + case SRV_NEW_RAW: + /* Set file size for new raw device. */ + err = set_size(file); + break; + + case SRV_NOT_RAW: + /* Check file size for existing file. */ + err = check_size(file); + break; + + case SRV_OLD_RAW: + err = DB_SUCCESS; + break; + + } + + if (err != DB_SUCCESS) { + file.close(); + } + + return(err); +} + +/** Check the tablespace header for this tablespace. +@param[out] flushed_lsn the value of FIL_PAGE_FILE_FLUSH_LSN +@return DB_SUCCESS or error code */ +dberr_t +SysTablespace::read_lsn_and_check_flags(lsn_t* flushed_lsn) +{ + dberr_t err; + + /* Only relevant for the system tablespace. */ + ut_ad(space_id() == TRX_SYS_SPACE); + + files_t::iterator it = m_files.begin(); + + ut_a(it->m_exists); + + if (it->m_handle == OS_FILE_CLOSED) { + + err = it->open_or_create( + m_ignore_read_only ? false : srv_read_only_mode); + + if (err != DB_SUCCESS) { + return(err); + } + } + + err = it->read_first_page( + m_ignore_read_only ? false : srv_read_only_mode); + + if (err != DB_SUCCESS) { + return(err); + } + + ut_a(it->order() == 0); + + if (srv_operation == SRV_OPERATION_NORMAL) { + buf_dblwr.init_or_load_pages(it->handle(), it->filepath()); + } + + /* Check the contents of the first page of the + first datafile. */ + for (int retry = 0; retry < 2; ++retry) { + + err = it->validate_first_page(flushed_lsn); + + if (err != DB_SUCCESS + && (retry == 1 + || it->restore_from_doublewrite())) { + + it->close(); + + return(err); + } + } + + /* Make sure the tablespace space ID matches the + space ID on the first page of the first datafile. */ + if (space_id() != it->m_space_id) { + + ib::error() + << "The " << name() << " data file '" << it->name() + << "' has the wrong space ID. It should be " + << space_id() << ", but " << it->m_space_id + << " was found"; + + it->close(); + + return(err); + } + + it->close(); + + return(DB_SUCCESS); +} + +/** Check if a file can be opened in the correct mode. +@param[in] file data file object +@param[out] reason exact reason if file_status check failed. +@return DB_SUCCESS or error code. */ +dberr_t +SysTablespace::check_file_status( + const Datafile& file, + file_status_t& reason) +{ + os_file_stat_t stat; + + memset(&stat, 0x0, sizeof(stat)); + + dberr_t err = os_file_get_status( + file.m_filepath, &stat, true, + m_ignore_read_only ? false : srv_read_only_mode); + + reason = FILE_STATUS_VOID; + /* File exists but we can't read the rw-permission settings. */ + switch (err) { + case DB_FAIL: + ib::error() << "os_file_get_status() failed on '" + << file.filepath() + << "'. Can't determine file permissions"; + err = DB_ERROR; + reason = FILE_STATUS_RW_PERMISSION_ERROR; + break; + + case DB_SUCCESS: + + /* Note: stat.rw_perm is only valid for "regular" files */ + + if (stat.type == OS_FILE_TYPE_FILE) { + + if (!stat.rw_perm) { + const char *p = (!srv_read_only_mode + || m_ignore_read_only) + ? "writable" + : "readable"; + + ib::error() << "The " << name() << " data file" + << " '" << file.name() << "' must be " + << p; + + err = DB_ERROR; + reason = FILE_STATUS_READ_WRITE_ERROR; + } + + } else { + /* Not a regular file, bail out. */ + ib::error() << "The " << name() << " data file '" + << file.name() << "' is not a regular" + " InnoDB data file."; + + err = DB_ERROR; + reason = FILE_STATUS_NOT_REGULAR_FILE_ERROR; + } + break; + + case DB_NOT_FOUND: + break; + + default: + ut_ad(0); + } + + return(err); +} + +/** Note that the data file was not found. +@param[in] file data file object +@param[out] create_new_db true if a new instance to be created +@return DB_SUCESS or error code */ +dberr_t +SysTablespace::file_not_found( + Datafile& file, + bool* create_new_db) +{ + file.m_exists = false; + + if (srv_read_only_mode && !m_ignore_read_only) { + ib::error() << "Can't create file '" << file.filepath() + << "' when --innodb-read-only is set"; + + return(DB_ERROR); + + } else if (&file == &m_files.front()) { + + /* First data file. */ + ut_a(!*create_new_db); + *create_new_db = TRUE; + + if (space_id() == TRX_SYS_SPACE) { + ib::info() << "The first " << name() << " data file '" + << file.name() << "' did not exist." + " A new tablespace will be created!"; + } + + } else { + ib::info() << "Need to create a new " << name() + << " data file '" << file.name() << "'."; + } + + /* Set the file create mode. */ + switch (file.m_type) { + case SRV_NOT_RAW: + file.set_open_flags(OS_FILE_CREATE); + break; + + case SRV_NEW_RAW: + case SRV_OLD_RAW: + file.set_open_flags(OS_FILE_OPEN_RAW); + break; + } + + return(DB_SUCCESS); +} + +/** Note that the data file was found. +@param[in,out] file data file object +@return true if a new instance to be created */ +bool +SysTablespace::file_found( + Datafile& file) +{ + /* Note that the file exists and can be opened + in the appropriate mode. */ + file.m_exists = true; + + /* Set the file open mode */ + switch (file.m_type) { + case SRV_NOT_RAW: + file.set_open_flags( + &file == &m_files.front() + ? OS_FILE_OPEN_RETRY : OS_FILE_OPEN); + break; + + case SRV_NEW_RAW: + case SRV_OLD_RAW: + file.set_open_flags(OS_FILE_OPEN_RAW); + break; + } + + /* Need to create the system tablespace for new raw device. */ + return(file.m_type == SRV_NEW_RAW); +} + +/** Check the data file specification. +@param[out] create_new_db true if a new database is to be created +@param[in] min_expected_size Minimum expected tablespace size in bytes +@return DB_SUCCESS if all OK else error code */ +dberr_t +SysTablespace::check_file_spec( + bool* create_new_db, + ulint min_expected_size) +{ + *create_new_db = FALSE; + + if (m_files.size() >= 1000) { + ib::error() << "There must be < 1000 data files in " + << name() << " but " << m_files.size() << " have been" + " defined."; + + return(DB_ERROR); + } + + if (!m_auto_extend_last_file + && get_sum_of_sizes() + < (min_expected_size >> srv_page_size_shift)) { + ib::error() << "Tablespace size must be at least " + << (min_expected_size >> 20) << " MB"; + return(DB_ERROR); + } + + dberr_t err = DB_SUCCESS; + + ut_a(!m_files.empty()); + + /* If there is more than one data file and the last data file + doesn't exist, that is OK. We allow adding of new data files. */ + + files_t::iterator begin = m_files.begin(); + files_t::iterator end = m_files.end(); + + for (files_t::iterator it = begin; it != end; ++it) { + + file_status_t reason_if_failed; + err = check_file_status(*it, reason_if_failed); + + if (err == DB_NOT_FOUND) { + + err = file_not_found(*it, create_new_db); + + if (err != DB_SUCCESS) { + break; + } + + } else if (err != DB_SUCCESS) { + if (reason_if_failed == FILE_STATUS_READ_WRITE_ERROR) { + const char* p = (!srv_read_only_mode + || m_ignore_read_only) + ? "writable" : "readable"; + ib::error() << "The " << name() << " data file" + << " '" << it->name() << "' must be " + << p; + } + + ut_a(err != DB_FAIL); + break; + + } else if (*create_new_db) { + ib::error() << "The " << name() << " data file '" + << begin->m_name << "' was not found but" + " one of the other data files '" << it->m_name + << "' exists."; + + err = DB_ERROR; + break; + + } else { + *create_new_db = file_found(*it); + } + } + + return(err); +} + +/** Open or create the data files +@param[in] is_temp whether this is a temporary tablespace +@param[in] create_new_db whether we are creating a new database +@param[out] sum_new_sizes sum of sizes of the new files added +@param[out] flush_lsn FIL_PAGE_FILE_FLUSH_LSN of first file +@return DB_SUCCESS or error code */ +dberr_t +SysTablespace::open_or_create( + bool is_temp, + bool create_new_db, + ulint* sum_new_sizes, + lsn_t* flush_lsn) +{ + dberr_t err = DB_SUCCESS; + fil_space_t* space = NULL; + + ut_ad(!m_files.empty()); + + if (sum_new_sizes) { + *sum_new_sizes = 0; + } + + files_t::iterator begin = m_files.begin(); + files_t::iterator end = m_files.end(); + + ut_ad(begin->order() == 0); + + for (files_t::iterator it = begin; it != end; ++it) { + + if (it->m_exists) { + err = open_file(*it); + + /* For new raw device increment new size. */ + if (sum_new_sizes && it->m_type == SRV_NEW_RAW) { + + *sum_new_sizes += it->m_size; + } + + } else { + err = create_file(*it); + + if (sum_new_sizes) { + *sum_new_sizes += it->m_size; + } + + /* Set the correct open flags now that we have + successfully created the file. */ + if (err == DB_SUCCESS) { + /* We ignore new_db OUT parameter here + as the information is known at this stage */ + file_found(*it); + } + } + + if (err != DB_SUCCESS) { + return(err); + } + + } + + if (!create_new_db && flush_lsn) { + /* Validate the header page in the first datafile + and read LSNs fom the others. */ + err = read_lsn_and_check_flags(flush_lsn); + if (err != DB_SUCCESS) { + return(err); + } + } + + /* Close the curent handles, add space and file info to the + fil_system cache and the Data Dictionary, and re-open them + in file_system cache so that they stay open until shutdown. */ + ulint node_counter = 0; + for (files_t::iterator it = begin; it != end; ++it) { + it->close(); + it->m_exists = true; + + if (it != begin) { + } else if (is_temp) { + ut_ad(space_id() == SRV_TMP_SPACE_ID); + space = fil_space_t::create( + name(), SRV_TMP_SPACE_ID, flags(), + FIL_TYPE_TEMPORARY, NULL); + ut_ad(space == fil_system.temp_space); + if (!space) { + return DB_ERROR; + } + ut_ad(!space->is_compressed()); + ut_ad(space->full_crc32()); + } else { + ut_ad(space_id() == TRX_SYS_SPACE); + space = fil_space_t::create( + name(), TRX_SYS_SPACE, it->flags(), + FIL_TYPE_TABLESPACE, NULL); + ut_ad(space == fil_system.sys_space); + if (!space) { + return DB_ERROR; + } + } + + ut_a(fil_validate()); + + uint32_t max_size = (++node_counter == m_files.size() + ? (m_last_file_size_max == 0 + ? UINT32_MAX + : uint32_t(m_last_file_size_max)) + : it->m_size); + + space->add(it->m_filepath, OS_FILE_CLOSED, it->m_size, + it->m_type != SRV_NOT_RAW, true, max_size); + } + + return(err); +} + +/** Normalize the file size, convert from megabytes to number of pages. */ +void +SysTablespace::normalize_size() +{ + files_t::iterator end = m_files.end(); + + for (files_t::iterator it = m_files.begin(); it != end; ++it) { + + it->m_size <<= (20U - srv_page_size_shift); + } + + m_last_file_size_max <<= (20U - srv_page_size_shift); +} + + +/** +@return next increment size */ +uint32_t SysTablespace::get_increment() const +{ + if (m_last_file_size_max == 0) + return get_autoextend_increment(); + + if (!is_valid_size()) + { + ib::error() << "The last data file in " << name() + << " has a size of " << last_file_size() + << " but the max size allowed is " + << m_last_file_size_max; + } + + return std::min(uint32_t(m_last_file_size_max) - last_file_size(), + get_autoextend_increment()); +} + + +/** +@return true if configured to use raw devices */ +bool +SysTablespace::has_raw_device() +{ + files_t::iterator end = m_files.end(); + + for (files_t::iterator it = m_files.begin(); it != end; ++it) { + + if (it->is_raw_device()) { + return(true); + } + } + + return(false); +} |