diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
commit | 3f619478f796eddbba6e39502fe941b285dd97b1 (patch) | |
tree | e2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/os | |
parent | Initial commit. (diff) | |
download | mariadb-upstream.tar.xz mariadb-upstream.zip |
Adding upstream version 1:10.11.6.upstream/1%10.11.6upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/os')
-rw-r--r-- | storage/innobase/os/os0file.cc | 4270 |
1 files changed, 4270 insertions, 0 deletions
diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc new file mode 100644 index 00000000..5e674806 --- /dev/null +++ b/storage/innobase/os/os0file.cc @@ -0,0 +1,4270 @@ +/*********************************************************************** + +Copyright (c) 1995, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2022, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +***********************************************************************/ + +/**************************************************//** +@file os/os0file.cc +The interface to the operating system file i/o primitives + +Created 10/21/1995 Heikki Tuuri +*******************************************************/ + +#include "os0file.h" +#include "sql_const.h" +#include "log.h" + +#ifdef __linux__ +# include <sys/types.h> +# include <sys/stat.h> +# include <sys/sysmacros.h> +#endif + +#include "srv0mon.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "fil0fil.h" +#include "fsp0fsp.h" +#ifdef HAVE_LINUX_UNISTD_H +#include "unistd.h" +#endif +#include "buf0dblwr.h" + +#include <tpool_structs.h> + +#ifdef LINUX_NATIVE_AIO +#include <libaio.h> +#endif /* LINUX_NATIVE_AIO */ + +#ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE +# include <fcntl.h> +# include <linux/falloc.h> +#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */ + +#ifdef _WIN32 +#include <winioctl.h> +#endif + +// my_test_if_atomic_write() , my_win_secattr() +#include <my_sys.h> + +#include <thread> +#include <chrono> + +/* Per-IO operation environment*/ +class io_slots +{ +private: + tpool::cache<tpool::aiocb> m_cache; + tpool::task_group m_group; + int m_max_aio; +public: + io_slots(int max_submitted_io, int max_callback_concurrency) : + m_cache(max_submitted_io), m_group(max_callback_concurrency, false), + m_max_aio(max_submitted_io) + { + } + /* Get cached AIO control block */ + tpool::aiocb* acquire() + { + return m_cache.get(); + } + /* Release AIO control block back to cache */ + void release(tpool::aiocb* aiocb) + { + m_cache.put(aiocb); + } + + bool contains(tpool::aiocb* aiocb) + { + return m_cache.contains(aiocb); + } + + /* Wait for completions of all AIO operations */ + void wait(mysql_mutex_t &m) + { + m_cache.wait(m); + } + + void wait() + { + m_cache.wait(); + } + + size_t pending_io_count() + { + return m_cache.pos(); + } + + tpool::task_group* get_task_group() + { + return &m_group; + } + + ~io_slots() + { + wait(); + } + + mysql_mutex_t& mutex() + { + return m_cache.mutex(); + } + + void resize(int max_submitted_io, int max_callback_concurrency) + { + m_cache.resize(max_submitted_io); + m_group.set_max_tasks(max_callback_concurrency); + m_max_aio = max_submitted_io; + } + + tpool::task_group& task_group() + { + return m_group; + } +}; + +static io_slots *read_slots; +static io_slots *write_slots; + +/** Number of retries for partial I/O's */ +constexpr ulint NUM_RETRIES_ON_PARTIAL_IO = 10; + +/* This specifies the file permissions InnoDB uses when it creates files in +Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to +my_umask */ + +#ifndef _WIN32 +/** Umask for creating files */ +static ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; +#else +/** Umask for creating files */ +static ulint os_innodb_umask = 0; +#endif /* _WIN32 */ + +Atomic_counter<ulint> os_n_file_reads; +static ulint os_bytes_read_since_printout; +Atomic_counter<size_t> os_n_file_writes; +Atomic_counter<size_t> os_n_fsyncs; +static ulint os_n_file_reads_old; +static ulint os_n_file_writes_old; +static ulint os_n_fsyncs_old; + +static time_t os_last_printout; +bool os_has_said_disk_full; + +/** Default Zip compression level */ +extern uint page_zip_level; + +#ifdef UNIV_PFS_IO +/* Keys to register InnoDB I/O with performance schema */ +mysql_pfs_key_t innodb_data_file_key; +mysql_pfs_key_t innodb_temp_file_key; +#endif + +/** Handle errors for file operations. +@param[in] name name of a file or NULL +@param[in] operation operation +@param[in] should_abort whether to abort on an unknown error +@param[in] on_error_silent whether to suppress reports of non-fatal errors +@return true if we should retry the operation */ +static +bool +os_file_handle_error_cond_exit( + const char* name, + const char* operation, + bool should_abort, + bool on_error_silent); + +/** Does error handling when a file operation fails. +@param[in] name name of a file or NULL +@param[in] operation operation name that failed +@return true if we should retry the operation */ +static +bool +os_file_handle_error( + const char* name, + const char* operation) +{ + /* Exit in case of unknown error */ + return(os_file_handle_error_cond_exit(name, operation, true, false)); +} + +/** Does error handling when a file operation fails. +@param[in] name name of a file or NULL +@param[in] operation operation name that failed +@param[in] on_error_silent if true then don't print any message to the log. +@return true if we should retry the operation */ +static +bool +os_file_handle_error_no_exit( + const char* name, + const char* operation, + bool on_error_silent) +{ + /* Don't exit in case of unknown error */ + return(os_file_handle_error_cond_exit( + name, operation, false, on_error_silent)); +} + +/** Handle RENAME error. +@param name old name of the file +@param new_name new name of the file */ +static void os_file_handle_rename_error(const char* name, const char* new_name) +{ + if (os_file_get_last_error(true) != OS_FILE_DISK_FULL) { + ib::error() << "Cannot rename file '" << name << "' to '" + << new_name << "'"; + } else if (!os_has_said_disk_full) { + os_has_said_disk_full = true; + /* Disk full error is reported irrespective of the + on_error_silent setting. */ + ib::error() << "Full disk prevents renaming file '" + << name << "' to '" << new_name << "'"; + } +} + + +#ifdef _WIN32 + +/** + Wrapper around Windows DeviceIoControl() function. + + Works synchronously, also in case for handle opened + for async access (i.e with FILE_FLAG_OVERLAPPED). + + Accepts the same parameters as DeviceIoControl(),except + last parameter (OVERLAPPED). +*/ +static +BOOL +os_win32_device_io_control( + HANDLE handle, + DWORD code, + LPVOID inbuf, + DWORD inbuf_size, + LPVOID outbuf, + DWORD outbuf_size, + LPDWORD bytes_returned +) +{ + OVERLAPPED overlapped = { 0 }; + overlapped.hEvent = tpool::win_get_syncio_event(); + BOOL result = DeviceIoControl(handle, code, inbuf, inbuf_size, outbuf, + outbuf_size, NULL, &overlapped); + + if (result || (GetLastError() == ERROR_IO_PENDING)) { + /* Wait for async io to complete */ + result = GetOverlappedResult(handle, &overlapped, bytes_returned, TRUE); + } + + return result; +} + +#endif + + + +/** Helper class for doing synchronous file IO. Currently, the objective +is to hide the OS specific code, so that the higher level functions aren't +peppered with #ifdef. Makes the code flow difficult to follow. */ +class SyncFileIO +{ +public: + /** Constructor + @param[in] fh File handle + @param[in,out] buf Buffer to read/write + @param[in] n Number of bytes to read/write + @param[in] offset Offset where to read or write */ + SyncFileIO(os_file_t fh, void *buf, ulint n, os_offset_t offset) : + m_fh(fh), m_buf(buf), m_n(static_cast<ssize_t>(n)), m_offset(offset) + { ut_ad(m_n > 0); } + + /** Do the read/write + @param[in] request The IO context and type + @return the number of bytes read/written or negative value on error */ + ssize_t execute(const IORequest &request); + + /** Move the read/write offset up to where the partial IO succeeded. + @param[in] n_bytes The number of bytes to advance */ + void advance(ssize_t n_bytes) + { + m_offset+= n_bytes; + ut_ad(m_n >= n_bytes); + m_n-= n_bytes; + m_buf= reinterpret_cast<uchar*>(m_buf) + n_bytes; + } + +private: + /** Open file handle */ + const os_file_t m_fh; + /** Buffer to read/write */ + void *m_buf; + /** Number of bytes to read/write */ + ssize_t m_n; + /** Offset from where to read/write */ + os_offset_t m_offset; +}; + +#ifndef _WIN32 /* On Microsoft Windows, mandatory locking is used */ +/** Obtain an exclusive lock on a file. +@param fd file descriptor +@param name file name +@return 0 on success */ +int os_file_lock(int fd, const char *name) +{ + struct flock lk; + + lk.l_type = F_WRLCK; + lk.l_whence = SEEK_SET; + lk.l_start = lk.l_len = 0; + + if (fcntl(fd, F_SETLK, &lk) == -1) { + + ib::error() + << "Unable to lock " << name + << " error: " << errno; + + if (errno == EAGAIN || errno == EACCES) { + + ib::info() + << "Check that you do not already have" + " another mariadbd process using the" + " same InnoDB data or log files."; + } + + return(-1); + } + + return(0); +} +#endif /* !_WIN32 */ + + +/** Create a temporary file. This function is like tmpfile(3), but +the temporary file is created in the in the mysql server configuration +parameter (--tmpdir). +@return temporary file handle, or NULL on error */ +FILE* +os_file_create_tmpfile() +{ + FILE* file = NULL; + File fd = mysql_tmpfile("ib"); + + if (fd >= 0) { + file = my_fdopen(fd, 0, O_RDWR|O_TRUNC|O_CREAT|FILE_BINARY, + MYF(MY_WME)); + if (!file) { + my_close(fd, MYF(MY_WME)); + } + } + + if (file == NULL) { + + ib::error() + << "Unable to create temporary file; errno: " + << errno; + } + + return(file); +} + +/** Rewind file to its start, read at most size - 1 bytes from it to str, and +NUL-terminate str. All errors are silently ignored. This function is +mostly meant to be used with temporary files. +@param[in,out] file File to read from +@param[in,out] str Buffer where to read +@param[in] size Size of buffer */ +void +os_file_read_string( + FILE* file, + char* str, + ulint size) +{ + if (size != 0) { + rewind(file); + + size_t flen = fread(str, 1, size - 1, file); + + str[flen] = '\0'; + } +} + +/** This function reduces a null-terminated full remote path name into +the path that is sent by MySQL for DATA DIRECTORY clause. It replaces +the 'databasename/tablename.ibd' found at the end of the path with just +'tablename'. + +Since the result is always smaller than the path sent in, no new memory +is allocated. The caller should allocate memory for the path sent in. +This function manipulates that path in place. + +If the path format is not as expected, just return. The result is used +to inform a SHOW CREATE TABLE command. +@param[in,out] data_dir_path Full path/data_dir_path */ +void +os_file_make_data_dir_path( + char* data_dir_path) +{ + /* Replace the period before the extension with a null byte. */ + char* ptr = strrchr(data_dir_path, '.'); + + if (ptr == NULL) { + return; + } + + ptr[0] = '\0'; + + /* The tablename starts after the last slash. */ + ptr = strrchr(data_dir_path, '/'); + + + if (ptr == NULL) { + return; + } + + ptr[0] = '\0'; + + char* tablename = ptr + 1; + + /* The databasename starts after the next to last slash. */ + ptr = strrchr(data_dir_path, '/'); +#ifdef _WIN32 + if (char *aptr = strrchr(data_dir_path, '\\')) { + if (aptr > ptr) { + ptr = aptr; + } + } +#endif + + if (ptr == NULL) { + return; + } + + ulint tablename_len = strlen(tablename); + + memmove(++ptr, tablename, tablename_len); + + ptr[tablename_len] = '\0'; +} + +/** Check if the path refers to the root of a drive using a pointer +to the last directory separator that the caller has fixed. +@param[in] path path name +@param[in] path last directory separator in the path +@return true if this path is a drive root, false if not */ +UNIV_INLINE +bool +os_file_is_root( + const char* path, + const char* last_slash) +{ + return( +#ifdef _WIN32 + (last_slash == path + 2 && path[1] == ':') || +#endif /* _WIN32 */ + last_slash == path); +} + +/** Return the parent directory component of a null-terminated path. +Return a new buffer containing the string up to, but not including, +the final component of the path. +The path returned will not contain a trailing separator. +Do not return a root path, return NULL instead. +The final component trimmed off may be a filename or a directory name. +If the final component is the only component of the path, return NULL. +It is the caller's responsibility to free the returned string after it +is no longer needed. +@param[in] path Path name +@return own: parent directory of the path */ +static +char* +os_file_get_parent_dir( + const char* path) +{ + /* Find the offset of the last slash */ + const char* last_slash = strrchr(path, '/'); + +#ifdef _WIN32 + if (const char *last = strrchr(path, '\\')) { + if (last > last_slash) { + last_slash = last; + } + } +#endif + + if (!last_slash) { + /* No slash in the path, return NULL */ + return(NULL); + } + + /* Ok, there is a slash. Is there anything after it? */ + const bool has_trailing_slash = last_slash[1] == '\0'; + + /* Reduce repetitive slashes. */ + while (last_slash > path + && (IF_WIN(last_slash[-1] == '\\' ||,) last_slash[-1] == '/')) { + last_slash--; + } + + /* Check for the root of a drive. */ + if (os_file_is_root(path, last_slash)) { + return(NULL); + } + + /* If a trailing slash prevented the first strrchr() from trimming + the last component of the path, trim that component now. */ + if (has_trailing_slash) { + /* Back up to the previous slash. */ + last_slash--; + while (last_slash > path + && (IF_WIN(last_slash[0] != '\\' &&,) + last_slash[0] != '/')) { + last_slash--; + } + + /* Reduce repetitive slashes. */ + while (last_slash > path + && (IF_WIN(last_slash[-1] == '\\' ||,) + last_slash[-1] == '/')) { + last_slash--; + } + } + + /* Check for the root of a drive. */ + if (os_file_is_root(path, last_slash)) { + return(NULL); + } + + if (last_slash - path < 0) { + /* Sanity check, it prevents gcc from trying to handle this case which + * results in warnings for some optimized builds */ + return (NULL); + } + + /* Non-trivial directory component */ + + return(mem_strdupl(path, ulint(last_slash - path))); +} +#ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR + +/* Test the function os_file_get_parent_dir. */ +void +test_os_file_get_parent_dir( + const char* child_dir, + const char* expected_dir) +{ + char* child = mem_strdup(child_dir); + char* expected = expected_dir == NULL ? NULL + : mem_strdup(expected_dir); + + char* parent = os_file_get_parent_dir(child); + + bool unexpected = (expected == NULL + ? (parent != NULL) + : (0 != strcmp(parent, expected))); + if (unexpected) { + ib::fatal() << "os_file_get_parent_dir('" << child + << "') returned '" << parent + << "', instead of '" << expected << "'."; + } + ut_free(parent); + ut_free(child); + ut_free(expected); +} + +/* Test the function os_file_get_parent_dir. */ +void +unit_test_os_file_get_parent_dir() +{ + test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib"); + test_os_file_get_parent_dir("/usr/", NULL); + test_os_file_get_parent_dir("//usr//", NULL); + test_os_file_get_parent_dir("usr", NULL); + test_os_file_get_parent_dir("usr//", NULL); + test_os_file_get_parent_dir("/", NULL); + test_os_file_get_parent_dir("//", NULL); + test_os_file_get_parent_dir(".", NULL); + test_os_file_get_parent_dir("..", NULL); +# ifdef _WIN32 + test_os_file_get_parent_dir("D:", NULL); + test_os_file_get_parent_dir("D:/", NULL); + test_os_file_get_parent_dir("D:\\", NULL); + test_os_file_get_parent_dir("D:/data", NULL); + test_os_file_get_parent_dir("D:/data/", NULL); + test_os_file_get_parent_dir("D:\\data\\", NULL); + test_os_file_get_parent_dir("D:///data/////", NULL); + test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL); + test_os_file_get_parent_dir("D:/data//a", "D:/data"); + test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data"); + test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a"); + test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a"); +#endif /* _WIN32 */ +} +#endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */ + + +/** Creates all missing subdirectories along the given path. +@param[in] path Path name +@return DB_SUCCESS if OK, otherwise error code. */ +dberr_t +os_file_create_subdirs_if_needed( + const char* path) +{ + if (srv_read_only_mode) { + + ib::error() + << "read only mode set. Can't create " + << "subdirectories '" << path << "'"; + + return(DB_READ_ONLY); + + } + + char* subdir = os_file_get_parent_dir(path); + + if (subdir == NULL) { + /* subdir is root or cwd, nothing to do */ + return(DB_SUCCESS); + } + + /* Test if subdir exists */ + os_file_type_t type; + bool subdir_exists; + bool success = os_file_status(subdir, &subdir_exists, &type); + + if (success && !subdir_exists) { + + /* Subdir does not exist, create it */ + dberr_t err = os_file_create_subdirs_if_needed(subdir); + + if (err != DB_SUCCESS) { + + ut_free(subdir); + + return(err); + } + + success = os_file_create_directory(subdir, false); + } + + ut_free(subdir); + + return(success ? DB_SUCCESS : DB_ERROR); +} + + + +/** Do the read/write +@param[in] request The IO context and type +@return the number of bytes read/written or negative value on error */ +ssize_t +SyncFileIO::execute(const IORequest& request) +{ + ssize_t n_bytes; + + if (request.is_read()) { +#ifdef _WIN32 + n_bytes = tpool::pread(m_fh, m_buf, m_n, m_offset); +#else + n_bytes = pread(m_fh, m_buf, m_n, m_offset); +#endif + } else { + ut_ad(request.is_write()); +#ifdef _WIN32 + n_bytes = tpool::pwrite(m_fh, m_buf, m_n, m_offset); +#else + n_bytes = pwrite(m_fh, m_buf, m_n, m_offset); +#endif + } + + return(n_bytes); +} + +#ifndef _WIN32 +/** Free storage space associated with a section of the file. +@param[in] fh Open file handle +@param[in] off Starting offset (SEEK_SET) +@param[in] len Size of the hole +@return DB_SUCCESS or error code */ +static +dberr_t +os_file_punch_hole_posix( + os_file_t fh, + os_offset_t off, + os_offset_t len) +{ + +#ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE + const int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; + + int ret = fallocate(fh, mode, off, len); + + if (ret == 0) { + return(DB_SUCCESS); + } + + if (errno == ENOTSUP) { + return(DB_IO_NO_PUNCH_HOLE); + } + + ib::warn() + << "fallocate(" + <<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, " + << off << ", " << len << ") returned errno: " + << errno; + + return(DB_IO_ERROR); + +#elif defined __sun__ + + // Use F_FREESP + +#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */ + + return(DB_IO_NO_PUNCH_HOLE); +} + +/** Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + 100 is returned. +@param[in] report_all_errors true if we want an error message + printed of all errors +@param[in] on_error_silent true then don't print any diagnostic + to the log +@return error number, or OS error number + 100 */ +ulint os_file_get_last_error(bool report_all_errors, bool on_error_silent) +{ + int err = errno; + + if (err == 0) { + return(0); + } + + if (report_all_errors + || (err != ENOSPC && err != EEXIST && err != ENOENT + && !on_error_silent)) { + + ib::error() + << "Operating system error number " + << err + << " in a file operation."; + + if (err == EACCES) { + + ib::error() + << "The error means mariadbd does not have" + " the access rights to the directory."; + + } else { + if (strerror(err) != NULL) { + + ib::error() + << "Error number " << err << " means '" + << strerror(err) << "'"; + } + + ib::info() << OPERATING_SYSTEM_ERROR_MSG; + } + } + + switch (err) { + case ENOSPC: + return(OS_FILE_DISK_FULL); + case ENOENT: + return(OS_FILE_NOT_FOUND); + case EEXIST: + return(OS_FILE_ALREADY_EXISTS); + case EXDEV: + case ENOTDIR: + case EISDIR: + case EPERM: + return(OS_FILE_PATH_ERROR); + case EAGAIN: + if (srv_use_native_aio) { + return(OS_FILE_AIO_RESOURCES_RESERVED); + } + break; + case EINTR: + if (srv_use_native_aio) { + return(OS_FILE_AIO_INTERRUPTED); + } + break; + case EACCES: + return(OS_FILE_ACCESS_VIOLATION); + } + return(OS_FILE_ERROR_MAX + err); +} + +/** Wrapper to fsync() or fdatasync() that retries the call on some errors. +Returns the value 0 if successful; otherwise the value -1 is returned and +the global variable errno is set to indicate the error. +@param[in] file open file handle +@return 0 if success, -1 otherwise */ +static int os_file_sync_posix(os_file_t file) +{ +#if !defined(HAVE_FDATASYNC) || HAVE_DECL_FDATASYNC == 0 + auto func= fsync; + auto func_name= "fsync()"; +#else + auto func= fdatasync; + auto func_name= "fdatasync()"; +#endif + + ulint failures= 0; + + for (;;) + { + ++os_n_fsyncs; + + int ret= func(file); + + if (ret == 0) + return ret; + + switch (errno) + { + case ENOLCK: + ++failures; + ut_a(failures < 1000); + + if (!(failures % 100)) + ib::warn() << func_name << ": No locks available; retrying"; + + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + break; + + case EINTR: + ++failures; + ut_a(failures < 2000); + break; + + default: + ib::fatal() << func_name << " returned " << errno; + } + } +} + +/** Check the existence and type of the given file. +@param[in] path path name of file +@param[out] exists true if the file exists +@param[out] type Type of the file, if it exists +@return true if call succeeded */ +static +bool +os_file_status_posix( + const char* path, + bool* exists, + os_file_type_t* type) +{ + struct stat statinfo; + + int ret = stat(path, &statinfo); + + *exists = !ret; + + if (!ret) { + /* file exists, everything OK */ + MSAN_STAT_WORKAROUND(&statinfo); + } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) { + /* file does not exist */ + return(true); + + } else { + /* file exists, but stat call failed */ + os_file_handle_error_no_exit(path, "stat", false); + return(false); + } + + if (S_ISDIR(statinfo.st_mode)) { + *type = OS_FILE_TYPE_DIR; + + } else if (S_ISLNK(statinfo.st_mode)) { + *type = OS_FILE_TYPE_LINK; + + } else if (S_ISREG(statinfo.st_mode)) { + *type = OS_FILE_TYPE_FILE; + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + return(true); +} + +/** NOTE! Use the corresponding macro os_file_flush(), not directly this +function! +Flushes the write buffers of a given file to the disk. +@param[in] file handle to a file +@return true if success */ +bool +os_file_flush_func( + os_file_t file) +{ + int ret; + + ret = os_file_sync_posix(file); + + if (ret == 0) { + return(true); + } + + /* Since Linux returns EINVAL if the 'file' is actually a raw device, + we choose to ignore that error if we are using raw disks */ + + if (srv_start_raw_disk_in_use && errno == EINVAL) { + + return(true); + } + + ib::error() << "The OS said file flush did not succeed"; + + os_file_handle_error(NULL, "flush"); + + /* It is a fatal error if a file flush does not succeed, because then + the database can get corrupt on disk */ + ut_error; + + return(false); +} + +/** NOTE! Use the corresponding macro os_file_create_simple(), not directly +this function! +A simple function to open or create a file. +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE +@param[in] read_only if true, read only checks are enforced +@param[out] success true if succeed, false if error +@return handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +pfs_os_file_t +os_file_create_simple_func( + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success) +{ + pfs_os_file_t file; + + *success = false; + + int create_flag; + const char* mode_str = NULL; + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + mode_str = "OPEN"; + + if (access_type == OS_FILE_READ_ONLY) { + + create_flag = O_RDONLY; + + } else if (read_only) { + + create_flag = O_RDONLY; + + } else { + create_flag = O_RDWR; + } + + } else if (read_only) { + + mode_str = "OPEN"; + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + mode_str = "CREATE"; + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else if (create_mode == OS_FILE_CREATE_PATH) { + + mode_str = "CREATE PATH"; + /* Create subdirs along the path if needed. */ + + *success = os_file_create_subdirs_if_needed(name); + + if (!*success) { + + ib::error() + << "Unable to create subdirectories '" + << name << "'"; + + return(OS_FILE_CLOSED); + } + + create_flag = O_RDWR | O_CREAT | O_EXCL; + create_mode = OS_FILE_CREATE; + } else { + + ib::error() + << "Unknown file create mode (" + << create_mode + << " for file '" << name << "'"; + + return(OS_FILE_CLOSED); + } + + bool retry; + + do { + file = open(name, create_flag | O_CLOEXEC, os_innodb_umask); + + if (file == -1) { + *success = false; + retry = os_file_handle_error( + name, + create_mode == OS_FILE_OPEN + ? "open" : "create"); + } else { + *success = true; + retry = false; + } + + } while (retry); + + /* This function is always called for data files, we should disable + OS caching (O_DIRECT) here as we do in os_file_create_func(), so + we open the same file in the same mode, see man page of open(2). */ + if (!srv_read_only_mode && *success) { + switch (srv_file_flush_method) { + case SRV_O_DSYNC: + case SRV_O_DIRECT: + case SRV_O_DIRECT_NO_FSYNC: + os_file_set_nocache(file, name, mode_str); + break; + default: + break; + } + } + +#ifndef _WIN32 + if (!read_only + && *success + && access_type == OS_FILE_READ_WRITE + && !my_disable_locking + && os_file_lock(file, name)) { + + *success = false; + close(file); + file = -1; + } +#endif /* !_WIN32 */ + + return(file); +} + +/** This function attempts to create a directory named pathname. The new +directory gets default permissions. On Unix the permissions are +(0770 & ~umask). If the directory exists already, nothing is done and +the call succeeds, unless the fail_if_exists arguments is true. +If another error occurs, such as a permission error, this does not crash, +but reports the error and returns false. +@param[in] pathname directory name as null-terminated string +@param[in] fail_if_exists if true, pre-existing directory is treated as + an error. +@return true if call succeeds, false on error */ +bool +os_file_create_directory( + const char* pathname, + bool fail_if_exists) +{ + int rcode; + + rcode = mkdir(pathname, 0770); + + if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { + /* failure */ + os_file_handle_error_no_exit(pathname, "mkdir", false); + + return(false); + } + + return(true); +} + +/** NOTE! Use the corresponding macro os_file_create(), not directly +this function! +Opens an existing file or creates a new. +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O + is desired, OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use async + I/O or unbuffered I/O: look in the function + source code for the exact rules +@param[in] type OS_DATA_FILE or OS_LOG_FILE +@param[in] read_only true, if read only checks should be enforcedm +@param[in] success true if succeeded +@return handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +pfs_os_file_t +os_file_create_func( + const char* name, + ulint create_mode, + ulint purpose, + ulint type, + bool read_only, + bool* success) +{ + bool on_error_no_exit; + bool on_error_silent; + + *success = false; + + DBUG_EXECUTE_IF( + "ib_create_table_fail_disk_full", + *success = false; + errno = ENOSPC; + return(OS_FILE_CLOSED); + ); + + int create_flag; + const char* mode_str = NULL; + + on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT + ? true : false; + on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT + ? true : false; + + create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT + | OS_FILE_ON_ERROR_SILENT)); + + if (create_mode == OS_FILE_OPEN + || create_mode == OS_FILE_OPEN_RAW + || create_mode == OS_FILE_OPEN_RETRY) { + + mode_str = "OPEN"; + + create_flag = read_only ? O_RDONLY : O_RDWR; + + } else if (read_only) { + + mode_str = "OPEN"; + + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + mode_str = "CREATE"; + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else if (create_mode == OS_FILE_OVERWRITE) { + + mode_str = "OVERWRITE"; + create_flag = O_RDWR | O_CREAT | O_TRUNC; + + } else { + ib::error() + << "Unknown file create mode (" << create_mode << ")" + << " for file '" << name << "'"; + + return(OS_FILE_CLOSED); + } + + ut_a(type == OS_LOG_FILE + || type == OS_DATA_FILE + || type == OS_DATA_FILE_NO_O_DIRECT); + + ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL); + + /* We let O_DSYNC only affect log files */ + + if (!read_only + && type == OS_LOG_FILE + && srv_file_flush_method == SRV_O_DSYNC) { +#ifdef O_DSYNC + create_flag |= O_DSYNC; +#else + create_flag |= O_SYNC; +#endif + } + + os_file_t file; + bool retry; + + do { + file = open(name, create_flag | O_CLOEXEC, os_innodb_umask); + + if (file == -1) { + const char* operation; + + operation = (create_mode == OS_FILE_CREATE + && !read_only) ? "create" : "open"; + + *success = false; + + if (on_error_no_exit) { + retry = os_file_handle_error_no_exit( + name, operation, on_error_silent); + } else { + retry = os_file_handle_error(name, operation); + } + } else { + *success = true; + retry = false; + } + + } while (retry); + + if (!*success) { + return file; + } + +#if (defined __sun__ && defined DIRECTIO_ON) || defined O_DIRECT + if (type == OS_DATA_FILE) { + switch (srv_file_flush_method) { + case SRV_O_DSYNC: + case SRV_O_DIRECT: + case SRV_O_DIRECT_NO_FSYNC: +# ifdef __linux__ +use_o_direct: +# endif + os_file_set_nocache(file, name, mode_str); + break; + default: + break; + } + } +# ifdef __linux__ + else if (type == OS_LOG_FILE && !log_sys.is_opened()) { + struct stat st; + char b[20 + sizeof "/sys/dev/block/" ":" + "/../queue/physical_block_size"]; + int f; + if (fstat(file, &st)) { + goto skip_o_direct; + } + MSAN_STAT_WORKAROUND(&st); + if (snprintf(b, sizeof b, + "/sys/dev/block/%u:%u/queue/physical_block_size", + major(st.st_dev), minor(st.st_dev)) + >= static_cast<int>(sizeof b)) { + goto skip_o_direct; + } + if ((f = open(b, O_RDONLY)) == -1) { + if (snprintf(b, sizeof b, + "/sys/dev/block/%u:%u/../queue/" + "physical_block_size", + major(st.st_dev), minor(st.st_dev)) + >= static_cast<int>(sizeof b)) { + goto skip_o_direct; + } + f = open(b, O_RDONLY); + } + if (f != -1) { + ssize_t l = read(f, b, sizeof b); + unsigned long s = 0; + + if (l > 0 && static_cast<size_t>(l) < sizeof b + && b[l - 1] == '\n') { + char* end = b; + s = strtoul(b, &end, 10); + if (b == end || *end != '\n') { + s = 0; + } + } + close(f); + if (s > 4096 || s < 64 || !ut_is_2pow(s)) { + goto skip_o_direct; + } + log_sys.log_maybe_unbuffered= true; + log_sys.set_block_size(uint32_t(s)); + if (!log_sys.log_buffered && !(st.st_size & (s - 1))) { + goto use_o_direct; + } + } else { +skip_o_direct: + log_sys.log_maybe_unbuffered= false; + log_sys.log_buffered= true; + log_sys.set_block_size(512); + } + } +# endif +#endif + +#ifndef _WIN32 + if (!read_only + && create_mode != OS_FILE_OPEN_RAW + && !my_disable_locking + && os_file_lock(file, name)) { + + if (create_mode == OS_FILE_OPEN_RETRY) { + ib::info() + << "Retrying to lock the first data file"; + + for (int i = 0; i < 100; i++) { + std::this_thread::sleep_for( + std::chrono::seconds(1)); + + if (!os_file_lock(file, name)) { + *success = true; + return(file); + } + } + + ib::info() + << "Unable to open the first data file"; + } + + *success = false; + close(file); + file = -1; + } +#endif /* !_WIN32 */ + + return(file); +} + +/** NOTE! Use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A simple function to open or create a file. +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option + is used by a backup program reading the file +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeeded +@return own: handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +pfs_os_file_t +os_file_create_simple_no_error_handling_func( + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success) +{ + os_file_t file; + int create_flag; + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + *success = false; + + if (create_mode == OS_FILE_OPEN) { + + if (access_type == OS_FILE_READ_ONLY) { + + create_flag = O_RDONLY; + + } else if (read_only) { + + create_flag = O_RDONLY; + + } else { + + ut_a(access_type == OS_FILE_READ_WRITE + || access_type == OS_FILE_READ_ALLOW_DELETE); + + create_flag = O_RDWR; + } + + } else if (read_only) { + + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else { + + ib::error() + << "Unknown file create mode " + << create_mode << " for file '" << name << "'"; + + return(OS_FILE_CLOSED); + } + + file = open(name, create_flag | O_CLOEXEC, os_innodb_umask); + + *success = (file != -1); + +#ifndef _WIN32 + if (!read_only + && *success + && access_type == OS_FILE_READ_WRITE + && !my_disable_locking + && os_file_lock(file, name)) { + + *success = false; + close(file); + file = -1; + + } +#endif /* !_WIN32 */ + + return(file); +} + +/** Deletes a file if it exists. The file has to be closed before calling this. +@param[in] name file path as a null-terminated string +@param[out] exist indicate if file pre-exist +@return true if success */ +bool +os_file_delete_if_exists_func( + const char* name, + bool* exist) +{ + if (exist != NULL) { + *exist = true; + } + + int ret; + + ret = unlink(name); + + if (ret != 0 && errno == ENOENT) { + if (exist != NULL) { + *exist = false; + } + } else if (ret != 0 && errno != ENOENT) { + os_file_handle_error_no_exit(name, "delete", false); + + return(false); + } + + return(true); +} + +/** Deletes a file. The file has to be closed before calling this. +@param[in] name file path as a null-terminated string +@return true if success */ +bool +os_file_delete_func( + const char* name) +{ + int ret; + + ret = unlink(name); + + if (ret != 0) { + os_file_handle_error_no_exit(name, "delete", FALSE); + + return(false); + } + + return(true); +} + +/** NOTE! Use the corresponding macro os_file_rename(), not directly this +function! +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. +@param[in] oldpath old file path as a null-terminated string +@param[in] newpath new file path +@return true if success */ +bool +os_file_rename_func( + const char* oldpath, + const char* newpath) +{ +#ifdef UNIV_DEBUG + os_file_type_t type; + bool exists; + + /* New path must not exist. */ + ut_ad(os_file_status(newpath, &exists, &type)); + ut_ad(!exists); + + /* Old path must exist. */ + ut_ad(os_file_status(oldpath, &exists, &type)); + ut_ad(exists); +#endif /* UNIV_DEBUG */ + + int ret; + + ret = rename(oldpath, newpath); + + if (ret != 0) { + os_file_handle_rename_error(oldpath, newpath); + + return(false); + } + + return(true); +} + +/** NOTE! Use the corresponding macro os_file_close(), not directly this +function! +Closes a file handle. In case of error, error number can be retrieved with +os_file_get_last_error. +@param[in] file Handle to close +@return true if success */ +bool os_file_close_func(os_file_t file) +{ + int ret= close(file); + + if (!ret) + return true; + + os_file_handle_error(NULL, "close"); + return false; +} + +/** Gets a file size. +@param[in] file handle to an open file +@return file size, or (os_offset_t) -1 on failure */ +os_offset_t +os_file_get_size(os_file_t file) +{ + struct stat statbuf; + if (fstat(file, &statbuf)) return os_offset_t(-1); + MSAN_STAT_WORKAROUND(&statbuf); + return statbuf.st_size; +} + +/** Gets a file size. +@param[in] filename Full path to the filename to check +@return file size if OK, else set m_total_size to ~0 and m_alloc_size to + errno */ +os_file_size_t +os_file_get_size( + const char* filename) +{ + struct stat s; + os_file_size_t file_size; + + int ret = stat(filename, &s); + + if (ret == 0) { + MSAN_STAT_WORKAROUND(&s); + file_size.m_total_size = s.st_size; + /* st_blocks is in 512 byte sized blocks */ + file_size.m_alloc_size = s.st_blocks * 512; + } else { + file_size.m_total_size = ~0U; + file_size.m_alloc_size = (os_offset_t) errno; + } + + return(file_size); +} + +/** This function returns information about the specified file +@param[in] path pathname of the file +@param[out] stat_info information of a file in a directory +@param[in,out] statinfo information of a file in a directory +@param[in] check_rw_perm for testing whether the file can be opened + in RW mode +@param[in] read_only if true read only mode checks are enforced +@return DB_SUCCESS if all OK */ +static +dberr_t +os_file_get_status_posix( + const char* path, + os_file_stat_t* stat_info, + struct stat* statinfo, + bool check_rw_perm, + bool read_only) +{ + int ret = stat(path, statinfo); + + if (ret && (errno == ENOENT || errno == ENOTDIR + || errno == ENAMETOOLONG)) { + /* file does not exist */ + + return(DB_NOT_FOUND); + + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", false); + + return(DB_FAIL); + } + + MSAN_STAT_WORKAROUND(statinfo); + + switch (statinfo->st_mode & S_IFMT) { + case S_IFDIR: + stat_info->type = OS_FILE_TYPE_DIR; + break; + case S_IFLNK: + stat_info->type = OS_FILE_TYPE_LINK; + break; + case S_IFBLK: + /* Handle block device as regular file. */ + case S_IFCHR: + /* Handle character device as regular file. */ + case S_IFREG: + stat_info->type = OS_FILE_TYPE_FILE; + break; + default: + stat_info->type = OS_FILE_TYPE_UNKNOWN; + } + + stat_info->size = statinfo->st_size; + stat_info->block_size = statinfo->st_blksize; + stat_info->alloc_size = statinfo->st_blocks * 512; + + if (check_rw_perm + && (stat_info->type == OS_FILE_TYPE_FILE + || stat_info->type == OS_FILE_TYPE_BLOCK)) { + + stat_info->rw_perm = !access(path, read_only + ? R_OK : R_OK | W_OK); + } + + return(DB_SUCCESS); +} + +/** Truncates a file to a specified size in bytes. +Do nothing if the size to preserve is greater or equal to the current +size of the file. +@param[in] pathname file path +@param[in] file file to be truncated +@param[in] size size to preserve in bytes +@return true if success */ +static +bool +os_file_truncate_posix( + const char* pathname, + os_file_t file, + os_offset_t size) +{ + int res = ftruncate(file, size); + + if (res == -1) { + + bool retry; + + retry = os_file_handle_error_no_exit( + pathname, "truncate", false); + + if (retry) { + ib::warn() + << "Truncate failed for '" + << pathname << "'"; + } + } + + return(res == 0); +} + +/** Truncates a file at its current position. +@return true if success */ +bool +os_file_set_eof( + FILE* file) /*!< in: file to be truncated */ +{ + return(!ftruncate(fileno(file), ftell(file))); +} + +#else /* !_WIN32 */ + +#include <WinIoCtl.h> + + + +/** Free storage space associated with a section of the file. +@param[in] fh Open file handle +@param[in] off Starting offset (SEEK_SET) +@param[in] len Size of the hole +@return 0 on success or errno */ +static +dberr_t +os_file_punch_hole_win32( + os_file_t fh, + os_offset_t off, + os_offset_t len) +{ + FILE_ZERO_DATA_INFORMATION punch; + + punch.FileOffset.QuadPart = off; + punch.BeyondFinalZero.QuadPart = off + len; + + /* If lpOverlapped is NULL, lpBytesReturned cannot be NULL, + therefore we pass a dummy parameter. */ + DWORD temp; + BOOL success = os_win32_device_io_control( + fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch), + NULL, 0, &temp); + + return(success ? DB_SUCCESS: DB_IO_NO_PUNCH_HOLE); +} + +/** Check the existence and type of the given file. +@param[in] path path name of file +@param[out] exists true if the file exists +@param[out] type Type of the file, if it exists +@return true if call succeeded */ +static +bool +os_file_status_win32( + const char* path, + bool* exists, + os_file_type_t* type) +{ + int ret; + struct _stat64 statinfo; + + ret = _stat64(path, &statinfo); + + *exists = !ret; + + if (!ret) { + /* file exists, everything OK */ + + } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) { + /* file does not exist */ + return(true); + + } else { + /* file exists, but stat call failed */ + os_file_handle_error_no_exit(path, "stat", false); + return(false); + } + + if (_S_IFDIR & statinfo.st_mode) { + *type = OS_FILE_TYPE_DIR; + + } else if (_S_IFREG & statinfo.st_mode) { + *type = OS_FILE_TYPE_FILE; + + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + return(true); +} + +/* Dynamically load NtFlushBuffersFileEx, used in os_file_flush_func */ +#include <winternl.h> +typedef NTSTATUS(WINAPI* pNtFlushBuffersFileEx)( + HANDLE FileHandle, ULONG Flags, PVOID Parameters, ULONG ParametersSize, + PIO_STATUS_BLOCK IoStatusBlock); + +static pNtFlushBuffersFileEx my_NtFlushBuffersFileEx + = (pNtFlushBuffersFileEx)GetProcAddress(GetModuleHandle("ntdll"), + "NtFlushBuffersFileEx"); + +/** NOTE! Use the corresponding macro os_file_flush(), not directly this +function! +Flushes the write buffers of a given file to the disk. +@param[in] file handle to a file +@return true if success */ +bool os_file_flush_func(os_file_t file) +{ + ++os_n_fsyncs; + static bool disable_datasync; + + if (my_NtFlushBuffersFileEx && !disable_datasync) + { + IO_STATUS_BLOCK iosb{}; + NTSTATUS status= my_NtFlushBuffersFileEx( + file, FLUSH_FLAGS_FILE_DATA_SYNC_ONLY, nullptr, 0, &iosb); + if (!status) + return true; + /* + NtFlushBuffersFileEx(FLUSH_FLAGS_FILE_DATA_SYNC_ONLY) might fail + unless on Win10+, and maybe non-NTFS. Switch to using FlushFileBuffers(). + */ + disable_datasync= true; + } + + if (FlushFileBuffers(file)) + return true; + + /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is + actually a raw device, we choose to ignore that error if we are using + raw disks */ + if (srv_start_raw_disk_in_use && GetLastError() == ERROR_INVALID_FUNCTION) + return true; + + os_file_handle_error(nullptr, "flush"); + + /* It is a fatal error if a file flush does not succeed, because then + the database can get corrupt on disk */ + ut_error; + + return false; +} + +/** Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +then OS error number + OS_FILE_ERROR_MAX is returned. +@param[in] report_all_errors true if we want an error message +printed of all errors +@param[in] on_error_silent true then don't print any diagnostic + to the log +@return error number, or OS error number + OS_FILE_ERROR_MAX */ +ulint os_file_get_last_error(bool report_all_errors, bool on_error_silent) + +{ + ulint err = (ulint) GetLastError(); + + if (err == ERROR_SUCCESS) { + return(0); + } + + if (report_all_errors + || (!on_error_silent + && err != ERROR_DISK_FULL + && err != ERROR_FILE_NOT_FOUND + && err != ERROR_FILE_EXISTS)) { + + ib::error() + << "Operating system error number " << err + << " in a file operation."; + + switch (err) { + case ERROR_PATH_NOT_FOUND: + break; + case ERROR_ACCESS_DENIED: + ib::error() + << "The error means mariadbd does not have" + " the access rights to" + " the directory. It may also be" + " you have created a subdirectory" + " of the same name as a data file."; + break; + case ERROR_SHARING_VIOLATION: + case ERROR_LOCK_VIOLATION: + ib::error() + << "The error means that another program" + " is using InnoDB's files." + " This might be a backup or antivirus" + " software or another instance" + " of MariaDB." + " Please close it to get rid of this error."; + break; + case ERROR_WORKING_SET_QUOTA: + case ERROR_NO_SYSTEM_RESOURCES: + ib::error() + << "The error means that there are no" + " sufficient system resources or quota to" + " complete the operation."; + break; + case ERROR_OPERATION_ABORTED: + ib::error() + << "The error means that the I/O" + " operation has been aborted" + " because of either a thread exit" + " or an application request." + " Retry attempt is made."; + break; + default: + ib::info() << OPERATING_SYSTEM_ERROR_MSG; + } + } + + if (err == ERROR_FILE_NOT_FOUND) { + return(OS_FILE_NOT_FOUND); + } else if (err == ERROR_DISK_FULL) { + return(OS_FILE_DISK_FULL); + } else if (err == ERROR_FILE_EXISTS) { + return(OS_FILE_ALREADY_EXISTS); + } else if (err == ERROR_SHARING_VIOLATION + || err == ERROR_LOCK_VIOLATION) { + return(OS_FILE_SHARING_VIOLATION); + } else if (err == ERROR_WORKING_SET_QUOTA + || err == ERROR_NO_SYSTEM_RESOURCES) { + return(OS_FILE_INSUFFICIENT_RESOURCE); + } else if (err == ERROR_OPERATION_ABORTED) { + return(OS_FILE_OPERATION_ABORTED); + } else if (err == ERROR_ACCESS_DENIED) { + return(OS_FILE_ACCESS_VIOLATION); + } + + return(OS_FILE_ERROR_MAX + err); +} + + +/** NOTE! Use the corresponding macro os_file_create_simple(), not directly +this function! +A simple function to open or create a file. +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeed, false if error +@return handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +pfs_os_file_t +os_file_create_simple_func( + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success) +{ + os_file_t file; + + *success = false; + + DWORD access; + DWORD create_flag; + DWORD attributes = 0; + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + ut_ad(srv_operation == SRV_OPERATION_NORMAL); + + if (create_mode == OS_FILE_OPEN) { + + create_flag = OPEN_EXISTING; + + } else if (read_only) { + + create_flag = OPEN_EXISTING; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = CREATE_NEW; + + } else if (create_mode == OS_FILE_CREATE_PATH) { + + /* Create subdirs along the path if needed. */ + *success = os_file_create_subdirs_if_needed(name); + + if (!*success) { + + ib::error() + << "Unable to create subdirectories '" + << name << "'"; + + return(OS_FILE_CLOSED); + } + + create_flag = CREATE_NEW; + create_mode = OS_FILE_CREATE; + + } else { + + ib::error() + << "Unknown file create mode (" + << create_mode << ") for file '" + << name << "'"; + + return(OS_FILE_CLOSED); + } + + if (access_type == OS_FILE_READ_ONLY) { + + access = GENERIC_READ; + + } else if (read_only) { + + ib::info() + << "Read only mode set. Unable to" + " open file '" << name << "' in RW mode, " + << "trying RO mode"; + + access = GENERIC_READ; + + } else if (access_type == OS_FILE_READ_WRITE) { + + access = GENERIC_READ | GENERIC_WRITE; + + } else { + + ib::error() + << "Unknown file access type (" << access_type << ") " + "for file '" << name << "'"; + + return(OS_FILE_CLOSED); + } + + bool retry; + + do { + /* Use default security attributes and no template file. */ + + file = CreateFile( + (LPCTSTR) name, access, + FILE_SHARE_READ | FILE_SHARE_DELETE, + my_win_file_secattr(), create_flag, attributes, NULL); + + if (file == INVALID_HANDLE_VALUE) { + + *success = false; + + retry = os_file_handle_error( + name, create_mode == OS_FILE_OPEN ? + "open" : "create"); + + } else { + + retry = false; + + *success = true; + } + + } while (retry); + + return(file); +} + +/** This function attempts to create a directory named pathname. The new +directory gets default permissions. On Unix the permissions are +(0770 & ~umask). If the directory exists already, nothing is done and +the call succeeds, unless the fail_if_exists arguments is true. +If another error occurs, such as a permission error, this does not crash, +but reports the error and returns false. +@param[in] pathname directory name as null-terminated string +@param[in] fail_if_exists if true, pre-existing directory is treated + as an error. +@return true if call succeeds, false on error */ +bool +os_file_create_directory( + const char* pathname, + bool fail_if_exists) +{ + BOOL rcode; + + rcode = CreateDirectory((LPCTSTR) pathname, NULL); + if (!(rcode != 0 + || (GetLastError() == ERROR_ALREADY_EXISTS + && !fail_if_exists))) { + + os_file_handle_error_no_exit( + pathname, "CreateDirectory", false); + + return(false); + } + + return(true); +} + +/** Get disk sector size for a file. */ +static size_t get_sector_size(HANDLE file) +{ + FILE_STORAGE_INFO fsi; + ULONG s= 4096; + if (GetFileInformationByHandleEx(file, FileStorageInfo, &fsi, sizeof fsi)) + { + s= fsi.PhysicalBytesPerSectorForPerformance; + if (s > 4096 || s < 64 || !ut_is_2pow(s)) + return 4096; + } + return s; +} + +/** NOTE! Use the corresponding macro os_file_create(), not directly +this function! +Opens an existing file or creates a new. +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O + is desired, OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use async + I/O or unbuffered I/O: look in the function + source code for the exact rules +@param[in] type OS_DATA_FILE or OS_LOG_FILE +@param[in] success true if succeeded +@return handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +pfs_os_file_t +os_file_create_func( + const char* name, + ulint create_mode, + ulint purpose, + ulint type, + bool read_only, + bool* success) +{ + os_file_t file; + bool retry; + bool on_error_no_exit; + bool on_error_silent; + + *success = false; + + DBUG_EXECUTE_IF( + "ib_create_table_fail_disk_full", + *success = false; + SetLastError(ERROR_DISK_FULL); + return(OS_FILE_CLOSED); + ); + + DWORD create_flag; + DWORD share_mode = read_only + ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE + : FILE_SHARE_READ | FILE_SHARE_DELETE; + + on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT + ? true : false; + + on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT + ? true : false; + + create_mode &= ~(OS_FILE_ON_ERROR_NO_EXIT | OS_FILE_ON_ERROR_SILENT); + + if (create_mode == OS_FILE_OPEN_RAW) { + + ut_a(!read_only); + + /* On Windows Physical devices require admin privileges and + have to have the write-share mode set. See the remarks + section for the CreateFile() function documentation in MSDN. */ + + share_mode |= FILE_SHARE_WRITE; + + create_flag = OPEN_EXISTING; + + } else if (create_mode == OS_FILE_OPEN + || create_mode == OS_FILE_OPEN_RETRY) { + + create_flag = OPEN_EXISTING; + + } else if (read_only) { + + create_flag = OPEN_EXISTING; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = CREATE_NEW; + + } else if (create_mode == OS_FILE_OVERWRITE) { + + create_flag = CREATE_ALWAYS; + + } else { + ib::error() + << "Unknown file create mode (" << create_mode << ") " + << " for file '" << name << "'"; + + return(OS_FILE_CLOSED); + } + + DWORD attributes = (purpose == OS_FILE_AIO && srv_use_native_aio) + ? FILE_FLAG_OVERLAPPED : 0; + + if (type == OS_LOG_FILE) { + if (!log_sys.is_opened() && !log_sys.log_buffered) { + attributes|= FILE_FLAG_NO_BUFFERING; + } + if (srv_file_flush_method == SRV_O_DSYNC) + attributes|= FILE_FLAG_WRITE_THROUGH; + } + else if (type == OS_DATA_FILE) + { + switch (srv_file_flush_method) + { + case SRV_FSYNC: + case SRV_LITTLESYNC: + case SRV_NOSYNC: + break; + default: + attributes|= FILE_FLAG_NO_BUFFERING; + } + } + + DWORD access = GENERIC_READ; + + if (!read_only) { + access |= GENERIC_WRITE; + } + + for (;;) { + const char *operation; + + /* Use default security attributes and no template file. */ + file = CreateFile( + name, access, share_mode, my_win_file_secattr(), + create_flag, attributes, NULL); + + *success = file != INVALID_HANDLE_VALUE; + + if (*success && type == OS_LOG_FILE) { + uint32_t s = uint32_t(get_sector_size(file)); + log_sys.set_block_size(s); + if (attributes & FILE_FLAG_NO_BUFFERING) { + if (os_file_get_size(file) % s) { + attributes &= ~FILE_FLAG_NO_BUFFERING; + create_flag = OPEN_ALWAYS; + CloseHandle(file); + continue; + } + log_sys.log_buffered = false; + } + } + + if (*success) { + break; + } + + operation = (create_mode == OS_FILE_CREATE && !read_only) ? + "create" : "open"; + + if (on_error_no_exit) { + retry = os_file_handle_error_no_exit( + name, operation, on_error_silent); + } + else { + retry = os_file_handle_error(name, operation); + } + + if (!retry) { + break; + } + } + + if (*success && (attributes & FILE_FLAG_OVERLAPPED) && srv_thread_pool) { + srv_thread_pool->bind(file); + } + return(file); +} + +/** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(), +not directly this function! +A simple function to open or create a file. +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file +@param[out] success true if succeeded +@return own: handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ + +pfs_os_file_t +os_file_create_simple_no_error_handling_func( + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success) +{ + os_file_t file; + + *success = false; + + DWORD access; + DWORD create_flag; + DWORD attributes = 0; + DWORD share_mode = read_only + ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE + : FILE_SHARE_READ | FILE_SHARE_DELETE; + + ut_a(name); + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + + create_flag = OPEN_EXISTING; + + } else if (read_only) { + + create_flag = OPEN_EXISTING; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = CREATE_NEW; + + } else { + + ib::error() + << "Unknown file create mode (" << create_mode << ") " + << " for file '" << name << "'"; + + return(OS_FILE_CLOSED); + } + + if (access_type == OS_FILE_READ_ONLY) { + + access = GENERIC_READ; + + } else if (read_only) { + + access = GENERIC_READ; + + } else if (access_type == OS_FILE_READ_WRITE) { + + access = GENERIC_READ | GENERIC_WRITE; + + } else if (access_type == OS_FILE_READ_ALLOW_DELETE) { + + ut_a(!read_only); + + access = GENERIC_READ; + + /*!< A backup program has to give mysqld the maximum + freedom to do what it likes with the file */ + + share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE + | FILE_SHARE_READ; + + } else { + + ib::error() + << "Unknown file access type (" << access_type << ") " + << "for file '" << name << "'"; + + return(OS_FILE_CLOSED); + } + + file = CreateFile((LPCTSTR) name, + access, + share_mode, + my_win_file_secattr(), + create_flag, + attributes, + NULL); // No template file + + *success = (file != INVALID_HANDLE_VALUE); + + return(file); +} + +/** Deletes a file if it exists. The file has to be closed before calling this. +@param[in] name file path as a null-terminated string +@param[out] exist indicate if file pre-exist +@return true if success */ +bool +os_file_delete_if_exists_func( + const char* name, + bool* exist) +{ + ulint count = 0; + + if (exist != NULL) { + *exist = true; + } + + for (;;) { + /* In Windows, deleting an .ibd file may fail if + the file is being accessed by an external program, + such as a backup tool. */ + + bool ret = DeleteFile((LPCTSTR) name); + + if (ret) { + return(true); + } + + switch (GetLastError()) { + case ERROR_FILE_NOT_FOUND: + case ERROR_PATH_NOT_FOUND: + /* the file does not exist, this not an error */ + if (exist != NULL) { + *exist = false; + } + /* fall through */ + case ERROR_ACCESS_DENIED: + return(true); + } + + ++count; + + if (count > 100 && 0 == (count % 10)) { + + /* Print error information */ + os_file_get_last_error(true); + + ib::warn() << "Delete of file '" << name << "' failed."; + } + + std::this_thread::sleep_for(std::chrono::seconds(1)); + + if (count > 2000) { + + return(false); + } + } +} + +/** Deletes a file. The file has to be closed before calling this. +@param[in] name File path as NUL terminated string +@return true if success */ +bool +os_file_delete_func( + const char* name) +{ + ulint count = 0; + + for (;;) { + /* In Windows, deleting an .ibd file may fail if + the file is being accessed by an external program, + such as a backup tool. */ + + BOOL ret = DeleteFile((LPCTSTR) name); + + if (ret) { + return(true); + } + + if (GetLastError() == ERROR_FILE_NOT_FOUND) { + /* If the file does not exist, we classify this as + a 'mild' error and return */ + + return(false); + } + + ++count; + + if (count > 100 && 0 == (count % 10)) { + + /* print error information */ + os_file_get_last_error(true); + + ib::warn() + << "Cannot delete file '" << name << "'. Is " + << "another program accessing it?"; + } + + std::this_thread::sleep_for(std::chrono::seconds(1)); + + if (count > 2000) { + + return(false); + } + } + + ut_error; + return(false); +} + +/** NOTE! Use the corresponding macro os_file_rename(), not directly this +function! +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. +@param[in] oldpath old file path as a null-terminated string +@param[in] newpath new file path +@return true if success */ +bool +os_file_rename_func( + const char* oldpath, + const char* newpath) +{ +#ifdef UNIV_DEBUG + os_file_type_t type; + bool exists; + + /* New path must not exist. */ + ut_ad(os_file_status(newpath, &exists, &type)); + ut_ad(!exists); + + /* Old path must exist. */ + ut_ad(os_file_status(oldpath, &exists, &type)); + ut_ad(exists); +#endif /* UNIV_DEBUG */ + + if (MoveFileEx(oldpath, newpath, MOVEFILE_REPLACE_EXISTING)) { + return(true); + } + + os_file_handle_rename_error(oldpath, newpath); + return(false); +} + +/** NOTE! Use the corresponding macro os_file_close(), not directly +this function! +Closes a file handle. In case of error, error number can be retrieved with +os_file_get_last_error. +@param[in,own] file Handle to a file +@return true if success */ +bool os_file_close_func(os_file_t file) +{ + ut_ad(file); + if (!CloseHandle(file)) + { + os_file_handle_error(NULL, "close"); + return false; + } + + if(srv_thread_pool) + srv_thread_pool->unbind(file); + return true; +} + +/** Gets a file size. +@param[in] file Handle to a file +@return file size, or (os_offset_t) -1 on failure */ +os_offset_t os_file_get_size(os_file_t file) +{ + LARGE_INTEGER li; + if (GetFileSizeEx(file, &li)) + return li.QuadPart; + return ((os_offset_t) -1); +} + +/** Gets a file size. +@param[in] filename Full path to the filename to check +@return file size if OK, else set m_total_size to ~0 and m_alloc_size to + errno */ +os_file_size_t +os_file_get_size( + const char* filename) +{ + struct __stat64 s; + os_file_size_t file_size; + + int ret = _stat64(filename, &s); + + if (ret == 0) { + + file_size.m_total_size = s.st_size; + + DWORD low_size; + DWORD high_size; + + low_size = GetCompressedFileSize(filename, &high_size); + + if (low_size != INVALID_FILE_SIZE) { + + file_size.m_alloc_size = high_size; + file_size.m_alloc_size <<= 32; + file_size.m_alloc_size |= low_size; + + } else { + ib::error() + << "GetCompressedFileSize(" + << filename << ", ..) failed."; + + file_size.m_alloc_size = (os_offset_t) -1; + } + } else { + file_size.m_total_size = ~0; + file_size.m_alloc_size = (os_offset_t) ret; + } + + return(file_size); +} + +/** This function returns information about the specified file +@param[in] path pathname of the file +@param[out] stat_info information of a file in a directory +@param[in,out] statinfo information of a file in a directory +@param[in] check_rw_perm for testing whether the file can be opened + in RW mode +@param[in] read_only true if the file is opened in read-only mode +@return DB_SUCCESS if all OK */ +static +dberr_t +os_file_get_status_win32( + const char* path, + os_file_stat_t* stat_info, + struct _stat64* statinfo, + bool check_rw_perm, + bool read_only) +{ + int ret = _stat64(path, statinfo); + + if (ret && (errno == ENOENT || errno == ENOTDIR + || errno == ENAMETOOLONG)) { + /* file does not exist */ + + return(DB_NOT_FOUND); + + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "STAT", false); + + return(DB_FAIL); + + } else if (_S_IFDIR & statinfo->st_mode) { + + stat_info->type = OS_FILE_TYPE_DIR; + + } else if (_S_IFREG & statinfo->st_mode) { + + DWORD access = GENERIC_READ; + + if (!read_only) { + access |= GENERIC_WRITE; + } + + stat_info->type = OS_FILE_TYPE_FILE; + + /* Check if we can open it in read-only mode. */ + + if (check_rw_perm) { + HANDLE fh; + + fh = CreateFile( + (LPCTSTR) path, // File to open + access, + FILE_SHARE_READ | FILE_SHARE_WRITE + | FILE_SHARE_DELETE, // Full sharing + my_win_file_secattr(), + OPEN_EXISTING, // Existing file only + FILE_ATTRIBUTE_NORMAL, // Normal file + NULL); // No attr. template + + if (fh == INVALID_HANDLE_VALUE) { + stat_info->rw_perm = false; + } else { + stat_info->rw_perm = true; + CloseHandle(fh); + } + } + } else { + stat_info->type = OS_FILE_TYPE_UNKNOWN; + } + + return(DB_SUCCESS); +} + +/** +Sets a sparse flag on Windows file. +@param[in] file file handle +@return true on success, false on error +*/ +#include <versionhelpers.h> +bool os_file_set_sparse_win32(os_file_t file, bool is_sparse) +{ + if (!is_sparse && !IsWindows8OrGreater()) { + /* Cannot unset sparse flag on older Windows. + Until Windows8 it is documented to produce unpredictable results, + if there are unallocated ranges in file.*/ + return false; + } + DWORD temp; + FILE_SET_SPARSE_BUFFER sparse_buffer; + sparse_buffer.SetSparse = is_sparse; + return os_win32_device_io_control(file, + FSCTL_SET_SPARSE, &sparse_buffer, sizeof(sparse_buffer), 0, 0,&temp); +} + + +/** +Change file size on Windows. + +If file is extended, the bytes between old and new EOF +are zeros. + +If file is sparse, "virtual" block is added at the end of +allocated area. + +If file is normal, file system allocates storage. + +@param[in] pathname file path +@param[in] file file handle +@param[in] size size to preserve in bytes +@return true if success */ +bool +os_file_change_size_win32( + const char* pathname, + os_file_t file, + os_offset_t size) +{ + LARGE_INTEGER length; + + length.QuadPart = size; + + BOOL success = SetFilePointerEx(file, length, NULL, FILE_BEGIN); + + if (!success) { + os_file_handle_error_no_exit( + pathname, "SetFilePointerEx", false); + } else { + success = SetEndOfFile(file); + if (!success) { + os_file_handle_error_no_exit( + pathname, "SetEndOfFile", false); + } + } + return(success); +} + +/** Truncates a file at its current position. +@param[in] file Handle to be truncated +@return true if success */ +bool +os_file_set_eof( + FILE* file) +{ + HANDLE h = (HANDLE) _get_osfhandle(fileno(file)); + + return(SetEndOfFile(h)); +} + +#endif /* !_WIN32*/ + +/** Does a synchronous read or write depending upon the type specified +In case of partial reads/writes the function tries +NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data. +@param[in] type, IO flags +@param[in] file handle to an open file +@param[out] buf buffer where to read +@param[in] offset file offset from the start where to read +@param[in] n number of bytes to read, starting from offset +@param[out] err DB_SUCCESS or error code +@return number of bytes read/written, -1 if error */ +static MY_ATTRIBUTE((warn_unused_result)) +ssize_t +os_file_io( + const IORequest&in_type, + os_file_t file, + void* buf, + ulint n, + os_offset_t offset, + dberr_t* err) +{ + ssize_t original_n = ssize_t(n); + IORequest type = in_type; + ssize_t bytes_returned = 0; + + SyncFileIO sync_file_io(file, buf, n, offset); + + for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) { + + ssize_t n_bytes = sync_file_io.execute(type); + + /* Check for a hard error. Not much we can do now. */ + if (n_bytes < 0) { + + break; + + } else if (n_bytes + bytes_returned == ssize_t(n)) { + + bytes_returned += n_bytes; + + *err = type.maybe_punch_hole(offset, n); + + return(original_n); + } + + /* Handle partial read/write. */ + + ut_ad(ulint(n_bytes + bytes_returned) < n); + + bytes_returned += n_bytes; + + if (type.type != IORequest::READ_MAYBE_PARTIAL) { + sql_print_warning("InnoDB: %zu bytes should have been" + " %s at %llu from %s," + " but got only %zd." + " Retrying.", + n, type.is_read() + ? "read" : "written", offset, + type.node + ? type.node->name + : "(unknown file)", bytes_returned); + } + + /* Advance the offset and buffer by n_bytes */ + sync_file_io.advance(n_bytes); + } + + *err = DB_IO_ERROR; + + if (type.type != IORequest::READ_MAYBE_PARTIAL) { + ib::warn() + << "Retry attempts for " + << (type.is_read() ? "reading" : "writing") + << " partial data failed."; + } + + return(bytes_returned); +} + +/** Does a synchronous write operation in Posix. +@param[in] type IO context +@param[in] file handle to an open file +@param[out] buf buffer from which to write +@param[in] n number of bytes to write, starting from offset +@param[in] offset file offset from the start where to write +@param[out] err DB_SUCCESS or error code +@return number of bytes written +@retval -1 on error */ +static MY_ATTRIBUTE((warn_unused_result)) +ssize_t +os_file_pwrite( + const IORequest& type, + os_file_t file, + const byte* buf, + ulint n, + os_offset_t offset, + dberr_t* err) +{ + ut_ad(type.is_write()); + + ++os_n_file_writes; + + const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES); + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor); + ssize_t n_bytes = os_file_io(type, file, const_cast<byte*>(buf), + n, offset, err); + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor); + + return(n_bytes); +} + +/** NOTE! Use the corresponding macro os_file_write(), not directly +Requests a synchronous write operation. +@param[in] type IO flags +@param[in] file handle to an open file +@param[out] buf buffer from which to write +@param[in] offset file offset from the start where to read +@param[in] n number of bytes to read, starting from offset +@return error code +@retval DB_SUCCESS if the operation succeeded */ +dberr_t +os_file_write_func( + const IORequest& type, + const char* name, + os_file_t file, + const void* buf, + os_offset_t offset, + ulint n) +{ + dberr_t err; + + ut_ad(n > 0); + + ssize_t n_bytes = os_file_pwrite(type, file, (byte*)buf, n, offset, &err); + + if ((ulint) n_bytes != n && !os_has_said_disk_full) { + + ib::error() + << "Write to file " << name << " failed at offset " + << offset << ", " << n + << " bytes should have been written," + " only " << n_bytes << " were written." + " Operating system error number " << IF_WIN(GetLastError(),errno) << "." + " Check that your OS and file system" + " support files of this size." + " Check also that the disk is not full" + " or a disk quota exceeded."; +#ifndef _WIN32 + if (strerror(errno) != NULL) { + + ib::error() + << "Error number " << errno + << " means '" << strerror(errno) << "'"; + } + + ib::info() << OPERATING_SYSTEM_ERROR_MSG; +#endif + os_has_said_disk_full = true; + } + + return(err); +} + +/** Does a synchronous read operation in Posix. +@param[in] type IO flags +@param[in] file handle to an open file +@param[out] buf buffer where to read +@param[in] offset file offset from the start where to read +@param[in] n number of bytes to read, starting from offset +@param[out] err DB_SUCCESS or error code +@return number of bytes read, -1 if error */ +static MY_ATTRIBUTE((warn_unused_result)) +ssize_t +os_file_pread( + const IORequest& type, + os_file_t file, + void* buf, + ulint n, + os_offset_t offset, + dberr_t* err) +{ + ut_ad(type.is_read()); + + ++os_n_file_reads; + + const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS); + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor); + ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err); + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor); + + return(n_bytes); +} + +/** Requests a synchronous positioned read operation. +@return DB_SUCCESS if request was successful, false if fail +@param[in] type IO flags +@param[in] file handle to an open file +@param[out] buf buffer where to read +@param[in] offset file offset from the start where to read +@param[in] n number of bytes to read, starting from offset +@param[out] o number of bytes actually read +@return DB_SUCCESS or error code */ +dberr_t +os_file_read_func( + const IORequest& type, + os_file_t file, + void* buf, + os_offset_t offset, + ulint n, + ulint* o) +{ + ut_ad(!type.node || type.node->handle == file); + ut_ad(n); + + os_bytes_read_since_printout+= n; + + dberr_t err; + ssize_t n_bytes= os_file_pread(type, file, buf, n, offset, &err); + + if (o) + *o= ulint(n_bytes); + + if (ulint(n_bytes) == n || err != DB_SUCCESS) + return err; + + os_file_handle_error_cond_exit(type.node ? type.node->name : nullptr, "read", + false, false); + sql_print_error("InnoDB: Tried to read %zu bytes at offset %llu" + " of file %s, but was only able to read %zd", + n, offset, type.node ? type.node->name : "(unknown)", + n_bytes); + + return err ? err : DB_IO_ERROR; +} + +/** Handle errors for file operations. +@param[in] name name of a file or NULL +@param[in] operation operation +@param[in] should_abort whether to abort on an unknown error +@param[in] on_error_silent whether to suppress reports of non-fatal errors +@return true if we should retry the operation */ +static MY_ATTRIBUTE((warn_unused_result)) +bool +os_file_handle_error_cond_exit( + const char* name, + const char* operation, + bool should_abort, + bool on_error_silent) +{ + ulint err; + + err = os_file_get_last_error(false, on_error_silent); + + switch (err) { + case OS_FILE_DISK_FULL: + /* We only print a warning about disk full once */ + + if (os_has_said_disk_full) { + + return(false); + } + + /* Disk full error is reported irrespective of the + on_error_silent setting. */ + + if (name) { + + ib::error() + << "Encountered a problem with file '" + << name << "'"; + } + + ib::error() + << "Disk is full. Try to clean the disk to free space."; + + os_has_said_disk_full = true; + + return(false); + + case OS_FILE_AIO_RESOURCES_RESERVED: + case OS_FILE_AIO_INTERRUPTED: + + return(true); + + case OS_FILE_PATH_ERROR: + case OS_FILE_ALREADY_EXISTS: + case OS_FILE_ACCESS_VIOLATION: + return(false); + + case OS_FILE_NOT_FOUND: + if (!on_error_silent) { + sql_print_error("InnoDB: File %s was not found", name); + } + return false; + + case OS_FILE_SHARING_VIOLATION: + + std::this_thread::sleep_for(std::chrono::seconds(10)); + return(true); + + case OS_FILE_OPERATION_ABORTED: + case OS_FILE_INSUFFICIENT_RESOURCE: + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + return(true); + + default: + + /* If it is an operation that can crash on error then it + is better to ignore on_error_silent and print an error message + to the log. */ + + if (should_abort || !on_error_silent) { + ib::error() << "File " + << (name != NULL ? name : "(unknown)") + << ": '" << operation << "'" + " returned OS error " << err << "." + << (should_abort + ? " Cannot continue operation" : ""); + } + + if (should_abort) { + abort(); + } + } + + return(false); +} + +#ifndef _WIN32 +/** Tries to disable OS caching on an opened file descriptor. +@param[in] fd file descriptor to alter +@param[in] file_name file name, used in the diagnostic message +@param[in] name "open" or "create"; used in the diagnostic + message */ +void +os_file_set_nocache( + int fd MY_ATTRIBUTE((unused)), + const char* file_name MY_ATTRIBUTE((unused)), + const char* operation_name MY_ATTRIBUTE((unused))) +{ + /* some versions of Solaris may not have DIRECTIO_ON */ +#if defined(__sun__) && defined(DIRECTIO_ON) + if (directio(fd, DIRECTIO_ON) == -1) { + int errno_save = errno; + + ib::error() + << "Failed to set DIRECTIO_ON on file " + << file_name << "; " << operation_name << ": " + << strerror(errno_save) << "," + " continuing anyway."; + } +#elif defined(O_DIRECT) + if (fcntl(fd, F_SETFL, O_DIRECT) == -1) { + int errno_save = errno; + static bool warning_message_printed = false; + if (errno_save == EINVAL) { + if (!warning_message_printed) { + warning_message_printed = true; + ib::info() + << "Setting O_DIRECT on file " + << file_name << " failed"; + } + } else { + ib::warn() + << "Failed to set O_DIRECT on file " + << file_name << "; " << operation_name + << " : " << strerror(errno_save) + << ", continuing anyway."; + } + } +#endif /* defined(__sun__) && defined(DIRECTIO_ON) */ +} + +#endif /* _WIN32 */ + +/** Check if the file system supports sparse files. +@param fh file handle +@return true if the file system supports sparse files */ +static bool os_is_sparse_file_supported(os_file_t fh) +{ +#ifdef _WIN32 + FILE_ATTRIBUTE_TAG_INFO info; + if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo, + &info, (DWORD)sizeof(info))) { + if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) { + return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0; + } + } + return false; +#else + /* We don't know the FS block size, use the sector size. The FS + will do the magic. */ + return DB_SUCCESS == os_file_punch_hole_posix(fh, 0, srv_page_size); +#endif /* _WIN32 */ +} + +/** Extend a file. + +On Windows, extending a file allocates blocks for the file, +unless the file is sparse. + +On Unix, we will extend the file with ftruncate(), if +file needs to be sparse. Otherwise posix_fallocate() is used +when available, and if not, binary zeroes are added to the end +of file. + +@param[in] name file name +@param[in] file file handle +@param[in] size desired file size +@param[in] sparse whether to create a sparse file (no preallocating) +@return whether the operation succeeded */ +bool +os_file_set_size( + const char* name, + os_file_t file, + os_offset_t size, + bool is_sparse) +{ + ut_ad(!(size & 4095)); + +#ifdef _WIN32 + /* On Windows, changing file size works well and as expected for both + sparse and normal files. + + However, 10.2 up until 10.2.9 made every file sparse in innodb, + causing NTFS fragmentation issues(MDEV-13941). We try to undo + the damage, and unsparse the file.*/ + + if (!is_sparse && os_is_sparse_file_supported(file)) { + if (!os_file_set_sparse_win32(file, false)) + /* Unsparsing file failed. Fallback to writing binary + zeros, to avoid even higher fragmentation.*/ + goto fallback; + } + + return os_file_change_size_win32(name, file, size); + +fallback: +#else + struct stat statbuf; + + if (is_sparse) { + bool success = !ftruncate(file, size); + if (!success) { + ib::error() << "ftruncate of file " << name << " to " + << size << " bytes failed with error " + << errno; + } + return(success); + } + +# ifdef HAVE_POSIX_FALLOCATE + int err; + do { + if (fstat(file, &statbuf)) { + err = errno; + } else { + MSAN_STAT_WORKAROUND(&statbuf); + os_offset_t current_size = statbuf.st_size; + if (current_size >= size) { + return true; + } + current_size &= ~4095ULL; + err = posix_fallocate(file, current_size, + size - current_size); + } + } while (err == EINTR + && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED); + + switch (err) { + case 0: + return true; + default: + ib::error() << "preallocating " + << size << " bytes for file " << name + << " failed with error " << err; + /* fall through */ + case EINTR: + errno = err; + return false; + case EINVAL: + case EOPNOTSUPP: + /* fall back to the code below */ + break; + } +# endif /* HAVE_POSIX_ALLOCATE */ +#endif /* _WIN32*/ + +#ifdef _WIN32 + os_offset_t current_size = os_file_get_size(file); + FILE_STORAGE_INFO info; + if (GetFileInformationByHandleEx(file, FileStorageInfo, &info, + sizeof info)) { + if (info.LogicalBytesPerSector) { + current_size &= ~os_offset_t(info.LogicalBytesPerSector + - 1); + } + } +#else + if (fstat(file, &statbuf)) { + return false; + } + os_offset_t current_size = statbuf.st_size & ~4095ULL; +#endif + if (current_size >= size) { + return true; + } + + /* Write up to 1 megabyte at a time. */ + ulint buf_size = ut_min(ulint(64), + ulint(size >> srv_page_size_shift)) + << srv_page_size_shift; + + /* Align the buffer for possible raw i/o */ + byte* buf = static_cast<byte*>(aligned_malloc(buf_size, + srv_page_size)); + /* Write buffer full of zeros */ + memset(buf, 0, buf_size); + + while (current_size < size + && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) { + ulint n_bytes; + + if (size - current_size < (os_offset_t) buf_size) { + n_bytes = (ulint) (size - current_size); + } else { + n_bytes = buf_size; + } + + if (os_file_write(IORequestWrite, name, + file, buf, current_size, n_bytes) != + DB_SUCCESS) { + break; + } + + current_size += n_bytes; + } + + aligned_free(buf); + + return(current_size >= size && os_file_flush(file)); +} + +/** Truncate a file to a specified size in bytes. +@param[in] pathname file path +@param[in] file file to be truncated +@param[in] size size preserved in bytes +@param[in] allow_shrink whether to allow the file to become smaller +@return true if success */ +bool +os_file_truncate( + const char* pathname, + os_file_t file, + os_offset_t size, + bool allow_shrink) +{ + if (!allow_shrink) { + /* Do nothing if the size preserved is larger than or + equal to the current size of file */ + os_offset_t size_bytes = os_file_get_size(file); + + if (size >= size_bytes) { + return(true); + } + } + +#ifdef _WIN32 + return(os_file_change_size_win32(pathname, file, size)); +#else /* _WIN32 */ + return(os_file_truncate_posix(pathname, file, size)); +#endif /* _WIN32 */ +} + +/** Check the existence and type of the given file. +@param[in] path path name of file +@param[out] exists true if the file exists +@param[out] type Type of the file, if it exists +@return true if call succeeded */ +bool +os_file_status( + const char* path, + bool* exists, + os_file_type_t* type) +{ +#ifdef _WIN32 + return(os_file_status_win32(path, exists, type)); +#else + return(os_file_status_posix(path, exists, type)); +#endif /* _WIN32 */ +} + +/** Free storage space associated with a section of the file. +@param[in] fh Open file handle +@param[in] off Starting offset (SEEK_SET) +@param[in] len Size of the hole +@return DB_SUCCESS or error code */ +dberr_t +os_file_punch_hole( + os_file_t fh, + os_offset_t off, + os_offset_t len) +{ +#ifdef _WIN32 + return os_file_punch_hole_win32(fh, off, len); +#else + return os_file_punch_hole_posix(fh, off, len); +#endif /* _WIN32 */ +} + +/** Free storage space associated with a section of the file. +@param off byte offset from the start (SEEK_SET) +@param len size of the hole in bytes +@return DB_SUCCESS or error code */ +dberr_t IORequest::punch_hole(os_offset_t off, ulint len) const +{ + ulint trim_len = bpage ? bpage->physical_size() - len : 0; + + if (trim_len == 0) { + return(DB_SUCCESS); + } + + off += len; + + /* Check does file system support punching holes for this + tablespace. */ + if (!node->punch_hole) { + return DB_IO_NO_PUNCH_HOLE; + } + + dberr_t err = os_file_punch_hole(node->handle, off, trim_len); + + switch (err) { + case DB_SUCCESS: + srv_stats.page_compressed_trim_op.inc(); + return err; + case DB_IO_NO_PUNCH_HOLE: + node->punch_hole = false; + err = DB_SUCCESS; + /* fall through */ + default: + return err; + } +} + +/* + Get file system block size, by path. + + This is expensive on Windows, and not very useful in general, + (only shown in some I_S table), so we keep that out of usual + stat. +*/ +size_t os_file_get_fs_block_size(const char *path) +{ +#ifdef _WIN32 + char volname[MAX_PATH]; + if (!GetVolumePathName(path, volname, MAX_PATH)) + return 0; + DWORD sectorsPerCluster; + DWORD bytesPerSector; + DWORD numberOfFreeClusters; + DWORD totalNumberOfClusters; + + if (GetDiskFreeSpace(volname, §orsPerCluster, &bytesPerSector, + &numberOfFreeClusters, &totalNumberOfClusters)) + return ((size_t) bytesPerSector) * sectorsPerCluster; +#else + os_file_stat_t info; + if (os_file_get_status(path, &info, false, false) == DB_SUCCESS) + return info.block_size; +#endif + return 0; +} + +/** This function returns information about the specified file +@param[in] path pathname of the file +@param[out] stat_info information of a file in a directory +@param[in] check_rw_perm for testing whether the file can be opened + in RW mode +@param[in] read_only true if file is opened in read-only mode +@return DB_SUCCESS if all OK */ +dberr_t +os_file_get_status( + const char* path, + os_file_stat_t* stat_info, + bool check_rw_perm, + bool read_only) +{ + dberr_t ret; + +#ifdef _WIN32 + struct _stat64 info; + + ret = os_file_get_status_win32( + path, stat_info, &info, check_rw_perm, read_only); + +#else + struct stat info; + + ret = os_file_get_status_posix( + path, stat_info, &info, check_rw_perm, read_only); + +#endif /* _WIN32 */ + + if (ret == DB_SUCCESS) { + stat_info->ctime = info.st_ctime; + stat_info->atime = info.st_atime; + stat_info->mtime = info.st_mtime; + stat_info->size = info.st_size; + } + + return(ret); +} + +static void fake_io_callback(void *c) +{ + tpool::aiocb *cb= static_cast<tpool::aiocb*>(c); + ut_ad(read_slots->contains(cb)); + static_cast<const IORequest*>(static_cast<const void*>(cb->m_userdata))-> + fake_read_complete(cb->m_offset); + read_slots->release(cb); +} + +static void read_io_callback(void *c) +{ + tpool::aiocb *cb= static_cast<tpool::aiocb*>(c); + ut_ad(cb->m_opcode == tpool::aio_opcode::AIO_PREAD); + ut_ad(read_slots->contains(cb)); + const IORequest &request= *static_cast<const IORequest*> + (static_cast<const void*>(cb->m_userdata)); + request.read_complete(cb->m_err); + read_slots->release(cb); +} + +static void write_io_callback(void *c) +{ + tpool::aiocb *cb= static_cast<tpool::aiocb*>(c); + ut_ad(cb->m_opcode == tpool::aio_opcode::AIO_PWRITE); + ut_ad(write_slots->contains(cb)); + const IORequest &request= *static_cast<const IORequest*> + (static_cast<const void*>(cb->m_userdata)); + + if (UNIV_UNLIKELY(cb->m_err != 0)) + ib::info () << "IO Error: " << cb->m_err + << "during write of " + << cb->m_len << " bytes, for file " + << request.node->name << "(" << cb->m_fh << "), returned " + << cb->m_ret_len; + + request.write_complete(cb->m_err); + write_slots->release(cb); +} + +#ifdef LINUX_NATIVE_AIO +/** Checks if the system supports native linux aio. On some kernel +versions where native aio is supported it won't work on tmpfs. In such +cases we can't use native aio. + +@return: true if supported, false otherwise. */ +static bool is_linux_native_aio_supported() +{ + File fd; + io_context_t io_ctx; + std::string log_file_path = get_log_file_path(); + + memset(&io_ctx, 0, sizeof(io_ctx)); + if (io_setup(1, &io_ctx)) { + + /* The platform does not support native aio. */ + + return(false); + + } + else if (!srv_read_only_mode) { + + /* Now check if tmpdir supports native aio ops. */ + fd = mysql_tmpfile("ib"); + + if (fd < 0) { + ib::warn() + << "Unable to create temp file to check" + " native AIO support."; + + int ret = io_destroy(io_ctx); + ut_a(ret != -EINVAL); + ut_ad(ret != -EFAULT); + + return(false); + } + } + else { + fd = my_open(log_file_path.c_str(), O_RDONLY | O_CLOEXEC, + MYF(0)); + + if (fd == -1) { + + ib::warn() << "Unable to open \"" << log_file_path + << "\" to check native" + << " AIO read support."; + + int ret = io_destroy(io_ctx); + ut_a(ret != EINVAL); + ut_ad(ret != EFAULT); + + return(false); + } + } + + struct io_event io_event; + + memset(&io_event, 0x0, sizeof(io_event)); + + byte* ptr = static_cast<byte*>(aligned_malloc(srv_page_size, + srv_page_size)); + + struct iocb iocb; + + /* Suppress valgrind warning. */ + memset(ptr, 0, srv_page_size); + memset(&iocb, 0x0, sizeof(iocb)); + + struct iocb* p_iocb = &iocb; + + if (!srv_read_only_mode) { + + io_prep_pwrite(p_iocb, fd, ptr, srv_page_size, 0); + + } + else { + ut_a(srv_page_size >= 512); + io_prep_pread(p_iocb, fd, ptr, 512, 0); + } + + int err = io_submit(io_ctx, 1, &p_iocb); + + if (err >= 1) { + /* Now collect the submitted IO request. */ + err = io_getevents(io_ctx, 1, 1, &io_event, NULL); + } + + aligned_free(ptr); + my_close(fd, MYF(MY_WME)); + + switch (err) { + case 1: + { + int ret = io_destroy(io_ctx); + ut_a(ret != -EINVAL); + ut_ad(ret != -EFAULT); + + return(true); + } + + case -EINVAL: + case -ENOSYS: + ib::warn() + << "Linux Native AIO not supported. You can either" + " move " + << (srv_read_only_mode ? log_file_path : "tmpdir") + << " to a file system that supports native" + " AIO or you can set innodb_use_native_aio to" + " FALSE to avoid this message."; + + /* fall through. */ + default: + ib::warn() + << "Linux Native AIO check on " + << (srv_read_only_mode ? log_file_path : "tmpdir") + << "returned error[" << -err << "]"; + } + + int ret = io_destroy(io_ctx); + ut_a(ret != -EINVAL); + ut_ad(ret != -EFAULT); + + return(false); +} +#endif + +int os_aio_init() +{ + int max_write_events= int(srv_n_write_io_threads * + OS_AIO_N_PENDING_IOS_PER_THREAD); + int max_read_events= int(srv_n_read_io_threads * + OS_AIO_N_PENDING_IOS_PER_THREAD); + int max_events= max_read_events + max_write_events; + int ret; +#if LINUX_NATIVE_AIO + if (srv_use_native_aio && !is_linux_native_aio_supported()) + goto disable; +#endif + + ret= srv_thread_pool->configure_aio(srv_use_native_aio, max_events); + +#ifdef LINUX_NATIVE_AIO + if (ret) + { + ut_ad(srv_use_native_aio); +disable: + ib::warn() << "Linux Native AIO disabled."; + srv_use_native_aio= false; + ret= srv_thread_pool->configure_aio(false, max_events); + } +#endif + +#ifdef HAVE_URING + if (ret) + { + ut_ad(srv_use_native_aio); + ib::warn() + << "liburing disabled: falling back to innodb_use_native_aio=OFF"; + srv_use_native_aio= false; + ret= srv_thread_pool->configure_aio(false, max_events); + } +#endif + + if (!ret) + { + read_slots= new io_slots(max_read_events, srv_n_read_io_threads); + write_slots= new io_slots(max_write_events, srv_n_write_io_threads); + } + return ret; +} + + +/** +Change reader or writer thread parameter on a running server. +This includes resizing the io slots, as we calculate +number of outstanding IOs based on the these variables. + +It is trickier with when Linux AIO is involved (io_context +needs to be recreated to account for different number of +max_events). With Linux AIO, depending on fs-max-aio number +and user and system wide max-aio limitation, this can fail. + +Otherwise, we just resize the slots, and allow for +more concurrent threads via thread_group setting. + +@param[in] n_reader_threads - max number of concurrently + executing read callbacks +@param[in] n_writer_thread - max number of cuncurrently + executing write callbacks +@return 0 for success, !=0 for error. +*/ +int os_aio_resize(ulint n_reader_threads, ulint n_writer_threads) +{ + /* Lock the slots, and wait until all current IOs finish.*/ + auto &lk_read= read_slots->mutex(), &lk_write= write_slots->mutex(); + mysql_mutex_lock(&lk_read); + mysql_mutex_lock(&lk_write); + + read_slots->wait(lk_read); + write_slots->wait(lk_write); + + /* Now, all IOs have finished and no new ones can start, due to locks. */ + int max_read_events= int(n_reader_threads * OS_AIO_N_PENDING_IOS_PER_THREAD); + int max_write_events= int(n_writer_threads * OS_AIO_N_PENDING_IOS_PER_THREAD); + int events= max_read_events + max_write_events; + + /** Do the Linux AIO dance (this will try to create a new + io context with changed max_events ,etc*/ + + int ret= srv_thread_pool->reconfigure_aio(srv_use_native_aio, events); + + if (ret) + { + /** Do the best effort. We can't change the parallel io number, + but we still can adjust the number of concurrent completion handlers.*/ + read_slots->task_group().set_max_tasks(static_cast<int>(n_reader_threads)); + write_slots->task_group().set_max_tasks(static_cast<int>(n_writer_threads)); + } + else + { + /* Allocation succeeded, resize the slots*/ + read_slots->resize(max_read_events, static_cast<int>(n_reader_threads)); + write_slots->resize(max_write_events, static_cast<int>(n_writer_threads)); + } + + mysql_mutex_unlock(&lk_read); + mysql_mutex_unlock(&lk_write); + return ret; +} + +void os_aio_free() +{ + srv_thread_pool->disable_aio(); + delete read_slots; + delete write_slots; + read_slots= nullptr; + write_slots= nullptr; +} + +/** Wait until there are no pending asynchronous writes. */ +static void os_aio_wait_until_no_pending_writes_low(bool declare) +{ + const bool notify_wait= declare && write_slots->pending_io_count(); + + if (notify_wait) + tpool::tpool_wait_begin(); + + write_slots->wait(); + + if (notify_wait) + tpool::tpool_wait_end(); +} + +/** Wait until there are no pending asynchronous writes. +@param declare whether the wait will be declared in tpool */ +void os_aio_wait_until_no_pending_writes(bool declare) +{ + os_aio_wait_until_no_pending_writes_low(declare); + buf_dblwr.wait_flush_buffered_writes(); +} + +/** @return number of pending reads */ +size_t os_aio_pending_reads() +{ + mysql_mutex_lock(&read_slots->mutex()); + size_t pending= read_slots->pending_io_count(); + mysql_mutex_unlock(&read_slots->mutex()); + return pending; +} + +/** @return approximate number of pending reads */ +size_t os_aio_pending_reads_approx() +{ + return read_slots->pending_io_count(); +} + +/** @return number of pending writes */ +size_t os_aio_pending_writes() +{ + mysql_mutex_lock(&write_slots->mutex()); + size_t pending= write_slots->pending_io_count(); + mysql_mutex_unlock(&write_slots->mutex()); + return pending; +} + +/** Wait until all pending asynchronous reads have completed. +@param declare whether the wait will be declared in tpool */ +void os_aio_wait_until_no_pending_reads(bool declare) +{ + const bool notify_wait= declare && read_slots->pending_io_count(); + + if (notify_wait) + tpool::tpool_wait_begin(); + + read_slots->wait(); + + if (notify_wait) + tpool::tpool_wait_end(); +} + +/** Submit a fake read request during crash recovery. +@param type fake read request +@param offset additional context */ +void os_fake_read(const IORequest &type, os_offset_t offset) +{ + tpool::aiocb *cb= read_slots->acquire(); + + cb->m_group= read_slots->get_task_group(); + cb->m_fh= type.node->handle.m_file; + cb->m_buffer= nullptr; + cb->m_len= 0; + cb->m_offset= offset; + cb->m_opcode= tpool::aio_opcode::AIO_PREAD; + new (cb->m_userdata) IORequest{type}; + cb->m_internal_task.m_func= fake_io_callback; + cb->m_internal_task.m_arg= cb; + cb->m_internal_task.m_group= cb->m_group; + + srv_thread_pool->submit_task(&cb->m_internal_task); +} + + +/** Request a read or write. +@param type I/O request +@param buf buffer +@param offset file offset +@param n number of bytes +@retval DB_SUCCESS if request was queued successfully +@retval DB_IO_ERROR on I/O error */ +dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n) +{ + ut_ad(n > 0); + ut_ad(!(n & 511)); /* payload of page_compressed tables */ + ut_ad((offset % UNIV_ZIP_SIZE_MIN) == 0); + ut_ad((reinterpret_cast<size_t>(buf) % UNIV_ZIP_SIZE_MIN) == 0); + ut_ad(type.is_read() || type.is_write()); + ut_ad(type.node); + ut_ad(type.node->is_open()); + +#ifdef WIN_ASYNC_IO + ut_ad((n & 0xFFFFFFFFUL) == n); +#endif /* WIN_ASYNC_IO */ + +#ifdef UNIV_PFS_IO + PSI_file_locker_state state; + PSI_file_locker* locker= nullptr; + register_pfs_file_io_begin(&state, locker, type.node->handle, n, + type.is_write() + ? PSI_FILE_WRITE : PSI_FILE_READ, + __FILE__, __LINE__); +#endif /* UNIV_PFS_IO */ + dberr_t err = DB_SUCCESS; + + if (!type.is_async()) { + err = type.is_read() + ? os_file_read_func(type, type.node->handle, + buf, offset, n, nullptr) + : os_file_write_func(type, type.node->name, + type.node->handle, + buf, offset, n); +func_exit: +#ifdef UNIV_PFS_IO + register_pfs_file_io_end(locker, n); +#endif /* UNIV_PFS_IO */ + return err; + } + + io_slots* slots; + tpool::callback_func callback; + tpool::aio_opcode opcode; + + if (type.is_read()) { + ++os_n_file_reads; + slots = read_slots; + callback = read_io_callback; + opcode = tpool::aio_opcode::AIO_PREAD; + } else { + ++os_n_file_writes; + slots = write_slots; + callback = write_io_callback; + opcode = tpool::aio_opcode::AIO_PWRITE; + } + + compile_time_assert(sizeof(IORequest) <= tpool::MAX_AIO_USERDATA_LEN); + tpool::aiocb* cb = slots->acquire(); + + cb->m_buffer = buf; + cb->m_callback = callback; + cb->m_group = slots->get_task_group(); + cb->m_fh = type.node->handle.m_file; + cb->m_len = (int)n; + cb->m_offset = offset; + cb->m_opcode = opcode; + new (cb->m_userdata) IORequest{type}; + + if (srv_thread_pool->submit_io(cb)) { + slots->release(cb); + os_file_handle_error(type.node->name, type.is_read() + ? "aio read" : "aio write"); + err = DB_IO_ERROR; + type.node->space->release(); + } + + goto func_exit; +} + +/** Prints info of the aio arrays. +@param[in,out] file file where to print */ +void +os_aio_print(FILE* file) +{ + time_t current_time; + double time_elapsed; + + current_time = time(NULL); + time_elapsed = 0.001 + difftime(current_time, os_last_printout); + + fprintf(file, + "Pending flushes (fsync): " ULINTPF "\n" + ULINTPF " OS file reads, %zu OS file writes, %zu OS fsyncs\n", + ulint{fil_n_pending_tablespace_flushes}, + ulint{os_n_file_reads}, + static_cast<size_t>(os_n_file_writes), + static_cast<size_t>(os_n_fsyncs)); + + const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS)); + const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES)); + + if (n_reads != 0 || n_writes != 0) { + fprintf(file, + ULINTPF " pending reads, " ULINTPF " pending writes\n", + n_reads, n_writes); + } + + ulint avg_bytes_read = (os_n_file_reads == os_n_file_reads_old) + ? 0 + : os_bytes_read_since_printout + / (os_n_file_reads - os_n_file_reads_old); + + fprintf(file, + "%.2f reads/s, " ULINTPF " avg bytes/read," + " %.2f writes/s, %.2f fsyncs/s\n", + static_cast<double>(os_n_file_reads - os_n_file_reads_old) + / time_elapsed, + avg_bytes_read, + static_cast<double>(os_n_file_writes - os_n_file_writes_old) + / time_elapsed, + static_cast<double>(os_n_fsyncs - os_n_fsyncs_old) + / time_elapsed); + + os_n_file_reads_old = os_n_file_reads; + os_n_file_writes_old = os_n_file_writes; + os_n_fsyncs_old = os_n_fsyncs; + os_bytes_read_since_printout = 0; + + os_last_printout = current_time; +} + +/** Refreshes the statistics used to print per-second averages. */ +void +os_aio_refresh_stats() +{ + os_n_fsyncs_old = os_n_fsyncs; + + os_bytes_read_since_printout = 0; + + os_n_file_reads_old = os_n_file_reads; + + os_n_file_writes_old = os_n_file_writes; + + os_n_fsyncs_old = os_n_fsyncs; + + os_bytes_read_since_printout = 0; + + os_last_printout = time(NULL); +} + + +/** +Set the file create umask +@param[in] umask The umask to use for file creation. */ +void +os_file_set_umask(ulint umask) +{ + os_innodb_umask = umask; +} + +#ifdef _WIN32 + +/* Checks whether physical drive is on SSD.*/ +static bool is_drive_on_ssd(DWORD nr) +{ + char physical_drive_path[32]; + snprintf(physical_drive_path, sizeof(physical_drive_path), + "\\\\.\\PhysicalDrive%lu", nr); + + HANDLE h= CreateFile(physical_drive_path, 0, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, nullptr); + if (h == INVALID_HANDLE_VALUE) + return false; + + DEVICE_SEEK_PENALTY_DESCRIPTOR seek_penalty; + STORAGE_PROPERTY_QUERY storage_query{}; + storage_query.PropertyId= StorageDeviceSeekPenaltyProperty; + storage_query.QueryType= PropertyStandardQuery; + + bool on_ssd= false; + DWORD bytes_written; + if (DeviceIoControl(h, IOCTL_STORAGE_QUERY_PROPERTY, &storage_query, + sizeof storage_query, &seek_penalty, sizeof seek_penalty, + &bytes_written, nullptr)) + { + on_ssd= !seek_penalty.IncursSeekPenalty; + } + else + { + on_ssd= false; + } + CloseHandle(h); + return on_ssd; +} + +/* + Checks whether volume is on SSD, by checking all physical drives + in that volume. +*/ +static bool is_volume_on_ssd(const char *volume_mount_point) +{ + char volume_name[MAX_PATH]; + + if (!GetVolumeNameForVolumeMountPoint(volume_mount_point, volume_name, + array_elements(volume_name))) + { + /* This can fail, e.g if file is on network share */ + return false; + } + + /* Chomp last backslash, this is needed to open volume.*/ + size_t length= strlen(volume_name); + if (length && volume_name[length - 1] == '\\') + volume_name[length - 1]= 0; + + /* Open volume handle */ + HANDLE volume_handle= CreateFile( + volume_name, 0, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, nullptr); + + if (volume_handle == INVALID_HANDLE_VALUE) + return false; + + /* + Enumerate all volume extends, check whether all of them are on SSD + */ + + /* Anticipate common case where there is only one extent.*/ + VOLUME_DISK_EXTENTS single_extent; + + /* But also have a place to manage allocated data.*/ + std::unique_ptr<BYTE[]> lifetime; + + DWORD bytes_written; + VOLUME_DISK_EXTENTS *extents= nullptr; + if (DeviceIoControl(volume_handle, IOCTL_VOLUME_GET_VOLUME_DISK_EXTENTS, + nullptr, 0, &single_extent, sizeof(single_extent), + &bytes_written, nullptr)) + { + /* Worked on the first try. Use the preallocated buffer.*/ + extents= &single_extent; + } + else + { + VOLUME_DISK_EXTENTS *last_query= &single_extent; + while (GetLastError() == ERROR_MORE_DATA) + { + DWORD extentCount= last_query->NumberOfDiskExtents; + DWORD allocatedSize= + FIELD_OFFSET(VOLUME_DISK_EXTENTS, Extents[extentCount]); + lifetime.reset(new BYTE[allocatedSize]); + last_query= (VOLUME_DISK_EXTENTS *) lifetime.get(); + if (DeviceIoControl(volume_handle, IOCTL_VOLUME_GET_VOLUME_DISK_EXTENTS, + nullptr, 0, last_query, allocatedSize, + &bytes_written, nullptr)) + { + extents= last_query; + break; + } + } + } + CloseHandle(volume_handle); + if (!extents) + return false; + + for (DWORD i= 0; i < extents->NumberOfDiskExtents; i++) + if (!is_drive_on_ssd(extents->Extents[i].DiskNumber)) + return false; + + return true; +} + +#include <unordered_map> +static bool is_path_on_ssd(char *file_path) +{ + /* Preset result, in case something fails, e.g we're on network drive.*/ + char volume_path[MAX_PATH]; + if (!GetVolumePathName(file_path, volume_path, array_elements(volume_path))) + return false; + return is_volume_on_ssd(volume_path); +} + +static bool is_file_on_ssd(HANDLE handle, char *file_path) +{ + ULONGLONG volume_serial_number; + FILE_ID_INFO info; + if(!GetFileInformationByHandleEx(handle, FileIdInfo, &info, sizeof(info))) + return false; + volume_serial_number= info.VolumeSerialNumber; + + static std::unordered_map<ULONGLONG, bool> cache; + static SRWLOCK lock= SRWLOCK_INIT; + bool found; + bool result; + AcquireSRWLockShared(&lock); + auto e= cache.find(volume_serial_number); + if ((found= e != cache.end())) + result= e->second; + ReleaseSRWLockShared(&lock); + if (!found) + { + result= is_path_on_ssd(file_path); + /* Update cache */ + AcquireSRWLockExclusive(&lock); + cache[volume_serial_number]= result; + ReleaseSRWLockExclusive(&lock); + } + return result; +} + +#endif + +void fil_node_t::find_metadata(os_file_t file +#ifndef _WIN32 + , bool create, struct stat *statbuf +#endif + ) +{ + if (!is_open()) + { + handle= file; + ut_ad(is_open()); + } + + if (!space->is_compressed()) + punch_hole= 0; + else if (my_test_if_thinly_provisioned(file)) + punch_hole= 2; + else + punch_hole= IF_WIN(, !create ||) os_is_sparse_file_supported(file); + +#ifdef _WIN32 + on_ssd= is_file_on_ssd(file, name); + FILE_STORAGE_INFO info; + if (GetFileInformationByHandleEx(file, FileStorageInfo, &info, sizeof info)) + block_size= info.PhysicalBytesPerSectorForAtomicity; + else + block_size= 512; +#else + struct stat sbuf; + if (!statbuf && !fstat(file, &sbuf)) + { + MSAN_STAT_WORKAROUND(&sbuf); + statbuf= &sbuf; + } + if (statbuf) + block_size= statbuf->st_blksize; +# ifdef __linux__ + on_ssd= statbuf && fil_system.is_ssd(statbuf->st_dev); +# endif +#endif + + if (space->purpose != FIL_TYPE_TABLESPACE) + { + /* For temporary tablespace or during IMPORT TABLESPACE, we + disable neighbour flushing and do not care about atomicity. */ + on_ssd= true; + atomic_write= true; + } + else + /* On Windows, all single sector writes are atomic, as per + WriteFile() documentation on MSDN. */ + atomic_write= srv_use_atomic_writes && + IF_WIN(srv_page_size == block_size, + my_test_if_atomic_write(file, space->physical_size())); +} + +/** Read the first page of a data file. +@return whether the page was found valid */ +bool fil_node_t::read_page0() +{ + mysql_mutex_assert_owner(&fil_system.mutex); + const unsigned psize= space->physical_size(); +#ifndef _WIN32 + struct stat statbuf; + if (fstat(handle, &statbuf)) + return false; + MSAN_STAT_WORKAROUND(&statbuf); + os_offset_t size_bytes= statbuf.st_size; +#else + os_offset_t size_bytes= os_file_get_size(handle); + ut_a(size_bytes != (os_offset_t) -1); +#endif + const uint32_t min_size= FIL_IBD_FILE_INITIAL_SIZE * psize; + + if (size_bytes < min_size) + { + ib::error() << "The size of the file " << name + << " is only " << size_bytes + << " bytes, should be at least " << min_size; + return false; + } + + if (!deferred) + { + page_t *page= static_cast<byte*>(aligned_malloc(psize, psize)); + if (os_file_read(IORequestRead, handle, page, 0, psize, nullptr) + != DB_SUCCESS) + { + sql_print_error("InnoDB: Unable to read first page of file %s", name); +corrupted: + aligned_free(page); + return false; + } + + const ulint space_id= memcmp_aligned<2> + (FIL_PAGE_SPACE_ID + page, + FSP_HEADER_OFFSET + FSP_SPACE_ID + page, 4) + ? ULINT_UNDEFINED + : mach_read_from_4(FIL_PAGE_SPACE_ID + page); + uint32_t flags= fsp_header_get_flags(page); + const uint32_t size= fsp_header_get_field(page, FSP_SIZE); + const uint32_t free_limit= fsp_header_get_field(page, FSP_FREE_LIMIT); + const uint32_t free_len= flst_get_len(FSP_HEADER_OFFSET + FSP_FREE + page); + if (!fil_space_t::is_valid_flags(flags, space->id)) + { + uint32_t cflags= fsp_flags_convert_from_101(flags); + if (cflags == UINT32_MAX) + { +invalid: + ib::error() << "Expected tablespace flags " + << ib::hex(space->flags) + << " but found " << ib::hex(flags) + << " in the file " << name; + goto corrupted; + } + + uint32_t cf= cflags & ~FSP_FLAGS_MEM_MASK; + uint32_t sf= space->flags & ~FSP_FLAGS_MEM_MASK; + + if (!fil_space_t::is_flags_equal(cf, sf) && + !fil_space_t::is_flags_equal(sf, cf)) + goto invalid; + flags= cflags; + } + + ut_ad(!(flags & FSP_FLAGS_MEM_MASK)); + + /* Try to read crypt_data from page 0 if it is not yet read. */ + if (!space->crypt_data) + space->crypt_data= fil_space_read_crypt_data( + fil_space_t::zip_size(flags), page); + aligned_free(page); + + if (UNIV_UNLIKELY(space_id != space->id)) + { + ib::error() << "Expected tablespace id " << space->id + << " but found " << space_id + << " in the file " << name; + return false; + } + + space->flags= (space->flags & FSP_FLAGS_MEM_MASK) | flags; + ut_ad(space->free_limit == 0 || space->free_limit == free_limit); + ut_ad(space->free_len == 0 || space->free_len == free_len); + space->size_in_header= size; + space->free_limit= free_limit; + space->free_len= free_len; + } + + IF_WIN(find_metadata(), find_metadata(handle, false, &statbuf)); + /* Truncate the size to a multiple of extent size. */ + ulint mask= psize * FSP_EXTENT_SIZE - 1; + + if (size_bytes <= mask); + /* .ibd files start smaller than an + extent size. Do not truncate valid data. */ + else + size_bytes&= ~os_offset_t(mask); + + this->size= uint32_t(size_bytes / psize); + space->set_sizes(this->size); + return true; +} |