diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:07:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:07:14 +0000 |
commit | a175314c3e5827eb193872241446f2f8f5c9d33c (patch) | |
tree | cd3d60ca99ae00829c52a6ca79150a5b6e62528b /storage/innobase/os | |
parent | Initial commit. (diff) | |
download | mariadb-10.5-upstream.tar.xz mariadb-10.5-upstream.zip |
Adding upstream version 1:10.5.12.upstream/1%10.5.12upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/os')
-rw-r--r-- | storage/innobase/os/os0event.cc | 515 | ||||
-rw-r--r-- | storage/innobase/os/os0file.cc | 4349 | ||||
-rw-r--r-- | storage/innobase/os/os0thread.cc | 131 |
3 files changed, 4995 insertions, 0 deletions
diff --git a/storage/innobase/os/os0event.cc b/storage/innobase/os/os0event.cc new file mode 100644 index 00000000..f18633cc --- /dev/null +++ b/storage/innobase/os/os0event.cc @@ -0,0 +1,515 @@ +/***************************************************************************** + +Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file os/os0event.cc +The interface to the operating system condition variables. + +Created 2012-09-23 Sunny Bains +*******************************************************/ + +#include "os0event.h" +#include "ut0mutex.h" +#include <my_sys.h> + +#ifdef _WIN32 +#include <windows.h> +#include <synchapi.h> +/** Native condition variable. */ +typedef CONDITION_VARIABLE os_cond_t; +#else +/** Native condition variable */ +typedef pthread_cond_t os_cond_t; +#endif /* _WIN32 */ + +/** InnoDB condition variable. */ +struct os_event { + os_event() UNIV_NOTHROW; + + ~os_event() UNIV_NOTHROW; + + /** + Destroys a condition variable */ + void destroy() UNIV_NOTHROW + { +#ifndef _WIN32 + int ret = pthread_cond_destroy(&cond_var); + ut_a(ret == 0); +#endif /* !_WIN32 */ + + mutex.destroy(); + } + + /** Set the event */ + void set() UNIV_NOTHROW + { + mutex.enter(); + + if (!m_set) { + broadcast(); + } + + mutex.exit(); + } + + int64_t reset() UNIV_NOTHROW + { + mutex.enter(); + + if (m_set) { + m_set = false; + } + + int64_t ret = signal_count; + + mutex.exit(); + + return(ret); + } + + /** + Waits for an event object until it is in the signaled state. + + Typically, if the event has been signalled after the os_event_reset() + we'll return immediately because event->m_set == true. + There are, however, situations (e.g.: sync_array code) where we may + lose this information. For example: + + thread A calls os_event_reset() + thread B calls os_event_set() [event->m_set == true] + thread C calls os_event_reset() [event->m_set == false] + thread A calls os_event_wait() [infinite wait!] + thread C calls os_event_wait() [infinite wait!] + + Where such a scenario is possible, to avoid infinite wait, the + value returned by reset() should be passed in as + reset_sig_count. */ + void wait_low(int64_t reset_sig_count) UNIV_NOTHROW; + + /** + Waits for an event object until it is in the signaled state or + a timeout is exceeded. + @param time_in_usec - timeout in microseconds, + or OS_SYNC_INFINITE_TIME + @param reset_sig_count- zero or the value returned by + previous call of os_event_reset(). + @return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */ + ulint wait_time_low( + ulint time_in_usec, + int64_t reset_sig_count) UNIV_NOTHROW; + + /** @return true if the event is in the signalled state. */ + bool is_set() const UNIV_NOTHROW + { + mutex.enter(); + bool is_set = m_set; + mutex.exit(); + return is_set; + } + +private: + /** + Initialize a condition variable */ + void init() UNIV_NOTHROW + { + + mutex.init(); + +#ifdef _WIN32 + InitializeConditionVariable(&cond_var); +#else + { + int ret; + + ret = pthread_cond_init(&cond_var, NULL); + ut_a(ret == 0); + } +#endif /* _WIN32 */ + } + + /** + Wait on condition variable */ + void wait() UNIV_NOTHROW + { +#ifdef _WIN32 + if (!SleepConditionVariableCS(&cond_var, mutex, INFINITE)) { + ut_error; + } +#else + { + int ret; + + ret = pthread_cond_wait(&cond_var, mutex); + ut_a(ret == 0); + } +#endif /* _WIN32 */ + } + + /** + Wakes all threads waiting for condition variable */ + void broadcast() UNIV_NOTHROW + { + m_set = true; + ++signal_count; + +#ifdef _WIN32 + WakeAllConditionVariable(&cond_var); +#else + { + int ret; + + ret = pthread_cond_broadcast(&cond_var); + ut_a(ret == 0); + } +#endif /* _WIN32 */ + } + + /** + Wakes one thread waiting for condition variable */ + void signal() UNIV_NOTHROW + { +#ifdef _WIN32 + WakeConditionVariable(&cond_var); +#else + { + int ret; + + ret = pthread_cond_signal(&cond_var); + ut_a(ret == 0); + } +#endif /* _WIN32 */ + } + + /** + Do a timed wait on condition variable. + @param abstime - timeout + @param time_in_ms - timeout in milliseconds. + @return true if timed out, false otherwise */ + bool timed_wait( +#ifndef _WIN32 + const timespec* abstime +#else + DWORD time_in_ms +#endif /* !_WIN32 */ + ); + +private: + bool m_set; /*!< this is true when the + event is in the signaled + state, i.e., a thread does + not stop if it tries to wait + for this event */ + int64_t signal_count; /*!< this is incremented + each time the event becomes + signaled */ + mutable OSMutex mutex; /*!< this mutex protects + the next fields */ + + + os_cond_t cond_var; /*!< condition variable is + used in waiting for the event */ + +protected: + // Disable copying + os_event(const os_event&); + os_event& operator=(const os_event&); +}; + +/** +Do a timed wait on condition variable. +@param abstime - absolute time to wait +@param time_in_ms - timeout in milliseconds +@return true if timed out */ +bool +os_event::timed_wait( +#ifndef _WIN32 + const timespec* abstime +#else + DWORD time_in_ms +#endif /* !_WIN32 */ +) +{ +#ifdef _WIN32 + BOOL ret; + + ret = SleepConditionVariableCS(&cond_var, mutex, time_in_ms); + + if (!ret) { + DWORD err = GetLastError(); + + /* FQDN=msdn.microsoft.com + @see http://$FQDN/en-us/library/ms686301%28VS.85%29.aspx, + + "Condition variables are subject to spurious wakeups + (those not associated with an explicit wake) and stolen wakeups + (another thread manages to run before the woken thread)." + Check for both types of timeouts. + Conditions are checked by the caller.*/ + if (err == WAIT_TIMEOUT || err == ERROR_TIMEOUT) { + return(true); + } + } + + ut_a(ret); + + return(false); +#else + int ret; + + ret = pthread_cond_timedwait(&cond_var, mutex, abstime); + + switch (ret) { + case 0: + case ETIMEDOUT: + /* We play it safe by checking for EINTR even though + according to the POSIX documentation it can't return EINTR. */ + case EINTR: + break; + + default: + ib::error() << "pthread_cond_timedwait() returned: " << ret + << ": abstime={" << abstime->tv_sec << "," + << abstime->tv_nsec << "}"; + ut_error; + } + + return(ret == ETIMEDOUT); +#endif /* _WIN32 */ +} + +/** +Waits for an event object until it is in the signaled state. + +Typically, if the event has been signalled after the os_event_reset() +we'll return immediately because event->m_set == true. +There are, however, situations (e.g.: sync_array code) where we may +lose this information. For example: + +thread A calls os_event_reset() +thread B calls os_event_set() [event->m_set == true] +thread C calls os_event_reset() [event->m_set == false] +thread A calls os_event_wait() [infinite wait!] +thread C calls os_event_wait() [infinite wait!] + +Where such a scenario is possible, to avoid infinite wait, the +value returned by reset() should be passed in as +reset_sig_count. */ +void +os_event::wait_low( + int64_t reset_sig_count) UNIV_NOTHROW +{ + mutex.enter(); + + if (!reset_sig_count) { + reset_sig_count = signal_count; + } + + while (!m_set && signal_count == reset_sig_count) { + + wait(); + + /* Spurious wakeups may occur: we have to check if the + event really has been signaled after we came here to wait. */ + } + + mutex.exit(); +} + +/** +Waits for an event object until it is in the signaled state or +a timeout is exceeded. +@param time_in_usec - timeout in microseconds, or OS_SYNC_INFINITE_TIME +@param reset_sig_count - zero or the value returned by previous call + of os_event_reset(). +@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */ +ulint +os_event::wait_time_low( + ulint time_in_usec, + int64_t reset_sig_count) UNIV_NOTHROW +{ + bool timed_out = false; + +#ifdef _WIN32 + DWORD time_in_ms; + + if (time_in_usec != OS_SYNC_INFINITE_TIME) { + time_in_ms = DWORD(time_in_usec / 1000); + } else { + time_in_ms = INFINITE; + } +#else + struct timespec abstime; + + if (time_in_usec != OS_SYNC_INFINITE_TIME) { + ulonglong usec = ulonglong(time_in_usec) + my_hrtime().val; + abstime.tv_sec = static_cast<time_t>(usec / 1000000); + abstime.tv_nsec = static_cast<uint>((usec % 1000000) * 1000); + } else { + abstime.tv_nsec = 999999999; + abstime.tv_sec = (time_t) ULINT_MAX; + } + + ut_a(abstime.tv_nsec <= 999999999); + +#endif /* _WIN32 */ + + mutex.enter(); + + if (!reset_sig_count) { + reset_sig_count = signal_count; + } + + do { + if (m_set || signal_count != reset_sig_count) { + + break; + } + +#ifndef _WIN32 + timed_out = timed_wait(&abstime); +#else + timed_out = timed_wait(time_in_ms); +#endif /* !_WIN32 */ + + } while (!timed_out); + + mutex.exit(); + + return(timed_out ? OS_SYNC_TIME_EXCEEDED : 0); +} + +/** Constructor */ +os_event::os_event() UNIV_NOTHROW +{ + init(); + + m_set = false; + + /* We return this value in os_event_reset(), + which can then be be used to pass to the + os_event_wait_low(). The value of zero is + reserved in os_event_wait_low() for the case + when the caller does not want to pass any + signal_count value. To distinguish between + the two cases we initialize signal_count + to 1 here. */ + + signal_count = 1; +} + +/** Destructor */ +os_event::~os_event() UNIV_NOTHROW +{ + destroy(); +} + +/** +Creates an event semaphore, i.e., a semaphore which may just have two +states: signaled and nonsignaled. The created event is manual reset: it +must be reset explicitly by calling sync_os_reset_event. +@return the event handle */ +os_event_t os_event_create(const char*) +{ + return(UT_NEW_NOKEY(os_event())); +} + +/** +Check if the event is set. +@return true if set */ +bool +os_event_is_set( +/*============*/ + const os_event_t event) /*!< in: event to test */ +{ + return(event->is_set()); +} + +/** +Sets an event semaphore to the signaled state: lets waiting threads +proceed. */ +void +os_event_set( +/*=========*/ + os_event_t event) /*!< in/out: event to set */ +{ + event->set(); +} + +/** +Resets an event semaphore to the nonsignaled state. Waiting threads will +stop to wait for the event. +The return value should be passed to os_even_wait_low() if it is desired +that this thread should not wait in case of an intervening call to +os_event_set() between this os_event_reset() and the +os_event_wait_low() call. See comments for os_event_wait_low(). +@return current signal_count. */ +int64_t +os_event_reset( +/*===========*/ + os_event_t event) /*!< in/out: event to reset */ +{ + return(event->reset()); +} + +/** +Waits for an event object until it is in the signaled state or +a timeout is exceeded. +@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */ +ulint +os_event_wait_time_low( +/*===================*/ + os_event_t event, /*!< in/out: event to wait */ + ulint time_in_usec, /*!< in: timeout in + microseconds, or + OS_SYNC_INFINITE_TIME */ + int64_t reset_sig_count) /*!< in: zero or the value + returned by previous call of + os_event_reset(). */ +{ + return(event->wait_time_low(time_in_usec, reset_sig_count)); +} + +/** +Waits for an event object until it is in the signaled state. + +Where such a scenario is possible, to avoid infinite wait, the +value returned by os_event_reset() should be passed in as +reset_sig_count. */ +void +os_event_wait_low( +/*==============*/ + os_event_t event, /*!< in: event to wait */ + int64_t reset_sig_count) /*!< in: zero or the value + returned by previous call of + os_event_reset(). */ +{ + event->wait_low(reset_sig_count); +} + +/** +Frees an event object. */ +void +os_event_destroy( +/*=============*/ + os_event_t& event) /*!< in/own: event to free */ + +{ + UT_DELETE(event); + event = NULL; +} diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc new file mode 100644 index 00000000..7a6829e7 --- /dev/null +++ b/storage/innobase/os/os0file.cc @@ -0,0 +1,4349 @@ +/*********************************************************************** + +Copyright (c) 1995, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2021, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +***********************************************************************/ + +/**************************************************//** +@file os/os0file.cc +The interface to the operating system file i/o primitives + +Created 10/21/1995 Heikki Tuuri +*******************************************************/ + +#ifndef UNIV_INNOCHECKSUM +#include "os0file.h" +#include "sql_const.h" + +#ifdef UNIV_LINUX +# include <sys/types.h> +# include <sys/stat.h> +#endif + +#include "srv0srv.h" +#include "srv0start.h" +#include "fil0fil.h" +#include "fsp0fsp.h" +#ifdef HAVE_LINUX_UNISTD_H +#include "unistd.h" +#endif +#include "os0event.h" +#include "os0thread.h" + +#include <vector> +#include <tpool_structs.h> + +#ifdef LINUX_NATIVE_AIO +#include <libaio.h> +#endif /* LINUX_NATIVE_AIO */ + +#ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE +# include <fcntl.h> +# include <linux/falloc.h> +#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */ + +#if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H) +# include <sys/ioctl.h> +# ifndef DFS_IOCTL_ATOMIC_WRITE_SET +# define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint) +# endif +#endif + +#ifdef _WIN32 +#include <winioctl.h> +#else +// my_test_if_atomic_write() +#include <my_sys.h> +#endif + +#include "buf0dblwr.h" + +#include <thread> +#include <chrono> + +/* Per-IO operation environment*/ +class io_slots +{ +private: + tpool::cache<tpool::aiocb> m_cache; + tpool::task_group m_group; + int m_max_aio; +public: + io_slots(int max_submitted_io, int max_callback_concurrency) : + m_cache(max_submitted_io), + m_group(max_callback_concurrency), + m_max_aio(max_submitted_io) + { + } + /* Get cached AIO control block */ + tpool::aiocb* acquire() + { + return m_cache.get(); + } + /* Release AIO control block back to cache */ + void release(tpool::aiocb* aiocb) + { + m_cache.put(aiocb); + } + + bool contains(tpool::aiocb* aiocb) + { + return m_cache.contains(aiocb); + } + + /* Wait for completions of all AIO operations */ + void wait() + { + m_cache.wait(); + } + + size_t pending_io_count() + { + return (size_t)m_max_aio - m_cache.size(); + } + + tpool::task_group* get_task_group() + { + return &m_group; + } + + ~io_slots() + { + wait(); + } +}; + +static io_slots *read_slots; +static io_slots *write_slots; + +/** Number of retries for partial I/O's */ +constexpr ulint NUM_RETRIES_ON_PARTIAL_IO = 10; + +/* This specifies the file permissions InnoDB uses when it creates files in +Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to +my_umask */ + +#ifndef _WIN32 +/** Umask for creating files */ +static ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; +#else +/** Umask for creating files */ +static ulint os_innodb_umask = 0; +#endif /* _WIN32 */ + + +#ifdef WITH_INNODB_DISALLOW_WRITES +#define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event) +#else +#define WAIT_ALLOW_WRITES() do { } while (0) +#endif /* WITH_INNODB_DISALLOW_WRITES */ + + +Atomic_counter<ulint> os_n_file_reads; +static ulint os_bytes_read_since_printout; +ulint os_n_file_writes; +ulint os_n_fsyncs; +static ulint os_n_file_reads_old; +static ulint os_n_file_writes_old; +static ulint os_n_fsyncs_old; + +static time_t os_last_printout; +bool os_has_said_disk_full; + +/** Default Zip compression level */ +extern uint page_zip_level; + +#ifdef UNIV_PFS_IO +/* Keys to register InnoDB I/O with performance schema */ +mysql_pfs_key_t innodb_data_file_key; +mysql_pfs_key_t innodb_log_file_key; +mysql_pfs_key_t innodb_temp_file_key; +#endif + +/** Handle errors for file operations. +@param[in] name name of a file or NULL +@param[in] operation operation +@param[in] should_abort whether to abort on an unknown error +@param[in] on_error_silent whether to suppress reports of non-fatal errors +@return true if we should retry the operation */ +static MY_ATTRIBUTE((warn_unused_result)) +bool +os_file_handle_error_cond_exit( + const char* name, + const char* operation, + bool should_abort, + bool on_error_silent); + +/** Does error handling when a file operation fails. +@param[in] name name of a file or NULL +@param[in] operation operation name that failed +@return true if we should retry the operation */ +static +bool +os_file_handle_error( + const char* name, + const char* operation) +{ + /* Exit in case of unknown error */ + return(os_file_handle_error_cond_exit(name, operation, true, false)); +} + +/** Does error handling when a file operation fails. +@param[in] name name of a file or NULL +@param[in] operation operation name that failed +@param[in] on_error_silent if true then don't print any message to the log. +@return true if we should retry the operation */ +static +bool +os_file_handle_error_no_exit( + const char* name, + const char* operation, + bool on_error_silent) +{ + /* Don't exit in case of unknown error */ + return(os_file_handle_error_cond_exit( + name, operation, false, on_error_silent)); +} + +/** Handle RENAME error. +@param name old name of the file +@param new_name new name of the file */ +static void os_file_handle_rename_error(const char* name, const char* new_name) +{ + if (os_file_get_last_error(true) != OS_FILE_DISK_FULL) { + ib::error() << "Cannot rename file '" << name << "' to '" + << new_name << "'"; + } else if (!os_has_said_disk_full) { + os_has_said_disk_full = true; + /* Disk full error is reported irrespective of the + on_error_silent setting. */ + ib::error() << "Full disk prevents renaming file '" + << name << "' to '" << new_name << "'"; + } +} + + +#ifdef _WIN32 + +/** + Wrapper around Windows DeviceIoControl() function. + + Works synchronously, also in case for handle opened + for async access (i.e with FILE_FLAG_OVERLAPPED). + + Accepts the same parameters as DeviceIoControl(),except + last parameter (OVERLAPPED). +*/ +static +BOOL +os_win32_device_io_control( + HANDLE handle, + DWORD code, + LPVOID inbuf, + DWORD inbuf_size, + LPVOID outbuf, + DWORD outbuf_size, + LPDWORD bytes_returned +) +{ + OVERLAPPED overlapped = { 0 }; + overlapped.hEvent = tpool::win_get_syncio_event(); + BOOL result = DeviceIoControl(handle, code, inbuf, inbuf_size, outbuf, + outbuf_size, NULL, &overlapped); + + if (result || (GetLastError() == ERROR_IO_PENDING)) { + /* Wait for async io to complete */ + result = GetOverlappedResult(handle, &overlapped, bytes_returned, TRUE); + } + + return result; +} + +#endif + + + +/** Helper class for doing synchronous file IO. Currently, the objective +is to hide the OS specific code, so that the higher level functions aren't +peppered with #ifdef. Makes the code flow difficult to follow. */ +class SyncFileIO +{ +public: + /** Constructor + @param[in] fh File handle + @param[in,out] buf Buffer to read/write + @param[in] n Number of bytes to read/write + @param[in] offset Offset where to read or write */ + SyncFileIO(os_file_t fh, void *buf, ulint n, os_offset_t offset) : + m_fh(fh), m_buf(buf), m_n(static_cast<ssize_t>(n)), m_offset(offset) + { ut_ad(m_n > 0); } + + /** Do the read/write + @param[in] request The IO context and type + @return the number of bytes read/written or negative value on error */ + ssize_t execute(const IORequest &request); + + /** Move the read/write offset up to where the partial IO succeeded. + @param[in] n_bytes The number of bytes to advance */ + void advance(ssize_t n_bytes) + { + m_offset+= n_bytes; + ut_ad(m_n >= n_bytes); + m_n-= n_bytes; + m_buf= reinterpret_cast<uchar*>(m_buf) + n_bytes; + } + +private: + /** Open file handle */ + const os_file_t m_fh; + /** Buffer to read/write */ + void *m_buf; + /** Number of bytes to read/write */ + ssize_t m_n; + /** Offset from where to read/write */ + os_offset_t m_offset; +}; + +#undef USE_FILE_LOCK +#ifndef _WIN32 +/* On Windows, mandatory locking is used */ +# define USE_FILE_LOCK +#endif +#ifdef USE_FILE_LOCK +/** Obtain an exclusive lock on a file. +@param[in] fd file descriptor +@param[in] name file name +@return 0 on success */ +static +int +os_file_lock( + int fd, + const char* name) +{ + if (my_disable_locking) { + return 0; + } + + struct flock lk; + + lk.l_type = F_WRLCK; + lk.l_whence = SEEK_SET; + lk.l_start = lk.l_len = 0; + + if (fcntl(fd, F_SETLK, &lk) == -1) { + + ib::error() + << "Unable to lock " << name + << " error: " << errno; + + if (errno == EAGAIN || errno == EACCES) { + + ib::info() + << "Check that you do not already have" + " another mysqld process using the" + " same InnoDB data or log files."; + } + + return(-1); + } + + return(0); +} +#endif /* USE_FILE_LOCK */ + + +/** Create a temporary file. This function is like tmpfile(3), but +the temporary file is created in the in the mysql server configuration +parameter (--tmpdir). +@return temporary file handle, or NULL on error */ +FILE* +os_file_create_tmpfile() +{ + FILE* file = NULL; + WAIT_ALLOW_WRITES(); + File fd = mysql_tmpfile("ib"); + + if (fd >= 0) { + file = my_fdopen(fd, 0, O_RDWR|O_TRUNC|O_CREAT|FILE_BINARY, + MYF(MY_WME)); + if (!file) { + my_close(fd, MYF(MY_WME)); + } + } + + if (file == NULL) { + + ib::error() + << "Unable to create temporary file; errno: " + << errno; + } + + return(file); +} + +/** Rewind file to its start, read at most size - 1 bytes from it to str, and +NUL-terminate str. All errors are silently ignored. This function is +mostly meant to be used with temporary files. +@param[in,out] file File to read from +@param[in,out] str Buffer where to read +@param[in] size Size of buffer */ +void +os_file_read_string( + FILE* file, + char* str, + ulint size) +{ + if (size != 0) { + rewind(file); + + size_t flen = fread(str, 1, size - 1, file); + + str[flen] = '\0'; + } +} + +/** This function returns a new path name after replacing the basename +in an old path with a new basename. The old_path is a full path +name including the extension. The tablename is in the normal +form "databasename/tablename". The new base name is found after +the forward slash. Both input strings are null terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@param[in] old_path Pathname +@param[in] tablename Contains new base name +@return own: new full pathname */ +char* +os_file_make_new_pathname( + const char* old_path, + const char* tablename) +{ + ulint dir_len; + char* last_slash; + char* base_name; + char* new_path; + ulint new_path_len; + + /* Split the tablename into its database and table name components. + They are separated by a '/'. */ + last_slash = strrchr((char*) tablename, '/'); + base_name = last_slash ? last_slash + 1 : (char*) tablename; + + /* Find the offset of the last slash. We will strip off the + old basename.ibd which starts after that slash. */ + last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR); + dir_len = last_slash ? ulint(last_slash - old_path) : strlen(old_path); + + /* allocate a new path and move the old directory path to it. */ + new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd"; + new_path = static_cast<char*>(ut_malloc_nokey(new_path_len)); + memcpy(new_path, old_path, dir_len); + + snprintf(new_path + dir_len, new_path_len - dir_len, + "%c%s.ibd", OS_PATH_SEPARATOR, base_name); + + return(new_path); +} + +/** This function reduces a null-terminated full remote path name into +the path that is sent by MySQL for DATA DIRECTORY clause. It replaces +the 'databasename/tablename.ibd' found at the end of the path with just +'tablename'. + +Since the result is always smaller than the path sent in, no new memory +is allocated. The caller should allocate memory for the path sent in. +This function manipulates that path in place. + +If the path format is not as expected, just return. The result is used +to inform a SHOW CREATE TABLE command. +@param[in,out] data_dir_path Full path/data_dir_path */ +void +os_file_make_data_dir_path( + char* data_dir_path) +{ + /* Replace the period before the extension with a null byte. */ + char* ptr = strrchr((char*) data_dir_path, '.'); + + if (ptr == NULL) { + return; + } + + ptr[0] = '\0'; + + /* The tablename starts after the last slash. */ + ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR); + + if (ptr == NULL) { + return; + } + + ptr[0] = '\0'; + + char* tablename = ptr + 1; + + /* The databasename starts after the next to last slash. */ + ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR); + + if (ptr == NULL) { + return; + } + + ulint tablename_len = strlen(tablename); + + memmove(++ptr, tablename, tablename_len); + + ptr[tablename_len] = '\0'; +} + +/** Check if the path refers to the root of a drive using a pointer +to the last directory separator that the caller has fixed. +@param[in] path path name +@param[in] path last directory separator in the path +@return true if this path is a drive root, false if not */ +UNIV_INLINE +bool +os_file_is_root( + const char* path, + const char* last_slash) +{ + return( +#ifdef _WIN32 + (last_slash == path + 2 && path[1] == ':') || +#endif /* _WIN32 */ + last_slash == path); +} + +/** Return the parent directory component of a null-terminated path. +Return a new buffer containing the string up to, but not including, +the final component of the path. +The path returned will not contain a trailing separator. +Do not return a root path, return NULL instead. +The final component trimmed off may be a filename or a directory name. +If the final component is the only component of the path, return NULL. +It is the caller's responsibility to free the returned string after it +is no longer needed. +@param[in] path Path name +@return own: parent directory of the path */ +static +char* +os_file_get_parent_dir( + const char* path) +{ + bool has_trailing_slash = false; + + /* Find the offset of the last slash */ + const char* last_slash = strrchr(path, OS_PATH_SEPARATOR); + + if (!last_slash) { + /* No slash in the path, return NULL */ + return(NULL); + } + + /* Ok, there is a slash. Is there anything after it? */ + if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) { + has_trailing_slash = true; + } + + /* Reduce repetative slashes. */ + while (last_slash > path + && last_slash[-1] == OS_PATH_SEPARATOR) { + last_slash--; + } + + /* Check for the root of a drive. */ + if (os_file_is_root(path, last_slash)) { + return(NULL); + } + + /* If a trailing slash prevented the first strrchr() from trimming + the last component of the path, trim that component now. */ + if (has_trailing_slash) { + /* Back up to the previous slash. */ + last_slash--; + while (last_slash > path + && last_slash[0] != OS_PATH_SEPARATOR) { + last_slash--; + } + + /* Reduce repetative slashes. */ + while (last_slash > path + && last_slash[-1] == OS_PATH_SEPARATOR) { + last_slash--; + } + } + + /* Check for the root of a drive. */ + if (os_file_is_root(path, last_slash)) { + return(NULL); + } + + if (last_slash - path < 0) { + /* Sanity check, it prevents gcc from trying to handle this case which + * results in warnings for some optimized builds */ + return (NULL); + } + + /* Non-trivial directory component */ + + return(mem_strdupl(path, ulint(last_slash - path))); +} +#ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR + +/* Test the function os_file_get_parent_dir. */ +void +test_os_file_get_parent_dir( + const char* child_dir, + const char* expected_dir) +{ + char* child = mem_strdup(child_dir); + char* expected = expected_dir == NULL ? NULL + : mem_strdup(expected_dir); + + /* os_file_get_parent_dir() assumes that separators are + converted to OS_PATH_SEPARATOR. */ + os_normalize_path(child); + os_normalize_path(expected); + + char* parent = os_file_get_parent_dir(child); + + bool unexpected = (expected == NULL + ? (parent != NULL) + : (0 != strcmp(parent, expected))); + if (unexpected) { + ib::fatal() << "os_file_get_parent_dir('" << child + << "') returned '" << parent + << "', instead of '" << expected << "'."; + } + ut_free(parent); + ut_free(child); + ut_free(expected); +} + +/* Test the function os_file_get_parent_dir. */ +void +unit_test_os_file_get_parent_dir() +{ + test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib"); + test_os_file_get_parent_dir("/usr/", NULL); + test_os_file_get_parent_dir("//usr//", NULL); + test_os_file_get_parent_dir("usr", NULL); + test_os_file_get_parent_dir("usr//", NULL); + test_os_file_get_parent_dir("/", NULL); + test_os_file_get_parent_dir("//", NULL); + test_os_file_get_parent_dir(".", NULL); + test_os_file_get_parent_dir("..", NULL); +# ifdef _WIN32 + test_os_file_get_parent_dir("D:", NULL); + test_os_file_get_parent_dir("D:/", NULL); + test_os_file_get_parent_dir("D:\\", NULL); + test_os_file_get_parent_dir("D:/data", NULL); + test_os_file_get_parent_dir("D:/data/", NULL); + test_os_file_get_parent_dir("D:\\data\\", NULL); + test_os_file_get_parent_dir("D:///data/////", NULL); + test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL); + test_os_file_get_parent_dir("D:/data//a", "D:/data"); + test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data"); + test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a"); + test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a"); +#endif /* _WIN32 */ +} +#endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */ + + +/** Creates all missing subdirectories along the given path. +@param[in] path Path name +@return DB_SUCCESS if OK, otherwise error code. */ +dberr_t +os_file_create_subdirs_if_needed( + const char* path) +{ + if (srv_read_only_mode) { + + ib::error() + << "read only mode set. Can't create " + << "subdirectories '" << path << "'"; + + return(DB_READ_ONLY); + + } + + char* subdir = os_file_get_parent_dir(path); + + if (subdir == NULL) { + /* subdir is root or cwd, nothing to do */ + return(DB_SUCCESS); + } + + /* Test if subdir exists */ + os_file_type_t type; + bool subdir_exists; + bool success = os_file_status(subdir, &subdir_exists, &type); + + if (success && !subdir_exists) { + + /* Subdir does not exist, create it */ + dberr_t err = os_file_create_subdirs_if_needed(subdir); + + if (err != DB_SUCCESS) { + + ut_free(subdir); + + return(err); + } + + success = os_file_create_directory(subdir, false); + } + + ut_free(subdir); + + return(success ? DB_SUCCESS : DB_ERROR); +} + + + +/** Do the read/write +@param[in] request The IO context and type +@return the number of bytes read/written or negative value on error */ +ssize_t +SyncFileIO::execute(const IORequest& request) +{ + ssize_t n_bytes; + + if (request.is_read()) { +#ifdef _WIN32 + n_bytes = tpool::pread(m_fh, m_buf, m_n, m_offset); +#else + n_bytes = pread(m_fh, m_buf, m_n, m_offset); +#endif + } else { + ut_ad(request.is_write()); +#ifdef _WIN32 + n_bytes = tpool::pwrite(m_fh, m_buf, m_n, m_offset); +#else + n_bytes = pwrite(m_fh, m_buf, m_n, m_offset); +#endif + } + + return(n_bytes); +} + +#ifndef _WIN32 +/** Free storage space associated with a section of the file. +@param[in] fh Open file handle +@param[in] off Starting offset (SEEK_SET) +@param[in] len Size of the hole +@return DB_SUCCESS or error code */ +static +dberr_t +os_file_punch_hole_posix( + os_file_t fh, + os_offset_t off, + os_offset_t len) +{ + +#ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE + const int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; + + int ret = fallocate(fh, mode, off, len); + + if (ret == 0) { + return(DB_SUCCESS); + } + + if (errno == ENOTSUP) { + return(DB_IO_NO_PUNCH_HOLE); + } + + ib::warn() + << "fallocate(" + <<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, " + << off << ", " << len << ") returned errno: " + << errno; + + return(DB_IO_ERROR); + +#elif defined(UNIV_SOLARIS) + + // Use F_FREESP + +#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */ + + return(DB_IO_NO_PUNCH_HOLE); +} + + + +/** Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + 100 is returned. +@param[in] report_all_errors true if we want an error message + printed of all errors +@param[in] on_error_silent true then don't print any diagnostic + to the log +@return error number, or OS error number + 100 */ +static +ulint +os_file_get_last_error_low( + bool report_all_errors, + bool on_error_silent) +{ + int err = errno; + + if (err == 0) { + return(0); + } + + if (report_all_errors + || (err != ENOSPC && err != EEXIST && !on_error_silent)) { + + ib::error() + << "Operating system error number " + << err + << " in a file operation."; + + if (err == ENOENT) { + + ib::error() + << "The error means the system" + " cannot find the path specified."; + + if (srv_is_being_started) { + + ib::error() + << "If you are installing InnoDB," + " remember that you must create" + " directories yourself, InnoDB" + " does not create them."; + } + } else if (err == EACCES) { + + ib::error() + << "The error means mysqld does not have" + " the access rights to the directory."; + + } else { + if (strerror(err) != NULL) { + + ib::error() + << "Error number " << err << " means '" + << strerror(err) << "'"; + } + + ib::info() << OPERATING_SYSTEM_ERROR_MSG; + } + } + + switch (err) { + case ENOSPC: + return(OS_FILE_DISK_FULL); + case ENOENT: + return(OS_FILE_NOT_FOUND); + case EEXIST: + return(OS_FILE_ALREADY_EXISTS); + case EXDEV: + case ENOTDIR: + case EISDIR: + return(OS_FILE_PATH_ERROR); + case EAGAIN: + if (srv_use_native_aio) { + return(OS_FILE_AIO_RESOURCES_RESERVED); + } + break; + case EINTR: + if (srv_use_native_aio) { + return(OS_FILE_AIO_INTERRUPTED); + } + break; + case EACCES: + return(OS_FILE_ACCESS_VIOLATION); + } + return(OS_FILE_ERROR_MAX + err); +} + +/** Wrapper to fsync() or fdatasync() that retries the call on some errors. +Returns the value 0 if successful; otherwise the value -1 is returned and +the global variable errno is set to indicate the error. +@param[in] file open file handle +@return 0 if success, -1 otherwise */ +static int os_file_sync_posix(os_file_t file) +{ +#if !defined(HAVE_FDATASYNC) || HAVE_DECL_FDATASYNC == 0 + auto func= fsync; + auto func_name= "fsync()"; +#else + auto func= fdatasync; + auto func_name= "fdatasync()"; +#endif + + ulint failures= 0; + + for (;;) + { + ++os_n_fsyncs; + + int ret= func(file); + + if (ret == 0) + return ret; + + switch (errno) + { + case ENOLCK: + ++failures; + ut_a(failures < 1000); + + if (!(failures % 100)) + ib::warn() << func_name << ": No locks available; retrying"; + + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + break; + + case EINTR: + ++failures; + ut_a(failures < 2000); + break; + + default: + ib::fatal() << func_name << " returned " << errno; + } + } +} + +/** Check the existence and type of the given file. +@param[in] path path name of file +@param[out] exists true if the file exists +@param[out] type Type of the file, if it exists +@return true if call succeeded */ +static +bool +os_file_status_posix( + const char* path, + bool* exists, + os_file_type_t* type) +{ + struct stat statinfo; + + int ret = stat(path, &statinfo); + + *exists = !ret; + + if (!ret) { + /* file exists, everything OK */ + + } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) { + /* file does not exist */ + return(true); + + } else { + /* file exists, but stat call failed */ + os_file_handle_error_no_exit(path, "stat", false); + return(false); + } + + if (S_ISDIR(statinfo.st_mode)) { + *type = OS_FILE_TYPE_DIR; + + } else if (S_ISLNK(statinfo.st_mode)) { + *type = OS_FILE_TYPE_LINK; + + } else if (S_ISREG(statinfo.st_mode)) { + *type = OS_FILE_TYPE_FILE; + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + return(true); +} + +/** NOTE! Use the corresponding macro os_file_flush(), not directly this +function! +Flushes the write buffers of a given file to the disk. +@param[in] file handle to a file +@return true if success */ +bool +os_file_flush_func( + os_file_t file) +{ + int ret; + + WAIT_ALLOW_WRITES(); + ret = os_file_sync_posix(file); + + if (ret == 0) { + return(true); + } + + /* Since Linux returns EINVAL if the 'file' is actually a raw device, + we choose to ignore that error if we are using raw disks */ + + if (srv_start_raw_disk_in_use && errno == EINVAL) { + + return(true); + } + + ib::error() << "The OS said file flush did not succeed"; + + os_file_handle_error(NULL, "flush"); + + /* It is a fatal error if a file flush does not succeed, because then + the database can get corrupt on disk */ + ut_error; + + return(false); +} + +/** NOTE! Use the corresponding macro os_file_create_simple(), not directly +this function! +A simple function to open or create a file. +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE +@param[in] read_only if true, read only checks are enforced +@param[out] success true if succeed, false if error +@return handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +pfs_os_file_t +os_file_create_simple_func( + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success) +{ + pfs_os_file_t file; + + *success = false; + + int create_flag; + const char* mode_str = NULL; + + if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) { + WAIT_ALLOW_WRITES(); + } + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + mode_str = "OPEN"; + + if (access_type == OS_FILE_READ_ONLY) { + + create_flag = O_RDONLY; + + } else if (read_only) { + + create_flag = O_RDONLY; + + } else { + create_flag = O_RDWR; + } + + } else if (read_only) { + + mode_str = "OPEN"; + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + mode_str = "CREATE"; + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else if (create_mode == OS_FILE_CREATE_PATH) { + + mode_str = "CREATE PATH"; + /* Create subdirs along the path if needed. */ + + *success = os_file_create_subdirs_if_needed(name); + + if (!*success) { + + ib::error() + << "Unable to create subdirectories '" + << name << "'"; + + return(OS_FILE_CLOSED); + } + + create_flag = O_RDWR | O_CREAT | O_EXCL; + create_mode = OS_FILE_CREATE; + } else { + + ib::error() + << "Unknown file create mode (" + << create_mode + << " for file '" << name << "'"; + + return(OS_FILE_CLOSED); + } + + bool retry; + + do { + file = open(name, create_flag | O_CLOEXEC, os_innodb_umask); + + if (file == -1) { + *success = false; + retry = os_file_handle_error( + name, + create_mode == OS_FILE_OPEN + ? "open" : "create"); + } else { + *success = true; + retry = false; + } + + } while (retry); + + /* This function is always called for data files, we should disable + OS caching (O_DIRECT) here as we do in os_file_create_func(), so + we open the same file in the same mode, see man page of open(2). */ + if (!srv_read_only_mode + && *success + && (srv_file_flush_method == SRV_O_DIRECT + || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) { + + os_file_set_nocache(file, name, mode_str); + } + +#ifdef USE_FILE_LOCK + if (!read_only + && *success + && (access_type == OS_FILE_READ_WRITE) + && os_file_lock(file, name)) { + + *success = false; + close(file); + file = -1; + } +#endif /* USE_FILE_LOCK */ + + return(file); +} + +/** This function attempts to create a directory named pathname. The new +directory gets default permissions. On Unix the permissions are +(0770 & ~umask). If the directory exists already, nothing is done and +the call succeeds, unless the fail_if_exists arguments is true. +If another error occurs, such as a permission error, this does not crash, +but reports the error and returns false. +@param[in] pathname directory name as null-terminated string +@param[in] fail_if_exists if true, pre-existing directory is treated as + an error. +@return true if call succeeds, false on error */ +bool +os_file_create_directory( + const char* pathname, + bool fail_if_exists) +{ + int rcode; + + WAIT_ALLOW_WRITES(); + rcode = mkdir(pathname, 0770); + + if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { + /* failure */ + os_file_handle_error_no_exit(pathname, "mkdir", false); + + return(false); + } + + return(true); +} + +/** NOTE! Use the corresponding macro os_file_create(), not directly +this function! +Opens an existing file or creates a new. +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O + is desired, OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use async + I/O or unbuffered I/O: look in the function + source code for the exact rules +@param[in] type OS_DATA_FILE or OS_LOG_FILE +@param[in] read_only true, if read only checks should be enforcedm +@param[in] success true if succeeded +@return handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +pfs_os_file_t +os_file_create_func( + const char* name, + ulint create_mode, + ulint purpose, + ulint type, + bool read_only, + bool* success) +{ + bool on_error_no_exit; + bool on_error_silent; + + *success = false; + + DBUG_EXECUTE_IF( + "ib_create_table_fail_disk_full", + *success = false; + errno = ENOSPC; + return(OS_FILE_CLOSED); + ); + + int create_flag; + const char* mode_str = NULL; + + on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT + ? true : false; + on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT + ? true : false; + + create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT + | OS_FILE_ON_ERROR_SILENT)); + + if (create_mode == OS_FILE_OPEN + || create_mode == OS_FILE_OPEN_RAW + || create_mode == OS_FILE_OPEN_RETRY) { + + mode_str = "OPEN"; + + create_flag = read_only ? O_RDONLY : O_RDWR; + + } else if (read_only) { + + mode_str = "OPEN"; + + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + mode_str = "CREATE"; + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else if (create_mode == OS_FILE_OVERWRITE) { + + mode_str = "OVERWRITE"; + create_flag = O_RDWR | O_CREAT | O_TRUNC; + + } else { + ib::error() + << "Unknown file create mode (" << create_mode << ")" + << " for file '" << name << "'"; + + return(OS_FILE_CLOSED); + } + + ut_a(type == OS_LOG_FILE + || type == OS_DATA_FILE + || type == OS_DATA_FILE_NO_O_DIRECT); + + ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL); + + /* We let O_DSYNC only affect log files */ + + if (!read_only + && type == OS_LOG_FILE + && srv_file_flush_method == SRV_O_DSYNC) { +#ifdef O_DSYNC + create_flag |= O_DSYNC; +#else + create_flag |= O_SYNC; +#endif + } + + os_file_t file; + bool retry; + + do { + file = open(name, create_flag | O_CLOEXEC, os_innodb_umask); + + if (file == -1) { + const char* operation; + + operation = (create_mode == OS_FILE_CREATE + && !read_only) ? "create" : "open"; + + *success = false; + + if (on_error_no_exit) { + retry = os_file_handle_error_no_exit( + name, operation, on_error_silent); + } else { + retry = os_file_handle_error(name, operation); + } + } else { + *success = true; + retry = false; + } + + } while (retry); + + /* We disable OS caching (O_DIRECT) only on data files */ + if (!read_only + && *success + && type != OS_LOG_FILE + && type != OS_DATA_FILE_NO_O_DIRECT + && (srv_file_flush_method == SRV_O_DIRECT + || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) { + + os_file_set_nocache(file, name, mode_str); + } + +#ifdef USE_FILE_LOCK + if (!read_only + && *success + && create_mode != OS_FILE_OPEN_RAW + && os_file_lock(file, name)) { + + if (create_mode == OS_FILE_OPEN_RETRY) { + + ib::info() + << "Retrying to lock the first data file"; + + for (int i = 0; i < 100; i++) { + os_thread_sleep(1000000); + + if (!os_file_lock(file, name)) { + *success = true; + return(file); + } + } + + ib::info() + << "Unable to open the first data file"; + } + + *success = false; + close(file); + file = -1; + } +#endif /* USE_FILE_LOCK */ + + return(file); +} + +/** NOTE! Use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A simple function to open or create a file. +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option + is used by a backup program reading the file +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeeded +@return own: handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +pfs_os_file_t +os_file_create_simple_no_error_handling_func( + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success) +{ + os_file_t file; + int create_flag; + + if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) { + WAIT_ALLOW_WRITES(); + } + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + *success = false; + + if (create_mode == OS_FILE_OPEN) { + + if (access_type == OS_FILE_READ_ONLY) { + + create_flag = O_RDONLY; + + } else if (read_only) { + + create_flag = O_RDONLY; + + } else { + + ut_a(access_type == OS_FILE_READ_WRITE + || access_type == OS_FILE_READ_ALLOW_DELETE); + + create_flag = O_RDWR; + } + + } else if (read_only) { + + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else { + + ib::error() + << "Unknown file create mode " + << create_mode << " for file '" << name << "'"; + + return(OS_FILE_CLOSED); + } + + file = open(name, create_flag | O_CLOEXEC, os_innodb_umask); + + *success = (file != -1); + +#ifdef USE_FILE_LOCK + if (!read_only + && *success + && access_type == OS_FILE_READ_WRITE + && os_file_lock(file, name)) { + + *success = false; + close(file); + file = -1; + + } +#endif /* USE_FILE_LOCK */ + + return(file); +} + +/** Deletes a file if it exists. The file has to be closed before calling this. +@param[in] name file path as a null-terminated string +@param[out] exist indicate if file pre-exist +@return true if success */ +bool +os_file_delete_if_exists_func( + const char* name, + bool* exist) +{ + if (exist != NULL) { + *exist = true; + } + + int ret; + WAIT_ALLOW_WRITES(); + + ret = unlink(name); + + if (ret != 0 && errno == ENOENT) { + if (exist != NULL) { + *exist = false; + } + } else if (ret != 0 && errno != ENOENT) { + os_file_handle_error_no_exit(name, "delete", false); + + return(false); + } + + return(true); +} + +/** Deletes a file. The file has to be closed before calling this. +@param[in] name file path as a null-terminated string +@return true if success */ +bool +os_file_delete_func( + const char* name) +{ + int ret; + WAIT_ALLOW_WRITES(); + + ret = unlink(name); + + if (ret != 0) { + os_file_handle_error_no_exit(name, "delete", FALSE); + + return(false); + } + + return(true); +} + +/** NOTE! Use the corresponding macro os_file_rename(), not directly this +function! +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. +@param[in] oldpath old file path as a null-terminated string +@param[in] newpath new file path +@return true if success */ +bool +os_file_rename_func( + const char* oldpath, + const char* newpath) +{ +#ifdef UNIV_DEBUG + os_file_type_t type; + bool exists; + + /* New path must not exist. */ + ut_ad(os_file_status(newpath, &exists, &type)); + ut_ad(!exists); + + /* Old path must exist. */ + ut_ad(os_file_status(oldpath, &exists, &type)); + ut_ad(exists); +#endif /* UNIV_DEBUG */ + + int ret; + WAIT_ALLOW_WRITES(); + + ret = rename(oldpath, newpath); + + if (ret != 0) { + os_file_handle_rename_error(oldpath, newpath); + + return(false); + } + + return(true); +} + +/** NOTE! Use the corresponding macro os_file_close(), not directly this +function! +Closes a file handle. In case of error, error number can be retrieved with +os_file_get_last_error. +@param[in] file Handle to close +@return true if success */ +bool os_file_close_func(os_file_t file) +{ + int ret= close(file); + + if (!ret) + return true; + + os_file_handle_error(NULL, "close"); + return false; +} + +/** Gets a file size. +@param[in] file handle to an open file +@return file size, or (os_offset_t) -1 on failure */ +os_offset_t +os_file_get_size(os_file_t file) +{ + struct stat statbuf; + return fstat(file, &statbuf) ? os_offset_t(-1) : statbuf.st_size; +} + +/** Gets a file size. +@param[in] filename Full path to the filename to check +@return file size if OK, else set m_total_size to ~0 and m_alloc_size to + errno */ +os_file_size_t +os_file_get_size( + const char* filename) +{ + struct stat s; + os_file_size_t file_size; + + int ret = stat(filename, &s); + + if (ret == 0) { + file_size.m_total_size = s.st_size; + /* st_blocks is in 512 byte sized blocks */ + file_size.m_alloc_size = s.st_blocks * 512; + } else { + file_size.m_total_size = ~0U; + file_size.m_alloc_size = (os_offset_t) errno; + } + + return(file_size); +} + +/** This function returns information about the specified file +@param[in] path pathname of the file +@param[out] stat_info information of a file in a directory +@param[in,out] statinfo information of a file in a directory +@param[in] check_rw_perm for testing whether the file can be opened + in RW mode +@param[in] read_only if true read only mode checks are enforced +@return DB_SUCCESS if all OK */ +static +dberr_t +os_file_get_status_posix( + const char* path, + os_file_stat_t* stat_info, + struct stat* statinfo, + bool check_rw_perm, + bool read_only) +{ + int ret = stat(path, statinfo); + + if (ret && (errno == ENOENT || errno == ENOTDIR + || errno == ENAMETOOLONG)) { + /* file does not exist */ + + return(DB_NOT_FOUND); + + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", false); + + return(DB_FAIL); + } + + switch (statinfo->st_mode & S_IFMT) { + case S_IFDIR: + stat_info->type = OS_FILE_TYPE_DIR; + break; + case S_IFLNK: + stat_info->type = OS_FILE_TYPE_LINK; + break; + case S_IFBLK: + /* Handle block device as regular file. */ + case S_IFCHR: + /* Handle character device as regular file. */ + case S_IFREG: + stat_info->type = OS_FILE_TYPE_FILE; + break; + default: + stat_info->type = OS_FILE_TYPE_UNKNOWN; + } + + stat_info->size = statinfo->st_size; + stat_info->block_size = statinfo->st_blksize; + stat_info->alloc_size = statinfo->st_blocks * 512; + + if (check_rw_perm + && (stat_info->type == OS_FILE_TYPE_FILE + || stat_info->type == OS_FILE_TYPE_BLOCK)) { + + stat_info->rw_perm = !access(path, read_only + ? R_OK : R_OK | W_OK); + } + + return(DB_SUCCESS); +} + +/** Truncates a file to a specified size in bytes. +Do nothing if the size to preserve is greater or equal to the current +size of the file. +@param[in] pathname file path +@param[in] file file to be truncated +@param[in] size size to preserve in bytes +@return true if success */ +static +bool +os_file_truncate_posix( + const char* pathname, + os_file_t file, + os_offset_t size) +{ + int res = ftruncate(file, size); + + if (res == -1) { + + bool retry; + + retry = os_file_handle_error_no_exit( + pathname, "truncate", false); + + if (retry) { + ib::warn() + << "Truncate failed for '" + << pathname << "'"; + } + } + + return(res == 0); +} + +/** Truncates a file at its current position. +@return true if success */ +bool +os_file_set_eof( + FILE* file) /*!< in: file to be truncated */ +{ + WAIT_ALLOW_WRITES(); + return(!ftruncate(fileno(file), ftell(file))); +} + +#else /* !_WIN32 */ + +#include <WinIoCtl.h> + + + +/** Free storage space associated with a section of the file. +@param[in] fh Open file handle +@param[in] off Starting offset (SEEK_SET) +@param[in] len Size of the hole +@return 0 on success or errno */ +static +dberr_t +os_file_punch_hole_win32( + os_file_t fh, + os_offset_t off, + os_offset_t len) +{ + FILE_ZERO_DATA_INFORMATION punch; + + punch.FileOffset.QuadPart = off; + punch.BeyondFinalZero.QuadPart = off + len; + + /* If lpOverlapped is NULL, lpBytesReturned cannot be NULL, + therefore we pass a dummy parameter. */ + DWORD temp; + BOOL success = os_win32_device_io_control( + fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch), + NULL, 0, &temp); + + return(success ? DB_SUCCESS: DB_IO_NO_PUNCH_HOLE); +} + +/** Check the existence and type of the given file. +@param[in] path path name of file +@param[out] exists true if the file exists +@param[out] type Type of the file, if it exists +@return true if call succeeded */ +static +bool +os_file_status_win32( + const char* path, + bool* exists, + os_file_type_t* type) +{ + int ret; + struct _stat64 statinfo; + + ret = _stat64(path, &statinfo); + + *exists = !ret; + + if (!ret) { + /* file exists, everything OK */ + + } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) { + /* file does not exist */ + return(true); + + } else { + /* file exists, but stat call failed */ + os_file_handle_error_no_exit(path, "stat", false); + return(false); + } + + if (_S_IFDIR & statinfo.st_mode) { + *type = OS_FILE_TYPE_DIR; + + } else if (_S_IFREG & statinfo.st_mode) { + *type = OS_FILE_TYPE_FILE; + + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + return(true); +} + +/* Dynamically load NtFlushBuffersFileEx, used in os_file_flush_func */ +#include <winternl.h> +typedef NTSTATUS(WINAPI* pNtFlushBuffersFileEx)( + HANDLE FileHandle, ULONG Flags, PVOID Parameters, ULONG ParametersSize, + PIO_STATUS_BLOCK IoStatusBlock); + +static pNtFlushBuffersFileEx my_NtFlushBuffersFileEx + = (pNtFlushBuffersFileEx)GetProcAddress(GetModuleHandle("ntdll"), + "NtFlushBuffersFileEx"); + +/** NOTE! Use the corresponding macro os_file_flush(), not directly this +function! +Flushes the write buffers of a given file to the disk. +@param[in] file handle to a file +@return true if success */ +bool os_file_flush_func(os_file_t file) +{ + ++os_n_fsyncs; + static bool disable_datasync; + + if (my_NtFlushBuffersFileEx && !disable_datasync) + { + IO_STATUS_BLOCK iosb{}; + NTSTATUS status= my_NtFlushBuffersFileEx( + file, FLUSH_FLAGS_FILE_DATA_SYNC_ONLY, nullptr, 0, &iosb); + if (!status) + return true; + /* + NtFlushBuffersFileEx(FLUSH_FLAGS_FILE_DATA_SYNC_ONLY) might fail + unless on Win10+, and maybe non-NTFS. Switch to using FlushFileBuffers(). + */ + disable_datasync= true; + } + + if (FlushFileBuffers(file)) + return true; + + /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is + actually a raw device, we choose to ignore that error if we are using + raw disks */ + if (srv_start_raw_disk_in_use && GetLastError() == ERROR_INVALID_FUNCTION) + return true; + + os_file_handle_error(nullptr, "flush"); + + /* It is a fatal error if a file flush does not succeed, because then + the database can get corrupt on disk */ + ut_error; + + return false; +} + +/** Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +then OS error number + OS_FILE_ERROR_MAX is returned. +@param[in] report_all_errors true if we want an error message printed + of all errors +@param[in] on_error_silent true then don't print any diagnostic + to the log +@return error number, or OS error number + OS_FILE_ERROR_MAX */ +static +ulint +os_file_get_last_error_low( + bool report_all_errors, + bool on_error_silent) +{ + ulint err = (ulint) GetLastError(); + + if (err == ERROR_SUCCESS) { + return(0); + } + + if (report_all_errors + || (!on_error_silent + && err != ERROR_DISK_FULL + && err != ERROR_FILE_EXISTS)) { + + ib::error() + << "Operating system error number " << err + << " in a file operation."; + + if (err == ERROR_PATH_NOT_FOUND) { + ib::error() + << "The error means the system" + " cannot find the path specified."; + + if (srv_is_being_started) { + ib::error() + << "If you are installing InnoDB," + " remember that you must create" + " directories yourself, InnoDB" + " does not create them."; + } + + } else if (err == ERROR_ACCESS_DENIED) { + + ib::error() + << "The error means mysqld does not have" + " the access rights to" + " the directory. It may also be" + " you have created a subdirectory" + " of the same name as a data file."; + + } else if (err == ERROR_SHARING_VIOLATION + || err == ERROR_LOCK_VIOLATION) { + + ib::error() + << "The error means that another program" + " is using InnoDB's files." + " This might be a backup or antivirus" + " software or another instance" + " of MySQL." + " Please close it to get rid of this error."; + + } else if (err == ERROR_WORKING_SET_QUOTA + || err == ERROR_NO_SYSTEM_RESOURCES) { + + ib::error() + << "The error means that there are no" + " sufficient system resources or quota to" + " complete the operation."; + + } else if (err == ERROR_OPERATION_ABORTED) { + + ib::error() + << "The error means that the I/O" + " operation has been aborted" + " because of either a thread exit" + " or an application request." + " Retry attempt is made."; + } else { + + ib::info() << OPERATING_SYSTEM_ERROR_MSG; + } + } + + if (err == ERROR_FILE_NOT_FOUND) { + return(OS_FILE_NOT_FOUND); + } else if (err == ERROR_DISK_FULL) { + return(OS_FILE_DISK_FULL); + } else if (err == ERROR_FILE_EXISTS) { + return(OS_FILE_ALREADY_EXISTS); + } else if (err == ERROR_SHARING_VIOLATION + || err == ERROR_LOCK_VIOLATION) { + return(OS_FILE_SHARING_VIOLATION); + } else if (err == ERROR_WORKING_SET_QUOTA + || err == ERROR_NO_SYSTEM_RESOURCES) { + return(OS_FILE_INSUFFICIENT_RESOURCE); + } else if (err == ERROR_OPERATION_ABORTED) { + return(OS_FILE_OPERATION_ABORTED); + } else if (err == ERROR_ACCESS_DENIED) { + return(OS_FILE_ACCESS_VIOLATION); + } + + return(OS_FILE_ERROR_MAX + err); +} + + +/** NOTE! Use the corresponding macro os_file_create_simple(), not directly +this function! +A simple function to open or create a file. +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeed, false if error +@return handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +pfs_os_file_t +os_file_create_simple_func( + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success) +{ + os_file_t file; + + *success = false; + + DWORD access; + DWORD create_flag; + DWORD attributes = 0; + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + ut_ad(srv_operation == SRV_OPERATION_NORMAL); + + if (create_mode == OS_FILE_OPEN) { + + create_flag = OPEN_EXISTING; + + } else if (read_only) { + + create_flag = OPEN_EXISTING; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = CREATE_NEW; + + } else if (create_mode == OS_FILE_CREATE_PATH) { + + /* Create subdirs along the path if needed. */ + *success = os_file_create_subdirs_if_needed(name); + + if (!*success) { + + ib::error() + << "Unable to create subdirectories '" + << name << "'"; + + return(OS_FILE_CLOSED); + } + + create_flag = CREATE_NEW; + create_mode = OS_FILE_CREATE; + + } else { + + ib::error() + << "Unknown file create mode (" + << create_mode << ") for file '" + << name << "'"; + + return(OS_FILE_CLOSED); + } + + if (access_type == OS_FILE_READ_ONLY) { + + access = GENERIC_READ; + + } else if (read_only) { + + ib::info() + << "Read only mode set. Unable to" + " open file '" << name << "' in RW mode, " + << "trying RO mode"; + + access = GENERIC_READ; + + } else if (access_type == OS_FILE_READ_WRITE) { + + access = GENERIC_READ | GENERIC_WRITE; + + } else { + + ib::error() + << "Unknown file access type (" << access_type << ") " + "for file '" << name << "'"; + + return(OS_FILE_CLOSED); + } + + bool retry; + + do { + /* Use default security attributes and no template file. */ + + file = CreateFile( + (LPCTSTR) name, access, + FILE_SHARE_READ | FILE_SHARE_DELETE, + NULL, create_flag, attributes, NULL); + + if (file == INVALID_HANDLE_VALUE) { + + *success = false; + + retry = os_file_handle_error( + name, create_mode == OS_FILE_OPEN ? + "open" : "create"); + + } else { + + retry = false; + + *success = true; + } + + } while (retry); + + return(file); +} + +/** This function attempts to create a directory named pathname. The new +directory gets default permissions. On Unix the permissions are +(0770 & ~umask). If the directory exists already, nothing is done and +the call succeeds, unless the fail_if_exists arguments is true. +If another error occurs, such as a permission error, this does not crash, +but reports the error and returns false. +@param[in] pathname directory name as null-terminated string +@param[in] fail_if_exists if true, pre-existing directory is treated + as an error. +@return true if call succeeds, false on error */ +bool +os_file_create_directory( + const char* pathname, + bool fail_if_exists) +{ + BOOL rcode; + + rcode = CreateDirectory((LPCTSTR) pathname, NULL); + if (!(rcode != 0 + || (GetLastError() == ERROR_ALREADY_EXISTS + && !fail_if_exists))) { + + os_file_handle_error_no_exit( + pathname, "CreateDirectory", false); + + return(false); + } + + return(true); +} + +/** Check that IO of specific size is possible for the file +opened with FILE_FLAG_NO_BUFFERING. + +The requirement is that IO is multiple of the disk sector size. + +@param[in] file file handle +@param[in] io_size expected io size +@return true - unbuffered io of requested size is possible, false otherwise. + +@note: this function only works correctly with Windows 8 or later, +(GetFileInformationByHandleEx with FileStorageInfo is only supported there). +It will return true on earlier Windows version. + */ +static bool unbuffered_io_possible(HANDLE file, size_t io_size) +{ + FILE_STORAGE_INFO info; + if (GetFileInformationByHandleEx( + file, FileStorageInfo, &info, sizeof(info))) { + ULONG sector_size = info.LogicalBytesPerSector; + if (sector_size) + return io_size % sector_size == 0; + } + return true; +} + + +/** NOTE! Use the corresponding macro os_file_create(), not directly +this function! +Opens an existing file or creates a new. +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O + is desired, OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use async + I/O or unbuffered I/O: look in the function + source code for the exact rules +@param[in] type OS_DATA_FILE or OS_LOG_FILE +@param[in] success true if succeeded +@return handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +pfs_os_file_t +os_file_create_func( + const char* name, + ulint create_mode, + ulint purpose, + ulint type, + bool read_only, + bool* success) +{ + os_file_t file; + bool retry; + bool on_error_no_exit; + bool on_error_silent; + + *success = false; + + DBUG_EXECUTE_IF( + "ib_create_table_fail_disk_full", + *success = false; + SetLastError(ERROR_DISK_FULL); + return(OS_FILE_CLOSED); + ); + + DWORD create_flag; + DWORD share_mode = read_only + ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE + : FILE_SHARE_READ | FILE_SHARE_DELETE; + + if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) { + WAIT_ALLOW_WRITES(); + } + + on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT + ? true : false; + + on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT + ? true : false; + + create_mode &= ~(OS_FILE_ON_ERROR_NO_EXIT | OS_FILE_ON_ERROR_SILENT); + + if (create_mode == OS_FILE_OPEN_RAW) { + + ut_a(!read_only); + + /* On Windows Physical devices require admin privileges and + have to have the write-share mode set. See the remarks + section for the CreateFile() function documentation in MSDN. */ + + share_mode |= FILE_SHARE_WRITE; + + create_flag = OPEN_EXISTING; + + } else if (create_mode == OS_FILE_OPEN + || create_mode == OS_FILE_OPEN_RETRY) { + + create_flag = OPEN_EXISTING; + + } else if (read_only) { + + create_flag = OPEN_EXISTING; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = CREATE_NEW; + + } else if (create_mode == OS_FILE_OVERWRITE) { + + create_flag = CREATE_ALWAYS; + + } else { + ib::error() + << "Unknown file create mode (" << create_mode << ") " + << " for file '" << name << "'"; + + return(OS_FILE_CLOSED); + } + + DWORD attributes = 0; + + if (purpose == OS_FILE_AIO) { + +#ifdef WIN_ASYNC_IO + /* If specified, use asynchronous (overlapped) io and no + buffering of writes in the OS */ + + if (srv_use_native_aio) { + attributes |= FILE_FLAG_OVERLAPPED; + } +#endif /* WIN_ASYNC_IO */ + + } else if (purpose == OS_FILE_NORMAL) { + + /* Use default setting. */ + + } else { + + ib::error() + << "Unknown purpose flag (" << purpose << ") " + << "while opening file '" << name << "'"; + + return(OS_FILE_CLOSED); + } + + if (type == OS_LOG_FILE) { + /* There is not reason to use buffered write to logs.*/ + attributes |= FILE_FLAG_NO_BUFFERING; + } + + switch (srv_file_flush_method) + { + case SRV_O_DSYNC: + if (type == OS_LOG_FILE) { + /* Map O_DSYNC to FILE_WRITE_THROUGH */ + attributes |= FILE_FLAG_WRITE_THROUGH; + } + break; + + case SRV_O_DIRECT_NO_FSYNC: + case SRV_O_DIRECT: + if (type != OS_DATA_FILE) { + break; + } + /* fall through */ + case SRV_ALL_O_DIRECT_FSYNC: + /*Traditional Windows behavior, no buffering for any files.*/ + if (type != OS_DATA_FILE_NO_O_DIRECT) { + attributes |= FILE_FLAG_NO_BUFFERING; + } + break; + + case SRV_FSYNC: + case SRV_LITTLESYNC: + break; + + case SRV_NOSYNC: + /* Let Windows cache manager handle all writes.*/ + attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING); + break; + + default: + ut_a(false); /* unknown flush mode.*/ + } + + + // TODO: Create a bug, this looks wrong. The flush log + // parameter is dynamic. + if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { + /* Do not use unbuffered i/o for the log files because + value 2 denotes that we do not flush the log at every + commit, but only once per second */ + attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING); + } + + + DWORD access = GENERIC_READ; + + if (!read_only) { + access |= GENERIC_WRITE; + } + + for (;;) { + const char *operation; + + /* Use default security attributes and no template file. */ + file = CreateFile( + name, access, share_mode, NULL, + create_flag, attributes, NULL); + + /* If FILE_FLAG_NO_BUFFERING was set, check if this can work at all, + for expected IO sizes. Reopen without the unbuffered flag, if it is won't work*/ + if ((file != INVALID_HANDLE_VALUE) + && (attributes & FILE_FLAG_NO_BUFFERING) + && (type == OS_LOG_FILE) + && !unbuffered_io_possible(file, OS_FILE_LOG_BLOCK_SIZE)) { + ut_a(CloseHandle(file)); + attributes &= ~FILE_FLAG_NO_BUFFERING; + create_flag = OPEN_ALWAYS; + continue; + } + + *success = (file != INVALID_HANDLE_VALUE); + if (*success) { + break; + } + + operation = (create_mode == OS_FILE_CREATE && !read_only) ? + "create" : "open"; + + if (on_error_no_exit) { + retry = os_file_handle_error_no_exit( + name, operation, on_error_silent); + } + else { + retry = os_file_handle_error(name, operation); + } + + if (!retry) { + break; + } + } + + if (*success && (attributes & FILE_FLAG_OVERLAPPED) && srv_thread_pool) { + srv_thread_pool->bind(file); + } + return(file); +} + +/** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(), +not directly this function! +A simple function to open or create a file. +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file +@param[out] success true if succeeded +@return own: handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +pfs_os_file_t +os_file_create_simple_no_error_handling_func( + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success) +{ + os_file_t file; + + *success = false; + + DWORD access; + DWORD create_flag; + DWORD attributes = 0; + DWORD share_mode = read_only + ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE + : FILE_SHARE_READ | FILE_SHARE_DELETE; + + ut_a(name); + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + + create_flag = OPEN_EXISTING; + + } else if (read_only) { + + create_flag = OPEN_EXISTING; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = CREATE_NEW; + + } else { + + ib::error() + << "Unknown file create mode (" << create_mode << ") " + << " for file '" << name << "'"; + + return(OS_FILE_CLOSED); + } + + if (access_type == OS_FILE_READ_ONLY) { + + access = GENERIC_READ; + + } else if (read_only) { + + access = GENERIC_READ; + + } else if (access_type == OS_FILE_READ_WRITE) { + + access = GENERIC_READ | GENERIC_WRITE; + + } else if (access_type == OS_FILE_READ_ALLOW_DELETE) { + + ut_a(!read_only); + + access = GENERIC_READ; + + /*!< A backup program has to give mysqld the maximum + freedom to do what it likes with the file */ + + share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE + | FILE_SHARE_READ; + + } else { + + ib::error() + << "Unknown file access type (" << access_type << ") " + << "for file '" << name << "'"; + + return(OS_FILE_CLOSED); + } + + file = CreateFile((LPCTSTR) name, + access, + share_mode, + NULL, // Security attributes + create_flag, + attributes, + NULL); // No template file + + *success = (file != INVALID_HANDLE_VALUE); + + return(file); +} + +/** Deletes a file if it exists. The file has to be closed before calling this. +@param[in] name file path as a null-terminated string +@param[out] exist indicate if file pre-exist +@return true if success */ +bool +os_file_delete_if_exists_func( + const char* name, + bool* exist) +{ + ulint count = 0; + + if (exist != NULL) { + *exist = true; + } + + for (;;) { + /* In Windows, deleting an .ibd file may fail if + the file is being accessed by an external program, + such as a backup tool. */ + + bool ret = DeleteFile((LPCTSTR) name); + + if (ret) { + return(true); + } + + DWORD lasterr = GetLastError(); + + if (lasterr == ERROR_FILE_NOT_FOUND + || lasterr == ERROR_PATH_NOT_FOUND) { + + /* the file does not exist, this not an error */ + if (exist != NULL) { + *exist = false; + } + + return(true); + } + + ++count; + + if (count > 100 && 0 == (count % 10)) { + + /* Print error information */ + os_file_get_last_error(true); + + ib::warn() << "Delete of file '" << name << "' failed."; + } + + /* Sleep for a second */ + os_thread_sleep(1000000); + + if (count > 2000) { + + return(false); + } + } +} + +/** Deletes a file. The file has to be closed before calling this. +@param[in] name File path as NUL terminated string +@return true if success */ +bool +os_file_delete_func( + const char* name) +{ + ulint count = 0; + + for (;;) { + /* In Windows, deleting an .ibd file may fail if + the file is being accessed by an external program, + such as a backup tool. */ + + BOOL ret = DeleteFile((LPCTSTR) name); + + if (ret) { + return(true); + } + + if (GetLastError() == ERROR_FILE_NOT_FOUND) { + /* If the file does not exist, we classify this as + a 'mild' error and return */ + + return(false); + } + + ++count; + + if (count > 100 && 0 == (count % 10)) { + + /* print error information */ + os_file_get_last_error(true); + + ib::warn() + << "Cannot delete file '" << name << "'. Is " + << "another program accessing it?"; + } + + /* sleep for a second */ + os_thread_sleep(1000000); + + if (count > 2000) { + + return(false); + } + } + + ut_error; + return(false); +} + +/** NOTE! Use the corresponding macro os_file_rename(), not directly this +function! +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. +@param[in] oldpath old file path as a null-terminated string +@param[in] newpath new file path +@return true if success */ +bool +os_file_rename_func( + const char* oldpath, + const char* newpath) +{ +#ifdef UNIV_DEBUG + os_file_type_t type; + bool exists; + + /* New path must not exist. */ + ut_ad(os_file_status(newpath, &exists, &type)); + ut_ad(!exists); + + /* Old path must exist. */ + ut_ad(os_file_status(oldpath, &exists, &type)); + ut_ad(exists); +#endif /* UNIV_DEBUG */ + + if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) { + return(true); + } + + os_file_handle_rename_error(oldpath, newpath); + return(false); +} + +/** NOTE! Use the corresponding macro os_file_close(), not directly +this function! +Closes a file handle. In case of error, error number can be retrieved with +os_file_get_last_error. +@param[in,own] file Handle to a file +@return true if success */ +bool os_file_close_func(os_file_t file) +{ + ut_ad(file); + if (!CloseHandle(file)) + { + os_file_handle_error(NULL, "close"); + return false; + } + + if(srv_thread_pool) + srv_thread_pool->unbind(file); + return true; +} + +/** Gets a file size. +@param[in] file Handle to a file +@return file size, or (os_offset_t) -1 on failure */ +os_offset_t +os_file_get_size( + os_file_t file) +{ + DWORD high; + DWORD low = GetFileSize(file, &high); + + if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) { + return((os_offset_t) -1); + } + + return(os_offset_t(low | (os_offset_t(high) << 32))); +} + +/** Gets a file size. +@param[in] filename Full path to the filename to check +@return file size if OK, else set m_total_size to ~0 and m_alloc_size to + errno */ +os_file_size_t +os_file_get_size( + const char* filename) +{ + struct __stat64 s; + os_file_size_t file_size; + + int ret = _stat64(filename, &s); + + if (ret == 0) { + + file_size.m_total_size = s.st_size; + + DWORD low_size; + DWORD high_size; + + low_size = GetCompressedFileSize(filename, &high_size); + + if (low_size != INVALID_FILE_SIZE) { + + file_size.m_alloc_size = high_size; + file_size.m_alloc_size <<= 32; + file_size.m_alloc_size |= low_size; + + } else { + ib::error() + << "GetCompressedFileSize(" + << filename << ", ..) failed."; + + file_size.m_alloc_size = (os_offset_t) -1; + } + } else { + file_size.m_total_size = ~0; + file_size.m_alloc_size = (os_offset_t) ret; + } + + return(file_size); +} + +/** This function returns information about the specified file +@param[in] path pathname of the file +@param[out] stat_info information of a file in a directory +@param[in,out] statinfo information of a file in a directory +@param[in] check_rw_perm for testing whether the file can be opened + in RW mode +@param[in] read_only true if the file is opened in read-only mode +@return DB_SUCCESS if all OK */ +static +dberr_t +os_file_get_status_win32( + const char* path, + os_file_stat_t* stat_info, + struct _stat64* statinfo, + bool check_rw_perm, + bool read_only) +{ + int ret = _stat64(path, statinfo); + + if (ret && (errno == ENOENT || errno == ENOTDIR + || errno == ENAMETOOLONG)) { + /* file does not exist */ + + return(DB_NOT_FOUND); + + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "STAT", false); + + return(DB_FAIL); + + } else if (_S_IFDIR & statinfo->st_mode) { + + stat_info->type = OS_FILE_TYPE_DIR; + + } else if (_S_IFREG & statinfo->st_mode) { + + DWORD access = GENERIC_READ; + + if (!read_only) { + access |= GENERIC_WRITE; + } + + stat_info->type = OS_FILE_TYPE_FILE; + + /* Check if we can open it in read-only mode. */ + + if (check_rw_perm) { + HANDLE fh; + + fh = CreateFile( + (LPCTSTR) path, // File to open + access, + FILE_SHARE_READ | FILE_SHARE_WRITE + | FILE_SHARE_DELETE, // Full sharing + NULL, // Default security + OPEN_EXISTING, // Existing file only + FILE_ATTRIBUTE_NORMAL, // Normal file + NULL); // No attr. template + + if (fh == INVALID_HANDLE_VALUE) { + stat_info->rw_perm = false; + } else { + stat_info->rw_perm = true; + CloseHandle(fh); + } + } + stat_info->block_size = 0; + + /* What follows, is calculation of FS block size, which is not important + (it is just shown in I_S innodb tables). The error to calculate it will be ignored.*/ + char volname[MAX_PATH]; + BOOL result = GetVolumePathName(path, volname, MAX_PATH); + static bool warned_once = false; + if (!result) { + if (!warned_once) { + ib::warn() + << "os_file_get_status_win32: " + << "Failed to get the volume path name for: " + << path + << "- OS error number " << GetLastError(); + warned_once = true; + } + return(DB_SUCCESS); + } + + DWORD sectorsPerCluster; + DWORD bytesPerSector; + DWORD numberOfFreeClusters; + DWORD totalNumberOfClusters; + + result = GetDiskFreeSpace( + (LPCSTR) volname, + §orsPerCluster, + &bytesPerSector, + &numberOfFreeClusters, + &totalNumberOfClusters); + + if (!result) { + if (!warned_once) { + ib::warn() + << "GetDiskFreeSpace(" << volname << ",...) " + << "failed " + << "- OS error number " << GetLastError(); + warned_once = true; + } + return(DB_SUCCESS); + } + stat_info->block_size = bytesPerSector * sectorsPerCluster; + } else { + stat_info->type = OS_FILE_TYPE_UNKNOWN; + } + + return(DB_SUCCESS); +} + +/** +Sets a sparse flag on Windows file. +@param[in] file file handle +@return true on success, false on error +*/ +#include <versionhelpers.h> +bool os_file_set_sparse_win32(os_file_t file, bool is_sparse) +{ + if (!is_sparse && !IsWindows8OrGreater()) { + /* Cannot unset sparse flag on older Windows. + Until Windows8 it is documented to produce unpredictable results, + if there are unallocated ranges in file.*/ + return false; + } + DWORD temp; + FILE_SET_SPARSE_BUFFER sparse_buffer; + sparse_buffer.SetSparse = is_sparse; + return os_win32_device_io_control(file, + FSCTL_SET_SPARSE, &sparse_buffer, sizeof(sparse_buffer), 0, 0,&temp); +} + + +/** +Change file size on Windows. + +If file is extended, the bytes between old and new EOF +are zeros. + +If file is sparse, "virtual" block is added at the end of +allocated area. + +If file is normal, file system allocates storage. + +@param[in] pathname file path +@param[in] file file handle +@param[in] size size to preserve in bytes +@return true if success */ +bool +os_file_change_size_win32( + const char* pathname, + os_file_t file, + os_offset_t size) +{ + LARGE_INTEGER length; + + length.QuadPart = size; + + BOOL success = SetFilePointerEx(file, length, NULL, FILE_BEGIN); + + if (!success) { + os_file_handle_error_no_exit( + pathname, "SetFilePointerEx", false); + } else { + success = SetEndOfFile(file); + if (!success) { + os_file_handle_error_no_exit( + pathname, "SetEndOfFile", false); + } + } + return(success); +} + +/** Truncates a file at its current position. +@param[in] file Handle to be truncated +@return true if success */ +bool +os_file_set_eof( + FILE* file) +{ + HANDLE h = (HANDLE) _get_osfhandle(fileno(file)); + + return(SetEndOfFile(h)); +} + +#endif /* !_WIN32*/ + +/** Does a syncronous read or write depending upon the type specified +In case of partial reads/writes the function tries +NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data. +@param[in] type, IO flags +@param[in] file handle to an open file +@param[out] buf buffer where to read +@param[in] offset file offset from the start where to read +@param[in] n number of bytes to read, starting from offset +@param[out] err DB_SUCCESS or error code +@return number of bytes read/written, -1 if error */ +static MY_ATTRIBUTE((warn_unused_result)) +ssize_t +os_file_io( + const IORequest&in_type, + os_file_t file, + void* buf, + ulint n, + os_offset_t offset, + dberr_t* err) +{ + ssize_t original_n = ssize_t(n); + IORequest type = in_type; + ssize_t bytes_returned = 0; + + SyncFileIO sync_file_io(file, buf, n, offset); + + for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) { + + ssize_t n_bytes = sync_file_io.execute(type); + + /* Check for a hard error. Not much we can do now. */ + if (n_bytes < 0) { + + break; + + } else if (n_bytes + bytes_returned == ssize_t(n)) { + + bytes_returned += n_bytes; + + *err = type.maybe_punch_hole(offset, n); + + return(original_n); + } + + /* Handle partial read/write. */ + + ut_ad(ulint(n_bytes + bytes_returned) < n); + + bytes_returned += n_bytes; + + if (type.type != IORequest::READ_MAYBE_PARTIAL) { + const char* op = type.is_read() + ? "read" : "written"; + + ib::warn() + << n + << " bytes should have been " << op << ". Only " + << bytes_returned + << " bytes " << op << ". Retrying" + << " for the remaining bytes."; + } + + /* Advance the offset and buffer by n_bytes */ + sync_file_io.advance(n_bytes); + } + + *err = DB_IO_ERROR; + + if (type.type != IORequest::READ_MAYBE_PARTIAL) { + ib::warn() + << "Retry attempts for " + << (type.is_read() ? "reading" : "writing") + << " partial data failed."; + } + + return(bytes_returned); +} + +/** Does a synchronous write operation in Posix. +@param[in] type IO context +@param[in] file handle to an open file +@param[out] buf buffer from which to write +@param[in] n number of bytes to read, starting from offset +@param[in] offset file offset from the start where to read +@param[out] err DB_SUCCESS or error code +@return number of bytes written, -1 if error */ +static MY_ATTRIBUTE((warn_unused_result)) +ssize_t +os_file_pwrite( + const IORequest& type, + os_file_t file, + const byte* buf, + ulint n, + os_offset_t offset, + dberr_t* err) +{ + ut_ad(type.is_write()); + + ++os_n_file_writes; + + const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES); + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor); + ssize_t n_bytes = os_file_io(type, file, const_cast<byte*>(buf), + n, offset, err); + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor); + + return(n_bytes); +} + +/** NOTE! Use the corresponding macro os_file_write(), not directly +Requests a synchronous write operation. +@param[in] type IO flags +@param[in] file handle to an open file +@param[out] buf buffer from which to write +@param[in] offset file offset from the start where to read +@param[in] n number of bytes to read, starting from offset +@return error code +@retval DB_SUCCESS if the operation succeeded */ +dberr_t +os_file_write_func( + const IORequest& type, + const char* name, + os_file_t file, + const void* buf, + os_offset_t offset, + ulint n) +{ + dberr_t err; + + ut_ad(n > 0); + + WAIT_ALLOW_WRITES(); + + ssize_t n_bytes = os_file_pwrite(type, file, (byte*)buf, n, offset, &err); + + if ((ulint) n_bytes != n && !os_has_said_disk_full) { + + ib::error() + << "Write to file " << name << " failed at offset " + << offset << ", " << n + << " bytes should have been written," + " only " << n_bytes << " were written." + " Operating system error number " << IF_WIN(GetLastError(),errno) << "." + " Check that your OS and file system" + " support files of this size." + " Check also that the disk is not full" + " or a disk quota exceeded."; +#ifndef _WIN32 + if (strerror(errno) != NULL) { + + ib::error() + << "Error number " << errno + << " means '" << strerror(errno) << "'"; + } + + ib::info() << OPERATING_SYSTEM_ERROR_MSG; +#endif + os_has_said_disk_full = true; + } + + return(err); +} + +/** Does a synchronous read operation in Posix. +@param[in] type IO flags +@param[in] file handle to an open file +@param[out] buf buffer where to read +@param[in] offset file offset from the start where to read +@param[in] n number of bytes to read, starting from offset +@param[out] err DB_SUCCESS or error code +@return number of bytes read, -1 if error */ +static MY_ATTRIBUTE((warn_unused_result)) +ssize_t +os_file_pread( + const IORequest& type, + os_file_t file, + void* buf, + ulint n, + os_offset_t offset, + dberr_t* err) +{ + ut_ad(type.is_read()); + + ++os_n_file_reads; + + const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS); + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor); + ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err); + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor); + + return(n_bytes); +} + +/** Requests a synchronous positioned read operation. +@return DB_SUCCESS if request was successful, false if fail +@param[in] type IO flags +@param[in] file handle to an open file +@param[out] buf buffer where to read +@param[in] offset file offset from the start where to read +@param[in] n number of bytes to read, starting from offset +@param[out] o number of bytes actually read +@param[in] exit_on_err if true then exit on error +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +os_file_read_page( + const IORequest& type, + os_file_t file, + void* buf, + os_offset_t offset, + ulint n, + ulint* o, + bool exit_on_err) +{ + dberr_t err; + + os_bytes_read_since_printout += n; + + ut_ad(n > 0); + + ssize_t n_bytes = os_file_pread(type, file, buf, n, offset, &err); + + if (o) { + *o = n_bytes; + } + + if (ulint(n_bytes) == n || (err != DB_SUCCESS && !exit_on_err)) { + return err; + } + int os_err = IF_WIN((int)GetLastError(), errno); + + if (!os_file_handle_error_cond_exit( + NULL, "read", exit_on_err, false)) { + ib::fatal() + << "Tried to read " << n << " bytes at offset " + << offset << ", but was only able to read " << n_bytes + << ".Cannot read from file. OS error number " + << os_err << "."; + } else { + ib::error() << "Tried to read " << n << " bytes at offset " + << offset << ", but was only able to read " << n_bytes; + } + if (err == DB_SUCCESS) { + err = DB_IO_ERROR; + } + + return err; +} + +/** Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + 100 is returned. +@param[in] report_all_errors true if we want an error printed + for all errors +@return error number, or OS error number + 100 */ +ulint +os_file_get_last_error( + bool report_all_errors) +{ + return(os_file_get_last_error_low(report_all_errors, false)); +} + +/** Handle errors for file operations. +@param[in] name name of a file or NULL +@param[in] operation operation +@param[in] should_abort whether to abort on an unknown error +@param[in] on_error_silent whether to suppress reports of non-fatal errors +@return true if we should retry the operation */ +static MY_ATTRIBUTE((warn_unused_result)) +bool +os_file_handle_error_cond_exit( + const char* name, + const char* operation, + bool should_abort, + bool on_error_silent) +{ + ulint err; + + err = os_file_get_last_error_low(false, on_error_silent); + + switch (err) { + case OS_FILE_DISK_FULL: + /* We only print a warning about disk full once */ + + if (os_has_said_disk_full) { + + return(false); + } + + /* Disk full error is reported irrespective of the + on_error_silent setting. */ + + if (name) { + + ib::error() + << "Encountered a problem with file '" + << name << "'"; + } + + ib::error() + << "Disk is full. Try to clean the disk to free space."; + + os_has_said_disk_full = true; + + return(false); + + case OS_FILE_AIO_RESOURCES_RESERVED: + case OS_FILE_AIO_INTERRUPTED: + + return(true); + + case OS_FILE_PATH_ERROR: + case OS_FILE_ALREADY_EXISTS: + case OS_FILE_ACCESS_VIOLATION: + + return(false); + + case OS_FILE_SHARING_VIOLATION: + + os_thread_sleep(10000000); /* 10 sec */ + return(true); + + case OS_FILE_OPERATION_ABORTED: + case OS_FILE_INSUFFICIENT_RESOURCE: + + os_thread_sleep(100000); /* 100 ms */ + return(true); + + default: + + /* If it is an operation that can crash on error then it + is better to ignore on_error_silent and print an error message + to the log. */ + + if (should_abort || !on_error_silent) { + ib::error() << "File " + << (name != NULL ? name : "(unknown)") + << ": '" << operation << "'" + " returned OS error " << err << "." + << (should_abort + ? " Cannot continue operation" : ""); + } + + if (should_abort) { + abort(); + } + } + + return(false); +} + +#ifndef _WIN32 +/** Tries to disable OS caching on an opened file descriptor. +@param[in] fd file descriptor to alter +@param[in] file_name file name, used in the diagnostic message +@param[in] name "open" or "create"; used in the diagnostic + message */ +void +os_file_set_nocache( + int fd MY_ATTRIBUTE((unused)), + const char* file_name MY_ATTRIBUTE((unused)), + const char* operation_name MY_ATTRIBUTE((unused))) +{ + /* some versions of Solaris may not have DIRECTIO_ON */ +#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) + if (directio(fd, DIRECTIO_ON) == -1) { + int errno_save = errno; + + ib::error() + << "Failed to set DIRECTIO_ON on file " + << file_name << "; " << operation_name << ": " + << strerror(errno_save) << "," + " continuing anyway."; + } +#elif defined(O_DIRECT) + if (fcntl(fd, F_SETFL, O_DIRECT) == -1) { + int errno_save = errno; + static bool warning_message_printed = false; + if (errno_save == EINVAL) { + if (!warning_message_printed) { + warning_message_printed = true; +# ifdef UNIV_LINUX + ib::warn() + << "Failed to set O_DIRECT on file" + << file_name << "; " << operation_name + << ": " << strerror(errno_save) << ", " + "continuing anyway. O_DIRECT is " + "known to result in 'Invalid argument' " + "on Linux on tmpfs, " + "see MySQL Bug#26662."; +# else /* UNIV_LINUX */ + goto short_warning; +# endif /* UNIV_LINUX */ + } + } else { +# ifndef UNIV_LINUX +short_warning: +# endif + ib::warn() + << "Failed to set O_DIRECT on file " + << file_name << "; " << operation_name + << " : " << strerror(errno_save) + << ", continuing anyway."; + } + } +#endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */ +} + +#endif /* _WIN32 */ + +/** Check if the file system supports sparse files. +@param fh file handle +@return true if the file system supports sparse files */ +IF_WIN(static,) bool os_is_sparse_file_supported(os_file_t fh) +{ +#ifdef _WIN32 + FILE_ATTRIBUTE_TAG_INFO info; + if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo, + &info, (DWORD)sizeof(info))) { + if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) { + return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0; + } + } + return false; +#else + /* We don't know the FS block size, use the sector size. The FS + will do the magic. */ + return DB_SUCCESS == os_file_punch_hole_posix(fh, 0, srv_page_size); +#endif /* _WIN32 */ +} + +/** Extend a file. + +On Windows, extending a file allocates blocks for the file, +unless the file is sparse. + +On Unix, we will extend the file with ftruncate(), if +file needs to be sparse. Otherwise posix_fallocate() is used +when available, and if not, binary zeroes are added to the end +of file. + +@param[in] name file name +@param[in] file file handle +@param[in] size desired file size +@param[in] sparse whether to create a sparse file (no preallocating) +@return whether the operation succeeded */ +bool +os_file_set_size( + const char* name, + os_file_t file, + os_offset_t size, + bool is_sparse) +{ +#ifdef _WIN32 + /* On Windows, changing file size works well and as expected for both + sparse and normal files. + + However, 10.2 up until 10.2.9 made every file sparse in innodb, + causing NTFS fragmentation issues(MDEV-13941). We try to undo + the damage, and unsparse the file.*/ + + if (!is_sparse && os_is_sparse_file_supported(file)) { + if (!os_file_set_sparse_win32(file, false)) + /* Unsparsing file failed. Fallback to writing binary + zeros, to avoid even higher fragmentation.*/ + goto fallback; + } + + return os_file_change_size_win32(name, file, size); + +fallback: +#else + struct stat statbuf; + + if (is_sparse) { + bool success = !ftruncate(file, size); + if (!success) { + ib::error() << "ftruncate of file " << name << " to " + << size << " bytes failed with error " + << errno; + } + return(success); + } + +# ifdef HAVE_POSIX_FALLOCATE + int err; + do { + if (fstat(file, &statbuf)) { + err = errno; + } else { + os_offset_t current_size = statbuf.st_size; + if (current_size >= size) { + return true; + } + current_size &= ~os_offset_t(statbuf.st_blksize - 1); + err = posix_fallocate(file, current_size, + size - current_size); + } + } while (err == EINTR + && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED); + + switch (err) { + case 0: + return true; + default: + ib::error() << "preallocating " + << size << " bytes for file " << name + << " failed with error " << err; + /* fall through */ + case EINTR: + errno = err; + return false; + case EINVAL: + case EOPNOTSUPP: + /* fall back to the code below */ + break; + } +# endif /* HAVE_POSIX_ALLOCATE */ +#endif /* _WIN32*/ + +#ifdef _WIN32 + os_offset_t current_size = os_file_get_size(file); + FILE_STORAGE_INFO info; + if (GetFileInformationByHandleEx(file, FileStorageInfo, &info, + sizeof info)) { + if (info.LogicalBytesPerSector) { + current_size &= ~os_offset_t(info.LogicalBytesPerSector + - 1); + } + } +#else + if (fstat(file, &statbuf)) { + return false; + } + os_offset_t current_size = statbuf.st_size + & ~os_offset_t(statbuf.st_blksize - 1); +#endif + if (current_size >= size) { + return true; + } + + /* Write up to 1 megabyte at a time. */ + ulint buf_size = ut_min(ulint(64), + ulint(size >> srv_page_size_shift)) + << srv_page_size_shift; + + /* Align the buffer for possible raw i/o */ + byte* buf = static_cast<byte*>(aligned_malloc(buf_size, + srv_page_size)); + /* Write buffer full of zeros */ + memset(buf, 0, buf_size); + + while (current_size < size + && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) { + ulint n_bytes; + + if (size - current_size < (os_offset_t) buf_size) { + n_bytes = (ulint) (size - current_size); + } else { + n_bytes = buf_size; + } + + if (os_file_write(IORequestWrite, name, + file, buf, current_size, n_bytes) != + DB_SUCCESS) { + break; + } + + current_size += n_bytes; + } + + aligned_free(buf); + + return(current_size >= size && os_file_flush(file)); +} + +/** Truncate a file to a specified size in bytes. +@param[in] pathname file path +@param[in] file file to be truncated +@param[in] size size preserved in bytes +@param[in] allow_shrink whether to allow the file to become smaller +@return true if success */ +bool +os_file_truncate( + const char* pathname, + os_file_t file, + os_offset_t size, + bool allow_shrink) +{ + if (!allow_shrink) { + /* Do nothing if the size preserved is larger than or + equal to the current size of file */ + os_offset_t size_bytes = os_file_get_size(file); + + if (size >= size_bytes) { + return(true); + } + } + +#ifdef _WIN32 + return(os_file_change_size_win32(pathname, file, size)); +#else /* _WIN32 */ + return(os_file_truncate_posix(pathname, file, size)); +#endif /* _WIN32 */ +} + +/** NOTE! Use the corresponding macro os_file_read(), not directly this +function! +Requests a synchronous positioned read operation. +@return DB_SUCCESS if request was successful, DB_IO_ERROR on failure +@param[in] type IO flags +@param[in] file handle to an open file +@param[out] buf buffer where to read +@param[in] offset file offset from the start where to read +@param[in] n number of bytes to read, starting from offset +@return error code +@retval DB_SUCCESS if the operation succeeded */ +dberr_t +os_file_read_func( + const IORequest& type, + os_file_t file, + void* buf, + os_offset_t offset, + ulint n) +{ + return(os_file_read_page(type, file, buf, offset, n, NULL, true)); +} + +/** NOTE! Use the corresponding macro os_file_read_no_error_handling(), +not directly this function! +Requests a synchronous positioned read operation. +@return DB_SUCCESS if request was successful, DB_IO_ERROR on failure +@param[in] type IO flags +@param[in] file handle to an open file +@param[out] buf buffer where to read +@param[in] offset file offset from the start where to read +@param[in] n number of bytes to read, starting from offset +@param[out] o number of bytes actually read +@return DB_SUCCESS or error code */ +dberr_t +os_file_read_no_error_handling_func( + const IORequest& type, + os_file_t file, + void* buf, + os_offset_t offset, + ulint n, + ulint* o) +{ + return(os_file_read_page(type, file, buf, offset, n, o, false)); +} + +/** Check the existence and type of the given file. +@param[in] path path name of file +@param[out] exists true if the file exists +@param[out] type Type of the file, if it exists +@return true if call succeeded */ +bool +os_file_status( + const char* path, + bool* exists, + os_file_type_t* type) +{ +#ifdef _WIN32 + return(os_file_status_win32(path, exists, type)); +#else + return(os_file_status_posix(path, exists, type)); +#endif /* _WIN32 */ +} + +/** Free storage space associated with a section of the file. +@param[in] fh Open file handle +@param[in] off Starting offset (SEEK_SET) +@param[in] len Size of the hole +@return DB_SUCCESS or error code */ +dberr_t +os_file_punch_hole( + os_file_t fh, + os_offset_t off, + os_offset_t len) +{ +#ifdef _WIN32 + return os_file_punch_hole_win32(fh, off, len); +#else + return os_file_punch_hole_posix(fh, off, len); +#endif /* _WIN32 */ +} + +/** Free storage space associated with a section of the file. +@param off byte offset from the start (SEEK_SET) +@param len size of the hole in bytes +@return DB_SUCCESS or error code */ +dberr_t IORequest::punch_hole(os_offset_t off, ulint len) const +{ + ulint trim_len = bpage ? bpage->physical_size() - len : 0; + + if (trim_len == 0) { + return(DB_SUCCESS); + } + + off += len; + + /* Check does file system support punching holes for this + tablespace. */ + if (!node->space->punch_hole) { + return DB_IO_NO_PUNCH_HOLE; + } + + dberr_t err = os_file_punch_hole(node->handle, off, trim_len); + + if (err == DB_SUCCESS) { + srv_stats.page_compressed_trim_op.inc(); + } else { + /* If punch hole is not supported, + set space so that it is not used. */ + if (err == DB_IO_NO_PUNCH_HOLE) { + node->space->punch_hole = false; + err = DB_SUCCESS; + } + } + + return (err); +} + +/** This function returns information about the specified file +@param[in] path pathname of the file +@param[out] stat_info information of a file in a directory +@param[in] check_rw_perm for testing whether the file can be opened + in RW mode +@param[in] read_only true if file is opened in read-only mode +@return DB_SUCCESS if all OK */ +dberr_t +os_file_get_status( + const char* path, + os_file_stat_t* stat_info, + bool check_rw_perm, + bool read_only) +{ + dberr_t ret; + +#ifdef _WIN32 + struct _stat64 info; + + ret = os_file_get_status_win32( + path, stat_info, &info, check_rw_perm, read_only); + +#else + struct stat info; + + ret = os_file_get_status_posix( + path, stat_info, &info, check_rw_perm, read_only); + +#endif /* _WIN32 */ + + if (ret == DB_SUCCESS) { + stat_info->ctime = info.st_ctime; + stat_info->atime = info.st_atime; + stat_info->mtime = info.st_mtime; + stat_info->size = info.st_size; + } + + return(ret); +} + + +extern void fil_aio_callback(const IORequest &request); + +static void io_callback(tpool::aiocb* cb) +{ + ut_a(cb->m_err == DB_SUCCESS); + const IORequest request(*static_cast<const IORequest*> + (static_cast<const void*>(cb->m_userdata))); + /* Return cb back to cache*/ + if (cb->m_opcode == tpool::aio_opcode::AIO_PREAD) + { + ut_ad(read_slots->contains(cb)); + read_slots->release(cb); + } + else + { + ut_ad(write_slots->contains(cb)); + write_slots->release(cb); + } + + fil_aio_callback(request); +} + +#ifdef LINUX_NATIVE_AIO +/** Checks if the system supports native linux aio. On some kernel +versions where native aio is supported it won't work on tmpfs. In such +cases we can't use native aio. + +@return: true if supported, false otherwise. */ +static bool is_linux_native_aio_supported() +{ + File fd; + io_context_t io_ctx; + std::string log_file_path = get_log_file_path(); + + memset(&io_ctx, 0, sizeof(io_ctx)); + if (io_setup(1, &io_ctx)) { + + /* The platform does not support native aio. */ + + return(false); + + } + else if (!srv_read_only_mode) { + + /* Now check if tmpdir supports native aio ops. */ + fd = mysql_tmpfile("ib"); + + if (fd < 0) { + ib::warn() + << "Unable to create temp file to check" + " native AIO support."; + + int ret = io_destroy(io_ctx); + ut_a(ret != -EINVAL); + ut_ad(ret != -EFAULT); + + return(false); + } + } + else { + fd = my_open(log_file_path.c_str(), O_RDONLY | O_CLOEXEC, + MYF(0)); + + if (fd == -1) { + + ib::warn() << "Unable to open \"" << log_file_path + << "\" to check native" + << " AIO read support."; + + int ret = io_destroy(io_ctx); + ut_a(ret != EINVAL); + ut_ad(ret != EFAULT); + + return(false); + } + } + + struct io_event io_event; + + memset(&io_event, 0x0, sizeof(io_event)); + + byte* ptr = static_cast<byte*>(aligned_malloc(srv_page_size, + srv_page_size)); + + struct iocb iocb; + + /* Suppress valgrind warning. */ + memset(ptr, 0, srv_page_size); + memset(&iocb, 0x0, sizeof(iocb)); + + struct iocb* p_iocb = &iocb; + + if (!srv_read_only_mode) { + + io_prep_pwrite(p_iocb, fd, ptr, srv_page_size, 0); + + } + else { + ut_a(srv_page_size >= 512); + io_prep_pread(p_iocb, fd, ptr, 512, 0); + } + + int err = io_submit(io_ctx, 1, &p_iocb); + + if (err >= 1) { + /* Now collect the submitted IO request. */ + err = io_getevents(io_ctx, 1, 1, &io_event, NULL); + } + + aligned_free(ptr); + my_close(fd, MYF(MY_WME)); + + switch (err) { + case 1: + { + int ret = io_destroy(io_ctx); + ut_a(ret != -EINVAL); + ut_ad(ret != -EFAULT); + + return(true); + } + + case -EINVAL: + case -ENOSYS: + ib::warn() + << "Linux Native AIO not supported. You can either" + " move " + << (srv_read_only_mode ? log_file_path : "tmpdir") + << " to a file system that supports native" + " AIO or you can set innodb_use_native_aio to" + " FALSE to avoid this message."; + + /* fall through. */ + default: + ib::warn() + << "Linux Native AIO check on " + << (srv_read_only_mode ? log_file_path : "tmpdir") + << "returned error[" << -err << "]"; + } + + int ret = io_destroy(io_ctx); + ut_a(ret != -EINVAL); + ut_ad(ret != -EFAULT); + + return(false); +} +#endif + +int os_aio_init() +{ + int max_write_events= int(srv_n_write_io_threads * + OS_AIO_N_PENDING_IOS_PER_THREAD); + int max_read_events= int(srv_n_read_io_threads * + OS_AIO_N_PENDING_IOS_PER_THREAD); + int max_events= max_read_events + max_write_events; + int ret; +#if LINUX_NATIVE_AIO + if (srv_use_native_aio && !is_linux_native_aio_supported()) + goto disable; +#endif + + ret= srv_thread_pool->configure_aio(srv_use_native_aio, max_events); + +#ifdef LINUX_NATIVE_AIO + if (ret) + { + ut_ad(srv_use_native_aio); +disable: + ib::warn() << "Linux Native AIO disabled."; + srv_use_native_aio= false; + ret= srv_thread_pool->configure_aio(false, max_events); + } +#endif + + if (!ret) + { + read_slots= new io_slots(max_read_events, srv_n_read_io_threads); + write_slots= new io_slots(max_write_events, srv_n_write_io_threads); + } + return ret; +} + + +void os_aio_free() +{ + srv_thread_pool->disable_aio(); + delete read_slots; + delete write_slots; + read_slots= nullptr; + write_slots= nullptr; +} + +/** Wait until there are no pending asynchronous writes. */ +static void os_aio_wait_until_no_pending_writes_low() +{ + bool notify_wait = write_slots->pending_io_count() > 0; + + if (notify_wait) + tpool::tpool_wait_begin(); + + write_slots->wait(); + + if (notify_wait) + tpool::tpool_wait_end(); +} + +/** Wait until there are no pending asynchronous writes. +Only used on FLUSH TABLES...FOR EXPORT. */ +void os_aio_wait_until_no_pending_writes() +{ + os_aio_wait_until_no_pending_writes_low(); + buf_dblwr.wait_flush_buffered_writes(); +} + +/** Request a read or write. +@param type I/O request +@param buf buffer +@param offset file offset +@param n number of bytes +@retval DB_SUCCESS if request was queued successfully +@retval DB_IO_ERROR on I/O error */ +dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n) +{ + ut_ad(n > 0); + ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0); + ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0); + ut_ad(type.is_read() || type.is_write()); + ut_ad(type.node); + ut_ad(type.node->is_open()); + +#ifdef WIN_ASYNC_IO + ut_ad((n & 0xFFFFFFFFUL) == n); +#endif /* WIN_ASYNC_IO */ + +#ifdef UNIV_PFS_IO + PSI_file_locker_state state; + PSI_file_locker* locker= nullptr; + register_pfs_file_io_begin(&state, locker, type.node->handle, n, + type.is_write() + ? PSI_FILE_WRITE : PSI_FILE_READ, + __FILE__, __LINE__); +#endif /* UNIV_PFS_IO */ + dberr_t err = DB_SUCCESS; + + if (!type.is_async()) { + err = type.is_read() + ? os_file_read_func(type, type.node->handle, + buf, offset, n) + : os_file_write_func(type, type.node->name, + type.node->handle, + buf, offset, n); +func_exit: +#ifdef UNIV_PFS_IO + register_pfs_file_io_end(locker, n); +#endif /* UNIV_PFS_IO */ + return err; + } + + if (type.is_read()) { + ++os_n_file_reads; + } else { + ++os_n_file_writes; + } + + compile_time_assert(sizeof(IORequest) <= tpool::MAX_AIO_USERDATA_LEN); + io_slots* slots= type.is_read() ? read_slots : write_slots; + tpool::aiocb* cb = slots->acquire(); + + cb->m_buffer = buf; + cb->m_callback = (tpool::callback_func)io_callback; + cb->m_group = slots->get_task_group(); + cb->m_fh = type.node->handle.m_file; + cb->m_len = (int)n; + cb->m_offset = offset; + cb->m_opcode = type.is_read() ? tpool::aio_opcode::AIO_PREAD : tpool::aio_opcode::AIO_PWRITE; + new (cb->m_userdata) IORequest{type}; + + ut_a(reinterpret_cast<size_t>(cb->m_buffer) % OS_FILE_LOG_BLOCK_SIZE + == 0); + ut_a(cb->m_len % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_a(cb->m_offset % OS_FILE_LOG_BLOCK_SIZE == 0); + + if (srv_thread_pool->submit_io(cb)) { + slots->release(cb); + os_file_handle_error(type.node->name, type.is_read() + ? "aio read" : "aio write"); + err = DB_IO_ERROR; + } + + goto func_exit; +} + +/** Prints info of the aio arrays. +@param[in,out] file file where to print */ +void +os_aio_print(FILE* file) +{ + time_t current_time; + double time_elapsed; + + current_time = time(NULL); + time_elapsed = 0.001 + difftime(current_time, os_last_printout); + + fprintf(file, + "Pending flushes (fsync) log: " ULINTPF + "; buffer pool: " ULINTPF "\n" + ULINTPF " OS file reads, " + ULINTPF " OS file writes, " + ULINTPF " OS fsyncs\n", + log_sys.get_pending_flushes(), + ulint{fil_n_pending_tablespace_flushes}, + ulint{os_n_file_reads}, + os_n_file_writes, + os_n_fsyncs); + + const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS)); + const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES)); + + if (n_reads != 0 || n_writes != 0) { + fprintf(file, + ULINTPF " pending reads, " ULINTPF " pending writes\n", + n_reads, n_writes); + } + + ulint avg_bytes_read = (os_n_file_reads == os_n_file_reads_old) + ? 0 + : os_bytes_read_since_printout + / (os_n_file_reads - os_n_file_reads_old); + + fprintf(file, + "%.2f reads/s, " ULINTPF " avg bytes/read," + " %.2f writes/s, %.2f fsyncs/s\n", + static_cast<double>(os_n_file_reads - os_n_file_reads_old) + / time_elapsed, + avg_bytes_read, + static_cast<double>(os_n_file_writes - os_n_file_writes_old) + / time_elapsed, + static_cast<double>(os_n_fsyncs - os_n_fsyncs_old) + / time_elapsed); + + os_n_file_reads_old = os_n_file_reads; + os_n_file_writes_old = os_n_file_writes; + os_n_fsyncs_old = os_n_fsyncs; + os_bytes_read_since_printout = 0; + + os_last_printout = current_time; +} + +/** Refreshes the statistics used to print per-second averages. */ +void +os_aio_refresh_stats() +{ + os_n_fsyncs_old = os_n_fsyncs; + + os_bytes_read_since_printout = 0; + + os_n_file_reads_old = os_n_file_reads; + + os_n_file_writes_old = os_n_file_writes; + + os_n_fsyncs_old = os_n_fsyncs; + + os_bytes_read_since_printout = 0; + + os_last_printout = time(NULL); +} + + +/** +Set the file create umask +@param[in] umask The umask to use for file creation. */ +void +os_file_set_umask(ulint umask) +{ + os_innodb_umask = umask; +} + +#ifdef _WIN32 + +/* Checks whether physical drive is on SSD.*/ +static bool is_drive_on_ssd(DWORD nr) +{ + char physical_drive_path[32]; + snprintf(physical_drive_path, sizeof(physical_drive_path), + "\\\\.\\PhysicalDrive%lu", nr); + + HANDLE h= CreateFile(physical_drive_path, 0, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, nullptr); + if (h == INVALID_HANDLE_VALUE) + return false; + + DEVICE_SEEK_PENALTY_DESCRIPTOR seek_penalty; + STORAGE_PROPERTY_QUERY storage_query{}; + storage_query.PropertyId= StorageDeviceSeekPenaltyProperty; + storage_query.QueryType= PropertyStandardQuery; + + bool on_ssd= false; + DWORD bytes_written; + if (DeviceIoControl(h, IOCTL_STORAGE_QUERY_PROPERTY, &storage_query, + sizeof storage_query, &seek_penalty, sizeof seek_penalty, + &bytes_written, nullptr)) + { + on_ssd= seek_penalty.IncursSeekPenalty; + } + else + { + on_ssd= false; + } + CloseHandle(h); + return on_ssd; +} + +/* + Checks whether volume is on SSD, by checking all physical drives + in that volume. +*/ +static bool is_volume_on_ssd(const char *volume_mount_point) +{ + char volume_name[MAX_PATH]; + + if (!GetVolumeNameForVolumeMountPoint(volume_mount_point, volume_name, + array_elements(volume_name))) + { + /* This can fail, e.g if file is on network share */ + return false; + } + + /* Chomp last backslash, this is needed to open volume.*/ + size_t length= strlen(volume_name); + if (length && volume_name[length - 1] == '\\') + volume_name[length - 1]= 0; + + /* Open volume handle */ + HANDLE volume_handle= CreateFile( + volume_name, 0, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, nullptr); + + if (volume_handle == INVALID_HANDLE_VALUE) + return false; + + /* + Enumerate all volume extends, check whether all of them are on SSD + */ + + /* Anticipate common case where there is only one extent.*/ + VOLUME_DISK_EXTENTS single_extent; + + /* But also have a place to manage allocated data.*/ + std::unique_ptr<BYTE[]> lifetime; + + DWORD bytes_written; + VOLUME_DISK_EXTENTS *extents= nullptr; + if (DeviceIoControl(volume_handle, IOCTL_VOLUME_GET_VOLUME_DISK_EXTENTS, + nullptr, 0, &single_extent, sizeof(single_extent), + &bytes_written, nullptr)) + { + /* Worked on the first try. Use the preallocated buffer.*/ + extents= &single_extent; + } + else + { + VOLUME_DISK_EXTENTS *last_query= &single_extent; + while (GetLastError() == ERROR_MORE_DATA) + { + DWORD extentCount= last_query->NumberOfDiskExtents; + DWORD allocatedSize= + FIELD_OFFSET(VOLUME_DISK_EXTENTS, Extents[extentCount]); + lifetime.reset(new BYTE[allocatedSize]); + last_query= (VOLUME_DISK_EXTENTS *) lifetime.get(); + if (DeviceIoControl(volume_handle, IOCTL_VOLUME_GET_VOLUME_DISK_EXTENTS, + nullptr, 0, last_query, allocatedSize, + &bytes_written, nullptr)) + { + extents= last_query; + break; + } + } + } + CloseHandle(volume_handle); + if (!extents) + return false; + + for (DWORD i= 0; i < extents->NumberOfDiskExtents; i++) + if (!is_drive_on_ssd(extents->Extents[i].DiskNumber)) + return false; + + return true; +} + +#include <unordered_map> +static bool is_file_on_ssd(char *file_path) +{ + /* Cache of volume_path => volume_info, protected by rwlock.*/ + static std::unordered_map<std::string, bool> cache; + static SRWLOCK lock= SRWLOCK_INIT; + + /* Preset result, in case something fails, e.g we're on network drive.*/ + char volume_path[MAX_PATH]; + if (!GetVolumePathName(file_path, volume_path, array_elements(volume_path))) + return false; + + /* Try cached volume info first.*/ + std::string volume_path_str(volume_path); + bool found; + bool result; + AcquireSRWLockShared(&lock); + auto e= cache.find(volume_path_str); + if ((found= e != cache.end())) + result= e->second; + ReleaseSRWLockShared(&lock); + + if (found) + return result; + + result= is_volume_on_ssd(volume_path); + + /* Update cache */ + AcquireSRWLockExclusive(&lock); + cache[volume_path_str]= result; + ReleaseSRWLockExclusive(&lock); + return result; +} + +#endif + +/** Determine some file metadata when creating or reading the file. +@param file the file that is being created, or OS_FILE_CLOSED */ +void fil_node_t::find_metadata(os_file_t file +#ifndef _WIN32 + , struct stat* statbuf +#endif + ) +{ + if (file == OS_FILE_CLOSED) { + file = handle; + ut_ad(is_open()); + } + +#ifdef _WIN32 /* FIXME: make this unconditional */ + if (space->punch_hole) { + space->punch_hole = os_is_sparse_file_supported(file); + } +#endif + + /* + For the temporary tablespace and during the + non-redo-logged adjustments in + IMPORT TABLESPACE, we do not care about + the atomicity of writes. + + Atomic writes is supported if the file can be used + with atomic_writes (not log file), O_DIRECT is + used (tested in ha_innodb.cc) and the file is + device and file system that supports atomic writes + for the given block size. + */ + space->atomic_write_supported = space->purpose == FIL_TYPE_TEMPORARY + || space->purpose == FIL_TYPE_IMPORT; +#ifdef _WIN32 + on_ssd = is_file_on_ssd(name); + FILE_STORAGE_INFO info; + if (GetFileInformationByHandleEx( + file, FileStorageInfo, &info, sizeof(info))) { + block_size = info.PhysicalBytesPerSectorForAtomicity; + } else { + block_size = 512; + } +#else + struct stat sbuf; + if (!statbuf && !fstat(file, &sbuf)) { + statbuf = &sbuf; + } + if (statbuf) { + block_size = statbuf->st_blksize; + } + on_ssd = space->atomic_write_supported +# ifdef UNIV_LINUX + || (statbuf && fil_system.is_ssd(statbuf->st_dev)) +# endif + ; +#endif + if (!space->atomic_write_supported) { + space->atomic_write_supported = atomic_write + && srv_use_atomic_writes +#ifndef _WIN32 + && my_test_if_atomic_write(file, + space->physical_size()) +#else + /* On Windows, all single sector writes are atomic, + as per WriteFile() documentation on MSDN. + We also require SSD for atomic writes, eventhough + technically it is not necessary- the reason is that + on hard disks, we still want the benefit from + (non-atomic) neighbor page flushing in the buffer + pool code. */ + && srv_page_size == block_size + && on_ssd +#endif + ; + } +} + +/** Read the first page of a data file. +@return whether the page was found valid */ +bool fil_node_t::read_page0() +{ + ut_ad(mutex_own(&fil_system.mutex)); + const unsigned psize = space->physical_size(); +#ifndef _WIN32 + struct stat statbuf; + if (fstat(handle, &statbuf)) { + return false; + } + os_offset_t size_bytes = statbuf.st_size; +#else + os_offset_t size_bytes = os_file_get_size(handle); + ut_a(size_bytes != (os_offset_t) -1); +#endif + const uint32_t min_size = FIL_IBD_FILE_INITIAL_SIZE * psize; + + if (size_bytes < min_size) { + ib::error() << "The size of the file " << name + << " is only " << size_bytes + << " bytes, should be at least " << min_size; + return false; + } + + page_t *page= static_cast<byte*>(aligned_malloc(psize, psize)); + if (os_file_read(IORequestRead, handle, page, 0, psize) + != DB_SUCCESS) { + ib::error() << "Unable to read first page of file " << name; +corrupted: + aligned_free(page); + return false; + } + + const ulint space_id = memcmp_aligned<2>( + FIL_PAGE_SPACE_ID + page, + FSP_HEADER_OFFSET + FSP_SPACE_ID + page, 4) + ? ULINT_UNDEFINED + : mach_read_from_4(FIL_PAGE_SPACE_ID + page); + ulint flags = fsp_header_get_flags(page); + const uint32_t size = fsp_header_get_field(page, FSP_SIZE); + const uint32_t free_limit = fsp_header_get_field(page, FSP_FREE_LIMIT); + const uint32_t free_len = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE + + page); + if (!fil_space_t::is_valid_flags(flags, space->id)) { + ulint cflags = fsp_flags_convert_from_101(flags); + if (cflags == ULINT_UNDEFINED) { +invalid: + ib::error() + << "Expected tablespace flags " + << ib::hex(space->flags) + << " but found " << ib::hex(flags) + << " in the file " << name; + goto corrupted; + } + + ulint cf = cflags & ~FSP_FLAGS_MEM_MASK; + ulint sf = space->flags & ~FSP_FLAGS_MEM_MASK; + + if (!fil_space_t::is_flags_equal(cf, sf) + && !fil_space_t::is_flags_equal(sf, cf)) { + goto invalid; + } + + flags = cflags; + } + + ut_ad(!(flags & FSP_FLAGS_MEM_MASK)); + + /* Try to read crypt_data from page 0 if it is not yet read. */ + if (!space->crypt_data) { + space->crypt_data = fil_space_read_crypt_data( + fil_space_t::zip_size(flags), page); + } + aligned_free(page); + + if (UNIV_UNLIKELY(space_id != space->id)) { + ib::error() << "Expected tablespace id " << space->id + << " but found " << space_id + << " in the file " << name; + return false; + } + +#ifdef UNIV_LINUX + find_metadata(handle, &statbuf); +#else + find_metadata(); +#endif + /* Truncate the size to a multiple of extent size. */ + ulint mask = psize * FSP_EXTENT_SIZE - 1; + + if (size_bytes <= mask) { + /* .ibd files start smaller than an + extent size. Do not truncate valid data. */ + } else { + size_bytes &= ~os_offset_t(mask); + } + + space->flags = (space->flags & FSP_FLAGS_MEM_MASK) | flags; + + space->punch_hole = space->is_compressed(); + this->size = uint32_t(size_bytes / psize); + space->set_sizes(this->size); + ut_ad(space->free_limit == 0 || space->free_limit == free_limit); + ut_ad(space->free_len == 0 || space->free_len == free_len); + space->size_in_header = size; + space->free_limit = free_limit; + space->free_len = free_len; + return true; +} + +#else +#include "univ.i" +#endif /* !UNIV_INNOCHECKSUM */ + +/** Normalizes a directory path for the current OS: +On Windows, we convert '/' to '\', else we convert '\' to '/'. +@param[in,out] str A null-terminated directory and file path */ +void +os_normalize_path( + char* str) +{ + if (str != NULL) { + for (; *str; str++) { + if (*str == OS_PATH_SEPARATOR_ALT) { + *str = OS_PATH_SEPARATOR; + } + } + } +} diff --git a/storage/innobase/os/os0thread.cc b/storage/innobase/os/os0thread.cc new file mode 100644 index 00000000..f3533acf --- /dev/null +++ b/storage/innobase/os/os0thread.cc @@ -0,0 +1,131 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file os/os0thread.cc +The interface to the operating system thread control primitives + +Created 9/8/1995 Heikki Tuuri +*******************************************************/ + +#include "univ.i" +#include "srv0srv.h" + +#ifdef _WIN32 +bool os_thread_eq(os_thread_id_t a, os_thread_id_t b) { return a == b; } +void os_thread_yield() { SwitchToThread(); } +os_thread_id_t os_thread_get_curr_id() { return GetCurrentThreadId(); } +#endif + +/****************************************************************//** +Creates a new thread of execution. The execution starts from +the function given. +NOTE: We count the number of threads in os_thread_exit(). A created +thread should always use that to exit so thatthe thread count will be +decremented. +We do not return an error code because if there is one, we crash here. */ +os_thread_t os_thread_create(os_thread_func_t func, void *arg) +{ + os_thread_id_t new_thread_id; + +#ifdef _WIN32 + HANDLE handle; + + handle = CreateThread(NULL, /* no security attributes */ + 0, /* default size stack */ + func, + arg, + 0, /* thread runs immediately */ + &new_thread_id); + + if (!handle) { + /* If we cannot start a new thread, life has no meaning. */ + ib::fatal() << "CreateThread returned " << GetLastError(); + } + + CloseHandle(handle); + + return((os_thread_t)new_thread_id); +#else /* _WIN32 else */ + + pthread_attr_t attr; + + int ret = pthread_attr_init(&attr); + if (UNIV_UNLIKELY(ret)) { + fprintf(stderr, + "InnoDB: Error: pthread_attr_init() returned %d\n", + ret); + abort(); + } + + ret = pthread_create(&new_thread_id, &attr, func, arg); + + ut_a(ret == 0); + + pthread_attr_destroy(&attr); + +#endif /* not _WIN32 */ + + return((os_thread_t)new_thread_id); +} + +/** Detach and terminate the current thread. */ +ATTRIBUTE_NORETURN void os_thread_exit() +{ +#ifdef UNIV_DEBUG_THREAD_CREATION + ib::info() << "Thread exits, id " << os_thread_get_curr_id(); +#endif + +#ifdef UNIV_PFS_THREAD + pfs_delete_thread(); +#endif + +#ifdef _WIN32 + ExitThread(0); +#else + pthread_detach(pthread_self()); + pthread_exit(NULL); +#endif +} + +/*****************************************************************//** +The thread sleeps at least the time given in microseconds. */ +void +os_thread_sleep( +/*============*/ + ulint tm) /*!< in: time in microseconds */ +{ +#ifdef _WIN32 + Sleep((DWORD) tm / 1000); +#elif defined(HAVE_NANOSLEEP) + struct timespec t; + + t.tv_sec = tm / 1000000; + t.tv_nsec = (tm % 1000000) * 1000; + + ::nanosleep(&t, NULL); +#else + struct timeval t; + + t.tv_sec = tm / 1000000; + t.tv_usec = tm % 1000000; + + select(0, NULL, NULL, NULL, &t); +#endif /* _WIN32 */ +} |