summaryrefslogtreecommitdiffstats
path: root/storage/innobase/os
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/os')
-rw-r--r--storage/innobase/os/os0event.cc515
-rw-r--r--storage/innobase/os/os0file.cc4349
-rw-r--r--storage/innobase/os/os0thread.cc131
3 files changed, 4995 insertions, 0 deletions
diff --git a/storage/innobase/os/os0event.cc b/storage/innobase/os/os0event.cc
new file mode 100644
index 00000000..f18633cc
--- /dev/null
+++ b/storage/innobase/os/os0event.cc
@@ -0,0 +1,515 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file os/os0event.cc
+The interface to the operating system condition variables.
+
+Created 2012-09-23 Sunny Bains
+*******************************************************/
+
+#include "os0event.h"
+#include "ut0mutex.h"
+#include <my_sys.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#include <synchapi.h>
+/** Native condition variable. */
+typedef CONDITION_VARIABLE os_cond_t;
+#else
+/** Native condition variable */
+typedef pthread_cond_t os_cond_t;
+#endif /* _WIN32 */
+
+/** InnoDB condition variable. */
+struct os_event {
+ os_event() UNIV_NOTHROW;
+
+ ~os_event() UNIV_NOTHROW;
+
+ /**
+ Destroys a condition variable */
+ void destroy() UNIV_NOTHROW
+ {
+#ifndef _WIN32
+ int ret = pthread_cond_destroy(&cond_var);
+ ut_a(ret == 0);
+#endif /* !_WIN32 */
+
+ mutex.destroy();
+ }
+
+ /** Set the event */
+ void set() UNIV_NOTHROW
+ {
+ mutex.enter();
+
+ if (!m_set) {
+ broadcast();
+ }
+
+ mutex.exit();
+ }
+
+ int64_t reset() UNIV_NOTHROW
+ {
+ mutex.enter();
+
+ if (m_set) {
+ m_set = false;
+ }
+
+ int64_t ret = signal_count;
+
+ mutex.exit();
+
+ return(ret);
+ }
+
+ /**
+ Waits for an event object until it is in the signaled state.
+
+ Typically, if the event has been signalled after the os_event_reset()
+ we'll return immediately because event->m_set == true.
+ There are, however, situations (e.g.: sync_array code) where we may
+ lose this information. For example:
+
+ thread A calls os_event_reset()
+ thread B calls os_event_set() [event->m_set == true]
+ thread C calls os_event_reset() [event->m_set == false]
+ thread A calls os_event_wait() [infinite wait!]
+ thread C calls os_event_wait() [infinite wait!]
+
+ Where such a scenario is possible, to avoid infinite wait, the
+ value returned by reset() should be passed in as
+ reset_sig_count. */
+ void wait_low(int64_t reset_sig_count) UNIV_NOTHROW;
+
+ /**
+ Waits for an event object until it is in the signaled state or
+ a timeout is exceeded.
+ @param time_in_usec - timeout in microseconds,
+ or OS_SYNC_INFINITE_TIME
+ @param reset_sig_count- zero or the value returned by
+ previous call of os_event_reset().
+ @return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
+ ulint wait_time_low(
+ ulint time_in_usec,
+ int64_t reset_sig_count) UNIV_NOTHROW;
+
+ /** @return true if the event is in the signalled state. */
+ bool is_set() const UNIV_NOTHROW
+ {
+ mutex.enter();
+ bool is_set = m_set;
+ mutex.exit();
+ return is_set;
+ }
+
+private:
+ /**
+ Initialize a condition variable */
+ void init() UNIV_NOTHROW
+ {
+
+ mutex.init();
+
+#ifdef _WIN32
+ InitializeConditionVariable(&cond_var);
+#else
+ {
+ int ret;
+
+ ret = pthread_cond_init(&cond_var, NULL);
+ ut_a(ret == 0);
+ }
+#endif /* _WIN32 */
+ }
+
+ /**
+ Wait on condition variable */
+ void wait() UNIV_NOTHROW
+ {
+#ifdef _WIN32
+ if (!SleepConditionVariableCS(&cond_var, mutex, INFINITE)) {
+ ut_error;
+ }
+#else
+ {
+ int ret;
+
+ ret = pthread_cond_wait(&cond_var, mutex);
+ ut_a(ret == 0);
+ }
+#endif /* _WIN32 */
+ }
+
+ /**
+ Wakes all threads waiting for condition variable */
+ void broadcast() UNIV_NOTHROW
+ {
+ m_set = true;
+ ++signal_count;
+
+#ifdef _WIN32
+ WakeAllConditionVariable(&cond_var);
+#else
+ {
+ int ret;
+
+ ret = pthread_cond_broadcast(&cond_var);
+ ut_a(ret == 0);
+ }
+#endif /* _WIN32 */
+ }
+
+ /**
+ Wakes one thread waiting for condition variable */
+ void signal() UNIV_NOTHROW
+ {
+#ifdef _WIN32
+ WakeConditionVariable(&cond_var);
+#else
+ {
+ int ret;
+
+ ret = pthread_cond_signal(&cond_var);
+ ut_a(ret == 0);
+ }
+#endif /* _WIN32 */
+ }
+
+ /**
+ Do a timed wait on condition variable.
+ @param abstime - timeout
+ @param time_in_ms - timeout in milliseconds.
+ @return true if timed out, false otherwise */
+ bool timed_wait(
+#ifndef _WIN32
+ const timespec* abstime
+#else
+ DWORD time_in_ms
+#endif /* !_WIN32 */
+ );
+
+private:
+ bool m_set; /*!< this is true when the
+ event is in the signaled
+ state, i.e., a thread does
+ not stop if it tries to wait
+ for this event */
+ int64_t signal_count; /*!< this is incremented
+ each time the event becomes
+ signaled */
+ mutable OSMutex mutex; /*!< this mutex protects
+ the next fields */
+
+
+ os_cond_t cond_var; /*!< condition variable is
+ used in waiting for the event */
+
+protected:
+ // Disable copying
+ os_event(const os_event&);
+ os_event& operator=(const os_event&);
+};
+
+/**
+Do a timed wait on condition variable.
+@param abstime - absolute time to wait
+@param time_in_ms - timeout in milliseconds
+@return true if timed out */
+bool
+os_event::timed_wait(
+#ifndef _WIN32
+ const timespec* abstime
+#else
+ DWORD time_in_ms
+#endif /* !_WIN32 */
+)
+{
+#ifdef _WIN32
+ BOOL ret;
+
+ ret = SleepConditionVariableCS(&cond_var, mutex, time_in_ms);
+
+ if (!ret) {
+ DWORD err = GetLastError();
+
+ /* FQDN=msdn.microsoft.com
+ @see http://$FQDN/en-us/library/ms686301%28VS.85%29.aspx,
+
+ "Condition variables are subject to spurious wakeups
+ (those not associated with an explicit wake) and stolen wakeups
+ (another thread manages to run before the woken thread)."
+ Check for both types of timeouts.
+ Conditions are checked by the caller.*/
+ if (err == WAIT_TIMEOUT || err == ERROR_TIMEOUT) {
+ return(true);
+ }
+ }
+
+ ut_a(ret);
+
+ return(false);
+#else
+ int ret;
+
+ ret = pthread_cond_timedwait(&cond_var, mutex, abstime);
+
+ switch (ret) {
+ case 0:
+ case ETIMEDOUT:
+ /* We play it safe by checking for EINTR even though
+ according to the POSIX documentation it can't return EINTR. */
+ case EINTR:
+ break;
+
+ default:
+ ib::error() << "pthread_cond_timedwait() returned: " << ret
+ << ": abstime={" << abstime->tv_sec << ","
+ << abstime->tv_nsec << "}";
+ ut_error;
+ }
+
+ return(ret == ETIMEDOUT);
+#endif /* _WIN32 */
+}
+
+/**
+Waits for an event object until it is in the signaled state.
+
+Typically, if the event has been signalled after the os_event_reset()
+we'll return immediately because event->m_set == true.
+There are, however, situations (e.g.: sync_array code) where we may
+lose this information. For example:
+
+thread A calls os_event_reset()
+thread B calls os_event_set() [event->m_set == true]
+thread C calls os_event_reset() [event->m_set == false]
+thread A calls os_event_wait() [infinite wait!]
+thread C calls os_event_wait() [infinite wait!]
+
+Where such a scenario is possible, to avoid infinite wait, the
+value returned by reset() should be passed in as
+reset_sig_count. */
+void
+os_event::wait_low(
+ int64_t reset_sig_count) UNIV_NOTHROW
+{
+ mutex.enter();
+
+ if (!reset_sig_count) {
+ reset_sig_count = signal_count;
+ }
+
+ while (!m_set && signal_count == reset_sig_count) {
+
+ wait();
+
+ /* Spurious wakeups may occur: we have to check if the
+ event really has been signaled after we came here to wait. */
+ }
+
+ mutex.exit();
+}
+
+/**
+Waits for an event object until it is in the signaled state or
+a timeout is exceeded.
+@param time_in_usec - timeout in microseconds, or OS_SYNC_INFINITE_TIME
+@param reset_sig_count - zero or the value returned by previous call
+ of os_event_reset().
+@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
+ulint
+os_event::wait_time_low(
+ ulint time_in_usec,
+ int64_t reset_sig_count) UNIV_NOTHROW
+{
+ bool timed_out = false;
+
+#ifdef _WIN32
+ DWORD time_in_ms;
+
+ if (time_in_usec != OS_SYNC_INFINITE_TIME) {
+ time_in_ms = DWORD(time_in_usec / 1000);
+ } else {
+ time_in_ms = INFINITE;
+ }
+#else
+ struct timespec abstime;
+
+ if (time_in_usec != OS_SYNC_INFINITE_TIME) {
+ ulonglong usec = ulonglong(time_in_usec) + my_hrtime().val;
+ abstime.tv_sec = static_cast<time_t>(usec / 1000000);
+ abstime.tv_nsec = static_cast<uint>((usec % 1000000) * 1000);
+ } else {
+ abstime.tv_nsec = 999999999;
+ abstime.tv_sec = (time_t) ULINT_MAX;
+ }
+
+ ut_a(abstime.tv_nsec <= 999999999);
+
+#endif /* _WIN32 */
+
+ mutex.enter();
+
+ if (!reset_sig_count) {
+ reset_sig_count = signal_count;
+ }
+
+ do {
+ if (m_set || signal_count != reset_sig_count) {
+
+ break;
+ }
+
+#ifndef _WIN32
+ timed_out = timed_wait(&abstime);
+#else
+ timed_out = timed_wait(time_in_ms);
+#endif /* !_WIN32 */
+
+ } while (!timed_out);
+
+ mutex.exit();
+
+ return(timed_out ? OS_SYNC_TIME_EXCEEDED : 0);
+}
+
+/** Constructor */
+os_event::os_event() UNIV_NOTHROW
+{
+ init();
+
+ m_set = false;
+
+ /* We return this value in os_event_reset(),
+ which can then be be used to pass to the
+ os_event_wait_low(). The value of zero is
+ reserved in os_event_wait_low() for the case
+ when the caller does not want to pass any
+ signal_count value. To distinguish between
+ the two cases we initialize signal_count
+ to 1 here. */
+
+ signal_count = 1;
+}
+
+/** Destructor */
+os_event::~os_event() UNIV_NOTHROW
+{
+ destroy();
+}
+
+/**
+Creates an event semaphore, i.e., a semaphore which may just have two
+states: signaled and nonsignaled. The created event is manual reset: it
+must be reset explicitly by calling sync_os_reset_event.
+@return the event handle */
+os_event_t os_event_create(const char*)
+{
+ return(UT_NEW_NOKEY(os_event()));
+}
+
+/**
+Check if the event is set.
+@return true if set */
+bool
+os_event_is_set(
+/*============*/
+ const os_event_t event) /*!< in: event to test */
+{
+ return(event->is_set());
+}
+
+/**
+Sets an event semaphore to the signaled state: lets waiting threads
+proceed. */
+void
+os_event_set(
+/*=========*/
+ os_event_t event) /*!< in/out: event to set */
+{
+ event->set();
+}
+
+/**
+Resets an event semaphore to the nonsignaled state. Waiting threads will
+stop to wait for the event.
+The return value should be passed to os_even_wait_low() if it is desired
+that this thread should not wait in case of an intervening call to
+os_event_set() between this os_event_reset() and the
+os_event_wait_low() call. See comments for os_event_wait_low().
+@return current signal_count. */
+int64_t
+os_event_reset(
+/*===========*/
+ os_event_t event) /*!< in/out: event to reset */
+{
+ return(event->reset());
+}
+
+/**
+Waits for an event object until it is in the signaled state or
+a timeout is exceeded.
+@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
+ulint
+os_event_wait_time_low(
+/*===================*/
+ os_event_t event, /*!< in/out: event to wait */
+ ulint time_in_usec, /*!< in: timeout in
+ microseconds, or
+ OS_SYNC_INFINITE_TIME */
+ int64_t reset_sig_count) /*!< in: zero or the value
+ returned by previous call of
+ os_event_reset(). */
+{
+ return(event->wait_time_low(time_in_usec, reset_sig_count));
+}
+
+/**
+Waits for an event object until it is in the signaled state.
+
+Where such a scenario is possible, to avoid infinite wait, the
+value returned by os_event_reset() should be passed in as
+reset_sig_count. */
+void
+os_event_wait_low(
+/*==============*/
+ os_event_t event, /*!< in: event to wait */
+ int64_t reset_sig_count) /*!< in: zero or the value
+ returned by previous call of
+ os_event_reset(). */
+{
+ event->wait_low(reset_sig_count);
+}
+
+/**
+Frees an event object. */
+void
+os_event_destroy(
+/*=============*/
+ os_event_t& event) /*!< in/own: event to free */
+
+{
+ UT_DELETE(event);
+ event = NULL;
+}
diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc
new file mode 100644
index 00000000..7a6829e7
--- /dev/null
+++ b/storage/innobase/os/os0file.cc
@@ -0,0 +1,4349 @@
+/***********************************************************************
+
+Copyright (c) 1995, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file os/os0file.cc
+The interface to the operating system file i/o primitives
+
+Created 10/21/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_INNOCHECKSUM
+#include "os0file.h"
+#include "sql_const.h"
+
+#ifdef UNIV_LINUX
+# include <sys/types.h>
+# include <sys/stat.h>
+#endif
+
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "fil0fil.h"
+#include "fsp0fsp.h"
+#ifdef HAVE_LINUX_UNISTD_H
+#include "unistd.h"
+#endif
+#include "os0event.h"
+#include "os0thread.h"
+
+#include <vector>
+#include <tpool_structs.h>
+
+#ifdef LINUX_NATIVE_AIO
+#include <libaio.h>
+#endif /* LINUX_NATIVE_AIO */
+
+#ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
+# include <fcntl.h>
+# include <linux/falloc.h>
+#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
+
+#if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H)
+# include <sys/ioctl.h>
+# ifndef DFS_IOCTL_ATOMIC_WRITE_SET
+# define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint)
+# endif
+#endif
+
+#ifdef _WIN32
+#include <winioctl.h>
+#else
+// my_test_if_atomic_write()
+#include <my_sys.h>
+#endif
+
+#include "buf0dblwr.h"
+
+#include <thread>
+#include <chrono>
+
+/* Per-IO operation environment*/
+class io_slots
+{
+private:
+ tpool::cache<tpool::aiocb> m_cache;
+ tpool::task_group m_group;
+ int m_max_aio;
+public:
+ io_slots(int max_submitted_io, int max_callback_concurrency) :
+ m_cache(max_submitted_io),
+ m_group(max_callback_concurrency),
+ m_max_aio(max_submitted_io)
+ {
+ }
+ /* Get cached AIO control block */
+ tpool::aiocb* acquire()
+ {
+ return m_cache.get();
+ }
+ /* Release AIO control block back to cache */
+ void release(tpool::aiocb* aiocb)
+ {
+ m_cache.put(aiocb);
+ }
+
+ bool contains(tpool::aiocb* aiocb)
+ {
+ return m_cache.contains(aiocb);
+ }
+
+ /* Wait for completions of all AIO operations */
+ void wait()
+ {
+ m_cache.wait();
+ }
+
+ size_t pending_io_count()
+ {
+ return (size_t)m_max_aio - m_cache.size();
+ }
+
+ tpool::task_group* get_task_group()
+ {
+ return &m_group;
+ }
+
+ ~io_slots()
+ {
+ wait();
+ }
+};
+
+static io_slots *read_slots;
+static io_slots *write_slots;
+
+/** Number of retries for partial I/O's */
+constexpr ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
+
+/* This specifies the file permissions InnoDB uses when it creates files in
+Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
+my_umask */
+
+#ifndef _WIN32
+/** Umask for creating files */
+static ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
+#else
+/** Umask for creating files */
+static ulint os_innodb_umask = 0;
+#endif /* _WIN32 */
+
+
+#ifdef WITH_INNODB_DISALLOW_WRITES
+#define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event)
+#else
+#define WAIT_ALLOW_WRITES() do { } while (0)
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+
+
+Atomic_counter<ulint> os_n_file_reads;
+static ulint os_bytes_read_since_printout;
+ulint os_n_file_writes;
+ulint os_n_fsyncs;
+static ulint os_n_file_reads_old;
+static ulint os_n_file_writes_old;
+static ulint os_n_fsyncs_old;
+
+static time_t os_last_printout;
+bool os_has_said_disk_full;
+
+/** Default Zip compression level */
+extern uint page_zip_level;
+
+#ifdef UNIV_PFS_IO
+/* Keys to register InnoDB I/O with performance schema */
+mysql_pfs_key_t innodb_data_file_key;
+mysql_pfs_key_t innodb_log_file_key;
+mysql_pfs_key_t innodb_temp_file_key;
+#endif
+
+/** Handle errors for file operations.
+@param[in] name name of a file or NULL
+@param[in] operation operation
+@param[in] should_abort whether to abort on an unknown error
+@param[in] on_error_silent whether to suppress reports of non-fatal errors
+@return true if we should retry the operation */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+os_file_handle_error_cond_exit(
+ const char* name,
+ const char* operation,
+ bool should_abort,
+ bool on_error_silent);
+
+/** Does error handling when a file operation fails.
+@param[in] name name of a file or NULL
+@param[in] operation operation name that failed
+@return true if we should retry the operation */
+static
+bool
+os_file_handle_error(
+ const char* name,
+ const char* operation)
+{
+ /* Exit in case of unknown error */
+ return(os_file_handle_error_cond_exit(name, operation, true, false));
+}
+
+/** Does error handling when a file operation fails.
+@param[in] name name of a file or NULL
+@param[in] operation operation name that failed
+@param[in] on_error_silent if true then don't print any message to the log.
+@return true if we should retry the operation */
+static
+bool
+os_file_handle_error_no_exit(
+ const char* name,
+ const char* operation,
+ bool on_error_silent)
+{
+ /* Don't exit in case of unknown error */
+ return(os_file_handle_error_cond_exit(
+ name, operation, false, on_error_silent));
+}
+
+/** Handle RENAME error.
+@param name old name of the file
+@param new_name new name of the file */
+static void os_file_handle_rename_error(const char* name, const char* new_name)
+{
+ if (os_file_get_last_error(true) != OS_FILE_DISK_FULL) {
+ ib::error() << "Cannot rename file '" << name << "' to '"
+ << new_name << "'";
+ } else if (!os_has_said_disk_full) {
+ os_has_said_disk_full = true;
+ /* Disk full error is reported irrespective of the
+ on_error_silent setting. */
+ ib::error() << "Full disk prevents renaming file '"
+ << name << "' to '" << new_name << "'";
+ }
+}
+
+
+#ifdef _WIN32
+
+/**
+ Wrapper around Windows DeviceIoControl() function.
+
+ Works synchronously, also in case for handle opened
+ for async access (i.e with FILE_FLAG_OVERLAPPED).
+
+ Accepts the same parameters as DeviceIoControl(),except
+ last parameter (OVERLAPPED).
+*/
+static
+BOOL
+os_win32_device_io_control(
+ HANDLE handle,
+ DWORD code,
+ LPVOID inbuf,
+ DWORD inbuf_size,
+ LPVOID outbuf,
+ DWORD outbuf_size,
+ LPDWORD bytes_returned
+)
+{
+ OVERLAPPED overlapped = { 0 };
+ overlapped.hEvent = tpool::win_get_syncio_event();
+ BOOL result = DeviceIoControl(handle, code, inbuf, inbuf_size, outbuf,
+ outbuf_size, NULL, &overlapped);
+
+ if (result || (GetLastError() == ERROR_IO_PENDING)) {
+ /* Wait for async io to complete */
+ result = GetOverlappedResult(handle, &overlapped, bytes_returned, TRUE);
+ }
+
+ return result;
+}
+
+#endif
+
+
+
+/** Helper class for doing synchronous file IO. Currently, the objective
+is to hide the OS specific code, so that the higher level functions aren't
+peppered with #ifdef. Makes the code flow difficult to follow. */
+class SyncFileIO
+{
+public:
+ /** Constructor
+ @param[in] fh File handle
+ @param[in,out] buf Buffer to read/write
+ @param[in] n Number of bytes to read/write
+ @param[in] offset Offset where to read or write */
+ SyncFileIO(os_file_t fh, void *buf, ulint n, os_offset_t offset) :
+ m_fh(fh), m_buf(buf), m_n(static_cast<ssize_t>(n)), m_offset(offset)
+ { ut_ad(m_n > 0); }
+
+ /** Do the read/write
+ @param[in] request The IO context and type
+ @return the number of bytes read/written or negative value on error */
+ ssize_t execute(const IORequest &request);
+
+ /** Move the read/write offset up to where the partial IO succeeded.
+ @param[in] n_bytes The number of bytes to advance */
+ void advance(ssize_t n_bytes)
+ {
+ m_offset+= n_bytes;
+ ut_ad(m_n >= n_bytes);
+ m_n-= n_bytes;
+ m_buf= reinterpret_cast<uchar*>(m_buf) + n_bytes;
+ }
+
+private:
+ /** Open file handle */
+ const os_file_t m_fh;
+ /** Buffer to read/write */
+ void *m_buf;
+ /** Number of bytes to read/write */
+ ssize_t m_n;
+ /** Offset from where to read/write */
+ os_offset_t m_offset;
+};
+
+#undef USE_FILE_LOCK
+#ifndef _WIN32
+/* On Windows, mandatory locking is used */
+# define USE_FILE_LOCK
+#endif
+#ifdef USE_FILE_LOCK
+/** Obtain an exclusive lock on a file.
+@param[in] fd file descriptor
+@param[in] name file name
+@return 0 on success */
+static
+int
+os_file_lock(
+ int fd,
+ const char* name)
+{
+ if (my_disable_locking) {
+ return 0;
+ }
+
+ struct flock lk;
+
+ lk.l_type = F_WRLCK;
+ lk.l_whence = SEEK_SET;
+ lk.l_start = lk.l_len = 0;
+
+ if (fcntl(fd, F_SETLK, &lk) == -1) {
+
+ ib::error()
+ << "Unable to lock " << name
+ << " error: " << errno;
+
+ if (errno == EAGAIN || errno == EACCES) {
+
+ ib::info()
+ << "Check that you do not already have"
+ " another mysqld process using the"
+ " same InnoDB data or log files.";
+ }
+
+ return(-1);
+ }
+
+ return(0);
+}
+#endif /* USE_FILE_LOCK */
+
+
+/** Create a temporary file. This function is like tmpfile(3), but
+the temporary file is created in the in the mysql server configuration
+parameter (--tmpdir).
+@return temporary file handle, or NULL on error */
+FILE*
+os_file_create_tmpfile()
+{
+ FILE* file = NULL;
+ WAIT_ALLOW_WRITES();
+ File fd = mysql_tmpfile("ib");
+
+ if (fd >= 0) {
+ file = my_fdopen(fd, 0, O_RDWR|O_TRUNC|O_CREAT|FILE_BINARY,
+ MYF(MY_WME));
+ if (!file) {
+ my_close(fd, MYF(MY_WME));
+ }
+ }
+
+ if (file == NULL) {
+
+ ib::error()
+ << "Unable to create temporary file; errno: "
+ << errno;
+ }
+
+ return(file);
+}
+
+/** Rewind file to its start, read at most size - 1 bytes from it to str, and
+NUL-terminate str. All errors are silently ignored. This function is
+mostly meant to be used with temporary files.
+@param[in,out] file File to read from
+@param[in,out] str Buffer where to read
+@param[in] size Size of buffer */
+void
+os_file_read_string(
+ FILE* file,
+ char* str,
+ ulint size)
+{
+ if (size != 0) {
+ rewind(file);
+
+ size_t flen = fread(str, 1, size - 1, file);
+
+ str[flen] = '\0';
+ }
+}
+
+/** This function returns a new path name after replacing the basename
+in an old path with a new basename. The old_path is a full path
+name including the extension. The tablename is in the normal
+form "databasename/tablename". The new base name is found after
+the forward slash. Both input strings are null terminated.
+
+This function allocates memory to be returned. It is the callers
+responsibility to free the return value after it is no longer needed.
+
+@param[in] old_path Pathname
+@param[in] tablename Contains new base name
+@return own: new full pathname */
+char*
+os_file_make_new_pathname(
+ const char* old_path,
+ const char* tablename)
+{
+ ulint dir_len;
+ char* last_slash;
+ char* base_name;
+ char* new_path;
+ ulint new_path_len;
+
+ /* Split the tablename into its database and table name components.
+ They are separated by a '/'. */
+ last_slash = strrchr((char*) tablename, '/');
+ base_name = last_slash ? last_slash + 1 : (char*) tablename;
+
+ /* Find the offset of the last slash. We will strip off the
+ old basename.ibd which starts after that slash. */
+ last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR);
+ dir_len = last_slash ? ulint(last_slash - old_path) : strlen(old_path);
+
+ /* allocate a new path and move the old directory path to it. */
+ new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
+ new_path = static_cast<char*>(ut_malloc_nokey(new_path_len));
+ memcpy(new_path, old_path, dir_len);
+
+ snprintf(new_path + dir_len, new_path_len - dir_len,
+ "%c%s.ibd", OS_PATH_SEPARATOR, base_name);
+
+ return(new_path);
+}
+
+/** This function reduces a null-terminated full remote path name into
+the path that is sent by MySQL for DATA DIRECTORY clause. It replaces
+the 'databasename/tablename.ibd' found at the end of the path with just
+'tablename'.
+
+Since the result is always smaller than the path sent in, no new memory
+is allocated. The caller should allocate memory for the path sent in.
+This function manipulates that path in place.
+
+If the path format is not as expected, just return. The result is used
+to inform a SHOW CREATE TABLE command.
+@param[in,out] data_dir_path Full path/data_dir_path */
+void
+os_file_make_data_dir_path(
+ char* data_dir_path)
+{
+ /* Replace the period before the extension with a null byte. */
+ char* ptr = strrchr((char*) data_dir_path, '.');
+
+ if (ptr == NULL) {
+ return;
+ }
+
+ ptr[0] = '\0';
+
+ /* The tablename starts after the last slash. */
+ ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
+
+ if (ptr == NULL) {
+ return;
+ }
+
+ ptr[0] = '\0';
+
+ char* tablename = ptr + 1;
+
+ /* The databasename starts after the next to last slash. */
+ ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
+
+ if (ptr == NULL) {
+ return;
+ }
+
+ ulint tablename_len = strlen(tablename);
+
+ memmove(++ptr, tablename, tablename_len);
+
+ ptr[tablename_len] = '\0';
+}
+
+/** Check if the path refers to the root of a drive using a pointer
+to the last directory separator that the caller has fixed.
+@param[in] path path name
+@param[in] path last directory separator in the path
+@return true if this path is a drive root, false if not */
+UNIV_INLINE
+bool
+os_file_is_root(
+ const char* path,
+ const char* last_slash)
+{
+ return(
+#ifdef _WIN32
+ (last_slash == path + 2 && path[1] == ':') ||
+#endif /* _WIN32 */
+ last_slash == path);
+}
+
+/** Return the parent directory component of a null-terminated path.
+Return a new buffer containing the string up to, but not including,
+the final component of the path.
+The path returned will not contain a trailing separator.
+Do not return a root path, return NULL instead.
+The final component trimmed off may be a filename or a directory name.
+If the final component is the only component of the path, return NULL.
+It is the caller's responsibility to free the returned string after it
+is no longer needed.
+@param[in] path Path name
+@return own: parent directory of the path */
+static
+char*
+os_file_get_parent_dir(
+ const char* path)
+{
+ bool has_trailing_slash = false;
+
+ /* Find the offset of the last slash */
+ const char* last_slash = strrchr(path, OS_PATH_SEPARATOR);
+
+ if (!last_slash) {
+ /* No slash in the path, return NULL */
+ return(NULL);
+ }
+
+ /* Ok, there is a slash. Is there anything after it? */
+ if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
+ has_trailing_slash = true;
+ }
+
+ /* Reduce repetative slashes. */
+ while (last_slash > path
+ && last_slash[-1] == OS_PATH_SEPARATOR) {
+ last_slash--;
+ }
+
+ /* Check for the root of a drive. */
+ if (os_file_is_root(path, last_slash)) {
+ return(NULL);
+ }
+
+ /* If a trailing slash prevented the first strrchr() from trimming
+ the last component of the path, trim that component now. */
+ if (has_trailing_slash) {
+ /* Back up to the previous slash. */
+ last_slash--;
+ while (last_slash > path
+ && last_slash[0] != OS_PATH_SEPARATOR) {
+ last_slash--;
+ }
+
+ /* Reduce repetative slashes. */
+ while (last_slash > path
+ && last_slash[-1] == OS_PATH_SEPARATOR) {
+ last_slash--;
+ }
+ }
+
+ /* Check for the root of a drive. */
+ if (os_file_is_root(path, last_slash)) {
+ return(NULL);
+ }
+
+ if (last_slash - path < 0) {
+ /* Sanity check, it prevents gcc from trying to handle this case which
+ * results in warnings for some optimized builds */
+ return (NULL);
+ }
+
+ /* Non-trivial directory component */
+
+ return(mem_strdupl(path, ulint(last_slash - path)));
+}
+#ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
+
+/* Test the function os_file_get_parent_dir. */
+void
+test_os_file_get_parent_dir(
+ const char* child_dir,
+ const char* expected_dir)
+{
+ char* child = mem_strdup(child_dir);
+ char* expected = expected_dir == NULL ? NULL
+ : mem_strdup(expected_dir);
+
+ /* os_file_get_parent_dir() assumes that separators are
+ converted to OS_PATH_SEPARATOR. */
+ os_normalize_path(child);
+ os_normalize_path(expected);
+
+ char* parent = os_file_get_parent_dir(child);
+
+ bool unexpected = (expected == NULL
+ ? (parent != NULL)
+ : (0 != strcmp(parent, expected)));
+ if (unexpected) {
+ ib::fatal() << "os_file_get_parent_dir('" << child
+ << "') returned '" << parent
+ << "', instead of '" << expected << "'.";
+ }
+ ut_free(parent);
+ ut_free(child);
+ ut_free(expected);
+}
+
+/* Test the function os_file_get_parent_dir. */
+void
+unit_test_os_file_get_parent_dir()
+{
+ test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
+ test_os_file_get_parent_dir("/usr/", NULL);
+ test_os_file_get_parent_dir("//usr//", NULL);
+ test_os_file_get_parent_dir("usr", NULL);
+ test_os_file_get_parent_dir("usr//", NULL);
+ test_os_file_get_parent_dir("/", NULL);
+ test_os_file_get_parent_dir("//", NULL);
+ test_os_file_get_parent_dir(".", NULL);
+ test_os_file_get_parent_dir("..", NULL);
+# ifdef _WIN32
+ test_os_file_get_parent_dir("D:", NULL);
+ test_os_file_get_parent_dir("D:/", NULL);
+ test_os_file_get_parent_dir("D:\\", NULL);
+ test_os_file_get_parent_dir("D:/data", NULL);
+ test_os_file_get_parent_dir("D:/data/", NULL);
+ test_os_file_get_parent_dir("D:\\data\\", NULL);
+ test_os_file_get_parent_dir("D:///data/////", NULL);
+ test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
+ test_os_file_get_parent_dir("D:/data//a", "D:/data");
+ test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
+ test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
+ test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a");
+#endif /* _WIN32 */
+}
+#endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
+
+
+/** Creates all missing subdirectories along the given path.
+@param[in] path Path name
+@return DB_SUCCESS if OK, otherwise error code. */
+dberr_t
+os_file_create_subdirs_if_needed(
+ const char* path)
+{
+ if (srv_read_only_mode) {
+
+ ib::error()
+ << "read only mode set. Can't create "
+ << "subdirectories '" << path << "'";
+
+ return(DB_READ_ONLY);
+
+ }
+
+ char* subdir = os_file_get_parent_dir(path);
+
+ if (subdir == NULL) {
+ /* subdir is root or cwd, nothing to do */
+ return(DB_SUCCESS);
+ }
+
+ /* Test if subdir exists */
+ os_file_type_t type;
+ bool subdir_exists;
+ bool success = os_file_status(subdir, &subdir_exists, &type);
+
+ if (success && !subdir_exists) {
+
+ /* Subdir does not exist, create it */
+ dberr_t err = os_file_create_subdirs_if_needed(subdir);
+
+ if (err != DB_SUCCESS) {
+
+ ut_free(subdir);
+
+ return(err);
+ }
+
+ success = os_file_create_directory(subdir, false);
+ }
+
+ ut_free(subdir);
+
+ return(success ? DB_SUCCESS : DB_ERROR);
+}
+
+
+
+/** Do the read/write
+@param[in] request The IO context and type
+@return the number of bytes read/written or negative value on error */
+ssize_t
+SyncFileIO::execute(const IORequest& request)
+{
+ ssize_t n_bytes;
+
+ if (request.is_read()) {
+#ifdef _WIN32
+ n_bytes = tpool::pread(m_fh, m_buf, m_n, m_offset);
+#else
+ n_bytes = pread(m_fh, m_buf, m_n, m_offset);
+#endif
+ } else {
+ ut_ad(request.is_write());
+#ifdef _WIN32
+ n_bytes = tpool::pwrite(m_fh, m_buf, m_n, m_offset);
+#else
+ n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
+#endif
+ }
+
+ return(n_bytes);
+}
+
+#ifndef _WIN32
+/** Free storage space associated with a section of the file.
+@param[in] fh Open file handle
+@param[in] off Starting offset (SEEK_SET)
+@param[in] len Size of the hole
+@return DB_SUCCESS or error code */
+static
+dberr_t
+os_file_punch_hole_posix(
+ os_file_t fh,
+ os_offset_t off,
+ os_offset_t len)
+{
+
+#ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
+ const int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+
+ int ret = fallocate(fh, mode, off, len);
+
+ if (ret == 0) {
+ return(DB_SUCCESS);
+ }
+
+ if (errno == ENOTSUP) {
+ return(DB_IO_NO_PUNCH_HOLE);
+ }
+
+ ib::warn()
+ << "fallocate("
+ <<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
+ << off << ", " << len << ") returned errno: "
+ << errno;
+
+ return(DB_IO_ERROR);
+
+#elif defined(UNIV_SOLARIS)
+
+ // Use F_FREESP
+
+#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
+
+ return(DB_IO_NO_PUNCH_HOLE);
+}
+
+
+
+/** Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@param[in] report_all_errors true if we want an error message
+ printed of all errors
+@param[in] on_error_silent true then don't print any diagnostic
+ to the log
+@return error number, or OS error number + 100 */
+static
+ulint
+os_file_get_last_error_low(
+ bool report_all_errors,
+ bool on_error_silent)
+{
+ int err = errno;
+
+ if (err == 0) {
+ return(0);
+ }
+
+ if (report_all_errors
+ || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
+
+ ib::error()
+ << "Operating system error number "
+ << err
+ << " in a file operation.";
+
+ if (err == ENOENT) {
+
+ ib::error()
+ << "The error means the system"
+ " cannot find the path specified.";
+
+ if (srv_is_being_started) {
+
+ ib::error()
+ << "If you are installing InnoDB,"
+ " remember that you must create"
+ " directories yourself, InnoDB"
+ " does not create them.";
+ }
+ } else if (err == EACCES) {
+
+ ib::error()
+ << "The error means mysqld does not have"
+ " the access rights to the directory.";
+
+ } else {
+ if (strerror(err) != NULL) {
+
+ ib::error()
+ << "Error number " << err << " means '"
+ << strerror(err) << "'";
+ }
+
+ ib::info() << OPERATING_SYSTEM_ERROR_MSG;
+ }
+ }
+
+ switch (err) {
+ case ENOSPC:
+ return(OS_FILE_DISK_FULL);
+ case ENOENT:
+ return(OS_FILE_NOT_FOUND);
+ case EEXIST:
+ return(OS_FILE_ALREADY_EXISTS);
+ case EXDEV:
+ case ENOTDIR:
+ case EISDIR:
+ return(OS_FILE_PATH_ERROR);
+ case EAGAIN:
+ if (srv_use_native_aio) {
+ return(OS_FILE_AIO_RESOURCES_RESERVED);
+ }
+ break;
+ case EINTR:
+ if (srv_use_native_aio) {
+ return(OS_FILE_AIO_INTERRUPTED);
+ }
+ break;
+ case EACCES:
+ return(OS_FILE_ACCESS_VIOLATION);
+ }
+ return(OS_FILE_ERROR_MAX + err);
+}
+
+/** Wrapper to fsync() or fdatasync() that retries the call on some errors.
+Returns the value 0 if successful; otherwise the value -1 is returned and
+the global variable errno is set to indicate the error.
+@param[in] file open file handle
+@return 0 if success, -1 otherwise */
+static int os_file_sync_posix(os_file_t file)
+{
+#if !defined(HAVE_FDATASYNC) || HAVE_DECL_FDATASYNC == 0
+ auto func= fsync;
+ auto func_name= "fsync()";
+#else
+ auto func= fdatasync;
+ auto func_name= "fdatasync()";
+#endif
+
+ ulint failures= 0;
+
+ for (;;)
+ {
+ ++os_n_fsyncs;
+
+ int ret= func(file);
+
+ if (ret == 0)
+ return ret;
+
+ switch (errno)
+ {
+ case ENOLCK:
+ ++failures;
+ ut_a(failures < 1000);
+
+ if (!(failures % 100))
+ ib::warn() << func_name << ": No locks available; retrying";
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(200));
+ break;
+
+ case EINTR:
+ ++failures;
+ ut_a(failures < 2000);
+ break;
+
+ default:
+ ib::fatal() << func_name << " returned " << errno;
+ }
+ }
+}
+
+/** Check the existence and type of the given file.
+@param[in] path path name of file
+@param[out] exists true if the file exists
+@param[out] type Type of the file, if it exists
+@return true if call succeeded */
+static
+bool
+os_file_status_posix(
+ const char* path,
+ bool* exists,
+ os_file_type_t* type)
+{
+ struct stat statinfo;
+
+ int ret = stat(path, &statinfo);
+
+ *exists = !ret;
+
+ if (!ret) {
+ /* file exists, everything OK */
+
+ } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
+ /* file does not exist */
+ return(true);
+
+ } else {
+ /* file exists, but stat call failed */
+ os_file_handle_error_no_exit(path, "stat", false);
+ return(false);
+ }
+
+ if (S_ISDIR(statinfo.st_mode)) {
+ *type = OS_FILE_TYPE_DIR;
+
+ } else if (S_ISLNK(statinfo.st_mode)) {
+ *type = OS_FILE_TYPE_LINK;
+
+ } else if (S_ISREG(statinfo.st_mode)) {
+ *type = OS_FILE_TYPE_FILE;
+ } else {
+ *type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ return(true);
+}
+
+/** NOTE! Use the corresponding macro os_file_flush(), not directly this
+function!
+Flushes the write buffers of a given file to the disk.
+@param[in] file handle to a file
+@return true if success */
+bool
+os_file_flush_func(
+ os_file_t file)
+{
+ int ret;
+
+ WAIT_ALLOW_WRITES();
+ ret = os_file_sync_posix(file);
+
+ if (ret == 0) {
+ return(true);
+ }
+
+ /* Since Linux returns EINVAL if the 'file' is actually a raw device,
+ we choose to ignore that error if we are using raw disks */
+
+ if (srv_start_raw_disk_in_use && errno == EINVAL) {
+
+ return(true);
+ }
+
+ ib::error() << "The OS said file flush did not succeed";
+
+ os_file_handle_error(NULL, "flush");
+
+ /* It is a fatal error if a file flush does not succeed, because then
+ the database can get corrupt on disk */
+ ut_error;
+
+ return(false);
+}
+
+/** NOTE! Use the corresponding macro os_file_create_simple(), not directly
+this function!
+A simple function to open or create a file.
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in] read_only if true, read only checks are enforced
+@param[out] success true if succeed, false if error
+@return handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_func(
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success)
+{
+ pfs_os_file_t file;
+
+ *success = false;
+
+ int create_flag;
+ const char* mode_str = NULL;
+
+ if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
+ WAIT_ALLOW_WRITES();
+ }
+
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+ if (create_mode == OS_FILE_OPEN) {
+ mode_str = "OPEN";
+
+ if (access_type == OS_FILE_READ_ONLY) {
+
+ create_flag = O_RDONLY;
+
+ } else if (read_only) {
+
+ create_flag = O_RDONLY;
+
+ } else {
+ create_flag = O_RDWR;
+ }
+
+ } else if (read_only) {
+
+ mode_str = "OPEN";
+ create_flag = O_RDONLY;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ mode_str = "CREATE";
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+ } else if (create_mode == OS_FILE_CREATE_PATH) {
+
+ mode_str = "CREATE PATH";
+ /* Create subdirs along the path if needed. */
+
+ *success = os_file_create_subdirs_if_needed(name);
+
+ if (!*success) {
+
+ ib::error()
+ << "Unable to create subdirectories '"
+ << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+ create_mode = OS_FILE_CREATE;
+ } else {
+
+ ib::error()
+ << "Unknown file create mode ("
+ << create_mode
+ << " for file '" << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ bool retry;
+
+ do {
+ file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+
+ if (file == -1) {
+ *success = false;
+ retry = os_file_handle_error(
+ name,
+ create_mode == OS_FILE_OPEN
+ ? "open" : "create");
+ } else {
+ *success = true;
+ retry = false;
+ }
+
+ } while (retry);
+
+ /* This function is always called for data files, we should disable
+ OS caching (O_DIRECT) here as we do in os_file_create_func(), so
+ we open the same file in the same mode, see man page of open(2). */
+ if (!srv_read_only_mode
+ && *success
+ && (srv_file_flush_method == SRV_O_DIRECT
+ || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) {
+
+ os_file_set_nocache(file, name, mode_str);
+ }
+
+#ifdef USE_FILE_LOCK
+ if (!read_only
+ && *success
+ && (access_type == OS_FILE_READ_WRITE)
+ && os_file_lock(file, name)) {
+
+ *success = false;
+ close(file);
+ file = -1;
+ }
+#endif /* USE_FILE_LOCK */
+
+ return(file);
+}
+
+/** This function attempts to create a directory named pathname. The new
+directory gets default permissions. On Unix the permissions are
+(0770 & ~umask). If the directory exists already, nothing is done and
+the call succeeds, unless the fail_if_exists arguments is true.
+If another error occurs, such as a permission error, this does not crash,
+but reports the error and returns false.
+@param[in] pathname directory name as null-terminated string
+@param[in] fail_if_exists if true, pre-existing directory is treated as
+ an error.
+@return true if call succeeds, false on error */
+bool
+os_file_create_directory(
+ const char* pathname,
+ bool fail_if_exists)
+{
+ int rcode;
+
+ WAIT_ALLOW_WRITES();
+ rcode = mkdir(pathname, 0770);
+
+ if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
+ /* failure */
+ os_file_handle_error_no_exit(pathname, "mkdir", false);
+
+ return(false);
+ }
+
+ return(true);
+}
+
+/** NOTE! Use the corresponding macro os_file_create(), not directly
+this function!
+Opens an existing file or creates a new.
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
+ is desired, OS_FILE_NORMAL, if any normal file;
+ NOTE that it also depends on type, os_aio_..
+ and srv_.. variables whether we really use async
+ I/O or unbuffered I/O: look in the function
+ source code for the exact rules
+@param[in] type OS_DATA_FILE or OS_LOG_FILE
+@param[in] read_only true, if read only checks should be enforcedm
+@param[in] success true if succeeded
+@return handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_func(
+ const char* name,
+ ulint create_mode,
+ ulint purpose,
+ ulint type,
+ bool read_only,
+ bool* success)
+{
+ bool on_error_no_exit;
+ bool on_error_silent;
+
+ *success = false;
+
+ DBUG_EXECUTE_IF(
+ "ib_create_table_fail_disk_full",
+ *success = false;
+ errno = ENOSPC;
+ return(OS_FILE_CLOSED);
+ );
+
+ int create_flag;
+ const char* mode_str = NULL;
+
+ on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
+ ? true : false;
+ on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
+ ? true : false;
+
+ create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT
+ | OS_FILE_ON_ERROR_SILENT));
+
+ if (create_mode == OS_FILE_OPEN
+ || create_mode == OS_FILE_OPEN_RAW
+ || create_mode == OS_FILE_OPEN_RETRY) {
+
+ mode_str = "OPEN";
+
+ create_flag = read_only ? O_RDONLY : O_RDWR;
+
+ } else if (read_only) {
+
+ mode_str = "OPEN";
+
+ create_flag = O_RDONLY;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ mode_str = "CREATE";
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+ } else if (create_mode == OS_FILE_OVERWRITE) {
+
+ mode_str = "OVERWRITE";
+ create_flag = O_RDWR | O_CREAT | O_TRUNC;
+
+ } else {
+ ib::error()
+ << "Unknown file create mode (" << create_mode << ")"
+ << " for file '" << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ ut_a(type == OS_LOG_FILE
+ || type == OS_DATA_FILE
+ || type == OS_DATA_FILE_NO_O_DIRECT);
+
+ ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
+
+ /* We let O_DSYNC only affect log files */
+
+ if (!read_only
+ && type == OS_LOG_FILE
+ && srv_file_flush_method == SRV_O_DSYNC) {
+#ifdef O_DSYNC
+ create_flag |= O_DSYNC;
+#else
+ create_flag |= O_SYNC;
+#endif
+ }
+
+ os_file_t file;
+ bool retry;
+
+ do {
+ file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+
+ if (file == -1) {
+ const char* operation;
+
+ operation = (create_mode == OS_FILE_CREATE
+ && !read_only) ? "create" : "open";
+
+ *success = false;
+
+ if (on_error_no_exit) {
+ retry = os_file_handle_error_no_exit(
+ name, operation, on_error_silent);
+ } else {
+ retry = os_file_handle_error(name, operation);
+ }
+ } else {
+ *success = true;
+ retry = false;
+ }
+
+ } while (retry);
+
+ /* We disable OS caching (O_DIRECT) only on data files */
+ if (!read_only
+ && *success
+ && type != OS_LOG_FILE
+ && type != OS_DATA_FILE_NO_O_DIRECT
+ && (srv_file_flush_method == SRV_O_DIRECT
+ || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) {
+
+ os_file_set_nocache(file, name, mode_str);
+ }
+
+#ifdef USE_FILE_LOCK
+ if (!read_only
+ && *success
+ && create_mode != OS_FILE_OPEN_RAW
+ && os_file_lock(file, name)) {
+
+ if (create_mode == OS_FILE_OPEN_RETRY) {
+
+ ib::info()
+ << "Retrying to lock the first data file";
+
+ for (int i = 0; i < 100; i++) {
+ os_thread_sleep(1000000);
+
+ if (!os_file_lock(file, name)) {
+ *success = true;
+ return(file);
+ }
+ }
+
+ ib::info()
+ << "Unable to open the first data file";
+ }
+
+ *success = false;
+ close(file);
+ file = -1;
+ }
+#endif /* USE_FILE_LOCK */
+
+ return(file);
+}
+
+/** NOTE! Use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A simple function to open or create a file.
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option
+ is used by a backup program reading the file
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeeded
+@return own: handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_no_error_handling_func(
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success)
+{
+ os_file_t file;
+ int create_flag;
+
+ if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
+ WAIT_ALLOW_WRITES();
+ }
+
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+ *success = false;
+
+ if (create_mode == OS_FILE_OPEN) {
+
+ if (access_type == OS_FILE_READ_ONLY) {
+
+ create_flag = O_RDONLY;
+
+ } else if (read_only) {
+
+ create_flag = O_RDONLY;
+
+ } else {
+
+ ut_a(access_type == OS_FILE_READ_WRITE
+ || access_type == OS_FILE_READ_ALLOW_DELETE);
+
+ create_flag = O_RDWR;
+ }
+
+ } else if (read_only) {
+
+ create_flag = O_RDONLY;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+ } else {
+
+ ib::error()
+ << "Unknown file create mode "
+ << create_mode << " for file '" << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+
+ *success = (file != -1);
+
+#ifdef USE_FILE_LOCK
+ if (!read_only
+ && *success
+ && access_type == OS_FILE_READ_WRITE
+ && os_file_lock(file, name)) {
+
+ *success = false;
+ close(file);
+ file = -1;
+
+ }
+#endif /* USE_FILE_LOCK */
+
+ return(file);
+}
+
+/** Deletes a file if it exists. The file has to be closed before calling this.
+@param[in] name file path as a null-terminated string
+@param[out] exist indicate if file pre-exist
+@return true if success */
+bool
+os_file_delete_if_exists_func(
+ const char* name,
+ bool* exist)
+{
+ if (exist != NULL) {
+ *exist = true;
+ }
+
+ int ret;
+ WAIT_ALLOW_WRITES();
+
+ ret = unlink(name);
+
+ if (ret != 0 && errno == ENOENT) {
+ if (exist != NULL) {
+ *exist = false;
+ }
+ } else if (ret != 0 && errno != ENOENT) {
+ os_file_handle_error_no_exit(name, "delete", false);
+
+ return(false);
+ }
+
+ return(true);
+}
+
+/** Deletes a file. The file has to be closed before calling this.
+@param[in] name file path as a null-terminated string
+@return true if success */
+bool
+os_file_delete_func(
+ const char* name)
+{
+ int ret;
+ WAIT_ALLOW_WRITES();
+
+ ret = unlink(name);
+
+ if (ret != 0) {
+ os_file_handle_error_no_exit(name, "delete", FALSE);
+
+ return(false);
+ }
+
+ return(true);
+}
+
+/** NOTE! Use the corresponding macro os_file_rename(), not directly this
+function!
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@param[in] oldpath old file path as a null-terminated string
+@param[in] newpath new file path
+@return true if success */
+bool
+os_file_rename_func(
+ const char* oldpath,
+ const char* newpath)
+{
+#ifdef UNIV_DEBUG
+ os_file_type_t type;
+ bool exists;
+
+ /* New path must not exist. */
+ ut_ad(os_file_status(newpath, &exists, &type));
+ ut_ad(!exists);
+
+ /* Old path must exist. */
+ ut_ad(os_file_status(oldpath, &exists, &type));
+ ut_ad(exists);
+#endif /* UNIV_DEBUG */
+
+ int ret;
+ WAIT_ALLOW_WRITES();
+
+ ret = rename(oldpath, newpath);
+
+ if (ret != 0) {
+ os_file_handle_rename_error(oldpath, newpath);
+
+ return(false);
+ }
+
+ return(true);
+}
+
+/** NOTE! Use the corresponding macro os_file_close(), not directly this
+function!
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@param[in] file Handle to close
+@return true if success */
+bool os_file_close_func(os_file_t file)
+{
+ int ret= close(file);
+
+ if (!ret)
+ return true;
+
+ os_file_handle_error(NULL, "close");
+ return false;
+}
+
+/** Gets a file size.
+@param[in] file handle to an open file
+@return file size, or (os_offset_t) -1 on failure */
+os_offset_t
+os_file_get_size(os_file_t file)
+{
+ struct stat statbuf;
+ return fstat(file, &statbuf) ? os_offset_t(-1) : statbuf.st_size;
+}
+
+/** Gets a file size.
+@param[in] filename Full path to the filename to check
+@return file size if OK, else set m_total_size to ~0 and m_alloc_size to
+ errno */
+os_file_size_t
+os_file_get_size(
+ const char* filename)
+{
+ struct stat s;
+ os_file_size_t file_size;
+
+ int ret = stat(filename, &s);
+
+ if (ret == 0) {
+ file_size.m_total_size = s.st_size;
+ /* st_blocks is in 512 byte sized blocks */
+ file_size.m_alloc_size = s.st_blocks * 512;
+ } else {
+ file_size.m_total_size = ~0U;
+ file_size.m_alloc_size = (os_offset_t) errno;
+ }
+
+ return(file_size);
+}
+
+/** This function returns information about the specified file
+@param[in] path pathname of the file
+@param[out] stat_info information of a file in a directory
+@param[in,out] statinfo information of a file in a directory
+@param[in] check_rw_perm for testing whether the file can be opened
+ in RW mode
+@param[in] read_only if true read only mode checks are enforced
+@return DB_SUCCESS if all OK */
+static
+dberr_t
+os_file_get_status_posix(
+ const char* path,
+ os_file_stat_t* stat_info,
+ struct stat* statinfo,
+ bool check_rw_perm,
+ bool read_only)
+{
+ int ret = stat(path, statinfo);
+
+ if (ret && (errno == ENOENT || errno == ENOTDIR
+ || errno == ENAMETOOLONG)) {
+ /* file does not exist */
+
+ return(DB_NOT_FOUND);
+
+ } else if (ret) {
+ /* file exists, but stat call failed */
+
+ os_file_handle_error_no_exit(path, "stat", false);
+
+ return(DB_FAIL);
+ }
+
+ switch (statinfo->st_mode & S_IFMT) {
+ case S_IFDIR:
+ stat_info->type = OS_FILE_TYPE_DIR;
+ break;
+ case S_IFLNK:
+ stat_info->type = OS_FILE_TYPE_LINK;
+ break;
+ case S_IFBLK:
+ /* Handle block device as regular file. */
+ case S_IFCHR:
+ /* Handle character device as regular file. */
+ case S_IFREG:
+ stat_info->type = OS_FILE_TYPE_FILE;
+ break;
+ default:
+ stat_info->type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ stat_info->size = statinfo->st_size;
+ stat_info->block_size = statinfo->st_blksize;
+ stat_info->alloc_size = statinfo->st_blocks * 512;
+
+ if (check_rw_perm
+ && (stat_info->type == OS_FILE_TYPE_FILE
+ || stat_info->type == OS_FILE_TYPE_BLOCK)) {
+
+ stat_info->rw_perm = !access(path, read_only
+ ? R_OK : R_OK | W_OK);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/** Truncates a file to a specified size in bytes.
+Do nothing if the size to preserve is greater or equal to the current
+size of the file.
+@param[in] pathname file path
+@param[in] file file to be truncated
+@param[in] size size to preserve in bytes
+@return true if success */
+static
+bool
+os_file_truncate_posix(
+ const char* pathname,
+ os_file_t file,
+ os_offset_t size)
+{
+ int res = ftruncate(file, size);
+
+ if (res == -1) {
+
+ bool retry;
+
+ retry = os_file_handle_error_no_exit(
+ pathname, "truncate", false);
+
+ if (retry) {
+ ib::warn()
+ << "Truncate failed for '"
+ << pathname << "'";
+ }
+ }
+
+ return(res == 0);
+}
+
+/** Truncates a file at its current position.
+@return true if success */
+bool
+os_file_set_eof(
+ FILE* file) /*!< in: file to be truncated */
+{
+ WAIT_ALLOW_WRITES();
+ return(!ftruncate(fileno(file), ftell(file)));
+}
+
+#else /* !_WIN32 */
+
+#include <WinIoCtl.h>
+
+
+
+/** Free storage space associated with a section of the file.
+@param[in] fh Open file handle
+@param[in] off Starting offset (SEEK_SET)
+@param[in] len Size of the hole
+@return 0 on success or errno */
+static
+dberr_t
+os_file_punch_hole_win32(
+ os_file_t fh,
+ os_offset_t off,
+ os_offset_t len)
+{
+ FILE_ZERO_DATA_INFORMATION punch;
+
+ punch.FileOffset.QuadPart = off;
+ punch.BeyondFinalZero.QuadPart = off + len;
+
+ /* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
+ therefore we pass a dummy parameter. */
+ DWORD temp;
+ BOOL success = os_win32_device_io_control(
+ fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
+ NULL, 0, &temp);
+
+ return(success ? DB_SUCCESS: DB_IO_NO_PUNCH_HOLE);
+}
+
+/** Check the existence and type of the given file.
+@param[in] path path name of file
+@param[out] exists true if the file exists
+@param[out] type Type of the file, if it exists
+@return true if call succeeded */
+static
+bool
+os_file_status_win32(
+ const char* path,
+ bool* exists,
+ os_file_type_t* type)
+{
+ int ret;
+ struct _stat64 statinfo;
+
+ ret = _stat64(path, &statinfo);
+
+ *exists = !ret;
+
+ if (!ret) {
+ /* file exists, everything OK */
+
+ } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
+ /* file does not exist */
+ return(true);
+
+ } else {
+ /* file exists, but stat call failed */
+ os_file_handle_error_no_exit(path, "stat", false);
+ return(false);
+ }
+
+ if (_S_IFDIR & statinfo.st_mode) {
+ *type = OS_FILE_TYPE_DIR;
+
+ } else if (_S_IFREG & statinfo.st_mode) {
+ *type = OS_FILE_TYPE_FILE;
+
+ } else {
+ *type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ return(true);
+}
+
+/* Dynamically load NtFlushBuffersFileEx, used in os_file_flush_func */
+#include <winternl.h>
+typedef NTSTATUS(WINAPI* pNtFlushBuffersFileEx)(
+ HANDLE FileHandle, ULONG Flags, PVOID Parameters, ULONG ParametersSize,
+ PIO_STATUS_BLOCK IoStatusBlock);
+
+static pNtFlushBuffersFileEx my_NtFlushBuffersFileEx
+ = (pNtFlushBuffersFileEx)GetProcAddress(GetModuleHandle("ntdll"),
+ "NtFlushBuffersFileEx");
+
+/** NOTE! Use the corresponding macro os_file_flush(), not directly this
+function!
+Flushes the write buffers of a given file to the disk.
+@param[in] file handle to a file
+@return true if success */
+bool os_file_flush_func(os_file_t file)
+{
+ ++os_n_fsyncs;
+ static bool disable_datasync;
+
+ if (my_NtFlushBuffersFileEx && !disable_datasync)
+ {
+ IO_STATUS_BLOCK iosb{};
+ NTSTATUS status= my_NtFlushBuffersFileEx(
+ file, FLUSH_FLAGS_FILE_DATA_SYNC_ONLY, nullptr, 0, &iosb);
+ if (!status)
+ return true;
+ /*
+ NtFlushBuffersFileEx(FLUSH_FLAGS_FILE_DATA_SYNC_ONLY) might fail
+ unless on Win10+, and maybe non-NTFS. Switch to using FlushFileBuffers().
+ */
+ disable_datasync= true;
+ }
+
+ if (FlushFileBuffers(file))
+ return true;
+
+ /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
+ actually a raw device, we choose to ignore that error if we are using
+ raw disks */
+ if (srv_start_raw_disk_in_use && GetLastError() == ERROR_INVALID_FUNCTION)
+ return true;
+
+ os_file_handle_error(nullptr, "flush");
+
+ /* It is a fatal error if a file flush does not succeed, because then
+ the database can get corrupt on disk */
+ ut_error;
+
+ return false;
+}
+
+/** Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+then OS error number + OS_FILE_ERROR_MAX is returned.
+@param[in] report_all_errors true if we want an error message printed
+ of all errors
+@param[in] on_error_silent true then don't print any diagnostic
+ to the log
+@return error number, or OS error number + OS_FILE_ERROR_MAX */
+static
+ulint
+os_file_get_last_error_low(
+ bool report_all_errors,
+ bool on_error_silent)
+{
+ ulint err = (ulint) GetLastError();
+
+ if (err == ERROR_SUCCESS) {
+ return(0);
+ }
+
+ if (report_all_errors
+ || (!on_error_silent
+ && err != ERROR_DISK_FULL
+ && err != ERROR_FILE_EXISTS)) {
+
+ ib::error()
+ << "Operating system error number " << err
+ << " in a file operation.";
+
+ if (err == ERROR_PATH_NOT_FOUND) {
+ ib::error()
+ << "The error means the system"
+ " cannot find the path specified.";
+
+ if (srv_is_being_started) {
+ ib::error()
+ << "If you are installing InnoDB,"
+ " remember that you must create"
+ " directories yourself, InnoDB"
+ " does not create them.";
+ }
+
+ } else if (err == ERROR_ACCESS_DENIED) {
+
+ ib::error()
+ << "The error means mysqld does not have"
+ " the access rights to"
+ " the directory. It may also be"
+ " you have created a subdirectory"
+ " of the same name as a data file.";
+
+ } else if (err == ERROR_SHARING_VIOLATION
+ || err == ERROR_LOCK_VIOLATION) {
+
+ ib::error()
+ << "The error means that another program"
+ " is using InnoDB's files."
+ " This might be a backup or antivirus"
+ " software or another instance"
+ " of MySQL."
+ " Please close it to get rid of this error.";
+
+ } else if (err == ERROR_WORKING_SET_QUOTA
+ || err == ERROR_NO_SYSTEM_RESOURCES) {
+
+ ib::error()
+ << "The error means that there are no"
+ " sufficient system resources or quota to"
+ " complete the operation.";
+
+ } else if (err == ERROR_OPERATION_ABORTED) {
+
+ ib::error()
+ << "The error means that the I/O"
+ " operation has been aborted"
+ " because of either a thread exit"
+ " or an application request."
+ " Retry attempt is made.";
+ } else {
+
+ ib::info() << OPERATING_SYSTEM_ERROR_MSG;
+ }
+ }
+
+ if (err == ERROR_FILE_NOT_FOUND) {
+ return(OS_FILE_NOT_FOUND);
+ } else if (err == ERROR_DISK_FULL) {
+ return(OS_FILE_DISK_FULL);
+ } else if (err == ERROR_FILE_EXISTS) {
+ return(OS_FILE_ALREADY_EXISTS);
+ } else if (err == ERROR_SHARING_VIOLATION
+ || err == ERROR_LOCK_VIOLATION) {
+ return(OS_FILE_SHARING_VIOLATION);
+ } else if (err == ERROR_WORKING_SET_QUOTA
+ || err == ERROR_NO_SYSTEM_RESOURCES) {
+ return(OS_FILE_INSUFFICIENT_RESOURCE);
+ } else if (err == ERROR_OPERATION_ABORTED) {
+ return(OS_FILE_OPERATION_ABORTED);
+ } else if (err == ERROR_ACCESS_DENIED) {
+ return(OS_FILE_ACCESS_VIOLATION);
+ }
+
+ return(OS_FILE_ERROR_MAX + err);
+}
+
+
+/** NOTE! Use the corresponding macro os_file_create_simple(), not directly
+this function!
+A simple function to open or create a file.
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in] read_only if true read only mode checks are enforced
+@param[out] success true if succeed, false if error
+@return handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_func(
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success)
+{
+ os_file_t file;
+
+ *success = false;
+
+ DWORD access;
+ DWORD create_flag;
+ DWORD attributes = 0;
+
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+ ut_ad(srv_operation == SRV_OPERATION_NORMAL);
+
+ if (create_mode == OS_FILE_OPEN) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (read_only) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ create_flag = CREATE_NEW;
+
+ } else if (create_mode == OS_FILE_CREATE_PATH) {
+
+ /* Create subdirs along the path if needed. */
+ *success = os_file_create_subdirs_if_needed(name);
+
+ if (!*success) {
+
+ ib::error()
+ << "Unable to create subdirectories '"
+ << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ create_flag = CREATE_NEW;
+ create_mode = OS_FILE_CREATE;
+
+ } else {
+
+ ib::error()
+ << "Unknown file create mode ("
+ << create_mode << ") for file '"
+ << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ if (access_type == OS_FILE_READ_ONLY) {
+
+ access = GENERIC_READ;
+
+ } else if (read_only) {
+
+ ib::info()
+ << "Read only mode set. Unable to"
+ " open file '" << name << "' in RW mode, "
+ << "trying RO mode";
+
+ access = GENERIC_READ;
+
+ } else if (access_type == OS_FILE_READ_WRITE) {
+
+ access = GENERIC_READ | GENERIC_WRITE;
+
+ } else {
+
+ ib::error()
+ << "Unknown file access type (" << access_type << ") "
+ "for file '" << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ bool retry;
+
+ do {
+ /* Use default security attributes and no template file. */
+
+ file = CreateFile(
+ (LPCTSTR) name, access,
+ FILE_SHARE_READ | FILE_SHARE_DELETE,
+ NULL, create_flag, attributes, NULL);
+
+ if (file == INVALID_HANDLE_VALUE) {
+
+ *success = false;
+
+ retry = os_file_handle_error(
+ name, create_mode == OS_FILE_OPEN ?
+ "open" : "create");
+
+ } else {
+
+ retry = false;
+
+ *success = true;
+ }
+
+ } while (retry);
+
+ return(file);
+}
+
+/** This function attempts to create a directory named pathname. The new
+directory gets default permissions. On Unix the permissions are
+(0770 & ~umask). If the directory exists already, nothing is done and
+the call succeeds, unless the fail_if_exists arguments is true.
+If another error occurs, such as a permission error, this does not crash,
+but reports the error and returns false.
+@param[in] pathname directory name as null-terminated string
+@param[in] fail_if_exists if true, pre-existing directory is treated
+ as an error.
+@return true if call succeeds, false on error */
+bool
+os_file_create_directory(
+ const char* pathname,
+ bool fail_if_exists)
+{
+ BOOL rcode;
+
+ rcode = CreateDirectory((LPCTSTR) pathname, NULL);
+ if (!(rcode != 0
+ || (GetLastError() == ERROR_ALREADY_EXISTS
+ && !fail_if_exists))) {
+
+ os_file_handle_error_no_exit(
+ pathname, "CreateDirectory", false);
+
+ return(false);
+ }
+
+ return(true);
+}
+
+/** Check that IO of specific size is possible for the file
+opened with FILE_FLAG_NO_BUFFERING.
+
+The requirement is that IO is multiple of the disk sector size.
+
+@param[in] file file handle
+@param[in] io_size expected io size
+@return true - unbuffered io of requested size is possible, false otherwise.
+
+@note: this function only works correctly with Windows 8 or later,
+(GetFileInformationByHandleEx with FileStorageInfo is only supported there).
+It will return true on earlier Windows version.
+ */
+static bool unbuffered_io_possible(HANDLE file, size_t io_size)
+{
+ FILE_STORAGE_INFO info;
+ if (GetFileInformationByHandleEx(
+ file, FileStorageInfo, &info, sizeof(info))) {
+ ULONG sector_size = info.LogicalBytesPerSector;
+ if (sector_size)
+ return io_size % sector_size == 0;
+ }
+ return true;
+}
+
+
+/** NOTE! Use the corresponding macro os_file_create(), not directly
+this function!
+Opens an existing file or creates a new.
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
+ is desired, OS_FILE_NORMAL, if any normal file;
+ NOTE that it also depends on type, os_aio_..
+ and srv_.. variables whether we really use async
+ I/O or unbuffered I/O: look in the function
+ source code for the exact rules
+@param[in] type OS_DATA_FILE or OS_LOG_FILE
+@param[in] success true if succeeded
+@return handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_func(
+ const char* name,
+ ulint create_mode,
+ ulint purpose,
+ ulint type,
+ bool read_only,
+ bool* success)
+{
+ os_file_t file;
+ bool retry;
+ bool on_error_no_exit;
+ bool on_error_silent;
+
+ *success = false;
+
+ DBUG_EXECUTE_IF(
+ "ib_create_table_fail_disk_full",
+ *success = false;
+ SetLastError(ERROR_DISK_FULL);
+ return(OS_FILE_CLOSED);
+ );
+
+ DWORD create_flag;
+ DWORD share_mode = read_only
+ ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
+ : FILE_SHARE_READ | FILE_SHARE_DELETE;
+
+ if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
+ WAIT_ALLOW_WRITES();
+ }
+
+ on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
+ ? true : false;
+
+ on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
+ ? true : false;
+
+ create_mode &= ~(OS_FILE_ON_ERROR_NO_EXIT | OS_FILE_ON_ERROR_SILENT);
+
+ if (create_mode == OS_FILE_OPEN_RAW) {
+
+ ut_a(!read_only);
+
+ /* On Windows Physical devices require admin privileges and
+ have to have the write-share mode set. See the remarks
+ section for the CreateFile() function documentation in MSDN. */
+
+ share_mode |= FILE_SHARE_WRITE;
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (create_mode == OS_FILE_OPEN
+ || create_mode == OS_FILE_OPEN_RETRY) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (read_only) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ create_flag = CREATE_NEW;
+
+ } else if (create_mode == OS_FILE_OVERWRITE) {
+
+ create_flag = CREATE_ALWAYS;
+
+ } else {
+ ib::error()
+ << "Unknown file create mode (" << create_mode << ") "
+ << " for file '" << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ DWORD attributes = 0;
+
+ if (purpose == OS_FILE_AIO) {
+
+#ifdef WIN_ASYNC_IO
+ /* If specified, use asynchronous (overlapped) io and no
+ buffering of writes in the OS */
+
+ if (srv_use_native_aio) {
+ attributes |= FILE_FLAG_OVERLAPPED;
+ }
+#endif /* WIN_ASYNC_IO */
+
+ } else if (purpose == OS_FILE_NORMAL) {
+
+ /* Use default setting. */
+
+ } else {
+
+ ib::error()
+ << "Unknown purpose flag (" << purpose << ") "
+ << "while opening file '" << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ if (type == OS_LOG_FILE) {
+ /* There is not reason to use buffered write to logs.*/
+ attributes |= FILE_FLAG_NO_BUFFERING;
+ }
+
+ switch (srv_file_flush_method)
+ {
+ case SRV_O_DSYNC:
+ if (type == OS_LOG_FILE) {
+ /* Map O_DSYNC to FILE_WRITE_THROUGH */
+ attributes |= FILE_FLAG_WRITE_THROUGH;
+ }
+ break;
+
+ case SRV_O_DIRECT_NO_FSYNC:
+ case SRV_O_DIRECT:
+ if (type != OS_DATA_FILE) {
+ break;
+ }
+ /* fall through */
+ case SRV_ALL_O_DIRECT_FSYNC:
+ /*Traditional Windows behavior, no buffering for any files.*/
+ if (type != OS_DATA_FILE_NO_O_DIRECT) {
+ attributes |= FILE_FLAG_NO_BUFFERING;
+ }
+ break;
+
+ case SRV_FSYNC:
+ case SRV_LITTLESYNC:
+ break;
+
+ case SRV_NOSYNC:
+ /* Let Windows cache manager handle all writes.*/
+ attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING);
+ break;
+
+ default:
+ ut_a(false); /* unknown flush mode.*/
+ }
+
+
+ // TODO: Create a bug, this looks wrong. The flush log
+ // parameter is dynamic.
+ if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
+ /* Do not use unbuffered i/o for the log files because
+ value 2 denotes that we do not flush the log at every
+ commit, but only once per second */
+ attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING);
+ }
+
+
+ DWORD access = GENERIC_READ;
+
+ if (!read_only) {
+ access |= GENERIC_WRITE;
+ }
+
+ for (;;) {
+ const char *operation;
+
+ /* Use default security attributes and no template file. */
+ file = CreateFile(
+ name, access, share_mode, NULL,
+ create_flag, attributes, NULL);
+
+ /* If FILE_FLAG_NO_BUFFERING was set, check if this can work at all,
+ for expected IO sizes. Reopen without the unbuffered flag, if it is won't work*/
+ if ((file != INVALID_HANDLE_VALUE)
+ && (attributes & FILE_FLAG_NO_BUFFERING)
+ && (type == OS_LOG_FILE)
+ && !unbuffered_io_possible(file, OS_FILE_LOG_BLOCK_SIZE)) {
+ ut_a(CloseHandle(file));
+ attributes &= ~FILE_FLAG_NO_BUFFERING;
+ create_flag = OPEN_ALWAYS;
+ continue;
+ }
+
+ *success = (file != INVALID_HANDLE_VALUE);
+ if (*success) {
+ break;
+ }
+
+ operation = (create_mode == OS_FILE_CREATE && !read_only) ?
+ "create" : "open";
+
+ if (on_error_no_exit) {
+ retry = os_file_handle_error_no_exit(
+ name, operation, on_error_silent);
+ }
+ else {
+ retry = os_file_handle_error(name, operation);
+ }
+
+ if (!retry) {
+ break;
+ }
+ }
+
+ if (*success && (attributes & FILE_FLAG_OVERLAPPED) && srv_thread_pool) {
+ srv_thread_pool->bind(file);
+ }
+ return(file);
+}
+
+/** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
+not directly this function!
+A simple function to open or create a file.
+@param[in] name name of the file or path as a null-terminated
+ string
+@param[in] create_mode create mode
+@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option is
+ used by a backup program reading the file
+@param[out] success true if succeeded
+@return own: handle to the file, not defined if error, error number
+ can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_no_error_handling_func(
+ const char* name,
+ ulint create_mode,
+ ulint access_type,
+ bool read_only,
+ bool* success)
+{
+ os_file_t file;
+
+ *success = false;
+
+ DWORD access;
+ DWORD create_flag;
+ DWORD attributes = 0;
+ DWORD share_mode = read_only
+ ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
+ : FILE_SHARE_READ | FILE_SHARE_DELETE;
+
+ ut_a(name);
+
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+ if (create_mode == OS_FILE_OPEN) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (read_only) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ create_flag = CREATE_NEW;
+
+ } else {
+
+ ib::error()
+ << "Unknown file create mode (" << create_mode << ") "
+ << " for file '" << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ if (access_type == OS_FILE_READ_ONLY) {
+
+ access = GENERIC_READ;
+
+ } else if (read_only) {
+
+ access = GENERIC_READ;
+
+ } else if (access_type == OS_FILE_READ_WRITE) {
+
+ access = GENERIC_READ | GENERIC_WRITE;
+
+ } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
+
+ ut_a(!read_only);
+
+ access = GENERIC_READ;
+
+ /*!< A backup program has to give mysqld the maximum
+ freedom to do what it likes with the file */
+
+ share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE
+ | FILE_SHARE_READ;
+
+ } else {
+
+ ib::error()
+ << "Unknown file access type (" << access_type << ") "
+ << "for file '" << name << "'";
+
+ return(OS_FILE_CLOSED);
+ }
+
+ file = CreateFile((LPCTSTR) name,
+ access,
+ share_mode,
+ NULL, // Security attributes
+ create_flag,
+ attributes,
+ NULL); // No template file
+
+ *success = (file != INVALID_HANDLE_VALUE);
+
+ return(file);
+}
+
+/** Deletes a file if it exists. The file has to be closed before calling this.
+@param[in] name file path as a null-terminated string
+@param[out] exist indicate if file pre-exist
+@return true if success */
+bool
+os_file_delete_if_exists_func(
+ const char* name,
+ bool* exist)
+{
+ ulint count = 0;
+
+ if (exist != NULL) {
+ *exist = true;
+ }
+
+ for (;;) {
+ /* In Windows, deleting an .ibd file may fail if
+ the file is being accessed by an external program,
+ such as a backup tool. */
+
+ bool ret = DeleteFile((LPCTSTR) name);
+
+ if (ret) {
+ return(true);
+ }
+
+ DWORD lasterr = GetLastError();
+
+ if (lasterr == ERROR_FILE_NOT_FOUND
+ || lasterr == ERROR_PATH_NOT_FOUND) {
+
+ /* the file does not exist, this not an error */
+ if (exist != NULL) {
+ *exist = false;
+ }
+
+ return(true);
+ }
+
+ ++count;
+
+ if (count > 100 && 0 == (count % 10)) {
+
+ /* Print error information */
+ os_file_get_last_error(true);
+
+ ib::warn() << "Delete of file '" << name << "' failed.";
+ }
+
+ /* Sleep for a second */
+ os_thread_sleep(1000000);
+
+ if (count > 2000) {
+
+ return(false);
+ }
+ }
+}
+
+/** Deletes a file. The file has to be closed before calling this.
+@param[in] name File path as NUL terminated string
+@return true if success */
+bool
+os_file_delete_func(
+ const char* name)
+{
+ ulint count = 0;
+
+ for (;;) {
+ /* In Windows, deleting an .ibd file may fail if
+ the file is being accessed by an external program,
+ such as a backup tool. */
+
+ BOOL ret = DeleteFile((LPCTSTR) name);
+
+ if (ret) {
+ return(true);
+ }
+
+ if (GetLastError() == ERROR_FILE_NOT_FOUND) {
+ /* If the file does not exist, we classify this as
+ a 'mild' error and return */
+
+ return(false);
+ }
+
+ ++count;
+
+ if (count > 100 && 0 == (count % 10)) {
+
+ /* print error information */
+ os_file_get_last_error(true);
+
+ ib::warn()
+ << "Cannot delete file '" << name << "'. Is "
+ << "another program accessing it?";
+ }
+
+ /* sleep for a second */
+ os_thread_sleep(1000000);
+
+ if (count > 2000) {
+
+ return(false);
+ }
+ }
+
+ ut_error;
+ return(false);
+}
+
+/** NOTE! Use the corresponding macro os_file_rename(), not directly this
+function!
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@param[in] oldpath old file path as a null-terminated string
+@param[in] newpath new file path
+@return true if success */
+bool
+os_file_rename_func(
+ const char* oldpath,
+ const char* newpath)
+{
+#ifdef UNIV_DEBUG
+ os_file_type_t type;
+ bool exists;
+
+ /* New path must not exist. */
+ ut_ad(os_file_status(newpath, &exists, &type));
+ ut_ad(!exists);
+
+ /* Old path must exist. */
+ ut_ad(os_file_status(oldpath, &exists, &type));
+ ut_ad(exists);
+#endif /* UNIV_DEBUG */
+
+ if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) {
+ return(true);
+ }
+
+ os_file_handle_rename_error(oldpath, newpath);
+ return(false);
+}
+
+/** NOTE! Use the corresponding macro os_file_close(), not directly
+this function!
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@param[in,own] file Handle to a file
+@return true if success */
+bool os_file_close_func(os_file_t file)
+{
+ ut_ad(file);
+ if (!CloseHandle(file))
+ {
+ os_file_handle_error(NULL, "close");
+ return false;
+ }
+
+ if(srv_thread_pool)
+ srv_thread_pool->unbind(file);
+ return true;
+}
+
+/** Gets a file size.
+@param[in] file Handle to a file
+@return file size, or (os_offset_t) -1 on failure */
+os_offset_t
+os_file_get_size(
+ os_file_t file)
+{
+ DWORD high;
+ DWORD low = GetFileSize(file, &high);
+
+ if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
+ return((os_offset_t) -1);
+ }
+
+ return(os_offset_t(low | (os_offset_t(high) << 32)));
+}
+
+/** Gets a file size.
+@param[in] filename Full path to the filename to check
+@return file size if OK, else set m_total_size to ~0 and m_alloc_size to
+ errno */
+os_file_size_t
+os_file_get_size(
+ const char* filename)
+{
+ struct __stat64 s;
+ os_file_size_t file_size;
+
+ int ret = _stat64(filename, &s);
+
+ if (ret == 0) {
+
+ file_size.m_total_size = s.st_size;
+
+ DWORD low_size;
+ DWORD high_size;
+
+ low_size = GetCompressedFileSize(filename, &high_size);
+
+ if (low_size != INVALID_FILE_SIZE) {
+
+ file_size.m_alloc_size = high_size;
+ file_size.m_alloc_size <<= 32;
+ file_size.m_alloc_size |= low_size;
+
+ } else {
+ ib::error()
+ << "GetCompressedFileSize("
+ << filename << ", ..) failed.";
+
+ file_size.m_alloc_size = (os_offset_t) -1;
+ }
+ } else {
+ file_size.m_total_size = ~0;
+ file_size.m_alloc_size = (os_offset_t) ret;
+ }
+
+ return(file_size);
+}
+
+/** This function returns information about the specified file
+@param[in] path pathname of the file
+@param[out] stat_info information of a file in a directory
+@param[in,out] statinfo information of a file in a directory
+@param[in] check_rw_perm for testing whether the file can be opened
+ in RW mode
+@param[in] read_only true if the file is opened in read-only mode
+@return DB_SUCCESS if all OK */
+static
+dberr_t
+os_file_get_status_win32(
+ const char* path,
+ os_file_stat_t* stat_info,
+ struct _stat64* statinfo,
+ bool check_rw_perm,
+ bool read_only)
+{
+ int ret = _stat64(path, statinfo);
+
+ if (ret && (errno == ENOENT || errno == ENOTDIR
+ || errno == ENAMETOOLONG)) {
+ /* file does not exist */
+
+ return(DB_NOT_FOUND);
+
+ } else if (ret) {
+ /* file exists, but stat call failed */
+
+ os_file_handle_error_no_exit(path, "STAT", false);
+
+ return(DB_FAIL);
+
+ } else if (_S_IFDIR & statinfo->st_mode) {
+
+ stat_info->type = OS_FILE_TYPE_DIR;
+
+ } else if (_S_IFREG & statinfo->st_mode) {
+
+ DWORD access = GENERIC_READ;
+
+ if (!read_only) {
+ access |= GENERIC_WRITE;
+ }
+
+ stat_info->type = OS_FILE_TYPE_FILE;
+
+ /* Check if we can open it in read-only mode. */
+
+ if (check_rw_perm) {
+ HANDLE fh;
+
+ fh = CreateFile(
+ (LPCTSTR) path, // File to open
+ access,
+ FILE_SHARE_READ | FILE_SHARE_WRITE
+ | FILE_SHARE_DELETE, // Full sharing
+ NULL, // Default security
+ OPEN_EXISTING, // Existing file only
+ FILE_ATTRIBUTE_NORMAL, // Normal file
+ NULL); // No attr. template
+
+ if (fh == INVALID_HANDLE_VALUE) {
+ stat_info->rw_perm = false;
+ } else {
+ stat_info->rw_perm = true;
+ CloseHandle(fh);
+ }
+ }
+ stat_info->block_size = 0;
+
+ /* What follows, is calculation of FS block size, which is not important
+ (it is just shown in I_S innodb tables). The error to calculate it will be ignored.*/
+ char volname[MAX_PATH];
+ BOOL result = GetVolumePathName(path, volname, MAX_PATH);
+ static bool warned_once = false;
+ if (!result) {
+ if (!warned_once) {
+ ib::warn()
+ << "os_file_get_status_win32: "
+ << "Failed to get the volume path name for: "
+ << path
+ << "- OS error number " << GetLastError();
+ warned_once = true;
+ }
+ return(DB_SUCCESS);
+ }
+
+ DWORD sectorsPerCluster;
+ DWORD bytesPerSector;
+ DWORD numberOfFreeClusters;
+ DWORD totalNumberOfClusters;
+
+ result = GetDiskFreeSpace(
+ (LPCSTR) volname,
+ &sectorsPerCluster,
+ &bytesPerSector,
+ &numberOfFreeClusters,
+ &totalNumberOfClusters);
+
+ if (!result) {
+ if (!warned_once) {
+ ib::warn()
+ << "GetDiskFreeSpace(" << volname << ",...) "
+ << "failed "
+ << "- OS error number " << GetLastError();
+ warned_once = true;
+ }
+ return(DB_SUCCESS);
+ }
+ stat_info->block_size = bytesPerSector * sectorsPerCluster;
+ } else {
+ stat_info->type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ return(DB_SUCCESS);
+}
+
+/**
+Sets a sparse flag on Windows file.
+@param[in] file file handle
+@return true on success, false on error
+*/
+#include <versionhelpers.h>
+bool os_file_set_sparse_win32(os_file_t file, bool is_sparse)
+{
+ if (!is_sparse && !IsWindows8OrGreater()) {
+ /* Cannot unset sparse flag on older Windows.
+ Until Windows8 it is documented to produce unpredictable results,
+ if there are unallocated ranges in file.*/
+ return false;
+ }
+ DWORD temp;
+ FILE_SET_SPARSE_BUFFER sparse_buffer;
+ sparse_buffer.SetSparse = is_sparse;
+ return os_win32_device_io_control(file,
+ FSCTL_SET_SPARSE, &sparse_buffer, sizeof(sparse_buffer), 0, 0,&temp);
+}
+
+
+/**
+Change file size on Windows.
+
+If file is extended, the bytes between old and new EOF
+are zeros.
+
+If file is sparse, "virtual" block is added at the end of
+allocated area.
+
+If file is normal, file system allocates storage.
+
+@param[in] pathname file path
+@param[in] file file handle
+@param[in] size size to preserve in bytes
+@return true if success */
+bool
+os_file_change_size_win32(
+ const char* pathname,
+ os_file_t file,
+ os_offset_t size)
+{
+ LARGE_INTEGER length;
+
+ length.QuadPart = size;
+
+ BOOL success = SetFilePointerEx(file, length, NULL, FILE_BEGIN);
+
+ if (!success) {
+ os_file_handle_error_no_exit(
+ pathname, "SetFilePointerEx", false);
+ } else {
+ success = SetEndOfFile(file);
+ if (!success) {
+ os_file_handle_error_no_exit(
+ pathname, "SetEndOfFile", false);
+ }
+ }
+ return(success);
+}
+
+/** Truncates a file at its current position.
+@param[in] file Handle to be truncated
+@return true if success */
+bool
+os_file_set_eof(
+ FILE* file)
+{
+ HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
+
+ return(SetEndOfFile(h));
+}
+
+#endif /* !_WIN32*/
+
+/** Does a syncronous read or write depending upon the type specified
+In case of partial reads/writes the function tries
+NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
+@param[in] type, IO flags
+@param[in] file handle to an open file
+@param[out] buf buffer where to read
+@param[in] offset file offset from the start where to read
+@param[in] n number of bytes to read, starting from offset
+@param[out] err DB_SUCCESS or error code
+@return number of bytes read/written, -1 if error */
+static MY_ATTRIBUTE((warn_unused_result))
+ssize_t
+os_file_io(
+ const IORequest&in_type,
+ os_file_t file,
+ void* buf,
+ ulint n,
+ os_offset_t offset,
+ dberr_t* err)
+{
+ ssize_t original_n = ssize_t(n);
+ IORequest type = in_type;
+ ssize_t bytes_returned = 0;
+
+ SyncFileIO sync_file_io(file, buf, n, offset);
+
+ for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
+
+ ssize_t n_bytes = sync_file_io.execute(type);
+
+ /* Check for a hard error. Not much we can do now. */
+ if (n_bytes < 0) {
+
+ break;
+
+ } else if (n_bytes + bytes_returned == ssize_t(n)) {
+
+ bytes_returned += n_bytes;
+
+ *err = type.maybe_punch_hole(offset, n);
+
+ return(original_n);
+ }
+
+ /* Handle partial read/write. */
+
+ ut_ad(ulint(n_bytes + bytes_returned) < n);
+
+ bytes_returned += n_bytes;
+
+ if (type.type != IORequest::READ_MAYBE_PARTIAL) {
+ const char* op = type.is_read()
+ ? "read" : "written";
+
+ ib::warn()
+ << n
+ << " bytes should have been " << op << ". Only "
+ << bytes_returned
+ << " bytes " << op << ". Retrying"
+ << " for the remaining bytes.";
+ }
+
+ /* Advance the offset and buffer by n_bytes */
+ sync_file_io.advance(n_bytes);
+ }
+
+ *err = DB_IO_ERROR;
+
+ if (type.type != IORequest::READ_MAYBE_PARTIAL) {
+ ib::warn()
+ << "Retry attempts for "
+ << (type.is_read() ? "reading" : "writing")
+ << " partial data failed.";
+ }
+
+ return(bytes_returned);
+}
+
+/** Does a synchronous write operation in Posix.
+@param[in] type IO context
+@param[in] file handle to an open file
+@param[out] buf buffer from which to write
+@param[in] n number of bytes to read, starting from offset
+@param[in] offset file offset from the start where to read
+@param[out] err DB_SUCCESS or error code
+@return number of bytes written, -1 if error */
+static MY_ATTRIBUTE((warn_unused_result))
+ssize_t
+os_file_pwrite(
+ const IORequest& type,
+ os_file_t file,
+ const byte* buf,
+ ulint n,
+ os_offset_t offset,
+ dberr_t* err)
+{
+ ut_ad(type.is_write());
+
+ ++os_n_file_writes;
+
+ const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES);
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+ ssize_t n_bytes = os_file_io(type, file, const_cast<byte*>(buf),
+ n, offset, err);
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+
+ return(n_bytes);
+}
+
+/** NOTE! Use the corresponding macro os_file_write(), not directly
+Requests a synchronous write operation.
+@param[in] type IO flags
+@param[in] file handle to an open file
+@param[out] buf buffer from which to write
+@param[in] offset file offset from the start where to read
+@param[in] n number of bytes to read, starting from offset
+@return error code
+@retval DB_SUCCESS if the operation succeeded */
+dberr_t
+os_file_write_func(
+ const IORequest& type,
+ const char* name,
+ os_file_t file,
+ const void* buf,
+ os_offset_t offset,
+ ulint n)
+{
+ dberr_t err;
+
+ ut_ad(n > 0);
+
+ WAIT_ALLOW_WRITES();
+
+ ssize_t n_bytes = os_file_pwrite(type, file, (byte*)buf, n, offset, &err);
+
+ if ((ulint) n_bytes != n && !os_has_said_disk_full) {
+
+ ib::error()
+ << "Write to file " << name << " failed at offset "
+ << offset << ", " << n
+ << " bytes should have been written,"
+ " only " << n_bytes << " were written."
+ " Operating system error number " << IF_WIN(GetLastError(),errno) << "."
+ " Check that your OS and file system"
+ " support files of this size."
+ " Check also that the disk is not full"
+ " or a disk quota exceeded.";
+#ifndef _WIN32
+ if (strerror(errno) != NULL) {
+
+ ib::error()
+ << "Error number " << errno
+ << " means '" << strerror(errno) << "'";
+ }
+
+ ib::info() << OPERATING_SYSTEM_ERROR_MSG;
+#endif
+ os_has_said_disk_full = true;
+ }
+
+ return(err);
+}
+
+/** Does a synchronous read operation in Posix.
+@param[in] type IO flags
+@param[in] file handle to an open file
+@param[out] buf buffer where to read
+@param[in] offset file offset from the start where to read
+@param[in] n number of bytes to read, starting from offset
+@param[out] err DB_SUCCESS or error code
+@return number of bytes read, -1 if error */
+static MY_ATTRIBUTE((warn_unused_result))
+ssize_t
+os_file_pread(
+ const IORequest& type,
+ os_file_t file,
+ void* buf,
+ ulint n,
+ os_offset_t offset,
+ dberr_t* err)
+{
+ ut_ad(type.is_read());
+
+ ++os_n_file_reads;
+
+ const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
+ ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err);
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
+
+ return(n_bytes);
+}
+
+/** Requests a synchronous positioned read operation.
+@return DB_SUCCESS if request was successful, false if fail
+@param[in] type IO flags
+@param[in] file handle to an open file
+@param[out] buf buffer where to read
+@param[in] offset file offset from the start where to read
+@param[in] n number of bytes to read, starting from offset
+@param[out] o number of bytes actually read
+@param[in] exit_on_err if true then exit on error
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+os_file_read_page(
+ const IORequest& type,
+ os_file_t file,
+ void* buf,
+ os_offset_t offset,
+ ulint n,
+ ulint* o,
+ bool exit_on_err)
+{
+ dberr_t err;
+
+ os_bytes_read_since_printout += n;
+
+ ut_ad(n > 0);
+
+ ssize_t n_bytes = os_file_pread(type, file, buf, n, offset, &err);
+
+ if (o) {
+ *o = n_bytes;
+ }
+
+ if (ulint(n_bytes) == n || (err != DB_SUCCESS && !exit_on_err)) {
+ return err;
+ }
+ int os_err = IF_WIN((int)GetLastError(), errno);
+
+ if (!os_file_handle_error_cond_exit(
+ NULL, "read", exit_on_err, false)) {
+ ib::fatal()
+ << "Tried to read " << n << " bytes at offset "
+ << offset << ", but was only able to read " << n_bytes
+ << ".Cannot read from file. OS error number "
+ << os_err << ".";
+ } else {
+ ib::error() << "Tried to read " << n << " bytes at offset "
+ << offset << ", but was only able to read " << n_bytes;
+ }
+ if (err == DB_SUCCESS) {
+ err = DB_IO_ERROR;
+ }
+
+ return err;
+}
+
+/** Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@param[in] report_all_errors true if we want an error printed
+ for all errors
+@return error number, or OS error number + 100 */
+ulint
+os_file_get_last_error(
+ bool report_all_errors)
+{
+ return(os_file_get_last_error_low(report_all_errors, false));
+}
+
+/** Handle errors for file operations.
+@param[in] name name of a file or NULL
+@param[in] operation operation
+@param[in] should_abort whether to abort on an unknown error
+@param[in] on_error_silent whether to suppress reports of non-fatal errors
+@return true if we should retry the operation */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+os_file_handle_error_cond_exit(
+ const char* name,
+ const char* operation,
+ bool should_abort,
+ bool on_error_silent)
+{
+ ulint err;
+
+ err = os_file_get_last_error_low(false, on_error_silent);
+
+ switch (err) {
+ case OS_FILE_DISK_FULL:
+ /* We only print a warning about disk full once */
+
+ if (os_has_said_disk_full) {
+
+ return(false);
+ }
+
+ /* Disk full error is reported irrespective of the
+ on_error_silent setting. */
+
+ if (name) {
+
+ ib::error()
+ << "Encountered a problem with file '"
+ << name << "'";
+ }
+
+ ib::error()
+ << "Disk is full. Try to clean the disk to free space.";
+
+ os_has_said_disk_full = true;
+
+ return(false);
+
+ case OS_FILE_AIO_RESOURCES_RESERVED:
+ case OS_FILE_AIO_INTERRUPTED:
+
+ return(true);
+
+ case OS_FILE_PATH_ERROR:
+ case OS_FILE_ALREADY_EXISTS:
+ case OS_FILE_ACCESS_VIOLATION:
+
+ return(false);
+
+ case OS_FILE_SHARING_VIOLATION:
+
+ os_thread_sleep(10000000); /* 10 sec */
+ return(true);
+
+ case OS_FILE_OPERATION_ABORTED:
+ case OS_FILE_INSUFFICIENT_RESOURCE:
+
+ os_thread_sleep(100000); /* 100 ms */
+ return(true);
+
+ default:
+
+ /* If it is an operation that can crash on error then it
+ is better to ignore on_error_silent and print an error message
+ to the log. */
+
+ if (should_abort || !on_error_silent) {
+ ib::error() << "File "
+ << (name != NULL ? name : "(unknown)")
+ << ": '" << operation << "'"
+ " returned OS error " << err << "."
+ << (should_abort
+ ? " Cannot continue operation" : "");
+ }
+
+ if (should_abort) {
+ abort();
+ }
+ }
+
+ return(false);
+}
+
+#ifndef _WIN32
+/** Tries to disable OS caching on an opened file descriptor.
+@param[in] fd file descriptor to alter
+@param[in] file_name file name, used in the diagnostic message
+@param[in] name "open" or "create"; used in the diagnostic
+ message */
+void
+os_file_set_nocache(
+ int fd MY_ATTRIBUTE((unused)),
+ const char* file_name MY_ATTRIBUTE((unused)),
+ const char* operation_name MY_ATTRIBUTE((unused)))
+{
+ /* some versions of Solaris may not have DIRECTIO_ON */
+#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
+ if (directio(fd, DIRECTIO_ON) == -1) {
+ int errno_save = errno;
+
+ ib::error()
+ << "Failed to set DIRECTIO_ON on file "
+ << file_name << "; " << operation_name << ": "
+ << strerror(errno_save) << ","
+ " continuing anyway.";
+ }
+#elif defined(O_DIRECT)
+ if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
+ int errno_save = errno;
+ static bool warning_message_printed = false;
+ if (errno_save == EINVAL) {
+ if (!warning_message_printed) {
+ warning_message_printed = true;
+# ifdef UNIV_LINUX
+ ib::warn()
+ << "Failed to set O_DIRECT on file"
+ << file_name << "; " << operation_name
+ << ": " << strerror(errno_save) << ", "
+ "continuing anyway. O_DIRECT is "
+ "known to result in 'Invalid argument' "
+ "on Linux on tmpfs, "
+ "see MySQL Bug#26662.";
+# else /* UNIV_LINUX */
+ goto short_warning;
+# endif /* UNIV_LINUX */
+ }
+ } else {
+# ifndef UNIV_LINUX
+short_warning:
+# endif
+ ib::warn()
+ << "Failed to set O_DIRECT on file "
+ << file_name << "; " << operation_name
+ << " : " << strerror(errno_save)
+ << ", continuing anyway.";
+ }
+ }
+#endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
+}
+
+#endif /* _WIN32 */
+
+/** Check if the file system supports sparse files.
+@param fh file handle
+@return true if the file system supports sparse files */
+IF_WIN(static,) bool os_is_sparse_file_supported(os_file_t fh)
+{
+#ifdef _WIN32
+ FILE_ATTRIBUTE_TAG_INFO info;
+ if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo,
+ &info, (DWORD)sizeof(info))) {
+ if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) {
+ return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0;
+ }
+ }
+ return false;
+#else
+ /* We don't know the FS block size, use the sector size. The FS
+ will do the magic. */
+ return DB_SUCCESS == os_file_punch_hole_posix(fh, 0, srv_page_size);
+#endif /* _WIN32 */
+}
+
+/** Extend a file.
+
+On Windows, extending a file allocates blocks for the file,
+unless the file is sparse.
+
+On Unix, we will extend the file with ftruncate(), if
+file needs to be sparse. Otherwise posix_fallocate() is used
+when available, and if not, binary zeroes are added to the end
+of file.
+
+@param[in] name file name
+@param[in] file file handle
+@param[in] size desired file size
+@param[in] sparse whether to create a sparse file (no preallocating)
+@return whether the operation succeeded */
+bool
+os_file_set_size(
+ const char* name,
+ os_file_t file,
+ os_offset_t size,
+ bool is_sparse)
+{
+#ifdef _WIN32
+ /* On Windows, changing file size works well and as expected for both
+ sparse and normal files.
+
+ However, 10.2 up until 10.2.9 made every file sparse in innodb,
+ causing NTFS fragmentation issues(MDEV-13941). We try to undo
+ the damage, and unsparse the file.*/
+
+ if (!is_sparse && os_is_sparse_file_supported(file)) {
+ if (!os_file_set_sparse_win32(file, false))
+ /* Unsparsing file failed. Fallback to writing binary
+ zeros, to avoid even higher fragmentation.*/
+ goto fallback;
+ }
+
+ return os_file_change_size_win32(name, file, size);
+
+fallback:
+#else
+ struct stat statbuf;
+
+ if (is_sparse) {
+ bool success = !ftruncate(file, size);
+ if (!success) {
+ ib::error() << "ftruncate of file " << name << " to "
+ << size << " bytes failed with error "
+ << errno;
+ }
+ return(success);
+ }
+
+# ifdef HAVE_POSIX_FALLOCATE
+ int err;
+ do {
+ if (fstat(file, &statbuf)) {
+ err = errno;
+ } else {
+ os_offset_t current_size = statbuf.st_size;
+ if (current_size >= size) {
+ return true;
+ }
+ current_size &= ~os_offset_t(statbuf.st_blksize - 1);
+ err = posix_fallocate(file, current_size,
+ size - current_size);
+ }
+ } while (err == EINTR
+ && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
+
+ switch (err) {
+ case 0:
+ return true;
+ default:
+ ib::error() << "preallocating "
+ << size << " bytes for file " << name
+ << " failed with error " << err;
+ /* fall through */
+ case EINTR:
+ errno = err;
+ return false;
+ case EINVAL:
+ case EOPNOTSUPP:
+ /* fall back to the code below */
+ break;
+ }
+# endif /* HAVE_POSIX_ALLOCATE */
+#endif /* _WIN32*/
+
+#ifdef _WIN32
+ os_offset_t current_size = os_file_get_size(file);
+ FILE_STORAGE_INFO info;
+ if (GetFileInformationByHandleEx(file, FileStorageInfo, &info,
+ sizeof info)) {
+ if (info.LogicalBytesPerSector) {
+ current_size &= ~os_offset_t(info.LogicalBytesPerSector
+ - 1);
+ }
+ }
+#else
+ if (fstat(file, &statbuf)) {
+ return false;
+ }
+ os_offset_t current_size = statbuf.st_size
+ & ~os_offset_t(statbuf.st_blksize - 1);
+#endif
+ if (current_size >= size) {
+ return true;
+ }
+
+ /* Write up to 1 megabyte at a time. */
+ ulint buf_size = ut_min(ulint(64),
+ ulint(size >> srv_page_size_shift))
+ << srv_page_size_shift;
+
+ /* Align the buffer for possible raw i/o */
+ byte* buf = static_cast<byte*>(aligned_malloc(buf_size,
+ srv_page_size));
+ /* Write buffer full of zeros */
+ memset(buf, 0, buf_size);
+
+ while (current_size < size
+ && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
+ ulint n_bytes;
+
+ if (size - current_size < (os_offset_t) buf_size) {
+ n_bytes = (ulint) (size - current_size);
+ } else {
+ n_bytes = buf_size;
+ }
+
+ if (os_file_write(IORequestWrite, name,
+ file, buf, current_size, n_bytes) !=
+ DB_SUCCESS) {
+ break;
+ }
+
+ current_size += n_bytes;
+ }
+
+ aligned_free(buf);
+
+ return(current_size >= size && os_file_flush(file));
+}
+
+/** Truncate a file to a specified size in bytes.
+@param[in] pathname file path
+@param[in] file file to be truncated
+@param[in] size size preserved in bytes
+@param[in] allow_shrink whether to allow the file to become smaller
+@return true if success */
+bool
+os_file_truncate(
+ const char* pathname,
+ os_file_t file,
+ os_offset_t size,
+ bool allow_shrink)
+{
+ if (!allow_shrink) {
+ /* Do nothing if the size preserved is larger than or
+ equal to the current size of file */
+ os_offset_t size_bytes = os_file_get_size(file);
+
+ if (size >= size_bytes) {
+ return(true);
+ }
+ }
+
+#ifdef _WIN32
+ return(os_file_change_size_win32(pathname, file, size));
+#else /* _WIN32 */
+ return(os_file_truncate_posix(pathname, file, size));
+#endif /* _WIN32 */
+}
+
+/** NOTE! Use the corresponding macro os_file_read(), not directly this
+function!
+Requests a synchronous positioned read operation.
+@return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
+@param[in] type IO flags
+@param[in] file handle to an open file
+@param[out] buf buffer where to read
+@param[in] offset file offset from the start where to read
+@param[in] n number of bytes to read, starting from offset
+@return error code
+@retval DB_SUCCESS if the operation succeeded */
+dberr_t
+os_file_read_func(
+ const IORequest& type,
+ os_file_t file,
+ void* buf,
+ os_offset_t offset,
+ ulint n)
+{
+ return(os_file_read_page(type, file, buf, offset, n, NULL, true));
+}
+
+/** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
+not directly this function!
+Requests a synchronous positioned read operation.
+@return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
+@param[in] type IO flags
+@param[in] file handle to an open file
+@param[out] buf buffer where to read
+@param[in] offset file offset from the start where to read
+@param[in] n number of bytes to read, starting from offset
+@param[out] o number of bytes actually read
+@return DB_SUCCESS or error code */
+dberr_t
+os_file_read_no_error_handling_func(
+ const IORequest& type,
+ os_file_t file,
+ void* buf,
+ os_offset_t offset,
+ ulint n,
+ ulint* o)
+{
+ return(os_file_read_page(type, file, buf, offset, n, o, false));
+}
+
+/** Check the existence and type of the given file.
+@param[in] path path name of file
+@param[out] exists true if the file exists
+@param[out] type Type of the file, if it exists
+@return true if call succeeded */
+bool
+os_file_status(
+ const char* path,
+ bool* exists,
+ os_file_type_t* type)
+{
+#ifdef _WIN32
+ return(os_file_status_win32(path, exists, type));
+#else
+ return(os_file_status_posix(path, exists, type));
+#endif /* _WIN32 */
+}
+
+/** Free storage space associated with a section of the file.
+@param[in] fh Open file handle
+@param[in] off Starting offset (SEEK_SET)
+@param[in] len Size of the hole
+@return DB_SUCCESS or error code */
+dberr_t
+os_file_punch_hole(
+ os_file_t fh,
+ os_offset_t off,
+ os_offset_t len)
+{
+#ifdef _WIN32
+ return os_file_punch_hole_win32(fh, off, len);
+#else
+ return os_file_punch_hole_posix(fh, off, len);
+#endif /* _WIN32 */
+}
+
+/** Free storage space associated with a section of the file.
+@param off byte offset from the start (SEEK_SET)
+@param len size of the hole in bytes
+@return DB_SUCCESS or error code */
+dberr_t IORequest::punch_hole(os_offset_t off, ulint len) const
+{
+ ulint trim_len = bpage ? bpage->physical_size() - len : 0;
+
+ if (trim_len == 0) {
+ return(DB_SUCCESS);
+ }
+
+ off += len;
+
+ /* Check does file system support punching holes for this
+ tablespace. */
+ if (!node->space->punch_hole) {
+ return DB_IO_NO_PUNCH_HOLE;
+ }
+
+ dberr_t err = os_file_punch_hole(node->handle, off, trim_len);
+
+ if (err == DB_SUCCESS) {
+ srv_stats.page_compressed_trim_op.inc();
+ } else {
+ /* If punch hole is not supported,
+ set space so that it is not used. */
+ if (err == DB_IO_NO_PUNCH_HOLE) {
+ node->space->punch_hole = false;
+ err = DB_SUCCESS;
+ }
+ }
+
+ return (err);
+}
+
+/** This function returns information about the specified file
+@param[in] path pathname of the file
+@param[out] stat_info information of a file in a directory
+@param[in] check_rw_perm for testing whether the file can be opened
+ in RW mode
+@param[in] read_only true if file is opened in read-only mode
+@return DB_SUCCESS if all OK */
+dberr_t
+os_file_get_status(
+ const char* path,
+ os_file_stat_t* stat_info,
+ bool check_rw_perm,
+ bool read_only)
+{
+ dberr_t ret;
+
+#ifdef _WIN32
+ struct _stat64 info;
+
+ ret = os_file_get_status_win32(
+ path, stat_info, &info, check_rw_perm, read_only);
+
+#else
+ struct stat info;
+
+ ret = os_file_get_status_posix(
+ path, stat_info, &info, check_rw_perm, read_only);
+
+#endif /* _WIN32 */
+
+ if (ret == DB_SUCCESS) {
+ stat_info->ctime = info.st_ctime;
+ stat_info->atime = info.st_atime;
+ stat_info->mtime = info.st_mtime;
+ stat_info->size = info.st_size;
+ }
+
+ return(ret);
+}
+
+
+extern void fil_aio_callback(const IORequest &request);
+
+static void io_callback(tpool::aiocb* cb)
+{
+ ut_a(cb->m_err == DB_SUCCESS);
+ const IORequest request(*static_cast<const IORequest*>
+ (static_cast<const void*>(cb->m_userdata)));
+ /* Return cb back to cache*/
+ if (cb->m_opcode == tpool::aio_opcode::AIO_PREAD)
+ {
+ ut_ad(read_slots->contains(cb));
+ read_slots->release(cb);
+ }
+ else
+ {
+ ut_ad(write_slots->contains(cb));
+ write_slots->release(cb);
+ }
+
+ fil_aio_callback(request);
+}
+
+#ifdef LINUX_NATIVE_AIO
+/** Checks if the system supports native linux aio. On some kernel
+versions where native aio is supported it won't work on tmpfs. In such
+cases we can't use native aio.
+
+@return: true if supported, false otherwise. */
+static bool is_linux_native_aio_supported()
+{
+ File fd;
+ io_context_t io_ctx;
+ std::string log_file_path = get_log_file_path();
+
+ memset(&io_ctx, 0, sizeof(io_ctx));
+ if (io_setup(1, &io_ctx)) {
+
+ /* The platform does not support native aio. */
+
+ return(false);
+
+ }
+ else if (!srv_read_only_mode) {
+
+ /* Now check if tmpdir supports native aio ops. */
+ fd = mysql_tmpfile("ib");
+
+ if (fd < 0) {
+ ib::warn()
+ << "Unable to create temp file to check"
+ " native AIO support.";
+
+ int ret = io_destroy(io_ctx);
+ ut_a(ret != -EINVAL);
+ ut_ad(ret != -EFAULT);
+
+ return(false);
+ }
+ }
+ else {
+ fd = my_open(log_file_path.c_str(), O_RDONLY | O_CLOEXEC,
+ MYF(0));
+
+ if (fd == -1) {
+
+ ib::warn() << "Unable to open \"" << log_file_path
+ << "\" to check native"
+ << " AIO read support.";
+
+ int ret = io_destroy(io_ctx);
+ ut_a(ret != EINVAL);
+ ut_ad(ret != EFAULT);
+
+ return(false);
+ }
+ }
+
+ struct io_event io_event;
+
+ memset(&io_event, 0x0, sizeof(io_event));
+
+ byte* ptr = static_cast<byte*>(aligned_malloc(srv_page_size,
+ srv_page_size));
+
+ struct iocb iocb;
+
+ /* Suppress valgrind warning. */
+ memset(ptr, 0, srv_page_size);
+ memset(&iocb, 0x0, sizeof(iocb));
+
+ struct iocb* p_iocb = &iocb;
+
+ if (!srv_read_only_mode) {
+
+ io_prep_pwrite(p_iocb, fd, ptr, srv_page_size, 0);
+
+ }
+ else {
+ ut_a(srv_page_size >= 512);
+ io_prep_pread(p_iocb, fd, ptr, 512, 0);
+ }
+
+ int err = io_submit(io_ctx, 1, &p_iocb);
+
+ if (err >= 1) {
+ /* Now collect the submitted IO request. */
+ err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
+ }
+
+ aligned_free(ptr);
+ my_close(fd, MYF(MY_WME));
+
+ switch (err) {
+ case 1:
+ {
+ int ret = io_destroy(io_ctx);
+ ut_a(ret != -EINVAL);
+ ut_ad(ret != -EFAULT);
+
+ return(true);
+ }
+
+ case -EINVAL:
+ case -ENOSYS:
+ ib::warn()
+ << "Linux Native AIO not supported. You can either"
+ " move "
+ << (srv_read_only_mode ? log_file_path : "tmpdir")
+ << " to a file system that supports native"
+ " AIO or you can set innodb_use_native_aio to"
+ " FALSE to avoid this message.";
+
+ /* fall through. */
+ default:
+ ib::warn()
+ << "Linux Native AIO check on "
+ << (srv_read_only_mode ? log_file_path : "tmpdir")
+ << "returned error[" << -err << "]";
+ }
+
+ int ret = io_destroy(io_ctx);
+ ut_a(ret != -EINVAL);
+ ut_ad(ret != -EFAULT);
+
+ return(false);
+}
+#endif
+
+int os_aio_init()
+{
+ int max_write_events= int(srv_n_write_io_threads *
+ OS_AIO_N_PENDING_IOS_PER_THREAD);
+ int max_read_events= int(srv_n_read_io_threads *
+ OS_AIO_N_PENDING_IOS_PER_THREAD);
+ int max_events= max_read_events + max_write_events;
+ int ret;
+#if LINUX_NATIVE_AIO
+ if (srv_use_native_aio && !is_linux_native_aio_supported())
+ goto disable;
+#endif
+
+ ret= srv_thread_pool->configure_aio(srv_use_native_aio, max_events);
+
+#ifdef LINUX_NATIVE_AIO
+ if (ret)
+ {
+ ut_ad(srv_use_native_aio);
+disable:
+ ib::warn() << "Linux Native AIO disabled.";
+ srv_use_native_aio= false;
+ ret= srv_thread_pool->configure_aio(false, max_events);
+ }
+#endif
+
+ if (!ret)
+ {
+ read_slots= new io_slots(max_read_events, srv_n_read_io_threads);
+ write_slots= new io_slots(max_write_events, srv_n_write_io_threads);
+ }
+ return ret;
+}
+
+
+void os_aio_free()
+{
+ srv_thread_pool->disable_aio();
+ delete read_slots;
+ delete write_slots;
+ read_slots= nullptr;
+ write_slots= nullptr;
+}
+
+/** Wait until there are no pending asynchronous writes. */
+static void os_aio_wait_until_no_pending_writes_low()
+{
+ bool notify_wait = write_slots->pending_io_count() > 0;
+
+ if (notify_wait)
+ tpool::tpool_wait_begin();
+
+ write_slots->wait();
+
+ if (notify_wait)
+ tpool::tpool_wait_end();
+}
+
+/** Wait until there are no pending asynchronous writes.
+Only used on FLUSH TABLES...FOR EXPORT. */
+void os_aio_wait_until_no_pending_writes()
+{
+ os_aio_wait_until_no_pending_writes_low();
+ buf_dblwr.wait_flush_buffered_writes();
+}
+
+/** Request a read or write.
+@param type I/O request
+@param buf buffer
+@param offset file offset
+@param n number of bytes
+@retval DB_SUCCESS if request was queued successfully
+@retval DB_IO_ERROR on I/O error */
+dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n)
+{
+ ut_ad(n > 0);
+ ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0);
+ ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0);
+ ut_ad(type.is_read() || type.is_write());
+ ut_ad(type.node);
+ ut_ad(type.node->is_open());
+
+#ifdef WIN_ASYNC_IO
+ ut_ad((n & 0xFFFFFFFFUL) == n);
+#endif /* WIN_ASYNC_IO */
+
+#ifdef UNIV_PFS_IO
+ PSI_file_locker_state state;
+ PSI_file_locker* locker= nullptr;
+ register_pfs_file_io_begin(&state, locker, type.node->handle, n,
+ type.is_write()
+ ? PSI_FILE_WRITE : PSI_FILE_READ,
+ __FILE__, __LINE__);
+#endif /* UNIV_PFS_IO */
+ dberr_t err = DB_SUCCESS;
+
+ if (!type.is_async()) {
+ err = type.is_read()
+ ? os_file_read_func(type, type.node->handle,
+ buf, offset, n)
+ : os_file_write_func(type, type.node->name,
+ type.node->handle,
+ buf, offset, n);
+func_exit:
+#ifdef UNIV_PFS_IO
+ register_pfs_file_io_end(locker, n);
+#endif /* UNIV_PFS_IO */
+ return err;
+ }
+
+ if (type.is_read()) {
+ ++os_n_file_reads;
+ } else {
+ ++os_n_file_writes;
+ }
+
+ compile_time_assert(sizeof(IORequest) <= tpool::MAX_AIO_USERDATA_LEN);
+ io_slots* slots= type.is_read() ? read_slots : write_slots;
+ tpool::aiocb* cb = slots->acquire();
+
+ cb->m_buffer = buf;
+ cb->m_callback = (tpool::callback_func)io_callback;
+ cb->m_group = slots->get_task_group();
+ cb->m_fh = type.node->handle.m_file;
+ cb->m_len = (int)n;
+ cb->m_offset = offset;
+ cb->m_opcode = type.is_read() ? tpool::aio_opcode::AIO_PREAD : tpool::aio_opcode::AIO_PWRITE;
+ new (cb->m_userdata) IORequest{type};
+
+ ut_a(reinterpret_cast<size_t>(cb->m_buffer) % OS_FILE_LOG_BLOCK_SIZE
+ == 0);
+ ut_a(cb->m_len % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_a(cb->m_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
+
+ if (srv_thread_pool->submit_io(cb)) {
+ slots->release(cb);
+ os_file_handle_error(type.node->name, type.is_read()
+ ? "aio read" : "aio write");
+ err = DB_IO_ERROR;
+ }
+
+ goto func_exit;
+}
+
+/** Prints info of the aio arrays.
+@param[in,out] file file where to print */
+void
+os_aio_print(FILE* file)
+{
+ time_t current_time;
+ double time_elapsed;
+
+ current_time = time(NULL);
+ time_elapsed = 0.001 + difftime(current_time, os_last_printout);
+
+ fprintf(file,
+ "Pending flushes (fsync) log: " ULINTPF
+ "; buffer pool: " ULINTPF "\n"
+ ULINTPF " OS file reads, "
+ ULINTPF " OS file writes, "
+ ULINTPF " OS fsyncs\n",
+ log_sys.get_pending_flushes(),
+ ulint{fil_n_pending_tablespace_flushes},
+ ulint{os_n_file_reads},
+ os_n_file_writes,
+ os_n_fsyncs);
+
+ const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS));
+ const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
+
+ if (n_reads != 0 || n_writes != 0) {
+ fprintf(file,
+ ULINTPF " pending reads, " ULINTPF " pending writes\n",
+ n_reads, n_writes);
+ }
+
+ ulint avg_bytes_read = (os_n_file_reads == os_n_file_reads_old)
+ ? 0
+ : os_bytes_read_since_printout
+ / (os_n_file_reads - os_n_file_reads_old);
+
+ fprintf(file,
+ "%.2f reads/s, " ULINTPF " avg bytes/read,"
+ " %.2f writes/s, %.2f fsyncs/s\n",
+ static_cast<double>(os_n_file_reads - os_n_file_reads_old)
+ / time_elapsed,
+ avg_bytes_read,
+ static_cast<double>(os_n_file_writes - os_n_file_writes_old)
+ / time_elapsed,
+ static_cast<double>(os_n_fsyncs - os_n_fsyncs_old)
+ / time_elapsed);
+
+ os_n_file_reads_old = os_n_file_reads;
+ os_n_file_writes_old = os_n_file_writes;
+ os_n_fsyncs_old = os_n_fsyncs;
+ os_bytes_read_since_printout = 0;
+
+ os_last_printout = current_time;
+}
+
+/** Refreshes the statistics used to print per-second averages. */
+void
+os_aio_refresh_stats()
+{
+ os_n_fsyncs_old = os_n_fsyncs;
+
+ os_bytes_read_since_printout = 0;
+
+ os_n_file_reads_old = os_n_file_reads;
+
+ os_n_file_writes_old = os_n_file_writes;
+
+ os_n_fsyncs_old = os_n_fsyncs;
+
+ os_bytes_read_since_printout = 0;
+
+ os_last_printout = time(NULL);
+}
+
+
+/**
+Set the file create umask
+@param[in] umask The umask to use for file creation. */
+void
+os_file_set_umask(ulint umask)
+{
+ os_innodb_umask = umask;
+}
+
+#ifdef _WIN32
+
+/* Checks whether physical drive is on SSD.*/
+static bool is_drive_on_ssd(DWORD nr)
+{
+ char physical_drive_path[32];
+ snprintf(physical_drive_path, sizeof(physical_drive_path),
+ "\\\\.\\PhysicalDrive%lu", nr);
+
+ HANDLE h= CreateFile(physical_drive_path, 0,
+ FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+ nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, nullptr);
+ if (h == INVALID_HANDLE_VALUE)
+ return false;
+
+ DEVICE_SEEK_PENALTY_DESCRIPTOR seek_penalty;
+ STORAGE_PROPERTY_QUERY storage_query{};
+ storage_query.PropertyId= StorageDeviceSeekPenaltyProperty;
+ storage_query.QueryType= PropertyStandardQuery;
+
+ bool on_ssd= false;
+ DWORD bytes_written;
+ if (DeviceIoControl(h, IOCTL_STORAGE_QUERY_PROPERTY, &storage_query,
+ sizeof storage_query, &seek_penalty, sizeof seek_penalty,
+ &bytes_written, nullptr))
+ {
+ on_ssd= seek_penalty.IncursSeekPenalty;
+ }
+ else
+ {
+ on_ssd= false;
+ }
+ CloseHandle(h);
+ return on_ssd;
+}
+
+/*
+ Checks whether volume is on SSD, by checking all physical drives
+ in that volume.
+*/
+static bool is_volume_on_ssd(const char *volume_mount_point)
+{
+ char volume_name[MAX_PATH];
+
+ if (!GetVolumeNameForVolumeMountPoint(volume_mount_point, volume_name,
+ array_elements(volume_name)))
+ {
+ /* This can fail, e.g if file is on network share */
+ return false;
+ }
+
+ /* Chomp last backslash, this is needed to open volume.*/
+ size_t length= strlen(volume_name);
+ if (length && volume_name[length - 1] == '\\')
+ volume_name[length - 1]= 0;
+
+ /* Open volume handle */
+ HANDLE volume_handle= CreateFile(
+ volume_name, 0, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+ nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, nullptr);
+
+ if (volume_handle == INVALID_HANDLE_VALUE)
+ return false;
+
+ /*
+ Enumerate all volume extends, check whether all of them are on SSD
+ */
+
+ /* Anticipate common case where there is only one extent.*/
+ VOLUME_DISK_EXTENTS single_extent;
+
+ /* But also have a place to manage allocated data.*/
+ std::unique_ptr<BYTE[]> lifetime;
+
+ DWORD bytes_written;
+ VOLUME_DISK_EXTENTS *extents= nullptr;
+ if (DeviceIoControl(volume_handle, IOCTL_VOLUME_GET_VOLUME_DISK_EXTENTS,
+ nullptr, 0, &single_extent, sizeof(single_extent),
+ &bytes_written, nullptr))
+ {
+ /* Worked on the first try. Use the preallocated buffer.*/
+ extents= &single_extent;
+ }
+ else
+ {
+ VOLUME_DISK_EXTENTS *last_query= &single_extent;
+ while (GetLastError() == ERROR_MORE_DATA)
+ {
+ DWORD extentCount= last_query->NumberOfDiskExtents;
+ DWORD allocatedSize=
+ FIELD_OFFSET(VOLUME_DISK_EXTENTS, Extents[extentCount]);
+ lifetime.reset(new BYTE[allocatedSize]);
+ last_query= (VOLUME_DISK_EXTENTS *) lifetime.get();
+ if (DeviceIoControl(volume_handle, IOCTL_VOLUME_GET_VOLUME_DISK_EXTENTS,
+ nullptr, 0, last_query, allocatedSize,
+ &bytes_written, nullptr))
+ {
+ extents= last_query;
+ break;
+ }
+ }
+ }
+ CloseHandle(volume_handle);
+ if (!extents)
+ return false;
+
+ for (DWORD i= 0; i < extents->NumberOfDiskExtents; i++)
+ if (!is_drive_on_ssd(extents->Extents[i].DiskNumber))
+ return false;
+
+ return true;
+}
+
+#include <unordered_map>
+static bool is_file_on_ssd(char *file_path)
+{
+ /* Cache of volume_path => volume_info, protected by rwlock.*/
+ static std::unordered_map<std::string, bool> cache;
+ static SRWLOCK lock= SRWLOCK_INIT;
+
+ /* Preset result, in case something fails, e.g we're on network drive.*/
+ char volume_path[MAX_PATH];
+ if (!GetVolumePathName(file_path, volume_path, array_elements(volume_path)))
+ return false;
+
+ /* Try cached volume info first.*/
+ std::string volume_path_str(volume_path);
+ bool found;
+ bool result;
+ AcquireSRWLockShared(&lock);
+ auto e= cache.find(volume_path_str);
+ if ((found= e != cache.end()))
+ result= e->second;
+ ReleaseSRWLockShared(&lock);
+
+ if (found)
+ return result;
+
+ result= is_volume_on_ssd(volume_path);
+
+ /* Update cache */
+ AcquireSRWLockExclusive(&lock);
+ cache[volume_path_str]= result;
+ ReleaseSRWLockExclusive(&lock);
+ return result;
+}
+
+#endif
+
+/** Determine some file metadata when creating or reading the file.
+@param file the file that is being created, or OS_FILE_CLOSED */
+void fil_node_t::find_metadata(os_file_t file
+#ifndef _WIN32
+ , struct stat* statbuf
+#endif
+ )
+{
+ if (file == OS_FILE_CLOSED) {
+ file = handle;
+ ut_ad(is_open());
+ }
+
+#ifdef _WIN32 /* FIXME: make this unconditional */
+ if (space->punch_hole) {
+ space->punch_hole = os_is_sparse_file_supported(file);
+ }
+#endif
+
+ /*
+ For the temporary tablespace and during the
+ non-redo-logged adjustments in
+ IMPORT TABLESPACE, we do not care about
+ the atomicity of writes.
+
+ Atomic writes is supported if the file can be used
+ with atomic_writes (not log file), O_DIRECT is
+ used (tested in ha_innodb.cc) and the file is
+ device and file system that supports atomic writes
+ for the given block size.
+ */
+ space->atomic_write_supported = space->purpose == FIL_TYPE_TEMPORARY
+ || space->purpose == FIL_TYPE_IMPORT;
+#ifdef _WIN32
+ on_ssd = is_file_on_ssd(name);
+ FILE_STORAGE_INFO info;
+ if (GetFileInformationByHandleEx(
+ file, FileStorageInfo, &info, sizeof(info))) {
+ block_size = info.PhysicalBytesPerSectorForAtomicity;
+ } else {
+ block_size = 512;
+ }
+#else
+ struct stat sbuf;
+ if (!statbuf && !fstat(file, &sbuf)) {
+ statbuf = &sbuf;
+ }
+ if (statbuf) {
+ block_size = statbuf->st_blksize;
+ }
+ on_ssd = space->atomic_write_supported
+# ifdef UNIV_LINUX
+ || (statbuf && fil_system.is_ssd(statbuf->st_dev))
+# endif
+ ;
+#endif
+ if (!space->atomic_write_supported) {
+ space->atomic_write_supported = atomic_write
+ && srv_use_atomic_writes
+#ifndef _WIN32
+ && my_test_if_atomic_write(file,
+ space->physical_size())
+#else
+ /* On Windows, all single sector writes are atomic,
+ as per WriteFile() documentation on MSDN.
+ We also require SSD for atomic writes, eventhough
+ technically it is not necessary- the reason is that
+ on hard disks, we still want the benefit from
+ (non-atomic) neighbor page flushing in the buffer
+ pool code. */
+ && srv_page_size == block_size
+ && on_ssd
+#endif
+ ;
+ }
+}
+
+/** Read the first page of a data file.
+@return whether the page was found valid */
+bool fil_node_t::read_page0()
+{
+ ut_ad(mutex_own(&fil_system.mutex));
+ const unsigned psize = space->physical_size();
+#ifndef _WIN32
+ struct stat statbuf;
+ if (fstat(handle, &statbuf)) {
+ return false;
+ }
+ os_offset_t size_bytes = statbuf.st_size;
+#else
+ os_offset_t size_bytes = os_file_get_size(handle);
+ ut_a(size_bytes != (os_offset_t) -1);
+#endif
+ const uint32_t min_size = FIL_IBD_FILE_INITIAL_SIZE * psize;
+
+ if (size_bytes < min_size) {
+ ib::error() << "The size of the file " << name
+ << " is only " << size_bytes
+ << " bytes, should be at least " << min_size;
+ return false;
+ }
+
+ page_t *page= static_cast<byte*>(aligned_malloc(psize, psize));
+ if (os_file_read(IORequestRead, handle, page, 0, psize)
+ != DB_SUCCESS) {
+ ib::error() << "Unable to read first page of file " << name;
+corrupted:
+ aligned_free(page);
+ return false;
+ }
+
+ const ulint space_id = memcmp_aligned<2>(
+ FIL_PAGE_SPACE_ID + page,
+ FSP_HEADER_OFFSET + FSP_SPACE_ID + page, 4)
+ ? ULINT_UNDEFINED
+ : mach_read_from_4(FIL_PAGE_SPACE_ID + page);
+ ulint flags = fsp_header_get_flags(page);
+ const uint32_t size = fsp_header_get_field(page, FSP_SIZE);
+ const uint32_t free_limit = fsp_header_get_field(page, FSP_FREE_LIMIT);
+ const uint32_t free_len = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE
+ + page);
+ if (!fil_space_t::is_valid_flags(flags, space->id)) {
+ ulint cflags = fsp_flags_convert_from_101(flags);
+ if (cflags == ULINT_UNDEFINED) {
+invalid:
+ ib::error()
+ << "Expected tablespace flags "
+ << ib::hex(space->flags)
+ << " but found " << ib::hex(flags)
+ << " in the file " << name;
+ goto corrupted;
+ }
+
+ ulint cf = cflags & ~FSP_FLAGS_MEM_MASK;
+ ulint sf = space->flags & ~FSP_FLAGS_MEM_MASK;
+
+ if (!fil_space_t::is_flags_equal(cf, sf)
+ && !fil_space_t::is_flags_equal(sf, cf)) {
+ goto invalid;
+ }
+
+ flags = cflags;
+ }
+
+ ut_ad(!(flags & FSP_FLAGS_MEM_MASK));
+
+ /* Try to read crypt_data from page 0 if it is not yet read. */
+ if (!space->crypt_data) {
+ space->crypt_data = fil_space_read_crypt_data(
+ fil_space_t::zip_size(flags), page);
+ }
+ aligned_free(page);
+
+ if (UNIV_UNLIKELY(space_id != space->id)) {
+ ib::error() << "Expected tablespace id " << space->id
+ << " but found " << space_id
+ << " in the file " << name;
+ return false;
+ }
+
+#ifdef UNIV_LINUX
+ find_metadata(handle, &statbuf);
+#else
+ find_metadata();
+#endif
+ /* Truncate the size to a multiple of extent size. */
+ ulint mask = psize * FSP_EXTENT_SIZE - 1;
+
+ if (size_bytes <= mask) {
+ /* .ibd files start smaller than an
+ extent size. Do not truncate valid data. */
+ } else {
+ size_bytes &= ~os_offset_t(mask);
+ }
+
+ space->flags = (space->flags & FSP_FLAGS_MEM_MASK) | flags;
+
+ space->punch_hole = space->is_compressed();
+ this->size = uint32_t(size_bytes / psize);
+ space->set_sizes(this->size);
+ ut_ad(space->free_limit == 0 || space->free_limit == free_limit);
+ ut_ad(space->free_len == 0 || space->free_len == free_len);
+ space->size_in_header = size;
+ space->free_limit = free_limit;
+ space->free_len = free_len;
+ return true;
+}
+
+#else
+#include "univ.i"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Normalizes a directory path for the current OS:
+On Windows, we convert '/' to '\', else we convert '\' to '/'.
+@param[in,out] str A null-terminated directory and file path */
+void
+os_normalize_path(
+ char* str)
+{
+ if (str != NULL) {
+ for (; *str; str++) {
+ if (*str == OS_PATH_SEPARATOR_ALT) {
+ *str = OS_PATH_SEPARATOR;
+ }
+ }
+ }
+}
diff --git a/storage/innobase/os/os0thread.cc b/storage/innobase/os/os0thread.cc
new file mode 100644
index 00000000..f3533acf
--- /dev/null
+++ b/storage/innobase/os/os0thread.cc
@@ -0,0 +1,131 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file os/os0thread.cc
+The interface to the operating system thread control primitives
+
+Created 9/8/1995 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+#include "srv0srv.h"
+
+#ifdef _WIN32
+bool os_thread_eq(os_thread_id_t a, os_thread_id_t b) { return a == b; }
+void os_thread_yield() { SwitchToThread(); }
+os_thread_id_t os_thread_get_curr_id() { return GetCurrentThreadId(); }
+#endif
+
+/****************************************************************//**
+Creates a new thread of execution. The execution starts from
+the function given.
+NOTE: We count the number of threads in os_thread_exit(). A created
+thread should always use that to exit so thatthe thread count will be
+decremented.
+We do not return an error code because if there is one, we crash here. */
+os_thread_t os_thread_create(os_thread_func_t func, void *arg)
+{
+ os_thread_id_t new_thread_id;
+
+#ifdef _WIN32
+ HANDLE handle;
+
+ handle = CreateThread(NULL, /* no security attributes */
+ 0, /* default size stack */
+ func,
+ arg,
+ 0, /* thread runs immediately */
+ &new_thread_id);
+
+ if (!handle) {
+ /* If we cannot start a new thread, life has no meaning. */
+ ib::fatal() << "CreateThread returned " << GetLastError();
+ }
+
+ CloseHandle(handle);
+
+ return((os_thread_t)new_thread_id);
+#else /* _WIN32 else */
+
+ pthread_attr_t attr;
+
+ int ret = pthread_attr_init(&attr);
+ if (UNIV_UNLIKELY(ret)) {
+ fprintf(stderr,
+ "InnoDB: Error: pthread_attr_init() returned %d\n",
+ ret);
+ abort();
+ }
+
+ ret = pthread_create(&new_thread_id, &attr, func, arg);
+
+ ut_a(ret == 0);
+
+ pthread_attr_destroy(&attr);
+
+#endif /* not _WIN32 */
+
+ return((os_thread_t)new_thread_id);
+}
+
+/** Detach and terminate the current thread. */
+ATTRIBUTE_NORETURN void os_thread_exit()
+{
+#ifdef UNIV_DEBUG_THREAD_CREATION
+ ib::info() << "Thread exits, id " << os_thread_get_curr_id();
+#endif
+
+#ifdef UNIV_PFS_THREAD
+ pfs_delete_thread();
+#endif
+
+#ifdef _WIN32
+ ExitThread(0);
+#else
+ pthread_detach(pthread_self());
+ pthread_exit(NULL);
+#endif
+}
+
+/*****************************************************************//**
+The thread sleeps at least the time given in microseconds. */
+void
+os_thread_sleep(
+/*============*/
+ ulint tm) /*!< in: time in microseconds */
+{
+#ifdef _WIN32
+ Sleep((DWORD) tm / 1000);
+#elif defined(HAVE_NANOSLEEP)
+ struct timespec t;
+
+ t.tv_sec = tm / 1000000;
+ t.tv_nsec = (tm % 1000000) * 1000;
+
+ ::nanosleep(&t, NULL);
+#else
+ struct timeval t;
+
+ t.tv_sec = tm / 1000000;
+ t.tv_usec = tm % 1000000;
+
+ select(0, NULL, NULL, NULL, &t);
+#endif /* _WIN32 */
+}