diff options
Diffstat (limited to 'storage/rocksdb/rdb_io_watchdog.cc')
-rw-r--r-- | storage/rocksdb/rdb_io_watchdog.cc | 240 |
1 files changed, 240 insertions, 0 deletions
diff --git a/storage/rocksdb/rdb_io_watchdog.cc b/storage/rocksdb/rdb_io_watchdog.cc new file mode 100644 index 00000000..07834118 --- /dev/null +++ b/storage/rocksdb/rdb_io_watchdog.cc @@ -0,0 +1,240 @@ +/* + Copyright (c) 2017, Facebook, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */ + +/* This C++ file's header */ +#include "./rdb_io_watchdog.h" + +/* C++ standard header files */ +#include <string> +#include <vector> + +/* Rdb_io_watchdog doesn't work on Windows [yet] */ +#ifdef HAVE_TIMER_DELETE + +namespace myrocks { + +void Rdb_io_watchdog::expire_io_callback(union sigval timer_data) { + DBUG_ASSERT(timer_data.sival_ptr != nullptr); + + // The treatment of any pending signal generated by the deleted timer is + // unspecified. Therefore we still need to handle the rare case where we + // finished the I/O operation right before the timer was deleted and callback + // was in flight. + if (!m_io_in_progress.load()) { + return; + } + + // At this point we know that I/O has been stuck in `write()` for more than + // `m_write_timeout` seconds. We'll log a message and shut down the service. + // NO_LINT_DEBUG + sql_print_error( + "MyRocks has detected a combination of I/O requests which " + "have cumulatively been blocking for more than %u seconds. " + "Shutting the service down.", + m_write_timeout); + + abort(); +} + +void Rdb_io_watchdog::io_check_callback(union sigval timer_data) { + RDB_MUTEX_LOCK_CHECK(m_reset_mutex); + + DBUG_ASSERT(timer_data.sival_ptr != nullptr); + + struct sigevent e; + + e.sigev_notify = SIGEV_THREAD; + e.sigev_notify_function = &Rdb_io_watchdog::expire_io_callback_wrapper; + e.sigev_value.sival_ptr = this; + e.sigev_notify_attributes = nullptr; + + int ret = timer_create(CLOCK_MONOTONIC, &e, &m_io_check_watchdog_timer); + + if (unlikely(ret)) { + // NO_LINT_DEBUG + sql_print_warning("Creating a watchdog I/O timer failed with %d.", errno); + RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex); + return; + } + + struct itimerspec timer_spec; + memset(&timer_spec, 0, sizeof(timer_spec)); + + // One time execution only for the watchdog. No interval. + timer_spec.it_value.tv_sec = m_write_timeout; + + ret = timer_settime(m_io_check_watchdog_timer, 0, &timer_spec, nullptr); + + if (unlikely(ret)) { + // NO_LINT_DEBUG + sql_print_warning("Setting time for a watchdog I/O timer failed with %d.", + errno); + RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex); + return; + } + + m_io_in_progress.store(true); + + // Verify the write access to all directories we care about. + for (const std::string &directory : m_dirs_to_check) { + ret = check_write_access(directory); + + // We'll log a warning and attept to continue to see if the problem happens + // in other cases as well. + if (unlikely(ret != HA_EXIT_SUCCESS)) { + // NO_LINT_DEBUG + sql_print_warning("Unable to verify write access to %s (error code %d).", + directory.c_str(), ret); + } + } + + m_io_in_progress.store(false); + + // Clean up the watchdog timer. + ret = timer_delete(m_io_check_watchdog_timer); + + if (unlikely(ret)) { + // NO_LINT_DEBUG + sql_print_warning("Deleting the watchdog I/O timer failed with %d.", errno); + } + + m_io_check_watchdog_timer = nullptr; + + RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex); +} + +int Rdb_io_watchdog::check_write_access(const std::string &dirname) const { + DBUG_ASSERT(!dirname.empty()); + DBUG_ASSERT(m_buf != nullptr); + + const std::string fname = dirname + FN_DIRSEP + RDB_IO_DUMMY_FILE_NAME; + + // O_DIRECT is a key flag here to make sure that we'll bypass the kernel's + // buffer cache. + int fd = open(fname.c_str(), O_WRONLY | O_DIRECT | O_CREAT | O_SYNC, + S_IRWXU | S_IWUSR); + + if (unlikely(fd == -1)) { + return fd; + } + + int ret = write(fd, m_buf, RDB_IO_WRITE_BUFFER_SIZE); + + if (unlikely(ret != RDB_IO_WRITE_BUFFER_SIZE)) { + return ret; + } + + ret = close(fd); + + if (unlikely(ret)) { + return ret; + } + + ret = unlink(fname.c_str()); + + if (unlikely(ret)) { + return ret; + } + + return HA_EXIT_SUCCESS; +} + +int Rdb_io_watchdog::reset_timeout(const uint32_t write_timeout) { + // This function will be called either from a thread initializing MyRocks + // engine or handling system variable changes. We need to account for the + // possibility of I/O callback executing at the same time. If that happens + // then we'll wait for it to finish. + RDB_MUTEX_LOCK_CHECK(m_reset_mutex); + + struct sigevent e; + + // In all the cases all the active timers needs to be stopped. + int ret = stop_timers(); + + if (unlikely(ret)) { + // NO_LINT_DEBUG + sql_print_warning("Stopping I/O timers failed with %d.", errno); + RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex); + return ret; + } + + m_write_timeout = write_timeout; + m_io_in_progress.store(false); + + // Zero means that the I/O timer will be disabled. Therefore there's nothing + // for us to do here. + if (!write_timeout) { + RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex); + return HA_EXIT_SUCCESS; + } + + free(m_buf); + + ret = posix_memalign(reinterpret_cast<void **>(&m_buf), + RDB_IO_WRITE_BUFFER_SIZE, RDB_IO_WRITE_BUFFER_SIZE); + + if (unlikely(ret)) { + m_buf = nullptr; + RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex); + // NB! The value of errno is not set. + return ret; + } + + DBUG_ASSERT(m_buf != nullptr); + memset(m_buf, 0, RDB_IO_WRITE_BUFFER_SIZE); + + // Common case gets handled here - we'll create a timer with a specific + // interval to check a set of directories for write access. + DBUG_ASSERT(m_dirs_to_check.size() > 0); + + e.sigev_notify = SIGEV_THREAD; + e.sigev_notify_function = &Rdb_io_watchdog::io_check_callback_wrapper; + e.sigev_value.sival_ptr = this; + e.sigev_notify_attributes = nullptr; + + ret = timer_create(CLOCK_MONOTONIC, &e, &m_io_check_timer); + + if (unlikely(ret)) { + // NO_LINT_DEBUG + sql_print_warning("Creating a I/O timer failed with %d.", errno); + RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex); + return ret; + } + + struct itimerspec timer_spec; + memset(&timer_spec, 0, sizeof(timer_spec)); + + // I/O timer will need to execute on a certain interval. + timer_spec.it_value.tv_sec = m_write_timeout; + timer_spec.it_interval.tv_sec = m_write_timeout; + + ret = timer_settime(m_io_check_timer, 0, &timer_spec, nullptr); + + if (unlikely(ret)) { + // NO_LINT_DEBUG + sql_print_warning("Setting time for a watchdog I/O timer failed with %d.", + errno); + } + + RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex); + + return HA_EXIT_SUCCESS; +} + +} // namespace myrocks + +#endif + |