1 files changed, 1358 insertions, 0 deletions
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
new file mode 100644
index 00000000..91999c81
--- /dev/null
+++ b/storage/innobase/log/log0log.cc
@@ -0,0 +1,1358 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file log/log0log.cc
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+#include <debug_sync.h>
+#include <my_service_manager.h>
+
+#include "log0log.h"
+#include "log0crypt.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "lock0lock.h"
+#include "log0recv.h"
+#include "fil0fil.h"
+#include "dict0stats_bg.h"
+#include "btr0defragment.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "srv0mon.h"
+#include "buf0dump.h"
+#include "log0sync.h"
+#include "log.h"
+
+/*
+General philosophy of InnoDB redo-logs:
+
+Every change to a contents of a data page must be done
+through mtr_t, and mtr_t::commit() will write log records
+to the InnoDB redo log. */
+
+alignas(CPU_LEVEL1_DCACHE_LINESIZE)
+static group_commit_lock flush_lock;
+alignas(CPU_LEVEL1_DCACHE_LINESIZE)
+static group_commit_lock write_lock;
+
+/** Redo log system */
+log_t	log_sys;
+
+/* Margins for free space in the log buffer after a log entry is catenated */
+#define LOG_BUF_FLUSH_RATIO	2
+#define LOG_BUF_FLUSH_MARGIN	((4 * 4096) /* cf. log_t::append_prepare() */ \
+				 + (4U << srv_page_size_shift))
+
+void log_t::set_capacity()
+{
+#ifndef SUX_LOCK_GENERIC
+	ut_ad(log_sys.latch.is_write_locked());
+#endif
+	/* Margin for the free space in the smallest log, before a new query
+	step which modifies the database, is started */
+
+	lsn_t smallest_capacity = srv_log_file_size - log_t::START_OFFSET;
+	/* Add extra safety */
+	smallest_capacity -= smallest_capacity / 10;
+
+	lsn_t margin = smallest_capacity - (48 << srv_page_size_shift);
+	margin -= margin / 10;	/* Add still some extra safety */
+
+	log_sys.log_capacity = smallest_capacity;
+
+	log_sys.max_modified_age_async = margin - margin / 8;
+	log_sys.max_checkpoint_age = margin;
+}
+
+#ifdef HAVE_PMEM
+void log_t::create_low()
+#else
+bool log_t::create()
+#endif
+{
+  ut_ad(this == &log_sys);
+  ut_ad(!is_initialised());
+
+  /* LSN 0 and 1 are reserved; @see buf_page_t::oldest_modification_ */
+  lsn.store(FIRST_LSN, std::memory_order_relaxed);
+  flushed_to_disk_lsn.store(FIRST_LSN, std::memory_order_relaxed);
+  write_lsn= FIRST_LSN;
+
+#ifndef HAVE_PMEM
+  buf= static_cast<byte*>(ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME));
+  if (!buf)
+  {
+  alloc_fail:
+    sql_print_error("InnoDB: Cannot allocate memory;"
+                    " too large innodb_log_buffer_size?");
+    return false;
+  }
+  flush_buf= static_cast<byte*>(ut_malloc_dontdump(buf_size,
+                                                   PSI_INSTRUMENT_ME));
+  if (!flush_buf)
+  {
+    ut_free_dodump(buf, buf_size);
+    buf= nullptr;
+    goto alloc_fail;
+  }
+
+  TRASH_ALLOC(buf, buf_size);
+  TRASH_ALLOC(flush_buf, buf_size);
+  checkpoint_buf= static_cast<byte*>(aligned_malloc(4096, 4096));
+  memset_aligned<4096>(checkpoint_buf, 0, 4096);
+#else
+  ut_ad(!checkpoint_buf);
+  ut_ad(!buf);
+  ut_ad(!flush_buf);
+#endif
+
+  latch.SRW_LOCK_INIT(log_latch_key);
+  init_lsn_lock();
+
+  max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN;
+  set_check_flush_or_checkpoint();
+
+  last_checkpoint_lsn= FIRST_LSN;
+  log_capacity= 0;
+  max_modified_age_async= 0;
+  max_checkpoint_age= 0;
+  next_checkpoint_lsn= 0;
+  checkpoint_pending= false;
+
+  buf_free= 0;
+
+  ut_ad(is_initialised());
+#ifndef HAVE_PMEM
+  return true;
+#endif
+}
+
+dberr_t log_file_t::close() noexcept
+{
+  ut_a(is_opened());
+
+  if (!os_file_close_func(m_file))
+    return DB_ERROR;
+
+  m_file= OS_FILE_CLOSED;
+  return DB_SUCCESS;
+}
+
+__attribute__((warn_unused_result))
+dberr_t log_file_t::read(os_offset_t offset, span<byte> buf) noexcept
+{
+  ut_ad(is_opened());
+  return os_file_read(IORequestRead, m_file, buf.data(), offset, buf.size(),
+                      nullptr);
+}
+
+void log_file_t::write(os_offset_t offset, span<const byte> buf) noexcept
+{
+  ut_ad(is_opened());
+  if (dberr_t err= os_file_write_func(IORequestWrite, "ib_logfile0", m_file,
+                                      buf.data(), offset, buf.size()))
+    ib::fatal() << "write(\"ib_logfile0\") returned " << err;
+}
+
+#ifdef HAVE_PMEM
+# include <libpmem.h>
+
+/** Attempt to memory map a file.
+@param file  log file handle
+@param size  file size
+@return pointer to memory mapping
+@retval MAP_FAILED  if the memory cannot be mapped */
+static void *log_mmap(os_file_t file, os_offset_t size)
+{
+  void *ptr=
+    my_mmap(0, size_t(size),
+            srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE,
+            MAP_SHARED_VALIDATE | MAP_SYNC, file, 0);
+#ifdef __linux__
+  if (ptr == MAP_FAILED)
+  {
+    struct stat st;
+    if (!fstat(file, &st))
+    {
+      MSAN_STAT_WORKAROUND(&st);
+      const auto st_dev= st.st_dev;
+      if (!stat("/dev/shm", &st))
+      {
+        MSAN_STAT_WORKAROUND(&st);
+        if (st.st_dev == st_dev)
+          ptr= my_mmap(0, size_t(size),
+                       srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE,
+                       MAP_SHARED, file, 0);
+      }
+    }
+  }
+#endif /* __linux__ */
+  return ptr;
+}
+#endif
+
+#ifdef HAVE_PMEM
+bool log_t::attach(log_file_t file, os_offset_t size)
+#else
+void log_t::attach_low(log_file_t file, os_offset_t size)
+#endif
+{
+  log= file;
+  ut_ad(!size || size >= START_OFFSET + SIZE_OF_FILE_CHECKPOINT);
+  file_size= size;
+
+#ifdef HAVE_PMEM
+  ut_ad(!buf);
+  ut_ad(!flush_buf);
+  if (size && !(size_t(size) & 4095) && srv_operation != SRV_OPERATION_BACKUP)
+  {
+    void *ptr= log_mmap(log.m_file, size);
+    if (ptr != MAP_FAILED)
+    {
+      log.close();
+      mprotect(ptr, size_t(size), PROT_READ);
+      buf= static_cast<byte*>(ptr);
+# if defined __linux__ || defined _WIN32
+      set_block_size(CPU_LEVEL1_DCACHE_LINESIZE);
+# endif
+      log_maybe_unbuffered= true;
+      log_buffered= false;
+      return true;
+    }
+  }
+  buf= static_cast<byte*>(ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME));
+  if (!buf)
+  {
+  alloc_fail:
+    max_buf_free= 0;
+    sql_print_error("InnoDB: Cannot allocate memory;"
+                    " too large innodb_log_buffer_size?");
+    return false;
+  }
+  flush_buf= static_cast<byte*>(ut_malloc_dontdump(buf_size,
+                                                   PSI_INSTRUMENT_ME));
+  if (!flush_buf)
+  {
+    ut_free_dodump(buf, buf_size);
+    buf= nullptr;
+    goto alloc_fail;
+  }
+
+  TRASH_ALLOC(buf, buf_size);
+  TRASH_ALLOC(flush_buf, buf_size);
+#endif
+
+#if defined __linux__ || defined _WIN32
+  sql_print_information("InnoDB: %s (block size=%u bytes)",
+                        log_buffered
+                        ? "Buffered log writes"
+                        : "File system buffers for log disabled",
+                        block_size);
+#endif
+
+#ifdef HAVE_PMEM
+  checkpoint_buf= static_cast<byte*>(aligned_malloc(block_size, block_size));
+  memset_aligned<64>(checkpoint_buf, 0, block_size);
+  return true;
+#endif
+}
+
+/** Write a log file header.
+@param buf        log header buffer
+@param lsn        log sequence number corresponding to log_sys.START_OFFSET
+@param encrypted  whether the log is encrypted */
+void log_t::header_write(byte *buf, lsn_t lsn, bool encrypted)
+{
+  mach_write_to_4(my_assume_aligned<4>(buf) + LOG_HEADER_FORMAT,
+                  log_sys.FORMAT_10_8);
+  mach_write_to_8(my_assume_aligned<8>(buf + LOG_HEADER_START_LSN), lsn);
+
+#if defined __GNUC__ && __GNUC__ > 7
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wstringop-truncation"
+#endif
+  strncpy(reinterpret_cast<char*>(buf) + LOG_HEADER_CREATOR,
+          "MariaDB " PACKAGE_VERSION,
+          LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR);
+#if defined __GNUC__ && __GNUC__ > 7
+# pragma GCC diagnostic pop
+#endif
+
+  if (encrypted)
+    log_crypt_write_header(buf + LOG_HEADER_CREATOR_END);
+  mach_write_to_4(my_assume_aligned<4>(508 + buf), my_crc32c(0, buf, 508));
+}
+
+void log_t::create(lsn_t lsn) noexcept
+{
+#ifndef SUX_LOCK_GENERIC
+  ut_ad(latch.is_write_locked());
+#endif
+  ut_ad(!recv_no_log_write);
+  ut_ad(is_latest());
+  ut_ad(this == &log_sys);
+
+  this->lsn.store(lsn, std::memory_order_relaxed);
+  this->flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed);
+  first_lsn= lsn;
+  write_lsn= lsn;
+
+  last_checkpoint_lsn= 0;
+
+#ifdef HAVE_PMEM
+  if (is_pmem())
+  {
+    mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE);
+    memset_aligned<4096>(buf, 0, 4096);
+    buf_free= START_OFFSET;
+  }
+  else
+#endif
+  {
+    buf_free= 0;
+    memset_aligned<4096>(flush_buf, 0, buf_size);
+    memset_aligned<4096>(buf, 0, buf_size);
+  }
+
+  log_sys.header_write(buf, lsn, is_encrypted());
+  DBUG_PRINT("ib_log", ("write header " LSN_PF, lsn));
+
+#ifdef HAVE_PMEM
+  if (is_pmem())
+    pmem_persist(buf, 512);
+  else
+#endif
+  {
+    log.write(0, {buf, 4096});
+    memset_aligned<512>(buf, 0, 512);
+  }
+}
+
+void log_t::close_file()
+{
+#ifdef HAVE_PMEM
+  if (is_pmem())
+  {
+    ut_ad(!is_opened());
+    ut_ad(!checkpoint_buf);
+    if (buf)
+    {
+      my_munmap(buf, file_size);
+      buf= nullptr;
+    }
+    return;
+  }
+
+  ut_free_dodump(buf, buf_size);
+  buf= nullptr;
+  ut_free_dodump(flush_buf, buf_size);
+  flush_buf= nullptr;
+  aligned_free(checkpoint_buf);
+  checkpoint_buf= nullptr;
+#endif
+  if (is_opened())
+    if (const dberr_t err= log.close())
+      ib::fatal() << "closing ib_logfile0 failed: " << err;
+}
+
+/** Acquire all latches that protect the log. */
+static void log_resize_acquire()
+{
+  if (!log_sys.is_pmem())
+  {
+    while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
+           group_commit_lock::ACQUIRED);
+    while (write_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
+           group_commit_lock::ACQUIRED);
+  }
+
+  log_sys.latch.wr_lock(SRW_LOCK_CALL);
+}
+
+/** Release the latches that protect the log. */
+void log_resize_release()
+{
+  log_sys.latch.wr_unlock();
+
+  if (!log_sys.is_pmem())
+  {
+    lsn_t lsn1= write_lock.release(write_lock.value());
+    lsn_t lsn2= flush_lock.release(flush_lock.value());
+    if (lsn1 || lsn2)
+      log_write_up_to(std::max(lsn1, lsn2), true, nullptr);
+  }
+}
+
+#if defined __linux__ || defined _WIN32
+/** Try to enable or disable file system caching (update log_buffered) */
+void log_t::set_buffered(bool buffered)
+{
+  if (!log_maybe_unbuffered || is_pmem() || high_level_read_only)
+    return;
+  log_resize_acquire();
+  if (!resize_in_progress() && is_opened() && bool(log_buffered) != buffered)
+  {
+    os_file_close_func(log.m_file);
+    log.m_file= OS_FILE_CLOSED;
+    std::string path{get_log_file_path()};
+    log_buffered= buffered;
+    bool success;
+    log.m_file= os_file_create_func(path.c_str(),
+                                    OS_FILE_OPEN, OS_FILE_NORMAL, OS_LOG_FILE,
+                                    false, &success);
+    ut_a(log.m_file != OS_FILE_CLOSED);
+    sql_print_information("InnoDB: %s (block size=%u bytes)",
+                          log_buffered
+                          ? "Buffered log writes"
+                          : "File system buffers for log disabled",
+                          block_size);
+  }
+  log_resize_release();
+}
+#endif
+
+/** Start resizing the log and release the exclusive latch.
+@param size  requested new file_size
+@return whether the resizing was started successfully */
+log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept
+{
+  ut_ad(size >= 4U << 20);
+  ut_ad(!(size & 4095));
+  ut_ad(!srv_read_only_mode);
+
+  log_resize_acquire();
+
+  resize_start_status status= RESIZE_NO_CHANGE;
+  lsn_t start_lsn{0};
+
+  if (resize_in_progress())
+    status= RESIZE_IN_PROGRESS;
+  else if (size != file_size)
+  {
+    ut_ad(!resize_in_progress());
+    ut_ad(!resize_log.is_opened());
+    ut_ad(!resize_buf);
+    ut_ad(!resize_flush_buf);
+    std::string path{get_log_file_path("ib_logfile101")};
+    bool success;
+    resize_lsn.store(1, std::memory_order_relaxed);
+    resize_target= 0;
+    resize_log.m_file=
+      os_file_create_func(path.c_str(),
+                          OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
+                          OS_FILE_NORMAL, OS_LOG_FILE, false, &success);
+    if (success)
+    {
+      log_resize_release();
+
+      void *ptr= nullptr, *ptr2= nullptr;
+      success= os_file_set_size(path.c_str(), resize_log.m_file, size);
+      if (!success);
+#ifdef HAVE_PMEM
+      else if (is_pmem())
+      {
+        ptr= log_mmap(resize_log.m_file, size);
+        if (ptr == MAP_FAILED)
+          goto alloc_fail;
+      }
+#endif
+      else
+      {
+        ptr= ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME);
+        if (ptr)
+        {
+          TRASH_ALLOC(ptr, buf_size);
+          ptr2= ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME);
+          if (ptr2)
+            TRASH_ALLOC(ptr2, buf_size);
+          else
+          {
+            ut_free_dodump(ptr, buf_size);
+            ptr= nullptr;
+            goto alloc_fail;
+          }
+        }
+        else
+        alloc_fail:
+          success= false;
+      }
+
+      log_resize_acquire();
+
+      if (!success)
+      {
+        resize_log.close();
+        IF_WIN(DeleteFile(path.c_str()), unlink(path.c_str()));
+      }
+      else
+      {
+        resize_target= size;
+        resize_buf= static_cast<byte*>(ptr);
+        resize_flush_buf= static_cast<byte*>(ptr2);
+        if (is_pmem())
+        {
+          resize_log.close();
+          start_lsn= get_lsn();
+        }
+        else
+        {
+          memcpy_aligned<16>(resize_buf, buf, (buf_free + 15) & ~15);
+          start_lsn= first_lsn +
+            (~lsn_t{get_block_size() - 1} & (write_lsn - first_lsn));
+        }
+      }
+      resize_lsn.store(start_lsn, std::memory_order_relaxed);
+      status= success ? RESIZE_STARTED : RESIZE_FAILED;
+    }
+  }
+
+  log_resize_release();
+
+  if (start_lsn)
+  {
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    lsn_t target_lsn= buf_pool.get_oldest_modification(0);
+    if (start_lsn < target_lsn)
+      start_lsn= target_lsn + 1;
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+    buf_flush_ahead(start_lsn, false);
+  }
+
+  return status;
+}
+
+/** Abort log resizing. */
+void log_t::resize_abort() noexcept
+{
+  log_resize_acquire();
+
+  if (resize_in_progress() > 1)
+  {
+    if (!is_pmem())
+    {
+      resize_log.close();
+      ut_free_dodump(resize_buf, buf_size);
+      ut_free_dodump(resize_flush_buf, buf_size);
+      resize_flush_buf= nullptr;
+    }
+#ifdef HAVE_PMEM
+    else
+    {
+      ut_ad(!resize_log.is_opened());
+      ut_ad(!resize_flush_buf);
+      if (resize_buf)
+        my_munmap(resize_buf, resize_target);
+    }
+#endif
+    resize_buf= nullptr;
+    resize_target= 0;
+    resize_lsn.store(0, std::memory_order_relaxed);
+  }
+
+  log_resize_release();
+}
+
+/** Write an aligned buffer to ib_logfile0.
+@param buf    buffer to be written
+@param len    length of data to be written
+@param offset log file offset */
+static void log_write_buf(const byte *buf, size_t len, lsn_t offset)
+{
+  ut_ad(write_lock.is_owner());
+  ut_ad(!recv_no_log_write);
+  ut_d(const size_t block_size_1= log_sys.get_block_size() - 1);
+  ut_ad(!(offset & block_size_1));
+  ut_ad(!(len & block_size_1));
+  ut_ad(!(size_t(buf) & block_size_1));
+  ut_ad(len);
+
+  if (UNIV_LIKELY(offset + len <= log_sys.file_size))
+  {
+write:
+    log_sys.log.write(offset, {buf, len});
+    return;
+  }
+
+  const size_t write_len= size_t(log_sys.file_size - offset);
+  log_sys.log.write(offset, {buf, write_len});
+  len-= write_len;
+  buf+= write_len;
+  ut_ad(log_sys.START_OFFSET + len < offset);
+  offset= log_sys.START_OFFSET;
+  goto write;
+}
+
+/** Invoke commit_checkpoint_notify_ha() to notify that outstanding
+log writes have been completed. */
+void log_flush_notify(lsn_t flush_lsn);
+
+#if 0 // Currently we overwrite the last log block until it is complete.
+/** CRC-32C of pad messages using between 1 and 15 bytes of NUL bytes
+in the payload */
+static const unsigned char pad_crc[15][4]= {
+  {0xA6,0x59,0xC1,0xDB}, {0xF2,0xAF,0x80,0x73}, {0xED,0x02,0xF1,0x90},
+  {0x68,0x4E,0xA3,0xF3}, {0x5D,0x1B,0xEA,0x6A}, {0xE0,0x01,0x86,0xB9},
+  {0xD1,0x06,0x86,0xF5}, {0xEB,0x20,0x12,0x33}, {0xBA,0x73,0xB2,0xA3},
+  {0x5F,0xA2,0x08,0x03}, {0x70,0x03,0xD6,0x9D}, {0xED,0xB3,0x49,0x78},
+  {0xFD,0xD6,0xB9,0x9C}, {0x25,0xF8,0xB1,0x2C}, {0xCD,0xAA,0xE7,0x10}
+};
+
+/** Pad the log with some dummy bytes
+@param lsn    desired log sequence number
+@param pad    number of bytes to append to the log
+@param begin  buffer to write 'pad' bytes to
+@param extra  buffer for additional pad bytes (up to 15 bytes)
+@return additional bytes used in extra[] */
+ATTRIBUTE_NOINLINE
+static size_t log_pad(lsn_t lsn, size_t pad, byte *begin, byte *extra)
+{
+  ut_ad(!(size_t(begin + pad) & (log_sys.get_block_size() - 1)));
+  byte *b= begin;
+  const byte seq{log_sys.get_sequence_bit(lsn)};
+  /* The caller should never request padding such that the
+  file would wrap around to the beginning. That is, the sequence
+  bit must be the same for all records. */
+  ut_ad(seq == log_sys.get_sequence_bit(lsn + pad));
+
+  if (log_sys.is_encrypted())
+  {
+    /* The lengths of our pad messages vary between 15 and 29 bytes
+    (FILE_CHECKPOINT byte, 1 to 15 NUL bytes, sequence byte,
+    4 bytes checksum, 8 NUL bytes nonce). */
+    if (pad < 15)
+    {
+      extra[0]= FILE_CHECKPOINT | 1;
+      extra[1]= 0;
+      extra[2]= seq;
+      memcpy(extra + 3, pad_crc[0], 4);
+      memset(extra + 7, 0, 8);
+      memcpy(b, extra, pad);
+      memmove(extra, extra + pad, 15 - pad);
+      return 15 - pad;
+    }
+
+    /* Pad first with 29-byte messages until the remaining size is
+    less than 29+15 bytes, and then write 1 or 2 shorter messages. */
+    const byte *const end= begin + pad;
+    for (; b + (29 + 15) < end; b+= 29)
+    {
+      b[0]= FILE_CHECKPOINT | 15;
+      memset(b + 1, 0, 15);
+      b[16]= seq;
+      memcpy(b + 17, pad_crc[14], 4);
+      memset(b + 21, 0, 8);
+    }
+    if (b + 29 < end)
+    {
+      b[0]= FILE_CHECKPOINT | 1;
+      b[1]= 0;
+      b[2]= seq;
+      memcpy(b + 3, pad_crc[0], 4);
+      memset(b + 7, 0, 8);
+      b+= 15;
+    }
+    const size_t last_pad(end - b);
+    ut_ad(last_pad >= 15);
+    ut_ad(last_pad <= 29);
+    b[0]= FILE_CHECKPOINT | byte(last_pad - 14);
+    memset(b + 1, 0, last_pad - 14);
+    b[last_pad - 13]= seq;
+    memcpy(b + last_pad - 12, pad_crc[last_pad - 15], 4);
+    memset(b + last_pad - 8, 0, 8);
+  }
+  else
+  {
+    /* The lengths of our pad messages vary between 7 and 21 bytes
+    (FILE_CHECKPOINT byte, 1 to 15 NUL bytes, sequence byte,
+    4 bytes checksum). */
+    if (pad < 7)
+    {
+      extra[0]= FILE_CHECKPOINT | 1;
+      extra[1]= 0;
+      extra[2]= seq;
+      memcpy(extra + 3, pad_crc[0], 4);
+      memcpy(b, extra, pad);
+      memmove(extra, extra + pad, 7 - pad);
+      return 7 - pad;
+    }
+
+    /* Pad first with 21-byte messages until the remaining size is
+    less than 21+7 bytes, and then write 1 or 2 shorter messages. */
+    const byte *const end= begin + pad;
+    for (; b + (21 + 7) < end; b+= 21)
+    {
+      b[0]= FILE_CHECKPOINT | 15;
+      memset(b + 1, 0, 15);
+      b[16]= seq;
+      memcpy(b + 17, pad_crc[14], 4);
+    }
+    if (b + 21 < end)
+    {
+      b[0]= FILE_CHECKPOINT | 1;
+      b[1]= 0;
+      b[2]= seq;
+      memcpy(b + 3, pad_crc[0], 4);
+      b+= 7;
+    }
+    const size_t last_pad(end - b);
+    ut_ad(last_pad >= 7);
+    ut_ad(last_pad <= 21);
+    b[0]= FILE_CHECKPOINT | byte(last_pad - 6);
+    memset(b + 1, 0, last_pad - 6);
+    b[last_pad - 5]= seq;
+    memcpy(b + last_pad - 4, pad_crc[last_pad - 7], 4);
+  }
+
+  return 0;
+}
+#endif
+
+#ifdef HAVE_PMEM
+/** Persist the log.
+@param lsn    desired new value of flushed_to_disk_lsn */
+inline void log_t::persist(lsn_t lsn) noexcept
+{
+  ut_ad(is_pmem());
+  ut_ad(!write_lock.is_owner());
+  ut_ad(!flush_lock.is_owner());
+
+  lsn_t old= flushed_to_disk_lsn.load(std::memory_order_relaxed);
+
+  if (old >= lsn)
+    return;
+
+  const lsn_t resizing{resize_in_progress()};
+  if (UNIV_UNLIKELY(resizing))
+    latch.rd_lock(SRW_LOCK_CALL);
+  const size_t start(calc_lsn_offset(old));
+  const size_t end(calc_lsn_offset(lsn));
+
+  if (UNIV_UNLIKELY(end < start))
+  {
+    pmem_persist(log_sys.buf + start, log_sys.file_size - start);
+    pmem_persist(log_sys.buf + log_sys.START_OFFSET,
+                 end - log_sys.START_OFFSET);
+  }
+  else
+    pmem_persist(log_sys.buf + start, end - start);
+
+  old= flushed_to_disk_lsn.load(std::memory_order_relaxed);
+
+  if (old < lsn)
+  {
+    while (!flushed_to_disk_lsn.compare_exchange_weak
+           (old, lsn, std::memory_order_release, std::memory_order_relaxed))
+      if (old >= lsn)
+        break;
+
+    log_flush_notify(lsn);
+    DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE(););
+  }
+
+  if (UNIV_UNLIKELY(resizing))
+    latch.rd_unlock();
+}
+#endif
+
+/** Write resize_buf to resize_log.
+@param length  the used length of resize_buf */
+ATTRIBUTE_COLD void log_t::resize_write_buf(size_t length) noexcept
+{
+  const size_t block_size_1= get_block_size() - 1;
+  ut_ad(!(resize_target & block_size_1));
+  ut_ad(!(length & block_size_1));
+  ut_ad(length > block_size_1);
+  ut_ad(length <= resize_target);
+  const lsn_t resizing{resize_in_progress()};
+  ut_ad(resizing <= write_lsn);
+  lsn_t offset= START_OFFSET +
+    ((write_lsn - resizing) & ~lsn_t{block_size_1}) %
+    (resize_target - START_OFFSET);
+
+  if (UNIV_UNLIKELY(offset + length > resize_target))
+  {
+    offset= START_OFFSET;
+    resize_lsn.store(first_lsn +
+                     (~lsn_t{block_size_1} & (write_lsn - first_lsn)),
+                     std::memory_order_relaxed);
+  }
+
+  ut_a(os_file_write_func(IORequestWrite, "ib_logfile101", resize_log.m_file,
+                          resize_flush_buf, offset, length) == DB_SUCCESS);
+}
+
+/** Write buf to ib_logfile0.
+@tparam release_latch whether to invoke latch.wr_unlock()
+@return the current log sequence number */
+template<bool release_latch> inline lsn_t log_t::write_buf() noexcept
+{
+#ifndef SUX_LOCK_GENERIC
+  ut_ad(latch.is_write_locked());
+#endif
+  ut_ad(!srv_read_only_mode);
+  ut_ad(!is_pmem());
+
+  const lsn_t lsn{get_lsn(std::memory_order_relaxed)};
+
+  if (write_lsn >= lsn)
+  {
+    if (release_latch)
+      latch.wr_unlock();
+    ut_ad(write_lsn == lsn);
+  }
+  else
+  {
+    ut_ad(!recv_no_log_write);
+    write_lock.set_pending(lsn);
+    ut_ad(write_lsn >= get_flushed_lsn());
+    const size_t block_size_1{get_block_size() - 1};
+    lsn_t offset{calc_lsn_offset(write_lsn) & ~lsn_t{block_size_1}};
+
+    DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF " at " LSN_PF,
+                          write_lsn, lsn, offset));
+    const byte *write_buf{buf};
+    size_t length{buf_free};
+    ut_ad(length >= (calc_lsn_offset(write_lsn) & block_size_1));
+    const size_t new_buf_free{length & block_size_1};
+    buf_free= new_buf_free;
+    ut_ad(new_buf_free == ((lsn - first_lsn) & block_size_1));
+
+    if (new_buf_free)
+    {
+#if 0 /* TODO: Pad the last log block with dummy records. */
+      buf_free= log_pad(lsn, get_block_size() - new_buf_free,
+                        buf + new_buf_free, flush_buf);
+      ... /* TODO: Update the LSN and adjust other code. */
+#else
+      /* The rest of the block will be written as garbage.
+      (We want to avoid memset() while holding mutex.)
+      This block will be overwritten later, once records beyond
+      the current LSN are generated. */
+# ifdef HAVE_valgrind
+      MEM_MAKE_DEFINED(buf + length, get_block_size() - new_buf_free);
+      if (UNIV_LIKELY_NULL(resize_flush_buf))
+        MEM_MAKE_DEFINED(resize_buf + length, get_block_size() - new_buf_free);
+# endif
+      buf[length]= 0; /* allow recovery to catch EOF faster */
+      length&= ~block_size_1;
+      memcpy_aligned<16>(flush_buf, buf + length, (new_buf_free + 15) & ~15);
+      if (UNIV_LIKELY_NULL(resize_flush_buf))
+        memcpy_aligned<16>(resize_flush_buf, resize_buf + length,
+                           (new_buf_free + 15) & ~15);
+      length+= get_block_size();
+#endif
+    }
+
+    std::swap(buf, flush_buf);
+    std::swap(resize_buf, resize_flush_buf);
+    write_to_log++;
+    if (release_latch)
+      latch.wr_unlock();
+
+    if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED))
+    {
+      service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+                                     "InnoDB log write: " LSN_PF, write_lsn);
+    }
+
+    /* Do the write to the log file */
+    log_write_buf(write_buf, length, offset);
+    if (UNIV_LIKELY_NULL(resize_buf))
+      resize_write_buf(length);
+    write_lsn= lsn;
+  }
+
+  return lsn;
+}
+
+bool log_t::flush(lsn_t lsn) noexcept
+{
+  ut_ad(lsn >= get_flushed_lsn());
+  flush_lock.set_pending(lsn);
+  const bool success{srv_file_flush_method == SRV_O_DSYNC || log.flush()};
+  if (UNIV_LIKELY(success))
+  {
+    flushed_to_disk_lsn.store(lsn, std::memory_order_release);
+    log_flush_notify(lsn);
+  }
+  return success;
+}
+
+/** Ensure that previous log writes are durable.
+@param lsn  previously written LSN
+@return new durable lsn target
+@retval 0  if there are no pending callbacks on flush_lock
+           or there is another group commit lead.
+*/
+static lsn_t log_flush(lsn_t lsn)
+{
+  ut_ad(!log_sys.is_pmem());
+  ut_a(log_sys.flush(lsn));
+  DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE(););
+  return flush_lock.release(lsn);
+}
+
+static const completion_callback dummy_callback{[](void *) {},nullptr};
+
+/** Ensure that the log has been written to the log file up to a given
+log entry (such as that of a transaction commit). Start a new write, or
+wait and check if an already running write is covering the request.
+@param lsn      log sequence number that should be included in the file write
+@param durable  whether the write needs to be durable
+@param callback log write completion callback */
+void log_write_up_to(lsn_t lsn, bool durable,
+                     const completion_callback *callback)
+{
+  ut_ad(!srv_read_only_mode);
+  ut_ad(lsn != LSN_MAX);
+
+  if (UNIV_UNLIKELY(recv_no_ibuf_operations))
+  {
+    /* A non-final batch of recovery is active no writes to the log
+    are allowed yet. */
+    ut_a(!callback);
+    return;
+  }
+
+  ut_ad(lsn <= log_sys.get_lsn());
+
+#ifdef HAVE_PMEM
+  if (log_sys.is_pmem())
+  {
+    ut_ad(!callback);
+    if (durable)
+      log_sys.persist(lsn);
+    return;
+  }
+#endif
+
+repeat:
+  if (durable)
+  {
+    if (flush_lock.acquire(lsn, callback) != group_commit_lock::ACQUIRED)
+      return;
+    flush_lock.set_pending(log_sys.get_lsn());
+  }
+
+  lsn_t pending_write_lsn= 0, pending_flush_lsn= 0;
+
+  if (write_lock.acquire(lsn, durable ? nullptr : callback) ==
+      group_commit_lock::ACQUIRED)
+  {
+    log_sys.latch.wr_lock(SRW_LOCK_CALL);
+    pending_write_lsn= write_lock.release(log_sys.write_buf<true>());
+  }
+
+  if (durable)
+  {
+    pending_flush_lsn= log_flush(write_lock.value());
+  }
+
+  if (pending_write_lsn || pending_flush_lsn)
+  {
+    /* There is no new group commit lead; some async waiters could stall. */
+    callback= &dummy_callback;
+    lsn= std::max(pending_write_lsn, pending_flush_lsn);
+    goto repeat;
+  }
+}
+
+/** Write to the log file up to the last log entry.
+@param durable  whether to wait for a durable write to complete */
+void log_buffer_flush_to_disk(bool durable)
+{
+  ut_ad(!srv_read_only_mode);
+  log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), durable);
+}
+
+/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */
+ATTRIBUTE_COLD void log_write_and_flush_prepare()
+{
+  if (log_sys.is_pmem())
+    return;
+
+  while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
+         group_commit_lock::ACQUIRED);
+  while (write_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
+         group_commit_lock::ACQUIRED);
+}
+
+/** Durably write the log up to log_sys.get_lsn(). */
+ATTRIBUTE_COLD void log_write_and_flush()
+{
+  ut_ad(!srv_read_only_mode);
+  if (!log_sys.is_pmem())
+  {
+    const lsn_t lsn{log_sys.write_buf<false>()};
+    write_lock.release(lsn);
+    log_flush(lsn);
+  }
+#ifdef HAVE_PMEM
+  else
+    log_sys.persist(log_sys.get_lsn());
+#endif
+}
+
+/********************************************************************
+
+Tries to establish a big enough margin of free space in the log buffer, such
+that a new log entry can be catenated without an immediate need for a flush. */
+ATTRIBUTE_COLD static void log_flush_margin()
+{
+  if (log_sys.buf_free > log_sys.max_buf_free)
+    log_buffer_flush_to_disk(false);
+}
+
+/****************************************************************//**
+Tries to establish a big enough margin of free space in the log, such
+that a new log entry can be catenated without an immediate need for a
+checkpoint. NOTE: this function may only be called if the calling thread
+owns no synchronization objects! */
+ATTRIBUTE_COLD static void log_checkpoint_margin()
+{
+  while (log_sys.check_flush_or_checkpoint())
+  {
+    log_sys.latch.rd_lock(SRW_LOCK_CALL);
+    ut_ad(!recv_no_log_write);
+
+    if (!log_sys.check_flush_or_checkpoint())
+    {
+func_exit:
+      log_sys.latch.rd_unlock();
+      return;
+    }
+
+    const lsn_t lsn= log_sys.get_lsn();
+    const lsn_t checkpoint= log_sys.last_checkpoint_lsn;
+    const lsn_t sync_lsn= checkpoint + log_sys.max_checkpoint_age;
+
+    if (lsn <= sync_lsn)
+    {
+#ifndef DBUG_OFF
+    skip_checkpoint:
+#endif
+      log_sys.set_check_flush_or_checkpoint(false);
+      goto func_exit;
+    }
+
+    DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", goto skip_checkpoint;);
+    log_sys.latch.rd_unlock();
+
+    /* We must wait to prevent the tail of the log overwriting the head. */
+    buf_flush_wait_flushed(std::min(sync_lsn, checkpoint + (1U << 20)));
+    /* Sleep to avoid a thundering herd */
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  }
+}
+
+/**
+Checks that there is enough free space in the log to start a new query step.
+Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
+function may only be called if the calling thread owns no synchronization
+objects! */
+ATTRIBUTE_COLD void log_check_margins()
+{
+  do
+  {
+    log_flush_margin();
+    log_checkpoint_margin();
+    ut_ad(!recv_no_log_write);
+  }
+  while (log_sys.check_flush_or_checkpoint());
+}
+
+/** Wait for a log checkpoint if needed.
+NOTE that this function may only be called while not holding
+any synchronization objects except dict_sys.latch. */
+void log_free_check()
+{
+  ut_ad(!lock_sys.is_writer());
+  if (log_sys.check_flush_or_checkpoint())
+    log_check_margins();
+}
+
+extern void buf_resize_shutdown();
+
+/** Make a checkpoint at the latest lsn on shutdown. */
+ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown()
+{
+	lsn_t			lsn;
+	ulint			count = 0;
+
+	ib::info() << "Starting shutdown...";
+
+	/* Wait until the master thread and all other operations are idle: our
+	algorithm only works if the server is idle at shutdown */
+	bool do_srv_shutdown = false;
+	if (srv_master_timer) {
+		do_srv_shutdown = srv_fast_shutdown < 2;
+		srv_master_timer.reset();
+	}
+
+	/* Wait for the end of the buffer resize task.*/
+	buf_resize_shutdown();
+	dict_stats_shutdown();
+	btr_defragment_shutdown();
+
+	srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
+
+	if (srv_buffer_pool_dump_at_shutdown &&
+		!srv_read_only_mode && srv_fast_shutdown < 2) {
+		buf_dump_start();
+	}
+	srv_monitor_timer.reset();
+
+	if (do_srv_shutdown) {
+		srv_shutdown(srv_fast_shutdown == 0);
+	}
+
+
+loop:
+	ut_ad(lock_sys.is_initialised() || !srv_was_started);
+	ut_ad(log_sys.is_initialised() || !srv_was_started);
+	ut_ad(fil_system.is_initialised() || !srv_was_started);
+
+#define COUNT_INTERVAL 600U
+#define CHECK_INTERVAL 100000U
+	std::this_thread::sleep_for(std::chrono::microseconds(CHECK_INTERVAL));
+
+	count++;
+
+	/* Check that there are no longer transactions, except for
+	PREPARED ones. We need this wait even for the 'very fast'
+	shutdown, because the InnoDB layer may have committed or
+	prepared transactions and we don't want to lose them. */
+
+	if (ulint total_trx = srv_was_started && !srv_read_only_mode
+	    && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
+	    ? trx_sys.any_active_transactions() : 0) {
+
+		if (srv_print_verbose_log && count > COUNT_INTERVAL) {
+			service_manager_extend_timeout(
+				COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
+				"Waiting for %lu active transactions to finish",
+				(ulong) total_trx);
+			ib::info() << "Waiting for " << total_trx << " active"
+				<< " transactions to finish";
+
+			count = 0;
+		}
+
+		goto loop;
+	}
+
+	/* We need these threads to stop early in shutdown. */
+	const char* thread_name = srv_fast_shutdown != 2
+		&& trx_rollback_is_active
+		? "rollback of recovered transactions" : nullptr;
+
+	if (thread_name) {
+		ut_ad(!srv_read_only_mode);
+wait_suspend_loop:
+		service_manager_extend_timeout(
+			COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
+			"Waiting for %s to exit", thread_name);
+		if (srv_print_verbose_log && count > COUNT_INTERVAL) {
+			ib::info() << "Waiting for " << thread_name
+				   << " to exit";
+			count = 0;
+		}
+		goto loop;
+	}
+
+	/* Check that the background threads are suspended */
+
+	ut_ad(!srv_any_background_activity());
+	if (srv_n_fil_crypt_threads_started) {
+		fil_crypt_threads_signal(true);
+		thread_name = "fil_crypt_thread";
+		goto wait_suspend_loop;
+	}
+
+	if (buf_page_cleaner_is_active) {
+		thread_name = "page cleaner thread";
+		pthread_cond_signal(&buf_pool.do_flush_list);
+		goto wait_suspend_loop;
+	}
+
+	buf_load_dump_end();
+
+	if (!buf_pool.is_initialised()) {
+		ut_ad(!srv_was_started);
+	} else {
+		buf_flush_buffer_pool();
+	}
+
+	if (srv_fast_shutdown == 2 || !srv_was_started) {
+		if (!srv_read_only_mode && srv_was_started) {
+			sql_print_information(
+				"InnoDB: Executing innodb_fast_shutdown=2."
+				" Next startup will execute crash recovery!");
+
+			/* In this fastest shutdown we do not flush the
+			buffer pool:
+
+			it is essentially a 'crash' of the InnoDB server.
+			Make sure that the log is all flushed to disk, so
+			that we can recover all committed transactions in
+			a crash recovery. */
+			log_buffer_flush_to_disk();
+		}
+
+		srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
+		return;
+	}
+
+	if (!srv_read_only_mode) {
+		service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+			"ensuring dirty buffer pool are written to log");
+		log_make_checkpoint();
+
+                const auto sizeof_cp = log_sys.is_encrypted()
+			? SIZE_OF_FILE_CHECKPOINT + 8
+			: SIZE_OF_FILE_CHECKPOINT;
+
+		log_sys.latch.rd_lock(SRW_LOCK_CALL);
+
+		lsn = log_sys.get_lsn();
+
+		const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn
+			&& lsn != log_sys.last_checkpoint_lsn + sizeof_cp;
+		ut_ad(lsn >= log_sys.last_checkpoint_lsn);
+
+		log_sys.latch.rd_unlock();
+
+		if (lsn_changed) {
+			goto loop;
+		}
+	} else {
+		lsn = recv_sys.lsn;
+	}
+
+	srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
+
+	/* Make some checks that the server really is quiet */
+	ut_ad(!srv_any_background_activity());
+
+	service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+				       "Free innodb buffer pool");
+	ut_d(buf_pool.assert_all_freed());
+
+	ut_a(lsn == log_sys.get_lsn()
+	     || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
+
+	if (UNIV_UNLIKELY(lsn < recv_sys.lsn)) {
+		sql_print_error("InnoDB: Shutdown LSN=" LSN_PF
+				" is less than start LSN=" LSN_PF,
+				lsn, recv_sys.lsn);
+	}
+
+	srv_shutdown_lsn = lsn;
+
+	/* Make some checks that the server really is quiet */
+	ut_ad(!srv_any_background_activity());
+
+	ut_a(lsn == log_sys.get_lsn()
+	     || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
+}
+
+/******************************************************//**
+Prints info of the log. */
+void
+log_print(
+/*======*/
+	FILE*	file)	/*!< in: file where to print */
+{
+	log_sys.latch.rd_lock(SRW_LOCK_CALL);
+
+	const lsn_t lsn= log_sys.get_lsn();
+	mysql_mutex_lock(&buf_pool.flush_list_mutex);
+	const lsn_t pages_flushed = buf_pool.get_oldest_modification(lsn);
+	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+	fprintf(file,
+		"Log sequence number " LSN_PF "\n"
+		"Log flushed up to   " LSN_PF "\n"
+		"Pages flushed up to " LSN_PF "\n"
+		"Last checkpoint at  " LSN_PF "\n",
+		lsn,
+		log_sys.get_flushed_lsn(),
+		pages_flushed,
+		lsn_t{log_sys.last_checkpoint_lsn});
+
+	log_sys.latch.rd_unlock();
+}
+
+/** Shut down the redo log subsystem. */
+void log_t::close()
+{
+  ut_ad(this == &log_sys);
+  if (!is_initialised()) return;
+  close_file();
+
+#ifndef HAVE_PMEM
+  ut_free_dodump(buf, buf_size);
+  buf= nullptr;
+  ut_free_dodump(flush_buf, buf_size);
+  flush_buf= nullptr;
+  aligned_free(checkpoint_buf);
+  checkpoint_buf= nullptr;
+#else
+  ut_ad(!checkpoint_buf);
+  ut_ad(!buf);
+  ut_ad(!flush_buf);
+#endif
+
+  latch.destroy();
+  destroy_lsn_lock();
+
+  recv_sys.close();
+
+  max_buf_free= 0;
+}
+
+std::string get_log_file_path(const char *filename)
+{
+  const size_t size= strlen(srv_log_group_home_dir) + /* path separator */ 1 +
+                     strlen(filename) + /* longest suffix */ 3;
+  std::string path;
+  path.reserve(size);
+  path.assign(srv_log_group_home_dir);
+
+  switch (path.back()) {
+#ifdef _WIN32
+  case '\\':
+#endif
+  case '/':
+    break;
+  default:
+    path.push_back('/');
+  }
+  path.append(filename);
+
+  return path;
+}