1 files changed, 764 insertions, 0 deletions
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc
new file mode 100644
index 00000000..52e947b7
--- /dev/null
+++ b/storage/innobase/buf/buf0dblwr.cc
@@ -0,0 +1,764 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dblwr.cc
+Doublwrite buffer module
+
+Created 2011/12/19
+*******************************************************/
+
+#include "buf0dblwr.h"
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "sync0sync.h"
+#include "page0zip.h"
+#include "trx0sys.h"
+#include "fil0crypt.h"
+#include "fil0pagecompress.h"
+
+using st_::span;
+
+/** The doublewrite buffer */
+buf_dblwr_t buf_dblwr;
+
+/** @return the TRX_SYS page */
+inline buf_block_t *buf_dblwr_trx_sys_get(mtr_t *mtr)
+{
+  buf_block_t *block= buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
+                                   0, RW_X_LATCH, mtr);
+  buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+  return block;
+}
+
+/** Initialize the doublewrite buffer data structure.
+@param header   doublewrite page header in the TRX_SYS page */
+inline void buf_dblwr_t::init(const byte *header)
+{
+  ut_ad(!active_slot->first_free);
+  ut_ad(!active_slot->reserved);
+  ut_ad(!batch_running);
+
+  mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr);
+  pthread_cond_init(&cond, nullptr);
+  block1= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK1));
+  block2= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK2));
+
+  const uint32_t buf_size= 2 * block_size();
+  for (int i= 0; i < 2; i++)
+  {
+    slots[i].write_buf= static_cast<byte*>
+      (aligned_malloc(buf_size << srv_page_size_shift, srv_page_size));
+    slots[i].buf_block_arr= static_cast<element*>
+      (ut_zalloc_nokey(buf_size * sizeof(element)));
+  }
+  active_slot= &slots[0];
+}
+
+/** Create or restore the doublewrite buffer in the TRX_SYS page.
+@return whether the operation succeeded */
+bool buf_dblwr_t::create()
+{
+  if (is_initialised())
+    return true;
+
+  mtr_t mtr;
+  const ulint size= block_size();
+
+start_again:
+  mtr.start();
+
+  buf_block_t *trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
+
+  if (mach_read_from_4(TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+                       trx_sys_block->frame) == TRX_SYS_DOUBLEWRITE_MAGIC_N)
+  {
+    /* The doublewrite buffer has already been created: just read in
+    some numbers */
+    init(TRX_SYS_DOUBLEWRITE + trx_sys_block->frame);
+    mtr.commit();
+    return true;
+  }
+
+  if (UT_LIST_GET_FIRST(fil_system.sys_space->chain)->size < 3 * size)
+  {
+too_small:
+    ib::error() << "Cannot create doublewrite buffer: "
+                   "the first file in innodb_data_file_path must be at least "
+                << (3 * (size >> (20U - srv_page_size_shift))) << "M.";
+    mtr.commit();
+    return false;
+  }
+  else
+  {
+    buf_block_t *b= fseg_create(fil_system.sys_space,
+                                TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG,
+                                &mtr, false, trx_sys_block);
+    if (!b)
+      goto too_small;
+    ib::info() << "Doublewrite buffer not found: creating new";
+
+    /* FIXME: After this point, the doublewrite buffer creation
+    is not atomic. The doublewrite buffer should not exist in
+    the InnoDB system tablespace file in the first place.
+    It could be located in separate optional file(s) in a
+    user-specified location. */
+
+    /* fseg_create acquires a second latch on the page,
+    therefore we must declare it: */
+    buf_block_dbg_add_level(b, SYNC_NO_ORDER_CHECK);
+  }
+
+  byte *fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
+    trx_sys_block->frame;
+  for (uint32_t prev_page_no= 0, i= 0, extent_size= FSP_EXTENT_SIZE;
+       i < 2 * size + extent_size / 2; i++)
+  {
+    buf_block_t *new_block= fseg_alloc_free_page(fseg_header, prev_page_no + 1,
+                                                 FSP_UP, &mtr);
+    if (!new_block)
+    {
+      ib::error() << "Cannot create doublewrite buffer: "
+                     " you must increase your tablespace size."
+                     " Cannot continue operation.";
+      /* This may essentially corrupt the doublewrite
+      buffer. However, usually the doublewrite buffer
+      is created at database initialization, and it
+      should not matter (just remove all newly created
+      InnoDB files and restart). */
+      mtr.commit();
+      return false;
+    }
+
+    /* We read the allocated pages to the buffer pool; when they are
+    written to disk in a flush, the space id and page number fields
+    are also written to the pages. When we at database startup read
+    pages from the doublewrite buffer, we know that if the space id
+    and page number in them are the same as the page position in the
+    tablespace, then the page has not been written to in
+    doublewrite. */
+
+    ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
+    const page_id_t id= new_block->page.id();
+    /* We only do this in the debug build, to ensure that the check in
+    buf_flush_init_for_writing() will see a valid page type. The
+    flushes of new_block are actually unnecessary here.  */
+    ut_d(mtr.write<2>(*new_block, FIL_PAGE_TYPE + new_block->frame,
+                      FIL_PAGE_TYPE_SYS));
+
+    if (i == size / 2)
+    {
+      ut_a(id.page_no() == size);
+      mtr.write<4>(*trx_sys_block,
+                   TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK1 +
+                   trx_sys_block->frame, id.page_no());
+      mtr.write<4>(*trx_sys_block,
+                   TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
+                   TRX_SYS_DOUBLEWRITE_BLOCK1 + trx_sys_block->frame,
+                   id.page_no());
+    }
+    else if (i == size / 2 + size)
+    {
+      ut_a(id.page_no() == 2 * size);
+      mtr.write<4>(*trx_sys_block,
+                   TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK2 +
+                   trx_sys_block->frame, id.page_no());
+      mtr.write<4>(*trx_sys_block,
+                   TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
+                   TRX_SYS_DOUBLEWRITE_BLOCK2 + trx_sys_block->frame,
+                   id.page_no());
+    }
+    else if (i > size / 2)
+      ut_a(id.page_no() == prev_page_no + 1);
+
+    if (((i + 1) & 15) == 0) {
+      /* rw_locks can only be recursively x-locked 2048 times. (on 32
+      bit platforms, (lint) 0 - (X_LOCK_DECR * 2049) is no longer a
+      negative number, and thus lock_word becomes like a shared lock).
+      For 4k page size this loop will lock the fseg header too many
+      times. Since this code is not done while any other threads are
+      active, restart the MTR occasionally. */
+      mtr.commit();
+      mtr.start();
+      trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
+      fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
+        trx_sys_block->frame;
+    }
+
+    prev_page_no= id.page_no();
+  }
+
+  mtr.write<4>(*trx_sys_block,
+               TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+               trx_sys_block->frame, TRX_SYS_DOUBLEWRITE_MAGIC_N);
+  mtr.write<4>(*trx_sys_block,
+               TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+               TRX_SYS_DOUBLEWRITE_REPEAT + trx_sys_block->frame,
+               TRX_SYS_DOUBLEWRITE_MAGIC_N);
+
+  mtr.write<4>(*trx_sys_block,
+               TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
+               trx_sys_block->frame, TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N);
+  mtr.commit();
+
+  /* Flush the modified pages to disk and make a checkpoint */
+  log_make_checkpoint();
+
+  /* Remove doublewrite pages from LRU */
+  buf_pool_invalidate();
+
+  ib::info() << "Doublewrite buffer created";
+  goto start_again;
+}
+
+/** Initialize the doublewrite buffer memory structure on recovery.
+If we are upgrading from a version before MySQL 4.1, then this
+function performs the necessary update operations to support
+innodb_file_per_table. If we are in a crash recovery, this function
+loads the pages from double write buffer into memory.
+@param file File handle
+@param path Path name of file
+@return DB_SUCCESS or error code */
+dberr_t buf_dblwr_t::init_or_load_pages(pfs_os_file_t file, const char *path)
+{
+  ut_ad(this == &buf_dblwr);
+  const uint32_t size= block_size();
+
+  /* We do the file i/o past the buffer pool */
+  byte *read_buf= static_cast<byte*>(aligned_malloc(srv_page_size,
+                                                    srv_page_size));
+  /* Read the TRX_SYS header to check if we are using the doublewrite buffer */
+  dberr_t err= os_file_read(IORequestRead, file, read_buf,
+                            TRX_SYS_PAGE_NO << srv_page_size_shift,
+                            srv_page_size);
+
+  if (err != DB_SUCCESS)
+  {
+    ib::error() << "Failed to read the system tablespace header page";
+func_exit:
+    aligned_free(read_buf);
+    return err;
+  }
+
+  /* TRX_SYS_PAGE_NO is not encrypted see fil_crypt_rotate_page() */
+  if (mach_read_from_4(TRX_SYS_DOUBLEWRITE_MAGIC + TRX_SYS_DOUBLEWRITE +
+                       read_buf) != TRX_SYS_DOUBLEWRITE_MAGIC_N)
+  {
+    /* There is no doublewrite buffer initialized in the TRX_SYS page.
+    This should normally not be possible; the doublewrite buffer should
+    be initialized when creating the database. */
+    err= DB_SUCCESS;
+    goto func_exit;
+  }
+
+  init(TRX_SYS_DOUBLEWRITE + read_buf);
+
+  const bool upgrade_to_innodb_file_per_table=
+    mach_read_from_4(TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
+                     TRX_SYS_DOUBLEWRITE + read_buf) !=
+    TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N;
+
+  auto write_buf= active_slot->write_buf;
+  /* Read the pages from the doublewrite buffer to memory */
+  err= os_file_read(IORequestRead, file, write_buf,
+                    block1.page_no() << srv_page_size_shift,
+                    size << srv_page_size_shift);
+
+  if (err != DB_SUCCESS)
+  {
+    ib::error() << "Failed to read the first double write buffer extent";
+    goto func_exit;
+  }
+
+  err= os_file_read(IORequestRead, file,
+                    write_buf + (size << srv_page_size_shift),
+                    block2.page_no() << srv_page_size_shift,
+                    size << srv_page_size_shift);
+  if (err != DB_SUCCESS)
+  {
+    ib::error() << "Failed to read the second double write buffer extent";
+    goto func_exit;
+  }
+
+  byte *page= write_buf;
+
+  if (UNIV_UNLIKELY(upgrade_to_innodb_file_per_table))
+  {
+    ib::info() << "Resetting space id's in the doublewrite buffer";
+
+    for (ulint i= 0; i < size * 2; i++, page += srv_page_size)
+    {
+      memset(page + FIL_PAGE_SPACE_ID, 0, 4);
+      /* For innodb_checksum_algorithm=innodb, we do not need to
+      calculate new checksums for the pages because the field
+      .._SPACE_ID does not affect them. Write the page back to where
+      we read it from. */
+      const ulint source_page_no= i < size
+        ? block1.page_no() + i
+        : block2.page_no() + i - size;
+      err= os_file_write(IORequestWrite, path, file, page,
+                         source_page_no << srv_page_size_shift, srv_page_size);
+      if (err != DB_SUCCESS)
+      {
+        ib::error() << "Failed to upgrade the double write buffer";
+        goto func_exit;
+      }
+    }
+    os_file_flush(file);
+  }
+  else
+    for (ulint i= 0; i < size * 2; i++, page += srv_page_size)
+      if (mach_read_from_8(my_assume_aligned<8>(page + FIL_PAGE_LSN)))
+        /* Each valid page header must contain a nonzero FIL_PAGE_LSN field. */
+        recv_sys.dblwr.add(page);
+
+  err= DB_SUCCESS;
+  goto func_exit;
+}
+
+/** Process and remove the double write buffer pages for all tablespaces. */
+void buf_dblwr_t::recover()
+{
+  ut_ad(recv_sys.parse_start_lsn);
+  if (!is_initialised())
+    return;
+
+  uint32_t page_no_dblwr= 0;
+  byte *read_buf= static_cast<byte*>(aligned_malloc(3 * srv_page_size,
+                                                    srv_page_size));
+  byte *const buf= read_buf + srv_page_size;
+
+  for (recv_dblwr_t::list::iterator i= recv_sys.dblwr.pages.begin();
+       i != recv_sys.dblwr.pages.end(); ++i, ++page_no_dblwr)
+  {
+    byte *page= *i;
+    const uint32_t page_no= page_get_page_no(page);
+    if (!page_no) /* recovered via Datafile::restore_from_doublewrite() */
+      continue;
+
+    const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN);
+    if (recv_sys.parse_start_lsn > lsn)
+      /* Pages written before the checkpoint are not useful for recovery. */
+      continue;
+    const ulint space_id= page_get_space_id(page);
+    const page_id_t page_id(space_id, page_no);
+
+    if (recv_sys.scanned_lsn < lsn)
+    {
+      ib::info() << "Ignoring a doublewrite copy of page " << page_id
+                 << " with future log sequence number " << lsn;
+      continue;
+    }
+
+    fil_space_t *space= fil_space_t::get(space_id);
+
+    if (!space)
+      /* The tablespace that this page once belonged to does not exist */
+      continue;
+
+    if (UNIV_UNLIKELY(page_no >= space->get_size()))
+    {
+      /* Do not report the warning for undo tablespaces, because they
+      can be truncated in place. */
+      if (!srv_is_undo_tablespace(space_id))
+        ib::warn() << "A copy of page " << page_no
+                   << " in the doublewrite buffer slot " << page_no_dblwr
+                   << " is beyond the end of tablespace " << space->name
+                   << " (" << space->size << " pages)";
+next_page:
+      space->release();
+      continue;
+    }
+
+    const ulint physical_size= space->physical_size();
+    ut_ad(!buf_is_zeroes(span<const byte>(page, physical_size)));
+
+    /* We want to ensure that for partial reads the unread portion of
+    the page is NUL. */
+    memset(read_buf, 0x0, physical_size);
+
+    /* Read in the actual page from the file */
+    fil_io_t fio= space->io(IORequest(IORequest::DBLWR_RECOVER),
+                            os_offset_t{page_no} * physical_size,
+                            physical_size, read_buf);
+
+    if (UNIV_UNLIKELY(fio.err != DB_SUCCESS))
+       ib::warn() << "Double write buffer recovery: " << page_id
+                  << " (tablespace '" << space->name
+                  << "') read failed with error: " << fio.err;
+
+    if (buf_is_zeroes(span<const byte>(read_buf, physical_size)))
+    {
+      /* We will check if the copy in the doublewrite buffer is
+      valid. If not, we will ignore this page (there should be redo
+      log records to initialize it). */
+    }
+    else if (recv_sys.dblwr.validate_page(page_id, read_buf, space, buf))
+      goto next_page;
+    else
+      /* We intentionally skip this message for all-zero pages. */
+      ib::info() << "Trying to recover page " << page_id
+                 << " from the doublewrite buffer.";
+
+    page= recv_sys.dblwr.find_page(page_id, space, buf);
+
+    if (!page)
+      goto next_page;
+
+    /* Write the good page from the doublewrite buffer to the intended
+    position. */
+    space->reacquire();
+    fio= space->io(IORequestWrite,
+                   os_offset_t{page_id.page_no()} * physical_size,
+                   physical_size, page);
+
+    if (fio.err == DB_SUCCESS)
+      ib::info() << "Recovered page " << page_id << " to '" << fio.node->name
+                 << "' from the doublewrite buffer.";
+    goto next_page;
+  }
+
+  recv_sys.dblwr.pages.clear();
+  fil_flush_file_spaces();
+  aligned_free(read_buf);
+}
+
+/** Free the doublewrite buffer. */
+void buf_dblwr_t::close()
+{
+  if (!is_initialised())
+    return;
+
+  /* Free the double write data structures. */
+  ut_ad(!active_slot->reserved);
+  ut_ad(!active_slot->first_free);
+  ut_ad(!batch_running);
+
+  pthread_cond_destroy(&cond);
+  for (int i= 0; i < 2; i++)
+  {
+    aligned_free(slots[i].write_buf);
+    ut_free(slots[i].buf_block_arr);
+  }
+  mysql_mutex_destroy(&mutex);
+
+  memset((void*) this, 0, sizeof *this);
+  active_slot= &slots[0];
+}
+
+/** Update the doublewrite buffer on write completion. */
+void buf_dblwr_t::write_completed()
+{
+  ut_ad(this == &buf_dblwr);
+  ut_ad(srv_use_doublewrite_buf);
+  ut_ad(is_initialised());
+  ut_ad(!srv_read_only_mode);
+
+  mysql_mutex_lock(&mutex);
+
+  ut_ad(batch_running);
+  slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+  ut_ad(flush_slot->reserved);
+  ut_ad(flush_slot->reserved <= flush_slot->first_free);
+
+  if (!--flush_slot->reserved)
+  {
+    mysql_mutex_unlock(&mutex);
+    /* This will finish the batch. Sync data files to the disk. */
+    fil_flush_file_spaces();
+    mysql_mutex_lock(&mutex);
+
+    /* We can now reuse the doublewrite memory buffer: */
+    flush_slot->first_free= 0;
+    batch_running= false;
+    pthread_cond_broadcast(&cond);
+  }
+
+  mysql_mutex_unlock(&mutex);
+}
+
+#ifdef UNIV_DEBUG
+/** Check the LSN values on the page.
+@param[in] page  page to check
+@param[in] s     tablespace */
+static void buf_dblwr_check_page_lsn(const page_t* page, const fil_space_t& s)
+{
+  /* Ignore page_compressed or encrypted pages */
+  if (s.is_compressed() || buf_page_get_key_version(page, s.flags))
+    return;
+  const byte* lsn_start= FIL_PAGE_LSN + 4 + page;
+  const byte* lsn_end= page + srv_page_size -
+    (s.full_crc32()
+     ? FIL_PAGE_FCRC32_END_LSN
+     : FIL_PAGE_END_LSN_OLD_CHKSUM - 4);
+  static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+  static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+  ut_ad(!memcmp_aligned<4>(lsn_start, lsn_end, 4));
+}
+
+static void buf_dblwr_check_page_lsn(const buf_page_t &b, const byte *page)
+{
+  if (fil_space_t *space= fil_space_t::get(b.id().space()))
+  {
+    buf_dblwr_check_page_lsn(page, *space);
+    space->release();
+  }
+}
+
+/** Check the LSN values on the page with which this block is associated. */
+static void buf_dblwr_check_block(const buf_page_t *bpage)
+{
+  ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
+  const page_t *page= reinterpret_cast<const buf_block_t*>(bpage)->frame;
+
+  switch (fil_page_get_type(page)) {
+  case FIL_PAGE_INDEX:
+  case FIL_PAGE_TYPE_INSTANT:
+  case FIL_PAGE_RTREE:
+    if (page_is_comp(page))
+    {
+      if (page_simple_validate_new(page))
+        return;
+    }
+    else if (page_simple_validate_old(page))
+      return;
+    /* While it is possible that this is not an index page but just
+    happens to have wrongly set FIL_PAGE_TYPE, such pages should never
+    be modified to without also adjusting the page type during page
+    allocation or buf_flush_init_for_writing() or
+    fil_block_reset_type(). */
+    buf_page_print(page);
+
+    ib::fatal() << "Apparent corruption of an index page " << bpage->id()
+                << " to be written to data file. We intentionally crash"
+                " the server to prevent corrupt data from ending up in"
+                " data files.";
+  }
+}
+#endif /* UNIV_DEBUG */
+
+bool buf_dblwr_t::flush_buffered_writes(const ulint size)
+{
+  mysql_mutex_assert_owner(&mutex);
+  ut_ad(size == block_size());
+
+  for (;;)
+  {
+    if (!active_slot->first_free)
+      return false;
+    if (!batch_running)
+      break;
+    my_cond_wait(&cond, &mutex.m_mutex);
+  }
+
+  ut_ad(active_slot->reserved == active_slot->first_free);
+  ut_ad(!flushing_buffered_writes);
+
+  /* Disallow anyone else to start another batch of flushing. */
+  slot *flush_slot= active_slot;
+  /* Switch the active slot */
+  active_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+  ut_a(active_slot->first_free == 0);
+  batch_running= true;
+  const ulint old_first_free= flush_slot->first_free;
+  auto write_buf= flush_slot->write_buf;
+  const bool multi_batch= block1 + static_cast<uint32_t>(size) != block2 &&
+    old_first_free > size;
+  flushing_buffered_writes= 1 + multi_batch;
+  pages_submitted+= old_first_free;
+  /* Now safe to release the mutex. */
+  mysql_mutex_unlock(&mutex);
+#ifdef UNIV_DEBUG
+  for (ulint len2= 0, i= 0; i < old_first_free; len2 += srv_page_size, i++)
+  {
+    buf_page_t *bpage= flush_slot->buf_block_arr[i].request.bpage;
+
+    if (bpage->zip.data)
+      /* No simple validate for ROW_FORMAT=COMPRESSED pages exists. */
+      continue;
+
+    /* Check that the actual page in the buffer pool is not corrupt
+    and the LSN values are sane. */
+    buf_dblwr_check_block(bpage);
+    ut_d(buf_dblwr_check_page_lsn(*bpage, write_buf + len2));
+  }
+#endif /* UNIV_DEBUG */
+  const IORequest request(nullptr, fil_system.sys_space->chain.start,
+                          IORequest::DBLWR_BATCH);
+  ut_a(fil_system.sys_space->acquire());
+  if (multi_batch)
+  {
+    fil_system.sys_space->reacquire();
+    os_aio(request, write_buf,
+           os_offset_t{block1.page_no()} << srv_page_size_shift,
+           size << srv_page_size_shift);
+    os_aio(request, write_buf + (size << srv_page_size_shift),
+           os_offset_t{block2.page_no()} << srv_page_size_shift,
+           (old_first_free - size) << srv_page_size_shift);
+  }
+  else
+    os_aio(request, write_buf,
+           os_offset_t{block1.page_no()} << srv_page_size_shift,
+           old_first_free << srv_page_size_shift);
+  return true;
+}
+
+void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request)
+{
+  ut_ad(this == &buf_dblwr);
+  ut_ad(srv_use_doublewrite_buf);
+  ut_ad(is_initialised());
+  ut_ad(!srv_read_only_mode);
+  ut_ad(!request.bpage);
+  ut_ad(request.node == fil_system.sys_space->chain.start);
+  ut_ad(request.type == IORequest::DBLWR_BATCH);
+  mysql_mutex_lock(&mutex);
+  ut_ad(batch_running);
+  ut_ad(flushing_buffered_writes);
+  ut_ad(flushing_buffered_writes <= 2);
+  writes_completed++;
+  if (UNIV_UNLIKELY(--flushing_buffered_writes))
+  {
+    mysql_mutex_unlock(&mutex);
+    return;
+  }
+
+  slot *const flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+  ut_ad(flush_slot->reserved == flush_slot->first_free);
+  /* increment the doublewrite flushed pages counter */
+  pages_written+= flush_slot->first_free;
+  mysql_mutex_unlock(&mutex);
+
+  /* Now flush the doublewrite buffer data to disk */
+  fil_system.sys_space->flush<false>();
+
+  /* The writes have been flushed to disk now and in recovery we will
+  find them in the doublewrite buffer blocks. Next, write the data pages. */
+  for (ulint i= 0, first_free= flush_slot->first_free; i < first_free; i++)
+  {
+    auto e= flush_slot->buf_block_arr[i];
+    buf_page_t* bpage= e.request.bpage;
+    ut_ad(bpage->in_file());
+
+    /* We request frame here to get correct buffer in case of
+    encryption and/or page compression */
+    void *frame= buf_page_get_frame(bpage);
+
+    auto e_size= e.size;
+
+    if (UNIV_LIKELY_NULL(bpage->zip.data))
+    {
+      e_size= bpage->zip_size();
+      ut_ad(e_size);
+    }
+    else
+    {
+      ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
+      ut_ad(!bpage->zip_size());
+      ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast<const byte*>(frame)));
+    }
+
+    const lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
+                                      (FIL_PAGE_LSN +
+                                       static_cast<const byte*>(frame)));
+    ut_ad(lsn);
+    ut_ad(lsn >= bpage->oldest_modification());
+    log_write_up_to(lsn, true);
+    e.request.node->space->io(e.request, bpage->physical_offset(), e_size,
+                              frame, bpage);
+  }
+}
+
+/** Flush possible buffered writes to persistent storage.
+It is very important to call this function after a batch of writes has been
+posted, and also when we may have to wait for a page latch!
+Otherwise a deadlock of threads can occur. */
+void buf_dblwr_t::flush_buffered_writes()
+{
+  if (!is_initialised() || !srv_use_doublewrite_buf)
+  {
+    fil_flush_file_spaces();
+    return;
+  }
+
+  ut_ad(!srv_read_only_mode);
+  const ulint size= block_size();
+
+  mysql_mutex_lock(&mutex);
+  if (!flush_buffered_writes(size))
+    mysql_mutex_unlock(&mutex);
+}
+
+/** Schedule a page write. If the doublewrite memory buffer is full,
+flush_buffered_writes() will be invoked to make space.
+@param request    asynchronous write request
+@param size       payload size in bytes */
+void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size)
+{
+  ut_ad(request.is_async());
+  ut_ad(request.is_write());
+  ut_ad(request.bpage);
+  ut_ad(request.bpage->in_file());
+  ut_ad(request.node);
+  ut_ad(request.node->space->id == request.bpage->id().space());
+  ut_ad(request.node->space->referenced());
+  ut_ad(!srv_read_only_mode);
+
+  const ulint buf_size= 2 * block_size();
+
+  mysql_mutex_lock(&mutex);
+
+  for (;;)
+  {
+    ut_ad(active_slot->first_free <= buf_size);
+    if (active_slot->first_free != buf_size)
+      break;
+
+    if (flush_buffered_writes(buf_size / 2))
+      mysql_mutex_lock(&mutex);
+  }
+
+  byte *p= active_slot->write_buf + srv_page_size * active_slot->first_free;
+
+  /* We request frame here to get correct buffer in case of
+  encryption and/or page compression */
+  void *frame= buf_page_get_frame(request.bpage);
+
+  /* "frame" is at least 1024-byte aligned for ROW_FORMAT=COMPRESSED pages,
+  and at least srv_page_size (4096-byte) for everything else. */
+  memcpy_aligned<UNIV_ZIP_SIZE_MIN>(p, frame, size);
+  /* fil_page_compress() for page_compressed guarantees 256-byte alignment */
+  memset_aligned<256>(p + size, 0, srv_page_size - size);
+  /* FIXME: Inform the compiler that "size" and "srv_page_size - size"
+  are integer multiples of 256, so the above can translate into simple
+  SIMD instructions. Currently, we make no such assumptions about the
+  non-pointer parameters that are passed to the _aligned templates. */
+  ut_ad(!request.bpage->zip_size() || request.bpage->zip_size() == size);
+  ut_ad(active_slot->reserved == active_slot->first_free);
+  ut_ad(active_slot->reserved < buf_size);
+  new (active_slot->buf_block_arr + active_slot->first_free++)
+    element{request, size};
+  active_slot->reserved= active_slot->first_free;
+
+  if (active_slot->first_free != buf_size ||
+      !flush_buffered_writes(buf_size / 2))
+    mysql_mutex_unlock(&mutex);
+}