Adding upstream version 16.2.11+ds.upstream/16.2.11+ds upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
commit: 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree: 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/crimson/common
parent: Initial commit. (diff)
download: ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz
ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip
24 files changed, 4139 insertions, 0 deletions
diff --git a/src/crimson/common/assert.cc b/src/crimson/common/assert.cc
new file mode 100644
index 000000000..07610c33f
--- /dev/null
+++ b/src/crimson/common/assert.cc
@@ -0,0 +1,81 @@
+#include <cstdarg>
+#include <iostream>
+
+#include <seastar/util/backtrace.hh>
+#include <seastar/core/reactor.hh>
+
+#include "include/ceph_assert.h"
+
+#include "crimson/common/log.h"
+
+namespace ceph {
+  [[gnu::cold]] void __ceph_assert_fail(const ceph::assert_data &ctx)
+  {
+    __ceph_assert_fail(ctx.assertion, ctx.file, ctx.line, ctx.function);
+  }
+
+  [[gnu::cold]] void __ceph_assert_fail(const char* assertion,
+                                        const char* file, int line,
+                                        const char* func)
+  {
+    seastar::logger& logger = crimson::get_logger(0);
+    logger.error("{}:{} : In function '{}', ceph_assert(%s)\n"
+                 "{}",
+                 file, line, func, assertion,
+                 seastar::current_backtrace());
+    std::cout << std::flush;
+    abort();
+  }
+  [[gnu::cold]] void __ceph_assertf_fail(const char *assertion,
+                                         const char *file, int line,
+                                         const char *func, const char* msg,
+                                         ...)
+  {
+    char buf[8096];
+    va_list args;
+    va_start(args, msg);
+    std::vsnprintf(buf, sizeof(buf), msg, args);
+    va_end(args);
+
+    seastar::logger& logger = crimson::get_logger(0);
+    logger.error("{}:{} : In function '{}', ceph_assert(%s)\n"
+                 "{}\n{}\n",
+                 file, line, func, assertion,
+                 buf,
+                 seastar::current_backtrace());
+    std::cout << std::flush;
+    abort();
+  }
+
+  [[gnu::cold]] void __ceph_abort(const char* file, int line,
+                                  const char* func, const std::string& msg)
+  {
+    seastar::logger& logger = crimson::get_logger(0);
+    logger.error("{}:{} : In function '{}', abort(%s)\n"
+                 "{}",
+                 file, line, func, msg,
+                 seastar::current_backtrace());
+    std::cout << std::flush;
+    abort();
+  }
+
+  [[gnu::cold]] void __ceph_abortf(const char* file, int line,
+                                   const char* func, const char* fmt,
+                                   ...)
+  {
+    char buf[8096];
+    va_list args;
+    va_start(args, fmt);
+    std::vsnprintf(buf, sizeof(buf), fmt, args);
+    va_end(args);
+
+    seastar::logger& logger = crimson::get_logger(0);
+    logger.error("{}:{} : In function '{}', abort()\n"
+                 "{}\n{}\n",
+                 file, line, func,
+                 buf,
+                 seastar::current_backtrace());
+    std::cout << std::flush;
+    abort();
+  }
+}
diff --git a/src/crimson/common/auth_handler.h b/src/crimson/common/auth_handler.h
new file mode 100644
index 000000000..d4140b6a2
--- /dev/null
+++ b/src/crimson/common/auth_handler.h
@@ -0,0 +1,17 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+class EntityName;
+class AuthCapsInfo;
+
+namespace crimson::common {
+class AuthHandler {
+public:
+  // the peer just got authorized
+  virtual void handle_authentication(const EntityName& name,
+				     const AuthCapsInfo& caps) = 0;
+  virtual ~AuthHandler() = default;
+};
+}
diff --git a/src/crimson/common/buffer_io.cc b/src/crimson/common/buffer_io.cc
new file mode 100644
index 000000000..86edf7a6f
--- /dev/null
+++ b/src/crimson/common/buffer_io.cc
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "buffer_io.h"
+
+#include <seastar/core/reactor.hh>
+#include <seastar/core/fstream.hh>
+#include <seastar/core/do_with.hh>
+
+#include "include/buffer.h"
+
+namespace crimson {
+
+seastar::future<> write_file(ceph::buffer::list&& bl,
+                             seastar::sstring fn,
+                             seastar::file_permissions permissions)
+{
+  const auto flags = (seastar::open_flags::wo |
+                      seastar::open_flags::create |
+                      seastar::open_flags::truncate);
+  seastar::file_open_options foo;
+  foo.create_permissions = permissions;
+  return seastar::open_file_dma(fn, flags, foo).then(
+    [bl=std::move(bl)](seastar::file f) {
+    return seastar::make_file_output_stream(f).then(
+      [bl=std::move(bl), f=std::move(f)](seastar::output_stream<char> out) {
+      return seastar::do_with(std::move(out),
+                              std::move(f),
+                              std::move(bl),
+                              [](seastar::output_stream<char>& out,
+                                 seastar::file& f,
+                                 ceph::buffer::list& bl) {
+        return seastar::do_for_each(bl.buffers(), [&out](auto& buf) {
+          return out.write(buf.c_str(), buf.length());
+        }).then([&out] {
+          return out.close();
+        });
+      });
+    });
+  });
+}
+
+seastar::future<seastar::temporary_buffer<char>>
+read_file(const seastar::sstring fn)
+{
+  return seastar::open_file_dma(fn, seastar::open_flags::ro).then(
+    [] (seastar::file f) {
+    return f.size().then([f = std::move(f)](size_t s) {
+      return seastar::do_with(seastar::make_file_input_stream(f),
+			      [s](seastar::input_stream<char>& in) {
+        return in.read_exactly(s);
+      });
+    });
+  });
+}
+
+}
diff --git a/src/crimson/common/buffer_io.h b/src/crimson/common/buffer_io.h
new file mode 100644
index 000000000..c5ece4a6f
--- /dev/null
+++ b/src/crimson/common/buffer_io.h
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/file-types.hh>
+
+#include "include/buffer_fwd.h"
+
+namespace crimson {
+  seastar::future<> write_file(ceph::buffer::list&& bl,
+                               seastar::sstring fn,
+                               seastar::file_permissions= // 0644
+                                 (seastar::file_permissions::user_read |
+                                  seastar::file_permissions::user_write |
+                                  seastar::file_permissions::group_read |
+                                  seastar::file_permissions::others_read));
+  seastar::future<seastar::temporary_buffer<char>>
+  read_file(const seastar::sstring fn);
+}
diff --git a/src/crimson/common/config_proxy.cc b/src/crimson/common/config_proxy.cc
new file mode 100644
index 000000000..88d4679d5
--- /dev/null
+++ b/src/crimson/common/config_proxy.cc
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "config_proxy.h"
+
+#include <filesystem>
+
+#include "crimson/common/buffer_io.h"
+
+namespace crimson::common {
+
+ConfigProxy::ConfigProxy(const EntityName& name, std::string_view cluster)
+{
+  if (seastar::this_shard_id() != 0) {
+    return;
+  }
+  // set the initial value on CPU#0
+  values.reset(seastar::make_lw_shared<ConfigValues>());
+  values.get()->name = name;
+  values.get()->cluster = cluster;
+  // and the only copy of md_config_impl<> is allocated on CPU#0
+  local_config.reset(new md_config_t{*values, obs_mgr, true});
+  if (name.is_mds()) {
+    local_config->set_val_default(*values, obs_mgr,
+				  "keyring", "$mds_data/keyring");
+  } else if (name.is_osd()) {
+    local_config->set_val_default(*values, obs_mgr,
+				  "keyring", "$osd_data/keyring");
+  }
+}
+
+seastar::future<> ConfigProxy::start()
+{
+  // populate values and config to all other shards
+  if (!values) {
+    return seastar::make_ready_future<>();
+  }
+  return container().invoke_on_others([this](auto& proxy) {
+    return values.copy().then([config=local_config.get(),
+			       &proxy](auto foreign_values) {
+      proxy.values.reset();
+      proxy.values = std::move(foreign_values);
+      proxy.remote_config = config;
+      return seastar::make_ready_future<>();
+    });
+  });
+}
+
+void ConfigProxy::show_config(ceph::Formatter* f) const {
+  get_config().show_config(*values, f);
+}
+
+seastar::future<> ConfigProxy::parse_config_files(const std::string& conf_files)
+{
+  auto conffile_paths =
+    get_config().get_conffile_paths(*values,
+                                    conf_files.empty() ? nullptr : conf_files.c_str(),
+                                    &std::cerr,
+                                    CODE_ENVIRONMENT_DAEMON);
+  return seastar::do_with(std::move(conffile_paths), [this] (auto& paths) {
+    return seastar::repeat([path=paths.begin(), e=paths.end(), this]() mutable {
+      if (path == e) {
+        // tried all conffile, none of them works
+        return seastar::make_ready_future<seastar::stop_iteration>(
+          seastar::stop_iteration::yes);
+      }
+      return crimson::read_file(*path++).then([this](auto&& buf) {
+        return do_change([buf=std::move(buf), this](ConfigValues& values) {
+          if (get_config().parse_buffer(values, obs_mgr,
+                                        buf.get(), buf.size(),
+                                        &std::cerr) == 0) {
+            get_config().update_legacy_vals(values);
+          } else {
+            throw std::invalid_argument("parse error");
+          }
+        }).then([] {
+          // this one works!
+	  return seastar::make_ready_future<seastar::stop_iteration>(
+            seastar::stop_iteration::yes);
+        });
+      }).handle_exception_type([] (const std::filesystem::filesystem_error&) {
+        return seastar::make_ready_future<seastar::stop_iteration>(
+          seastar::stop_iteration::no);
+      }).handle_exception_type([] (const std::invalid_argument&) {
+        return seastar::make_ready_future<seastar::stop_iteration>(
+         seastar::stop_iteration::no);
+      });
+    });
+  });
+}
+
+ConfigProxy::ShardedConfig ConfigProxy::sharded_conf;
+}
diff --git a/src/crimson/common/config_proxy.h b/src/crimson/common/config_proxy.h
new file mode 100644
index 000000000..f50a63431
--- /dev/null
+++ b/src/crimson/common/config_proxy.h
@@ -0,0 +1,200 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/reactor.hh>
+#include <seastar/core/sharded.hh>
+#include "common/config.h"
+#include "common/config_obs.h"
+#include "common/config_obs_mgr.h"
+#include "common/errno.h"
+
+namespace ceph {
+class Formatter;
+}
+
+namespace crimson::common {
+
+// a facade for managing config. each shard has its own copy of ConfigProxy.
+//
+// In seastar-osd, there could be multiple instances of @c ConfigValues in a
+// single process, as we are using a variant of read-copy-update mechinary to
+// update the settings at runtime.
+class ConfigProxy : public seastar::peering_sharded_service<ConfigProxy>
+{
+  using LocalConfigValues = seastar::lw_shared_ptr<ConfigValues>;
+  seastar::foreign_ptr<LocalConfigValues> values;
+
+  md_config_t* remote_config = nullptr;
+  std::unique_ptr<md_config_t> local_config;
+
+  using ConfigObserver = ceph::md_config_obs_impl<ConfigProxy>;
+  ObserverMgr<ConfigObserver> obs_mgr;
+
+  const md_config_t& get_config() const {
+    return remote_config ? *remote_config : * local_config;
+  }
+  md_config_t& get_config() {
+    return remote_config ? *remote_config : * local_config;
+  }
+
+  // apply changes to all shards
+  // @param func a functor which accepts @c "ConfigValues&"
+  template<typename Func>
+  seastar::future<> do_change(Func&& func) {
+    return container().invoke_on(values.get_owner_shard(),
+                                 [func = std::move(func)](ConfigProxy& owner) {
+      // apply the changes to a copy of the values
+      auto new_values = seastar::make_lw_shared(*owner.values);
+      new_values->changed.clear();
+      func(*new_values);
+
+      // always apply the new settings synchronously on the owner shard, to
+      // avoid racings with other do_change() calls in parallel.
+      ObserverMgr<ConfigObserver>::rev_obs_map rev_obs;
+      owner.values.reset(new_values);
+      owner.obs_mgr.for_each_change(owner.values->changed, owner,
+                                    [&rev_obs](ConfigObserver *obs,
+                                               const std::string &key) {
+                                      rev_obs[obs].insert(key);
+                                    }, nullptr);
+      for (auto& [obs, keys] : rev_obs) {
+        obs->handle_conf_change(owner, keys);
+      }
+
+      return seastar::parallel_for_each(boost::irange(1u, seastar::smp::count),
+                                        [&owner, new_values] (auto cpu) {
+        return owner.container().invoke_on(cpu,
+          [foreign_values = seastar::make_foreign(new_values)](ConfigProxy& proxy) mutable {
+            proxy.values.reset();
+            proxy.values = std::move(foreign_values);
+
+            ObserverMgr<ConfigObserver>::rev_obs_map rev_obs;
+            proxy.obs_mgr.for_each_change(proxy.values->changed, proxy,
+              [&rev_obs](ConfigObserver *obs, const std::string& key) {
+                rev_obs[obs].insert(key);
+              }, nullptr);
+            for (auto& obs_keys : rev_obs) {
+              obs_keys.first->handle_conf_change(proxy, obs_keys.second);
+            }
+          });
+        }).finally([new_values] {
+          new_values->changed.clear();
+        });
+      });
+  }
+public:
+  ConfigProxy(const EntityName& name, std::string_view cluster);
+  const ConfigValues* operator->() const noexcept {
+    return values.get();
+  }
+  const ConfigValues get_config_values() {
+     return *values.get();
+  }
+  ConfigValues* operator->() noexcept {
+    return values.get();
+  }
+
+  // required by sharded<>
+  seastar::future<> start();
+  seastar::future<> stop() {
+    return seastar::make_ready_future<>();
+  }
+  void add_observer(ConfigObserver* obs) {
+    obs_mgr.add_observer(obs);
+  }
+  void remove_observer(ConfigObserver* obs) {
+    obs_mgr.remove_observer(obs);
+  }
+  seastar::future<> rm_val(const std::string& key) {
+    return do_change([key, this](ConfigValues& values) {
+      auto ret = get_config().rm_val(values, key);
+      if (ret < 0) {
+        throw std::invalid_argument(cpp_strerror(ret));
+      }
+    });
+  }
+  seastar::future<> set_val(const std::string& key,
+			    const std::string& val) {
+    return do_change([key, val, this](ConfigValues& values) {
+      std::stringstream err;
+      auto ret = get_config().set_val(values, obs_mgr, key, val, &err);
+      if (ret < 0) {
+	throw std::invalid_argument(err.str());
+      }
+    });
+  }
+  int get_val(const std::string &key, std::string *val) const {
+    return get_config().get_val(*values, key, val);
+  }
+  template<typename T>
+  const T get_val(const std::string& key) const {
+    return get_config().template get_val<T>(*values, key);
+  }
+
+  int get_all_sections(std::vector<std::string>& sections) const {
+    return get_config().get_all_sections(sections);
+  }
+
+  int get_val_from_conf_file(const std::vector<std::string>& sections,
+			     const std::string& key, std::string& out,
+			     bool expand_meta) const {
+    return get_config().get_val_from_conf_file(*values, sections, key,
+					       out, expand_meta);
+  }
+
+  unsigned get_osd_pool_default_min_size(uint8_t size) const {
+    return get_config().get_osd_pool_default_min_size(*values, size);
+  }
+
+  seastar::future<>
+  set_mon_vals(const std::map<std::string,std::string,std::less<>>& kv) {
+    return do_change([kv, this](ConfigValues& values) {
+      get_config().set_mon_vals(nullptr, values, obs_mgr, kv, nullptr);
+    });
+  }
+
+  seastar::future<> inject_args(const std::string& s) {
+    return do_change([s, this](ConfigValues& values) {
+      std::stringstream err;
+      if (get_config().injectargs(values, obs_mgr, s, &err)) {
+        throw std::invalid_argument(err.str());
+      }
+    });
+  }
+  void show_config(ceph::Formatter* f) const;
+
+  seastar::future<> parse_argv(std::vector<const char*>& argv) {
+    // we could pass whatever is unparsed to seastar, but seastar::app_template
+    // is used for driving the seastar application, and
+    // crimson::common::ConfigProxy is not available until seastar engine is up
+    // and running, so we have to feed the command line args to app_template
+    // first, then pass them to ConfigProxy.
+    return do_change([&argv, this](ConfigValues& values) {
+      get_config().parse_argv(values,
+			      obs_mgr,
+			      argv,
+			      CONF_CMDLINE);
+    });
+  }
+
+  seastar::future<> parse_config_files(const std::string& conf_files);
+
+  using ShardedConfig = seastar::sharded<ConfigProxy>;
+
+private:
+  static ShardedConfig sharded_conf;
+  friend ConfigProxy& local_conf();
+  friend ShardedConfig& sharded_conf();
+};
+
+inline ConfigProxy& local_conf() {
+  return ConfigProxy::sharded_conf.local();
+}
+
+inline ConfigProxy::ShardedConfig& sharded_conf() {
+  return ConfigProxy::sharded_conf;
+}
+
+}
diff --git a/src/crimson/common/errorator.h b/src/crimson/common/errorator.h
new file mode 100644
index 000000000..af1e6ea45
--- /dev/null
+++ b/src/crimson/common/errorator.h
@@ -0,0 +1,1140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <exception>
+#include <system_error>
+
+#include <seastar/core/future-util.hh>
+
+#include "include/ceph_assert.h"
+
+namespace crimson {
+
+template<typename Iterator, typename AsyncAction>
+inline auto do_for_each(Iterator begin, Iterator end, AsyncAction action) {
+  using futurator = \
+    ::seastar::futurize<std::invoke_result_t<AsyncAction, decltype(*begin)>>;
+
+  if (begin == end) {
+    return futurator::type::errorator_type::template make_ready_future<>();
+  }
+  while (true) {
+    auto f = futurator::invoke(action, *begin);
+    ++begin;
+    if (begin == end) {
+      return f;
+    }
+    if (!f.available() || seastar::need_preempt()) {
+      return std::move(f)._then(
+        [ action = std::move(action),
+          begin = std::move(begin),
+          end = std::move(end)
+        ] () mutable {
+          return ::crimson::do_for_each(std::move(begin),
+                                        std::move(end),
+                                        std::move(action));
+      });
+    }
+    if (f.failed()) {
+      return f;
+    }
+  }
+}
+template<typename Container, typename AsyncAction>
+inline auto do_for_each(Container& c, AsyncAction action) {
+  return ::crimson::do_for_each(std::begin(c), std::end(c), std::move(action));
+}
+
+template<typename AsyncAction>
+inline auto do_until(AsyncAction action) {
+  using errorator_t =
+    typename ::seastar::futurize_t<std::invoke_result_t<AsyncAction>>::errorator_type;
+
+  while (true) {
+    auto f = ::seastar::futurize_invoke(action);
+    if (f.failed()) {
+      return errorator_t::template make_exception_future2<>(
+        f.get_exception()
+      );
+    } else if (f.available()) {
+      if (auto done = f.get0()) {
+        return errorator_t::template make_ready_future<>();
+      }
+    } else {
+      return std::move(f)._then(
+        [action = std::move(action)] (auto &&done) mutable {
+          if (done) {
+            return errorator_t::template make_ready_future<>();
+          }
+          return ::crimson::do_until(
+            std::move(action));
+        });
+    }
+  }
+}
+
+// define the interface between error types and errorator
+template <class ConcreteErrorT>
+class error_t {
+  static constexpr const std::type_info& get_exception_ptr_type_info() {
+    return ConcreteErrorT::exception_ptr_type_info();
+  }
+
+  std::exception_ptr to_exception_ptr() const {
+    const auto* concrete_error = static_cast<const ConcreteErrorT*>(this);
+    return concrete_error->to_exception_ptr();
+  }
+
+  decltype(auto) static from_exception_ptr(std::exception_ptr ep) {
+    return ConcreteErrorT::from_exception_ptr(std::move(ep));
+  }
+
+  template <class... AllowedErrorsT>
+  friend struct errorator;
+
+  template <class ErrorVisitorT, class FuturatorT>
+  friend class maybe_handle_error_t;
+
+public:
+  template <class Func>
+  static decltype(auto) handle(Func&& func) {
+    return ConcreteErrorT::handle(std::forward<Func>(func));
+  }
+};
+
+// unthrowable_wrapper ensures compilation failure when somebody
+// would like to `throw make_error<...>)()` instead of returning.
+// returning allows for the compile-time verification of future's
+// AllowedErrorsV and also avoid the burden of throwing.
+template <class ErrorT, ErrorT ErrorV>
+struct unthrowable_wrapper : error_t<unthrowable_wrapper<ErrorT, ErrorV>> {
+  unthrowable_wrapper(const unthrowable_wrapper&) = delete;
+  [[nodiscard]] static const auto& make() {
+    static constexpr unthrowable_wrapper instance{};
+    return instance;
+  }
+
+  template<class Func>
+  static auto handle(Func&& func) {
+    return [
+      func = std::forward<Func>(func)
+    ] (const unthrowable_wrapper&) mutable -> decltype(auto) {
+      if constexpr (std::is_invocable_v<Func, ErrorT>) {
+        return std::invoke(std::forward<Func>(func), ErrorV);
+      } else {
+        return std::invoke(std::forward<Func>(func));
+      }
+    };
+  }
+
+  struct pass_further {
+    decltype(auto) operator()(const unthrowable_wrapper& e) {
+      return e;
+    }
+  };
+
+  struct discard {
+    decltype(auto) operator()(const unthrowable_wrapper&) {
+    }
+  };
+
+
+private:
+  // can be used only to initialize the `instance` member
+  explicit unthrowable_wrapper() = default;
+
+  // implement the errorable interface
+  struct throwable_carrier{};
+  static std::exception_ptr carrier_instance;
+
+  static constexpr const std::type_info& exception_ptr_type_info() {
+    return typeid(throwable_carrier);
+  }
+  auto to_exception_ptr() const {
+    // error codes don't need to instantiate `std::exception_ptr` each
+    // time as the code is actually a part of the type itself.
+    // `std::make_exception_ptr()` on modern enough GCCs is quite cheap
+    // (see the Gleb Natapov's patch eradicating throw/catch there),
+    // but using one instance per type boils down the overhead to just
+    // ref-counting.
+    return carrier_instance;
+  }
+  static const auto& from_exception_ptr(std::exception_ptr) {
+    return make();
+  }
+
+  friend class error_t<unthrowable_wrapper<ErrorT, ErrorV>>;
+};
+
+template <class ErrorT, ErrorT ErrorV>
+std::exception_ptr unthrowable_wrapper<ErrorT, ErrorV>::carrier_instance = \
+  std::make_exception_ptr<
+    unthrowable_wrapper<ErrorT, ErrorV>::throwable_carrier>({});
+
+
+template <class ErrorT>
+struct stateful_error_t : error_t<stateful_error_t<ErrorT>> {
+  template <class... Args>
+  explicit stateful_error_t(Args&&... args)
+    : ep(std::make_exception_ptr<ErrorT>(std::forward<Args>(args)...)) {
+  }
+
+  template<class Func>
+  static auto handle(Func&& func) {
+    static_assert(std::is_invocable_v<Func, ErrorT>);
+    return [
+      func = std::forward<Func>(func)
+    ] (stateful_error_t<ErrorT>&& e) mutable -> decltype(auto) {
+      try {
+        std::rethrow_exception(e.ep);
+      } catch (const ErrorT& obj) {
+        return std::invoke(std::forward<Func>(func), obj);
+      }
+      ceph_abort_msg("exception type mismatch – impossible!");
+    };
+  }
+
+private:
+  std::exception_ptr ep;
+
+  explicit stateful_error_t(std::exception_ptr ep) : ep(std::move(ep)) {}
+
+  static constexpr const std::type_info& exception_ptr_type_info() {
+    return typeid(ErrorT);
+  }
+  auto to_exception_ptr() const {
+    return ep;
+  }
+  static stateful_error_t<ErrorT> from_exception_ptr(std::exception_ptr ep) {
+    return stateful_error_t<ErrorT>(std::move(ep));
+  }
+
+  friend class error_t<stateful_error_t<ErrorT>>;
+};
+
+namespace _impl {
+  template <class T> struct always_false : std::false_type {};
+};
+
+template <class ErrorVisitorT, class FuturatorT>
+class maybe_handle_error_t {
+  const std::type_info& type_info;
+  typename FuturatorT::type result;
+  ErrorVisitorT errfunc;
+
+public:
+  maybe_handle_error_t(ErrorVisitorT&& errfunc, std::exception_ptr ep)
+    : type_info(*ep.__cxa_exception_type()),
+      result(FuturatorT::make_exception_future(std::move(ep))),
+      errfunc(std::forward<ErrorVisitorT>(errfunc)) {
+  }
+
+  template <class ErrorT>
+  void handle() {
+    static_assert(std::is_invocable<ErrorVisitorT, ErrorT>::value,
+                  "provided Error Visitor is not exhaustive");
+    // In C++ throwing an exception isn't the sole way to signal
+    // error with it. This approach nicely fits cold, infrequent cases
+    // but when applied to a hot one, it will likely hurt performance.
+    //
+    // Alternative approach is to create `std::exception_ptr` on our
+    // own and place it in the future via `make_exception_future()`.
+    // When it comes to handling, the pointer can be interrogated for
+    // pointee's type with `__cxa_exception_type()` instead of costly
+    // re-throwing (via `std::rethrow_exception()`) and matching with
+    // `catch`. The limitation here is lack of support for hierarchies
+    // of exceptions. The code below checks for exact match only while
+    // `catch` would allow to match against a base class as well.
+    // However, this shouldn't be a big issue for `errorator` as Error
+    // Visitors are already checked for exhaustiveness at compile-time.
+    //
+    // NOTE: `__cxa_exception_type()` is an extension of the language.
+    // It should be available both in GCC and Clang but a fallback
+    // (based on `std::rethrow_exception()` and `catch`) can be made
+    // to handle other platforms if necessary.
+    if (type_info == ErrorT::error_t::get_exception_ptr_type_info()) {
+      // set `state::invalid` in internals of `seastar::future` to not
+      // call `report_failed_future()` during `operator=()`.
+      [[maybe_unused]] auto&& ep = std::move(result).get_exception();
+
+      using return_t = std::invoke_result_t<ErrorVisitorT, ErrorT>;
+      if constexpr (std::is_assignable_v<decltype(result), return_t>) {
+        result = std::invoke(std::forward<ErrorVisitorT>(errfunc),
+                             ErrorT::error_t::from_exception_ptr(std::move(ep)));
+      } else if constexpr (std::is_same_v<return_t, void>) {
+        // void denotes explicit discarding
+        // execute for the sake a side effects. Typically this boils down
+        // to throwing an exception by the handler.
+        std::invoke(std::forward<ErrorVisitorT>(errfunc),
+                    ErrorT::error_t::from_exception_ptr(std::move(ep)));
+      } else {
+        static_assert(_impl::always_false<return_t>::value,
+                      "return of Error Visitor is not assignable to future");
+        // do nothing with `ep`.
+      }
+    }
+  }
+
+  auto get_result() && {
+    return std::move(result);
+  }
+};
+
+template <class FuncHead, class... FuncTail>
+static constexpr auto composer(FuncHead&& head, FuncTail&&... tail) {
+  return [
+    head = std::forward<FuncHead>(head),
+    // perfect forwarding in lambda's closure isn't available in C++17
+    // using tuple as workaround; see: https://stackoverflow.com/a/49902823
+    tail = std::make_tuple(std::forward<FuncTail>(tail)...)
+  ] (auto&&... args) mutable -> decltype(auto) {
+    if constexpr (std::is_invocable_v<FuncHead, decltype(args)...>) {
+      return std::invoke(std::forward<FuncHead>(head),
+                         std::forward<decltype(args)>(args)...);
+    } else if constexpr (sizeof...(FuncTail) > 0) {
+      using next_composer_t = decltype(composer<FuncTail...>);
+      auto&& next = std::apply<next_composer_t>(composer<FuncTail...>,
+                                                std::move(tail));
+      return std::invoke(std::move(next),
+                         std::forward<decltype(args)>(args)...);
+    } else {
+      static_assert(
+	std::is_invocable_v<FuncHead, decltype(args)...> ||
+	(sizeof...(FuncTail) > 0),
+      "composition is not exhaustive");
+    }
+  };
+}
+
+template <class ValueT>
+struct errorated_future_marker{};
+
+template <class... AllowedErrors>
+struct errorator {
+  template <class T>
+  static inline constexpr bool is_error_v = std::is_base_of_v<error_t<T>, T>;
+
+  static_assert((... && is_error_v<AllowedErrors>),
+                "errorator expects presence of ::is_error in all error types");
+
+  template <class ErrorT>
+  struct contains_once {
+    static constexpr bool value =
+      (0 + ... + std::is_same_v<ErrorT, AllowedErrors>) == 1;
+  };
+  template <class... Errors>
+  struct contains_once<errorator<Errors...>> {
+    static constexpr bool value = (... && contains_once<Errors>::value);
+  };
+  template <class T>
+  static constexpr bool contains_once_v = contains_once<T>::value;
+
+  static_assert((... && contains_once_v<AllowedErrors>),
+                "no error type in errorator can be duplicated");
+
+  struct ready_future_marker{};
+  struct exception_future_marker{};
+
+private:
+  // see the comment for `using future = _future` below.
+  template <class>
+  class _future {};
+  template <class ValueT>
+  class _future<::crimson::errorated_future_marker<ValueT>>
+    : private seastar::future<ValueT> {
+    using base_t = seastar::future<ValueT>;
+    // we need the friendship for the sake of `get_exception() &&` when
+    // `safe_then()` is going to return an errorated future as a result of
+    // chaining. In contrast to `seastar::future`, errorator<T...>::future`
+    // has this member private.
+    template <class ErrorVisitor, class Futurator>
+    friend class maybe_handle_error_t;
+
+    // any `seastar::futurize` specialization must be able to access the base.
+    // see : `satisfy_with_result_of()` far below.
+    template <typename>
+    friend class seastar::futurize;
+
+    template <typename T1, typename T2, typename... More>
+    friend auto seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, More&&... more);
+
+    template <class, class = std::void_t<>>
+    struct get_errorator {
+      // generic template for non-errorated things (plain types and
+      // vanilla seastar::future as well).
+      using type = errorator<>;
+    };
+    template <class FutureT>
+    struct get_errorator<FutureT,
+                         std::void_t<typename FutureT::errorator_type>> {
+      using type = typename FutureT::errorator_type;
+    };
+    template <class T>
+    using get_errorator_t = typename get_errorator<T>::type;
+
+    template <class ValueFuncErroratorT, class... ErrorVisitorRetsT>
+    struct make_errorator {
+      // NOP. The generic template.
+    };
+    template <class... ValueFuncAllowedErrors,
+              class    ErrorVisitorRetsHeadT,
+              class... ErrorVisitorRetsTailT>
+    struct make_errorator<errorator<ValueFuncAllowedErrors...>,
+                          ErrorVisitorRetsHeadT,
+                          ErrorVisitorRetsTailT...> {
+    private:
+      using step_errorator = errorator<ValueFuncAllowedErrors...>;
+      // add ErrorVisitorRetsHeadT only if 1) it's an error type and
+      // 2) isn't already included in the errorator's error set.
+      // It's enough to negate contains_once_v as any errorator<...>
+      // type is already guaranteed to be free of duplications.
+      using next_errorator = std::conditional_t<
+        is_error_v<ErrorVisitorRetsHeadT> &&
+          !step_errorator::template contains_once_v<ErrorVisitorRetsHeadT>,
+        typename step_errorator::template extend<ErrorVisitorRetsHeadT>,
+        step_errorator>;
+
+    public:
+      using type = typename make_errorator<next_errorator,
+                                           ErrorVisitorRetsTailT...>::type;
+    };
+    // finish the recursion
+    template <class... ValueFuncAllowedErrors>
+    struct make_errorator<errorator<ValueFuncAllowedErrors...>> {
+      using type = ::crimson::errorator<ValueFuncAllowedErrors...>;
+    };
+    template <class... Args>
+    using make_errorator_t = typename make_errorator<Args...>::type;
+
+    using base_t::base_t;
+
+    template <class Futurator, class Future, class ErrorVisitor>
+    [[gnu::noinline]]
+    static auto _safe_then_handle_errors(Future&& future,
+                                         ErrorVisitor&& errfunc) {
+      maybe_handle_error_t<ErrorVisitor, Futurator> maybe_handle_error(
+        std::forward<ErrorVisitor>(errfunc),
+        std::move(future).get_exception()
+      );
+      (maybe_handle_error.template handle<AllowedErrors>() , ...);
+      return std::move(maybe_handle_error).get_result();
+    }
+
+  public:
+    using errorator_type = ::crimson::errorator<AllowedErrors...>;
+    using promise_type = seastar::promise<ValueT>;
+
+    using base_t::available;
+    using base_t::failed;
+    // need this because of the legacy in PG::do_osd_ops().
+    using base_t::handle_exception_type;
+
+    [[gnu::always_inline]]
+    _future(base_t&& base)
+      : base_t(std::move(base)) {
+    }
+
+    template <class... A>
+    [[gnu::always_inline]]
+    _future(ready_future_marker, A&&... a)
+      : base_t(::seastar::make_ready_future<ValueT>(std::forward<A>(a)...)) {
+    }
+    [[gnu::always_inline]]
+    _future(exception_future_marker, ::seastar::future_state_base&& state) noexcept
+      : base_t(::seastar::futurize<base_t>::make_exception_future(std::move(state))) {
+    }
+    [[gnu::always_inline]]
+    _future(exception_future_marker, std::exception_ptr&& ep) noexcept
+      : base_t(::seastar::futurize<base_t>::make_exception_future(std::move(ep))) {
+    }
+
+    template <template <class...> class ErroratedFuture,
+              class = std::void_t<
+                typename ErroratedFuture<
+                  ::crimson::errorated_future_marker<ValueT>>::errorator_type>>
+    operator ErroratedFuture<errorated_future_marker<ValueT>> () && {
+      using dest_errorator_t = \
+        typename ErroratedFuture<
+          ::crimson::errorated_future_marker<ValueT>>::errorator_type;
+      static_assert(dest_errorator_t::template contains_once_v<errorator_type>,
+                    "conversion is possible to more-or-eq errorated future!");
+      return static_cast<base_t&&>(*this);
+    }
+
+    // initialize future as failed without throwing. `make_exception_future()`
+    // internally uses `std::make_exception_ptr()`. cppreference.com shouldn't
+    // be misinterpreted when it says:
+    //
+    //   "This is done as if executing the following code:
+    //     try {
+    //         throw e;
+    //     } catch(...) {
+    //         return std::current_exception();
+    //     }",
+    //
+    // the "as if" is absolutely crucial because modern GCCs employ optimized
+    // path for it. See:
+    //   * https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=cce8e59224e18858749a2324bce583bcfd160d6c,
+    //   * https://gcc.gnu.org/ml/gcc-patches/2016-08/msg00373.html.
+    //
+    // This behavior, combined with `__cxa_exception_type()` for inspecting
+    // exception's type, allows for throw/catch-free handling of stateless
+    // exceptions (which is fine for error codes). Stateful jumbos would be
+    // actually a bit harder as `_M_get()` is private, and thus rethrowing is
+    // necessary to get to the state inside. However, it's not unthinkable to
+    // see another extension bringing operator*() to the exception pointer...
+    //
+    // TODO: we don't really need to `make_exception_ptr` each time. It still
+    // allocates memory underneath while can be replaced with single instance
+    // per type created on start-up.
+    template <class ErrorT,
+              class DecayedT = std::decay_t<ErrorT>,
+              bool IsError = is_error_v<DecayedT>,
+              class = std::enable_if_t<IsError>>
+    _future(ErrorT&& e)
+      : base_t(
+          seastar::make_exception_future<ValueT>(
+            errorator_type::make_exception_ptr(e))) {
+      static_assert(errorator_type::contains_once_v<DecayedT>,
+                    "ErrorT is not enlisted in errorator");
+    }
+
+    template <class ValueFuncT, class ErrorVisitorT>
+    auto safe_then(ValueFuncT&& valfunc, ErrorVisitorT&& errfunc) {
+      static_assert((... && std::is_invocable_v<ErrorVisitorT,
+                                                AllowedErrors>),
+                    "provided Error Visitor is not exhaustive");
+
+      using value_func_result_t =
+        typename std::conditional_t<std::is_void_v<ValueT>,
+				    std::invoke_result<ValueFuncT>,
+				    std::invoke_result<ValueFuncT, ValueT>>::type;
+      // recognize whether there can be any error coming from the Value
+      // Function.
+      using value_func_errorator_t = get_errorator_t<value_func_result_t>;
+      // mutate the Value Function's errorator to harvest errors coming
+      // from the Error Visitor. Yes, it's perfectly fine to fail error
+      // handling at one step and delegate even broader set of issues
+      // to next continuation.
+      using return_errorator_t = make_errorator_t<
+        value_func_errorator_t,
+        std::decay_t<std::invoke_result_t<ErrorVisitorT, AllowedErrors>>...>;
+      // OK, now we know about all errors next continuation must take
+      // care about. If Visitor handled everything and the Value Func
+      // doesn't return any, we'll finish with errorator<>::future
+      // which is just vanilla seastar::future – that's it, next cont
+      // finally could use `.then()`!
+      using futurator_t = \
+        typename return_errorator_t::template futurize<value_func_result_t>;
+      // `seastar::futurize`, used internally by `then_wrapped()`, would
+      // wrap any non-`seastar::future` type coming from Value Func into
+      // `seastar::future`. As we really don't want to end with things
+      // like `seastar::future<errorator::future<...>>`, we need either:
+      //   * convert the errorated future into plain in the lambda below
+      //     and back here or
+      //   * specialize the `seastar::futurize<T>` to get proper kind of
+      //     future directly from `::then_wrapped()`.
+      // As C++17 doesn't guarantee copy elision when non-same types are
+      // involved while examination of assemblies from GCC 8.1 confirmed
+      // extra copying, switch to the second approach has been made.
+      return this->then_wrapped(
+        [ valfunc = std::forward<ValueFuncT>(valfunc),
+          errfunc = std::forward<ErrorVisitorT>(errfunc)
+        ] (auto&& future) mutable noexcept {
+          if (__builtin_expect(future.failed(), false)) {
+            return _safe_then_handle_errors<futurator_t>(
+              std::move(future), std::forward<ErrorVisitorT>(errfunc));
+          } else {
+            // NOTE: using `seastar::future::get()` here is a bit bloaty
+            // as the method rechecks availability of future's value and,
+            // if it's unavailable, does the `::do_wait()` path (yes, it
+            // targets `seastar::thread`). Actually this is dead code as
+            // `then_wrapped()` executes the lambda only when the future
+            // is available (which means: failed or ready). However, GCC
+            // hasn't optimized it out:
+            //
+            //          if (__builtin_expect(future.failed(), false)) {
+            //    ea25:       48 83 bd c8 fe ff ff    cmpq   $0x2,-0x138(%rbp)
+            //    ea2c:       02
+            //    ea2d:       0f 87 f0 05 00 00       ja     f023 <ceph::osd::
+            // ...
+            //    /// If get() is called in a \ref seastar::thread context,
+            //    /// then it need not be available; instead, the thread will
+            //    /// be paused until the future becomes available.
+            //    [[gnu::always_inline]]
+            //    std::tuple<T...> get() {
+            //        if (!_state.available()) {
+            //    ea3a:       0f 85 1b 05 00 00       jne    ef5b <ceph::osd::
+            //    }
+            // ...
+            //
+            // I don't perceive this as huge issue. Though, it cannot be
+            // claimed errorator has 0 overhead on hot path. The perfect
+            // solution here would be mark the `::get_available_state()`
+            // as `protected` and use dedicated `get_value()` exactly as
+            // `::then()` already does.
+            return futurator_t::invoke(std::forward<ValueFuncT>(valfunc),
+                                       std::move(future).get());
+          }
+        });
+    }
+
+    /**
+     * unsafe_thread_get
+     *
+     * Only valid within a seastar_thread.  Ignores errorator protections
+     * and throws any contained exceptions.
+     *
+     * Should really only be used within test code
+     * (see test/crimson/gtest_seastar.h).
+     */
+    auto &&unsafe_get() {
+      return seastar::future<ValueT>::get();
+    }
+    auto unsafe_get0() {
+      return seastar::future<ValueT>::get0();
+    }
+
+    template <class FuncT>
+    _future finally(FuncT &&func) {
+      return this->then_wrapped(
+        [func = std::forward<FuncT>(func)](auto &&result) mutable noexcept {
+        if constexpr (seastar::is_future<std::invoke_result_t<FuncT>>::value) {
+          return ::seastar::futurize_invoke(std::forward<FuncT>(func)).then_wrapped(
+            [result = std::move(result)](auto&& f_res) mutable {
+            // TODO: f_res.failed()
+            (void)f_res.discard_result();
+            return std::move(result);
+          });
+        } else {
+          try {
+            func();
+          } catch (...) {
+            // TODO: rethrow
+          }
+          return std::move(result);
+        }
+      });
+    }
+
+    // taking ErrorFuncOne and ErrorFuncTwo separately from ErrorFuncTail
+    // to avoid SFINAE
+    template <class ValueFunc,
+              class ErrorFuncHead,
+              class... ErrorFuncTail>
+    auto safe_then(ValueFunc&& value_func,
+                   ErrorFuncHead&& error_func_head,
+                   ErrorFuncTail&&... error_func_tail) {
+      static_assert(sizeof...(ErrorFuncTail) > 0);
+      return safe_then(
+        std::forward<ValueFunc>(value_func),
+        composer(std::forward<ErrorFuncHead>(error_func_head),
+                 std::forward<ErrorFuncTail>(error_func_tail)...));
+    }
+
+    template <class ValueFunc>
+    auto safe_then(ValueFunc&& value_func) {
+      return safe_then(std::forward<ValueFunc>(value_func),
+                       errorator_type::pass_further{});
+    }
+
+    template <class Func>
+    void then(Func&&) = delete;
+
+    template <class ErrorVisitorT>
+    auto handle_error(ErrorVisitorT&& errfunc) {
+      static_assert((... && std::is_invocable_v<ErrorVisitorT,
+                                                AllowedErrors>),
+                    "provided Error Visitor is not exhaustive");
+      using return_errorator_t = make_errorator_t<
+        errorator<>,
+        std::decay_t<std::invoke_result_t<ErrorVisitorT, AllowedErrors>>...>;
+      using futurator_t = \
+        typename return_errorator_t::template futurize<::seastar::future<ValueT>>;
+      return this->then_wrapped(
+        [ errfunc = std::forward<ErrorVisitorT>(errfunc)
+        ] (auto&& future) mutable noexcept {
+          if (__builtin_expect(future.failed(), false)) {
+            return _safe_then_handle_errors<futurator_t>(
+              std::move(future), std::forward<ErrorVisitorT>(errfunc));
+          } else {
+            return typename futurator_t::type{ std::move(future) };
+          }
+        });
+    }
+    template <class ErrorFuncHead,
+              class... ErrorFuncTail>
+    auto handle_error(ErrorFuncHead&& error_func_head,
+                      ErrorFuncTail&&... error_func_tail) {
+      static_assert(sizeof...(ErrorFuncTail) > 0);
+      return this->handle_error(
+        composer(std::forward<ErrorFuncHead>(error_func_head),
+                 std::forward<ErrorFuncTail>(error_func_tail)...));
+    }
+
+  private:
+    // for ::crimson::do_for_each
+    template <class Func>
+    auto _then(Func&& func) {
+      return base_t::then(std::forward<Func>(func));
+    }
+    template<typename Iterator, typename AsyncAction>
+    friend inline auto ::crimson::do_for_each(Iterator begin,
+                                              Iterator end,
+                                              AsyncAction action);
+
+    template<typename AsyncAction>
+    friend inline auto ::crimson::do_until(AsyncAction action);
+
+    template <typename Result>
+    friend class ::seastar::future;
+
+    // let seastar::do_with_impl to up-cast us to seastar::future.
+    template<typename T, typename F>
+    friend inline auto ::seastar::internal::do_with_impl(T&& rvalue, F&& f);
+    template<typename T1, typename T2, typename T3_or_F, typename... More>
+    friend inline auto ::seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, T3_or_F&& rv3, More&&... more);
+  };
+
+  class Enabler {};
+
+  template <typename T>
+  using EnableIf = typename std::enable_if<contains_once_v<std::decay_t<T>>, Enabler>::type;
+
+  template <typename ErrorFunc>
+  struct all_same_way_t {
+    ErrorFunc func;
+    all_same_way_t(ErrorFunc &&error_func)
+      : func(std::forward<ErrorFunc>(error_func)) {}
+
+    template <typename ErrorT, EnableIf<ErrorT>...>
+    decltype(auto) operator()(ErrorT&& e) {
+      using decayed_t = std::decay_t<decltype(e)>;
+      auto&& handler =
+        decayed_t::error_t::handle(std::forward<ErrorFunc>(func));
+      static_assert(std::is_invocable_v<decltype(handler), ErrorT>);
+      return std::invoke(std::move(handler), std::forward<ErrorT>(e));
+    }
+  };
+
+public:
+  // HACK: `errorated_future_marker` and `_future` is just a hack to
+  // specialize `seastar::futurize` for category of class templates:
+  // `future<...>` from distinct errorators. Such tricks are usually
+  // performed basing on SFINAE and `std::void_t` to check existence
+  // of a trait/member (`future<...>::errorator_type` in our case).
+  // Unfortunately, this technique can't be applied as the `futurize`
+  // lacks the optional parameter. The problem looks awfully similar
+  // to following SO item:  https://stackoverflow.com/a/38860413.
+  template <class ValueT=void>
+  using future = _future<::crimson::errorated_future_marker<ValueT>>;
+
+  // the visitor that forwards handling of all errors to next continuation
+  struct pass_further {
+    template <class ErrorT, EnableIf<ErrorT>...>
+    decltype(auto) operator()(ErrorT&& e) {
+      static_assert(contains_once_v<std::decay_t<ErrorT>>,
+                    "passing further disallowed ErrorT");
+      return std::forward<ErrorT>(e);
+    }
+  };
+
+  struct discard_all {
+    template <class ErrorT, EnableIf<ErrorT>...>
+    void operator()(ErrorT&&) {
+      static_assert(contains_once_v<std::decay_t<ErrorT>>,
+                    "discarding disallowed ErrorT");
+    }
+  };
+
+  // assert_all{ "TODO" };
+  class assert_all {
+    const char* const msg = nullptr;
+  public:
+    template <std::size_t N>
+    assert_all(const char (&msg)[N])
+      : msg(msg) {
+    }
+    assert_all() = default;
+
+    template <class ErrorT, EnableIf<ErrorT>...>
+    void operator()(ErrorT&&) {
+      static_assert(contains_once_v<std::decay_t<ErrorT>>,
+                    "discarding disallowed ErrorT");
+      if (msg) {
+        ceph_abort_msg(msg);
+      } else {
+        ceph_abort();
+      }
+    }
+  };
+
+  template <class ErrorFunc>
+  static decltype(auto) all_same_way(ErrorFunc&& error_func) {
+    return all_same_way_t<ErrorFunc>{std::forward<ErrorFunc>(error_func)};
+  };
+
+  // get a new errorator by extending current one with new error
+  template <class... NewAllowedErrorsT>
+  using extend = errorator<AllowedErrors..., NewAllowedErrorsT...>;
+
+  // get a new errorator by summing and deduplicating error set of
+  // the errorator `unify<>` is applied on with another errorator
+  // provided as template parameter.
+  template <class OtherErroratorT>
+  struct unify {
+    // 1st: generic NOP template
+  };
+  template <class    OtherAllowedErrorsHead,
+            class... OtherAllowedErrorsTail>
+  struct unify<errorator<OtherAllowedErrorsHead,
+                         OtherAllowedErrorsTail...>> {
+  private:
+    // 2nd: specialization for errorators with non-empty error set.
+    //
+    // split error set of other errorator, passed as template param,
+    // into head and tail. Mix error set of this errorator with head
+    // of the other one only if it isn't already present in the set.
+    using step_errorator = std::conditional_t<
+      contains_once_v<OtherAllowedErrorsHead> == false,
+      errorator<AllowedErrors..., OtherAllowedErrorsHead>,
+      errorator<AllowedErrors...>>;
+    using rest_errorator = errorator<OtherAllowedErrorsTail...>;
+
+  public:
+    using type = typename step_errorator::template unify<rest_errorator>::type;
+  };
+  template <class... EmptyPack>
+  struct unify<errorator<EmptyPack...>> {
+    // 3rd: recursion finisher
+    static_assert(sizeof...(EmptyPack) == 0);
+    using type = errorator<AllowedErrors...>;
+  };
+
+  template <typename T=void, typename... A>
+  static future<T> make_ready_future(A&&... value) {
+    return future<T>(ready_future_marker(), std::forward<A>(value)...);
+  }
+
+  template <typename T=void>
+  static
+  future<T> make_exception_future2(std::exception_ptr&& ex) noexcept {
+    return future<T>(exception_future_marker(), std::move(ex));
+  }
+  template <typename T=void>
+  static
+  future<T> make_exception_future2(seastar::future_state_base&& state) noexcept {
+    return future<T>(exception_future_marker(), std::move(state));
+  }
+  template <typename T=void, typename Exception>
+  static
+  future<T> make_exception_future2(Exception&& ex) noexcept {
+    return make_exception_future2<T>(std::make_exception_ptr(std::forward<Exception>(ex)));
+  }
+
+  static auto now() {
+    return make_ready_future<>();
+  }
+
+private:
+  template <class T, class = std::void_t<T>>
+  class futurize {
+    using vanilla_futurize = seastar::futurize<T>;
+
+    // explicit specializations for nested type is not allowed unless both
+    // the member template and the enclosing template are specialized. see
+    // section temp.expl.spec, N4659
+    template <class Stored, int Dummy = 0>
+    struct stored_to_future {
+      using type = future<Stored>;
+    };
+    template <int Dummy>
+    struct stored_to_future <seastar::internal::monostate, Dummy> {
+      using type = future<>;
+    };
+
+  public:
+    using type =
+      typename stored_to_future<typename vanilla_futurize::value_type>::type;
+
+    template <class Func, class... Args>
+    static type invoke(Func&& func, Args&&... args) {
+      try {
+        return vanilla_futurize::invoke(std::forward<Func>(func),
+                                        std::forward<Args>(args)...);
+      } catch (...) {
+        return make_exception_future(std::current_exception());
+      }
+    }
+
+    template <class Func>
+    static type invoke(Func&& func, seastar::internal::monostate) {
+      try {
+        return vanilla_futurize::invoke(std::forward<Func>(func));
+      } catch (...) {
+        return make_exception_future(std::current_exception());
+      }
+    }
+
+    template <typename Arg>
+    static type make_exception_future(Arg&& arg) {
+      return vanilla_futurize::make_exception_future(std::forward<Arg>(arg));
+    }
+  };
+  template <template <class...> class ErroratedFutureT,
+            class ValueT>
+  class futurize<ErroratedFutureT<::crimson::errorated_future_marker<ValueT>>,
+                 std::void_t<
+                   typename ErroratedFutureT<
+                     ::crimson::errorated_future_marker<ValueT>>::errorator_type>> {
+  public:
+    using type = ::crimson::errorator<AllowedErrors...>::future<ValueT>;
+
+    template <class Func, class... Args>
+    static type apply(Func&& func, std::tuple<Args...>&& args) {
+      try {
+        return ::seastar::futurize_apply(std::forward<Func>(func),
+					 std::forward<std::tuple<Args...>>(args));
+      } catch (...) {
+        return make_exception_future(std::current_exception());
+      }
+    }
+
+    template <class Func, class... Args>
+    static type invoke(Func&& func, Args&&... args) {
+      try {
+        return ::seastar::futurize_invoke(std::forward<Func>(func),
+                                          std::forward<Args>(args)...);
+      } catch (...) {
+        return make_exception_future(std::current_exception());
+      }
+    }
+
+    template <class Func>
+    static type invoke(Func&& func, seastar::internal::monostate) {
+      try {
+        return ::seastar::futurize_invoke(std::forward<Func>(func));
+      } catch (...) {
+        return make_exception_future(std::current_exception());
+      }
+    }
+
+    template <typename Arg>
+    static type make_exception_future(Arg&& arg) {
+      return ::crimson::errorator<AllowedErrors...>::make_exception_future2<ValueT>(std::forward<Arg>(arg));
+    }
+  };
+
+  template <class ErrorT>
+  static std::exception_ptr make_exception_ptr(ErrorT&& e) {
+    // calling via interface class due to encapsulation and friend relations.
+    return e.error_t<std::decay_t<ErrorT>>::to_exception_ptr();
+  }
+
+  // needed because of:
+  //  * return_errorator_t::template futurize<...> in `safe_then()`,
+  //  * conversion to `std::exception_ptr` in `future::future(ErrorT&&)`.
+  // the friendship with all errorators is an idea from Kefu to fix build
+  // issues on GCC 9. This version likely fixes some access violation bug
+  // we were exploiting before.
+  template <class...>
+  friend class errorator;
+}; // class errorator, generic template
+
+// no errors? errorator<>::future is plain seastar::future then!
+template <>
+class errorator<> {
+public:
+  template <class ValueT>
+  using future = ::seastar::future<ValueT>;
+
+  template <class T>
+  using futurize = ::seastar::futurize<T>;
+
+  // get a new errorator by extending current one with new error
+  template <class... NewAllowedErrors>
+  using extend = errorator<NewAllowedErrors...>;
+
+  // errorator with empty error set never contains any error
+  template <class T>
+  static constexpr bool contains_once_v = false;
+}; // class errorator, <> specialization
+
+
+template <class    ErroratorOne,
+          class    ErroratorTwo,
+          class... FurtherErrators>
+struct compound_errorator {
+private:
+  // generic template. Empty `FurtherErrators` are handled by
+  // the specialization below.
+  static_assert(sizeof...(FurtherErrators) > 0);
+  using step =
+    typename compound_errorator<ErroratorOne, ErroratorTwo>::type;
+
+public:
+  using type =
+    typename compound_errorator<step, FurtherErrators...>::type;
+};
+template <class ErroratorOne,
+          class ErroratorTwo>
+struct compound_errorator<ErroratorOne, ErroratorTwo>  {
+  // specialization for empty `FurtherErrators` arg pack
+  using type =
+    typename ErroratorOne::template unify<ErroratorTwo>::type;
+};
+template <class... Args>
+using compound_errorator_t = typename compound_errorator<Args...>::type;
+
+// this is conjunction of two nasty features: C++14's variable template
+// and inline global variable of C++17. The latter is crucial to ensure
+// the variable will get the same address across all translation units.
+template <std::errc ErrorV>
+inline std::error_code ec = std::make_error_code(ErrorV);
+
+template <std::errc ErrorV>
+using ct_error_code = unthrowable_wrapper<const std::error_code&, ec<ErrorV>>;
+
+namespace ct_error {
+  using enoent = ct_error_code<std::errc::no_such_file_or_directory>;
+  using enodata = ct_error_code<std::errc::no_message_available>;
+  using invarg =  ct_error_code<std::errc::invalid_argument>;
+  using input_output_error = ct_error_code<std::errc::io_error>;
+  using object_corrupted = ct_error_code<std::errc::illegal_byte_sequence>;
+  using permission_denied = ct_error_code<std::errc::permission_denied>;
+  using operation_not_supported =
+    ct_error_code<std::errc::operation_not_supported>;
+  using not_connected = ct_error_code<std::errc::not_connected>;
+  using timed_out = ct_error_code<std::errc::timed_out>;
+  using erange =
+    ct_error_code<std::errc::result_out_of_range>;
+  using ebadf =
+    ct_error_code<std::errc::bad_file_descriptor>;
+  using enospc =
+    ct_error_code<std::errc::no_space_on_device>;
+  using value_too_large = ct_error_code<std::errc::value_too_large>;
+  using eagain =
+    ct_error_code<std::errc::resource_unavailable_try_again>;
+  using file_too_large =
+    ct_error_code<std::errc::file_too_large>;
+  using address_in_use = ct_error_code<std::errc::address_in_use>;
+
+  struct pass_further_all {
+    template <class ErrorT>
+    decltype(auto) operator()(ErrorT&& e) {
+      return std::forward<ErrorT>(e);
+    }
+  };
+
+  struct discard_all {
+    template <class ErrorT>
+    void operator()(ErrorT&&) {
+    }
+  };
+
+  class assert_all {
+    const char* const msg = nullptr;
+  public:
+    template <std::size_t N>
+    assert_all(const char (&msg)[N])
+      : msg(msg) {
+    }
+    assert_all() = default;
+
+    template <class ErrorT>
+    void operator()(ErrorT&&) {
+      if (msg) {
+        ceph_abort(msg);
+      } else {
+        ceph_abort();
+      }
+    }
+  };
+
+  template <class ErrorFunc>
+  static decltype(auto) all_same_way(ErrorFunc&& error_func) {
+    return [
+      error_func = std::forward<ErrorFunc>(error_func)
+    ] (auto&& e) mutable -> decltype(auto) {
+      using decayed_t = std::decay_t<decltype(e)>;
+      auto&& handler =
+        decayed_t::error_t::handle(std::forward<ErrorFunc>(error_func));
+      return std::invoke(std::move(handler), std::forward<decltype(e)>(e));
+    };
+  };
+}
+
+using stateful_errc = stateful_error_t<std::errc>;
+using stateful_errint = stateful_error_t<int>;
+using stateful_ec = stateful_error_t<std::error_code>;
+
+} // namespace crimson
+
+
+// open the `seastar` namespace to specialize `futurize`. This is not
+// pretty for sure. I just hope it's not worse than e.g. specializing
+// `hash` in the `std` namespace. The justification is copy avoidance
+// in `future<...>::safe_then()`. See the comments there for details.
+namespace seastar {
+
+// Container is a placeholder for errorator::_future<> template
+template <template <class> class Container,
+          class Value>
+struct futurize<Container<::crimson::errorated_future_marker<Value>>> {
+  using errorator_type = typename Container<
+    ::crimson::errorated_future_marker<Value>>::errorator_type;
+
+  using type = typename errorator_type::template future<Value>;
+  using value_type = seastar::internal::future_stored_type_t<Value>;
+
+  template<typename Func, typename... FuncArgs>
+  [[gnu::always_inline]]
+  static inline type invoke(Func&& func, FuncArgs&&... args) noexcept {
+    try {
+      return func(std::forward<FuncArgs>(args)...);
+    } catch (...) {
+      return make_exception_future(std::current_exception());
+    }
+  }
+
+  template <class Func>
+  [[gnu::always_inline]]
+  static type invoke(Func&& func, seastar::internal::monostate) noexcept {
+    try {
+      return func();
+    } catch (...) {
+      return make_exception_future(std::current_exception());
+    }
+  }
+
+  template <typename Arg>
+  [[gnu::always_inline]]
+  static type make_exception_future(Arg&& arg) {
+    return errorator_type::template make_exception_future2<Value>(std::forward<Arg>(arg));
+  }
+
+private:
+  template<typename PromiseT, typename Func>
+  static void satisfy_with_result_of(PromiseT&& pr, Func&& func) {
+    // this may use the protected variant of `seastar::future::forward_to()`
+    // because:
+    //   1. `seastar::future` established a friendship with with all
+    //      specializations of `seastar::futurize`, including this
+    //      one (we're in the `seastar` namespace!) WHILE
+    //   2. any errorated future declares now the friendship with any
+    //      `seastar::futurize<...>`.
+    func().forward_to(std::move(pr));
+  }
+  template <typename U>
+  friend class future;
+};
+
+template <template <class> class Container,
+          class Value>
+struct continuation_base_from_future<Container<::crimson::errorated_future_marker<Value>>> {
+  using type = continuation_base<Value>;
+};
+
+} // namespace seastar
diff --git a/src/crimson/common/exception.h b/src/crimson/common/exception.h
new file mode 100644
index 000000000..05caf5ebd
--- /dev/null
+++ b/src/crimson/common/exception.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <exception>
+#include <seastar/core/future.hh>
+#include <seastar/core/future-util.hh>
+
+#include "crimson/common/log.h"
+
+namespace crimson::common {
+
+class system_shutdown_exception final : public std::exception{
+public:
+  const char* what() const noexcept final {
+    return "system shutting down";
+  }
+};
+
+class actingset_changed final : public std::exception {
+public:
+  actingset_changed(bool sp) : still_primary(sp) {}
+  const char* what() const noexcept final {
+    return "acting set changed";
+  }
+  bool is_primary() const {
+    return still_primary;
+  }
+private:
+  const bool still_primary;
+};
+
+template<typename Func, typename... Args>
+inline seastar::future<> handle_system_shutdown(Func&& func, Args&&... args)
+{
+  return seastar::futurize_invoke(std::forward<Func>(func),
+				  std::forward<Args>(args)...)
+  .handle_exception([](std::exception_ptr eptr) {
+    if (*eptr.__cxa_exception_type() ==
+	typeid(crimson::common::system_shutdown_exception)) {
+	crimson::get_logger(ceph_subsys_osd).debug(
+	    "operation skipped, system shutdown");
+	return seastar::now();
+    }
+    std::rethrow_exception(eptr);
+  });
+}
+
+}
diff --git a/src/crimson/common/fixed_kv_node_layout.h b/src/crimson/common/fixed_kv_node_layout.h
new file mode 100644
index 000000000..4c7cc2e76
--- /dev/null
+++ b/src/crimson/common/fixed_kv_node_layout.h
@@ -0,0 +1,700 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include "include/byteorder.h"
+
+#include "crimson/common/layout.h"
+
+namespace crimson::common {
+
+template <typename T, bool is_const>
+struct maybe_const_t {
+};
+template<typename T>
+struct maybe_const_t<T, true> {
+  using type = const T*;
+};
+template<typename T>
+struct maybe_const_t<T, false> {
+  using type = T*;
+};
+
+
+/**
+ * FixedKVNodeLayout
+ *
+ * Reusable implementation of a fixed size block mapping
+ * K -> V with internal representations KINT and VINT.
+ *
+ * Uses absl::container_internal::Layout for the actual memory layout.
+ *
+ * The primary interface exposed is centered on the iterator
+ * and related methods.
+ *
+ * Also included are helpers for doing splits and merges as for a btree.
+ */
+template <
+  size_t CAPACITY,
+  typename Meta,
+  typename MetaInt,
+  typename K,
+  typename KINT,
+  typename V,
+  typename VINT,
+  bool VALIDATE_INVARIANTS=true>
+class FixedKVNodeLayout {
+  char *buf = nullptr;
+
+  using L = absl::container_internal::Layout<ceph_le32, MetaInt, KINT, VINT>;
+  static constexpr L layout{1, 1, CAPACITY, CAPACITY};
+
+public:
+  template <bool is_const>
+  struct iter_t {
+    friend class FixedKVNodeLayout;
+    using parent_t = typename maybe_const_t<FixedKVNodeLayout, is_const>::type;
+
+    parent_t node;
+    uint16_t offset;
+
+    iter_t(
+      parent_t parent,
+      uint16_t offset) : node(parent), offset(offset) {}
+
+    iter_t(const iter_t &) = default;
+    iter_t(iter_t &&) = default;
+    iter_t &operator=(const iter_t &) = default;
+    iter_t &operator=(iter_t &&) = default;
+
+    operator iter_t<!is_const>() const {
+      static_assert(!is_const);
+      return iter_t<!is_const>(node, offset);
+    }
+
+    // Work nicely with for loops without requiring a nested type.
+    iter_t &operator*() { return *this; }
+    iter_t *operator->() { return this; }
+
+    iter_t operator++(int) {
+      auto ret = *this;
+      ++offset;
+      return ret;
+    }
+
+    iter_t &operator++() {
+      ++offset;
+      return *this;
+    }
+
+    uint16_t operator-(const iter_t &rhs) const {
+      assert(rhs.node == node);
+      return offset - rhs.offset;
+    }
+
+    iter_t operator+(uint16_t off) const {
+      return iter_t(
+	node,
+	offset + off);
+    }
+    iter_t operator-(uint16_t off) const {
+      return iter_t(
+	node,
+	offset - off);
+    }
+
+    bool operator==(const iter_t &rhs) const {
+      assert(node == rhs.node);
+      return rhs.offset == offset;
+    }
+
+    bool operator!=(const iter_t &rhs) const {
+      return !(*this == rhs);
+    }
+
+    K get_key() const {
+      return K(node->get_key_ptr()[offset]);
+    }
+
+    K get_next_key_or_max() const {
+      auto next = *this + 1;
+      if (next == node->end())
+	return std::numeric_limits<K>::max();
+      else
+	return next->get_key();
+    }
+
+    void set_val(V val) const {
+      static_assert(!is_const);
+      node->get_val_ptr()[offset] = VINT(val);
+    }
+
+    V get_val() const {
+      return V(node->get_val_ptr()[offset]);
+    };
+
+    bool contains(K addr) const {
+      return (get_key() <= addr) && (get_next_key_or_max() > addr);
+    }
+
+    uint16_t get_offset() const {
+      return offset;
+    }
+
+  private:
+    void set_key(K _lb) const {
+      static_assert(!is_const);
+      KINT lb;
+      lb = _lb;
+      node->get_key_ptr()[offset] = lb;
+    }
+
+    typename maybe_const_t<char, is_const>::type get_key_ptr() const {
+      return reinterpret_cast<
+	typename maybe_const_t<char, is_const>::type>(
+	  node->get_key_ptr() + offset);
+    }
+
+    typename maybe_const_t<char, is_const>::type get_val_ptr() const {
+      return reinterpret_cast<
+	typename maybe_const_t<char, is_const>::type>(
+	  node->get_val_ptr() + offset);
+    }
+  };
+  using const_iterator = iter_t<true>;
+  using iterator = iter_t<false>;
+
+  struct delta_t {
+    enum class op_t : uint8_t {
+      INSERT,
+      REMOVE,
+      UPDATE,
+    } op;
+    KINT key;
+    VINT val;
+
+    void replay(FixedKVNodeLayout &l) {
+      switch (op) {
+      case op_t::INSERT: {
+	l.insert(l.lower_bound(key), key, val);
+	break;
+      }
+      case op_t::REMOVE: {
+	auto iter = l.find(key);
+	assert(iter != l.end());
+	l.remove(iter);
+	break;
+      }
+      case op_t::UPDATE: {
+	auto iter = l.find(key);
+	assert(iter != l.end());
+	l.update(iter, val);
+	break;
+      }
+      default:
+	assert(0 == "Impossible");
+      }
+    }
+
+    bool operator==(const delta_t &rhs) const {
+      return op == rhs.op &&
+	key == rhs.key &&
+	val == rhs.val;
+    }
+  };
+
+public:
+  class delta_buffer_t {
+    std::vector<delta_t> buffer;
+  public:
+    bool empty() const {
+      return buffer.empty();
+    }
+    void insert(
+      const K &key,
+      const V &val) {
+      KINT k;
+      k = key;
+      buffer.push_back(
+	delta_t{
+	  delta_t::op_t::INSERT,
+	  k,
+	  VINT(val)
+	});
+    }
+    void update(
+      const K &key,
+      const V &val) {
+      KINT k;
+      k = key;
+      buffer.push_back(
+	delta_t{
+	  delta_t::op_t::UPDATE,
+	  k,
+	  VINT(val)
+	});
+    }
+    void remove(const K &key) {
+      KINT k;
+      k = key;
+      buffer.push_back(
+	delta_t{
+	  delta_t::op_t::REMOVE,
+	  k,
+	  VINT()
+	});
+    }
+    void replay(FixedKVNodeLayout &node) {
+      for (auto &i: buffer) {
+	i.replay(node);
+      }
+    }
+    size_t get_bytes() const {
+      return buffer.size() * sizeof(delta_t);
+    }
+    void copy_out(char *out, size_t len) {
+      assert(len == get_bytes());
+      ::memcpy(out, reinterpret_cast<const void *>(buffer.data()), get_bytes());
+      buffer.clear();
+    }
+    void copy_in(const char *out, size_t len) {
+      assert(empty());
+      assert(len % sizeof(delta_t) == 0);
+      buffer = std::vector(
+	reinterpret_cast<const delta_t*>(out),
+	reinterpret_cast<const delta_t*>(out + len));
+    }
+    bool operator==(const delta_buffer_t &rhs) const {
+      return buffer == rhs.buffer;
+    }
+  };
+
+  void journal_insert(
+    const_iterator _iter,
+    const K &key,
+    const V &val,
+    delta_buffer_t *recorder) {
+    auto iter = iterator(this, _iter.offset);
+    if (recorder) {
+      recorder->insert(
+	key,
+	val);
+    }
+    insert(iter, key, val);
+  }
+
+  void journal_update(
+    const_iterator _iter,
+    const V &val,
+    delta_buffer_t *recorder) {
+    auto iter = iterator(this, _iter.offset);
+    if (recorder) {
+      recorder->update(iter->get_key(), val);
+    }
+    update(iter, val);
+  }
+
+  void journal_replace(
+    const_iterator _iter,
+    const K &key,
+    const V &val,
+    delta_buffer_t *recorder) {
+    auto iter = iterator(this, _iter.offset);
+    if (recorder) {
+      recorder->remove(iter->get_key());
+      recorder->insert(key, val);
+    }
+    replace(iter, key, val);
+  }
+
+
+  void journal_remove(
+    const_iterator _iter,
+    delta_buffer_t *recorder) {
+    auto iter = iterator(this, _iter.offset);
+    if (recorder) {
+      recorder->remove(iter->get_key());
+    }
+    remove(iter);
+  }
+
+
+  FixedKVNodeLayout(char *buf) :
+    buf(buf) {}
+
+  virtual ~FixedKVNodeLayout() = default;
+
+  const_iterator begin() const {
+    return const_iterator(
+      this,
+      0);
+  }
+
+  const_iterator end() const {
+    return const_iterator(
+      this,
+      get_size());
+  }
+
+  iterator begin() {
+    return iterator(
+      this,
+      0);
+  }
+
+  iterator end() {
+    return iterator(
+      this,
+      get_size());
+  }
+
+  const_iterator iter_idx(uint16_t off) const {
+    return const_iterator(
+      this,
+      off);
+  }
+
+  const_iterator find(K l) const {
+    auto ret = begin();
+    for (; ret != end(); ++ret) {
+      if (ret->get_key() == l)
+	break;
+    }
+    return ret;
+  }
+  iterator find(K l) {
+    const auto &tref = *this;
+    return iterator(this, tref.find(l).offset);
+  }
+
+  const_iterator lower_bound(K l) const {
+    auto ret = begin();
+    for (; ret != end(); ++ret) {
+      if (ret->get_key() >= l)
+	break;
+    }
+    return ret;
+  }
+  iterator lower_bound(K l) {
+    const auto &tref = *this;
+    return iterator(this, tref.lower_bound(l).offset);
+  }
+
+  const_iterator upper_bound(K l) const {
+    auto ret = begin();
+    for (; ret != end(); ++ret) {
+      if (ret->get_key() > l)
+	break;
+    }
+    return ret;
+  }
+  iterator upper_bound(K l) {
+    const auto &tref = *this;
+    return iterator(this, tref.upper_bound(l).offset);
+  }
+
+  const_iterator get_split_pivot() const {
+    return iter_idx(get_size() / 2);
+  }
+
+  uint16_t get_size() const {
+    return *layout.template Pointer<0>(buf);
+  }
+
+  /**
+   * set_size
+   *
+   * Set size representation to match size
+   */
+  void set_size(uint16_t size) {
+    *layout.template Pointer<0>(buf) = size;
+  }
+
+  /**
+   * get_meta/set_meta
+   *
+   * Enables stashing a templated type within the layout.
+   * Cannot be modified after initial write as it is not represented
+   * in delta_t
+   */
+  Meta get_meta() const {
+    MetaInt &metaint = *layout.template Pointer<1>(buf);
+    return Meta(metaint);
+  }
+  void set_meta(const Meta &meta) {
+    *layout.template Pointer<1>(buf) = MetaInt(meta);
+  }
+
+  constexpr static size_t get_capacity() {
+    return CAPACITY;
+  }
+
+  bool operator==(const FixedKVNodeLayout &rhs) const {
+    if (get_size() != rhs.get_size()) {
+      return false;
+    }
+
+    auto iter = begin();
+    auto iter2 = rhs.begin();
+    while (iter != end()) {
+      if (iter->get_key() != iter2->get_key() ||
+	  iter->get_val() != iter2->get_val()) {
+	return false;
+      }
+      iter++;
+      iter2++;
+    }
+    return true;
+  }
+
+  /**
+   * split_into
+   *
+   * Takes *this and splits its contents into left and right.
+   */
+  K split_into(
+    FixedKVNodeLayout &left,
+    FixedKVNodeLayout &right) const {
+    auto piviter = get_split_pivot();
+
+    left.copy_from_foreign(left.begin(), begin(), piviter);
+    left.set_size(piviter - begin());
+
+    right.copy_from_foreign(right.begin(), piviter, end());
+    right.set_size(end() - piviter);
+
+    auto [lmeta, rmeta] = get_meta().split_into(piviter->get_key());
+    left.set_meta(lmeta);
+    right.set_meta(rmeta);
+
+    return piviter->get_key();
+  }
+
+  /**
+   * merge_from
+   *
+   * Takes two nodes and copies their contents into *this.
+   *
+   * precondition: left.size() + right.size() < CAPACITY
+   */
+  void merge_from(
+    const FixedKVNodeLayout &left,
+    const FixedKVNodeLayout &right)
+  {
+    copy_from_foreign(
+      end(),
+      left.begin(),
+      left.end());
+    set_size(left.get_size());
+    copy_from_foreign(
+      end(),
+      right.begin(),
+      right.end());
+    set_size(left.get_size() + right.get_size());
+    set_meta(Meta::merge_from(left.get_meta(), right.get_meta()));
+  }
+
+  /**
+   * balance_into_new_nodes
+   *
+   * Takes the contents of left and right and copies them into
+   * replacement_left and replacement_right such that in the
+   * event that the number of elements is odd the extra goes to
+   * the left side iff prefer_left.
+   */
+  static K balance_into_new_nodes(
+    const FixedKVNodeLayout &left,
+    const FixedKVNodeLayout &right,
+    bool prefer_left,
+    FixedKVNodeLayout &replacement_left,
+    FixedKVNodeLayout &replacement_right)
+  {
+    auto total = left.get_size() + right.get_size();
+    auto pivot_idx = (left.get_size() + right.get_size()) / 2;
+    if (total % 2 && prefer_left) {
+      pivot_idx++;
+    }
+    auto replacement_pivot = pivot_idx >= left.get_size() ?
+      right.iter_idx(pivot_idx - left.get_size())->get_key() :
+      left.iter_idx(pivot_idx)->get_key();
+
+    if (pivot_idx < left.get_size()) {
+      replacement_left.copy_from_foreign(
+	replacement_left.end(),
+	left.begin(),
+	left.iter_idx(pivot_idx));
+      replacement_left.set_size(pivot_idx);
+
+      replacement_right.copy_from_foreign(
+	replacement_right.end(),
+	left.iter_idx(pivot_idx),
+	left.end());
+
+      replacement_right.set_size(left.get_size() - pivot_idx);
+      replacement_right.copy_from_foreign(
+	replacement_right.end(),
+	right.begin(),
+	right.end());
+      replacement_right.set_size(total - pivot_idx);
+    } else {
+      replacement_left.copy_from_foreign(
+	replacement_left.end(),
+	left.begin(),
+	left.end());
+      replacement_left.set_size(left.get_size());
+
+      replacement_left.copy_from_foreign(
+	replacement_left.end(),
+	right.begin(),
+	right.iter_idx(pivot_idx - left.get_size()));
+      replacement_left.set_size(pivot_idx);
+
+      replacement_right.copy_from_foreign(
+	replacement_right.end(),
+	right.iter_idx(pivot_idx - left.get_size()),
+	right.end());
+      replacement_right.set_size(total - pivot_idx);
+    }
+
+    auto [lmeta, rmeta] = Meta::rebalance(
+      left.get_meta(), right.get_meta(), replacement_pivot);
+    replacement_left.set_meta(lmeta);
+    replacement_right.set_meta(rmeta);
+    return replacement_pivot;
+  }
+
+private:
+  void insert(
+    iterator iter,
+    const K &key,
+    const V &val) {
+    if (VALIDATE_INVARIANTS) {
+      if (iter != begin()) {
+	assert((iter - 1)->get_key() < key);
+      }
+      if (iter != end()) {
+	assert(iter->get_key() > key);
+      }
+      assert(get_size() < CAPACITY);
+    }
+    copy_from_local(iter + 1, iter, end());
+    iter->set_key(key);
+    iter->set_val(val);
+    set_size(get_size() + 1);
+  }
+
+  void update(
+    iterator iter,
+    V val) {
+    assert(iter != end());
+    iter->set_val(val);
+  }
+
+  void replace(
+    iterator iter,
+    const K &key,
+    const V &val) {
+    assert(iter != end());
+    if (VALIDATE_INVARIANTS) {
+      if (iter != begin()) {
+	assert((iter - 1)->get_key() < key);
+      }
+      if ((iter + 1) != end()) {
+	assert((iter + 1)->get_key() > key);
+      }
+    }
+    iter->set_key(key);
+    iter->set_val(val);
+  }
+
+  void remove(iterator iter) {
+    assert(iter != end());
+    copy_from_local(iter, iter + 1, end());
+    set_size(get_size() - 1);
+  }
+
+  /**
+   * get_key_ptr
+   *
+   * Get pointer to start of key array
+   */
+  KINT *get_key_ptr() {
+    return layout.template Pointer<2>(buf);
+  }
+  const KINT *get_key_ptr() const {
+    return layout.template Pointer<2>(buf);
+  }
+
+  /**
+   * get_val_ptr
+   *
+   * Get pointer to start of val array
+   */
+  VINT *get_val_ptr() {
+    return layout.template Pointer<3>(buf);
+  }
+  const VINT *get_val_ptr() const {
+    return layout.template Pointer<3>(buf);
+  }
+
+  /**
+   * node_resolve/unresolve_vals
+   *
+   * If the representation for values depends in some way on the
+   * node in which they are located, users may implement
+   * resolve/unresolve to enable copy_from_foreign to handle that
+   * transition.
+   */
+  virtual void node_resolve_vals(iterator from, iterator to) const {}
+  virtual void node_unresolve_vals(iterator from, iterator to) const {}
+
+  /**
+   * copy_from_foreign
+   *
+   * Copies entries from [from_src, to_src) to tgt.
+   *
+   * tgt and from_src must be from different nodes.
+   * from_src and to_src must be from the same node.
+   */
+  static void copy_from_foreign(
+    iterator tgt,
+    const_iterator from_src,
+    const_iterator to_src) {
+    assert(tgt->node != from_src->node);
+    assert(to_src->node == from_src->node);
+    memcpy(
+      tgt->get_val_ptr(), from_src->get_val_ptr(),
+      to_src->get_val_ptr() - from_src->get_val_ptr());
+    memcpy(
+      tgt->get_key_ptr(), from_src->get_key_ptr(),
+      to_src->get_key_ptr() - from_src->get_key_ptr());
+    from_src->node->node_resolve_vals(tgt, tgt + (to_src - from_src));
+    tgt->node->node_unresolve_vals(tgt, tgt + (to_src - from_src));
+  }
+
+  /**
+   * copy_from_local
+   *
+   * Copies entries from [from_src, to_src) to tgt.
+   *
+   * tgt, from_src, and to_src must be from the same node.
+   */
+  static void copy_from_local(
+    iterator tgt,
+    iterator from_src,
+    iterator to_src) {
+    assert(tgt->node == from_src->node);
+    assert(to_src->node == from_src->node);
+    memmove(
+      tgt->get_val_ptr(), from_src->get_val_ptr(),
+      to_src->get_val_ptr() - from_src->get_val_ptr());
+    memmove(
+      tgt->get_key_ptr(), from_src->get_key_ptr(),
+      to_src->get_key_ptr() - from_src->get_key_ptr());
+  }
+};
+
+}
diff --git a/src/crimson/common/formatter.cc b/src/crimson/common/formatter.cc
new file mode 100644
index 000000000..677216224
--- /dev/null
+++ b/src/crimson/common/formatter.cc
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "formatter.h"
+
+#include <fmt/format.h>
+#if FMT_VERSION >= 60000
+#include <fmt/chrono.h>
+#else
+#include <fmt/time.h>
+#endif
+
+
+template <>
+struct fmt::formatter<seastar::lowres_system_clock::time_point> {
+  // ignore the format string
+  template <typename ParseContext>
+  constexpr auto parse(ParseContext &ctx) { return ctx.begin(); }
+
+  template <typename FormatContext>
+  auto format(const seastar::lowres_system_clock::time_point& t,
+              FormatContext& ctx) {
+    std::time_t tt = std::chrono::duration_cast<std::chrono::seconds>(
+      t.time_since_epoch()).count();
+    auto milliseconds = (t.time_since_epoch() %
+                         std::chrono::seconds(1)).count();
+    return fmt::format_to(ctx.out(), "{:%Y-%m-%d %H:%M:%S} {:03d}",
+                          fmt::localtime(tt), milliseconds);
+  }
+};
+
+template <>
+struct fmt::formatter<ceph::coarse_real_clock::time_point> {
+  // ignore the format string
+  template <typename ParseContext>
+  constexpr auto parse(ParseContext &ctx) { return ctx.begin(); }
+
+  template <typename FormatContext>
+  auto format(const ceph::coarse_real_clock::time_point& t,
+              FormatContext& ctx) {
+    std::time_t tt = std::chrono::duration_cast<std::chrono::seconds>(
+      t.time_since_epoch()).count();
+    auto milliseconds = (t.time_since_epoch() %
+                         std::chrono::seconds(1)).count();
+    return fmt::format_to(ctx.out(), "{:%Y-%m-%d %H:%M:%S} {:03d}",
+                          fmt::localtime(tt), milliseconds);
+  }
+};
+
+namespace std {
+
+ostream& operator<<(ostream& out,
+                    const seastar::lowres_system_clock::time_point& t)
+{
+  return out << fmt::format("{}", t);
+}
+
+ostream& operator<<(ostream& out,
+                    const ceph::coarse_real_clock::time_point& t)
+{
+  return out << fmt::format("{}", t);
+}
+
+}
diff --git a/src/crimson/common/formatter.h b/src/crimson/common/formatter.h
new file mode 100644
index 000000000..1775b0954
--- /dev/null
+++ b/src/crimson/common/formatter.h
@@ -0,0 +1,15 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <seastar/core/lowres_clock.hh>
+
+#include "common/ceph_time.h"
+
+namespace std {
+
+ostream& operator<<(ostream& out,
+                    const seastar::lowres_system_clock::time_point& t);
+ostream& operator<<(ostream& out,
+                    const ceph::coarse_real_clock::time_point& t);
+
+}
diff --git a/src/crimson/common/gated.h b/src/crimson/common/gated.h
new file mode 100644
index 000000000..7d901b6b1
--- /dev/null
+++ b/src/crimson/common/gated.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/gate.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/future-util.hh>
+
+#include "crimson/common/exception.h"
+#include "crimson/common/log.h"
+#include "include/ceph_assert.h"
+
+namespace crimson::common {
+
+class Gated {
+ public:
+  static seastar::logger& gated_logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+  template <typename Func, typename T>
+  inline void dispatch_in_background(const char* what, T& who, Func&& func) {
+    (void) dispatch(what, who, func);
+  }
+  template <typename Func, typename T>
+  inline seastar::future<> dispatch(const char* what, T& who, Func&& func) {
+    return seastar::with_gate(pending_dispatch, std::forward<Func>(func)
+    ).handle_exception([what, &who] (std::exception_ptr eptr) {
+      if (*eptr.__cxa_exception_type() == typeid(system_shutdown_exception)) {
+	gated_logger().debug(
+	    "{}, {} skipped, system shutdown", who, what);
+	return;
+      }
+      gated_logger().error(
+          "{} dispatch() {} caught exception: {}", who, what, eptr);
+      assert(*eptr.__cxa_exception_type()
+	== typeid(seastar::gate_closed_exception));
+    });
+  }
+
+  seastar::future<> close() {
+    return pending_dispatch.close();
+  }
+  bool is_closed() const {
+    return pending_dispatch.is_closed();
+  }
+ private:
+  seastar::gate pending_dispatch;
+};
+
+}// namespace crimson::common
diff --git a/src/crimson/common/layout.h b/src/crimson/common/layout.h
new file mode 100644
index 000000000..9d54ecd1d
--- /dev/null
+++ b/src/crimson/common/layout.h
@@ -0,0 +1,737 @@
+// Copyright 2018 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//                           MOTIVATION AND TUTORIAL
+//
+// If you want to put in a single heap allocation N doubles followed by M ints,
+// it's easy if N and M are known at compile time.
+//
+//   struct S {
+//     double a[N];
+//     int b[M];
+//   };
+//
+//   S* p = new S;
+//
+// But what if N and M are known only in run time? Class template Layout to the
+// rescue! It's a portable generalization of the technique known as struct hack.
+//
+//   // This object will tell us everything we need to know about the memory
+//   // layout of double[N] followed by int[M]. It's structurally identical to
+//   // size_t[2] that stores N and M. It's very cheap to create.
+//   const Layout<double, int> layout(N, M);
+//
+//   // Allocate enough memory for both arrays. `AllocSize()` tells us how much
+//   // memory is needed. We are free to use any allocation function we want as
+//   // long as it returns aligned memory.
+//   std::unique_ptr<unsigned char[]> p(new unsigned char[layout.AllocSize()]);
+//
+//   // Obtain the pointer to the array of doubles.
+//   // Equivalent to `reinterpret_cast<double*>(p.get())`.
+//   //
+//   // We could have written layout.Pointer<0>(p) instead. If all the types are
+//   // unique you can use either form, but if some types are repeated you must
+//   // use the index form.
+//   double* a = layout.Pointer<double>(p.get());
+//
+//   // Obtain the pointer to the array of ints.
+//   // Equivalent to `reinterpret_cast<int*>(p.get() + N * 8)`.
+//   int* b = layout.Pointer<int>(p);
+//
+// If we are unable to specify sizes of all fields, we can pass as many sizes as
+// we can to `Partial()`. In return, it'll allow us to access the fields whose
+// locations and sizes can be computed from the provided information.
+// `Partial()` comes in handy when the array sizes are embedded into the
+// allocation.
+//
+//   // size_t[1] containing N, size_t[1] containing M, double[N], int[M].
+//   using L = Layout<size_t, size_t, double, int>;
+//
+//   unsigned char* Allocate(size_t n, size_t m) {
+//     const L layout(1, 1, n, m);
+//     unsigned char* p = new unsigned char[layout.AllocSize()];
+//     *layout.Pointer<0>(p) = n;
+//     *layout.Pointer<1>(p) = m;
+//     return p;
+//   }
+//
+//   void Use(unsigned char* p) {
+//     // First, extract N and M.
+//     // Specify that the first array has only one element. Using `prefix` we
+//     // can access the first two arrays but not more.
+//     constexpr auto prefix = L::Partial(1);
+//     size_t n = *prefix.Pointer<0>(p);
+//     size_t m = *prefix.Pointer<1>(p);
+//
+//     // Now we can get pointers to the payload.
+//     const L layout(1, 1, n, m);
+//     double* a = layout.Pointer<double>(p);
+//     int* b = layout.Pointer<int>(p);
+//   }
+//
+// The layout we used above combines fixed-size with dynamically-sized fields.
+// This is quite common. Layout is optimized for this use case and generates
+// optimal code. All computations that can be performed at compile time are
+// indeed performed at compile time.
+//
+// Efficiency tip: The order of fields matters. In `Layout<T1, ..., TN>` try to
+// ensure that `alignof(T1) >= ... >= alignof(TN)`. This way you'll have no
+// padding in between arrays.
+//
+// You can manually override the alignment of an array by wrapping the type in
+// `Aligned<T, N>`. `Layout<..., Aligned<T, N>, ...>` has exactly the same API
+// and behavior as `Layout<..., T, ...>` except that the first element of the
+// array of `T` is aligned to `N` (the rest of the elements follow without
+// padding). `N` cannot be less than `alignof(T)`.
+//
+// `AllocSize()` and `Pointer()` are the most basic methods for dealing with
+// memory layouts. Check out the reference or code below to discover more.
+//
+//                            EXAMPLE
+//
+//   // Immutable move-only string with sizeof equal to sizeof(void*). The
+//   // string size and the characters are kept in the same heap allocation.
+//   class CompactString {
+//    public:
+//     CompactString(const char* s = "") {
+//       const size_t size = strlen(s);
+//       // size_t[1] followed by char[size + 1].
+//       const L layout(1, size + 1);
+//       p_.reset(new unsigned char[layout.AllocSize()]);
+//       // If running under ASAN, mark the padding bytes, if any, to catch
+//       // memory errors.
+//       layout.PoisonPadding(p_.get());
+//       // Store the size in the allocation.
+//       *layout.Pointer<size_t>(p_.get()) = size;
+//       // Store the characters in the allocation.
+//       memcpy(layout.Pointer<char>(p_.get()), s, size + 1);
+//     }
+//
+//     size_t size() const {
+//       // Equivalent to reinterpret_cast<size_t&>(*p).
+//       return *L::Partial().Pointer<size_t>(p_.get());
+//     }
+//
+//     const char* c_str() const {
+//       // Equivalent to reinterpret_cast<char*>(p.get() + sizeof(size_t)).
+//       // The argument in Partial(1) specifies that we have size_t[1] in front
+//       // of the characters.
+//       return L::Partial(1).Pointer<char>(p_.get());
+//     }
+//
+//    private:
+//     // Our heap allocation contains a size_t followed by an array of chars.
+//     using L = Layout<size_t, char>;
+//     std::unique_ptr<unsigned char[]> p_;
+//   };
+//
+//   int main() {
+//     CompactString s = "hello";
+//     assert(s.size() == 5);
+//     assert(strcmp(s.c_str(), "hello") == 0);
+//   }
+//
+//                               DOCUMENTATION
+//
+// The interface exported by this file consists of:
+// - class `Layout<>` and its public members.
+// - The public members of class `internal_layout::LayoutImpl<>`. That class
+//   isn't intended to be used directly, and its name and template parameter
+//   list are internal implementation details, but the class itself provides
+//   most of the functionality in this file. See comments on its members for
+//   detailed documentation.
+//
+// `Layout<T1,... Tn>::Partial(count1,..., countm)` (where `m` <= `n`) returns a
+// `LayoutImpl<>` object. `Layout<T1,..., Tn> layout(count1,..., countn)`
+// creates a `Layout` object, which exposes the same functionality by inheriting
+// from `LayoutImpl<>`.
+
+#ifndef ABSL_CONTAINER_INTERNAL_LAYOUT_H_
+#define ABSL_CONTAINER_INTERNAL_LAYOUT_H_
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+
+#ifdef ADDRESS_SANITIZER
+#include <sanitizer/asan_interface.h>
+#endif
+
+// for C++20 std::span
+#include <boost/beast/core/span.hpp>
+#include <fmt/format.h>
+
+#if defined(__GXX_RTTI)
+#define ABSL_INTERNAL_HAS_CXA_DEMANGLE
+#endif
+
+#ifdef ABSL_INTERNAL_HAS_CXA_DEMANGLE
+#include <cxxabi.h>
+#endif
+
+namespace absl {
+namespace container_internal {
+
+// A type wrapper that instructs `Layout` to use the specific alignment for the
+// array. `Layout<..., Aligned<T, N>, ...>` has exactly the same API
+// and behavior as `Layout<..., T, ...>` except that the first element of the
+// array of `T` is aligned to `N` (the rest of the elements follow without
+// padding).
+//
+// Requires: `N >= alignof(T)` and `N` is a power of 2.
+template <class T, size_t N>
+struct Aligned;
+
+namespace internal_layout {
+
+template <class T>
+struct NotAligned {};
+
+template <class T, size_t N>
+struct NotAligned<const Aligned<T, N>> {
+  static_assert(sizeof(T) == 0, "Aligned<T, N> cannot be const-qualified");
+};
+
+template <size_t>
+using IntToSize = size_t;
+
+template <class>
+using TypeToSize = size_t;
+
+template <class T>
+struct Type : NotAligned<T> {
+  using type = T;
+};
+
+template <class T, size_t N>
+struct Type<Aligned<T, N>> {
+  using type = T;
+};
+
+template <class T>
+struct SizeOf : NotAligned<T>, std::integral_constant<size_t, sizeof(T)> {};
+
+template <class T, size_t N>
+struct SizeOf<Aligned<T, N>> : std::integral_constant<size_t, sizeof(T)> {};
+
+// Note: workaround for https://gcc.gnu.org/PR88115
+template <class T>
+struct AlignOf : NotAligned<T> {
+  static constexpr size_t value = alignof(T);
+};
+
+template <class T, size_t N>
+struct AlignOf<Aligned<T, N>> {
+  static_assert(N % alignof(T) == 0,
+                "Custom alignment can't be lower than the type's alignment");
+  static constexpr size_t value = N;
+};
+
+// Does `Ts...` contain `T`?
+template <class T, class... Ts>
+using Contains = std::disjunction<std::is_same<T, Ts>...>;
+
+template <class From, class To>
+using CopyConst =
+    typename std::conditional_t<std::is_const_v<From>, const To, To>;
+
+// Note: We're not qualifying this with absl:: because it doesn't compile under
+// MSVC.
+template <class T>
+using SliceType = boost::beast::span<T>;
+
+// This namespace contains no types. It prevents functions defined in it from
+// being found by ADL.
+namespace adl_barrier {
+
+template <class Needle, class... Ts>
+constexpr size_t Find(Needle, Needle, Ts...) {
+  static_assert(!Contains<Needle, Ts...>(), "Duplicate element type");
+  return 0;
+}
+
+template <class Needle, class T, class... Ts>
+constexpr size_t Find(Needle, T, Ts...) {
+  return adl_barrier::Find(Needle(), Ts()...) + 1;
+}
+
+constexpr bool IsPow2(size_t n) { return !(n & (n - 1)); }
+
+// Returns `q * m` for the smallest `q` such that `q * m >= n`.
+// Requires: `m` is a power of two. It's enforced by IsLegalElementType below.
+constexpr size_t Align(size_t n, size_t m) { return (n + m - 1) & ~(m - 1); }
+
+constexpr size_t Min(size_t a, size_t b) { return b < a ? b : a; }
+
+constexpr size_t Max(size_t a) { return a; }
+
+template <class... Ts>
+constexpr size_t Max(size_t a, size_t b, Ts... rest) {
+  return adl_barrier::Max(b < a ? a : b, rest...);
+}
+
+template <class T>
+std::string TypeName() {
+  std::string out;
+  int status = 0;
+  char* demangled = nullptr;
+#ifdef ABSL_INTERNAL_HAS_CXA_DEMANGLE
+  demangled = abi::__cxa_demangle(typeid(T).name(), nullptr, nullptr, &status);
+#endif
+  if (status == 0 && demangled != nullptr) {  // Demangling succeeded.
+    out = fmt::format("<{}>", demangled);
+    free(demangled);
+  } else {
+#if defined(__GXX_RTTI) || defined(_CPPRTTI)
+    out = fmt::format("<{}>", typeid(T).name());
+#endif
+  }
+  return out;
+}
+
+}  // namespace adl_barrier
+
+template <bool C>
+using EnableIf = typename std::enable_if_t<C, int>;
+
+// Can `T` be a template argument of `Layout`?
+template <class T>
+using IsLegalElementType = std::integral_constant<
+    bool, !std::is_reference_v<T> && !std::is_volatile_v<T> &&
+              !std::is_reference_v<typename Type<T>::type> &&
+              !std::is_volatile_v<typename Type<T>::type> &&
+              adl_barrier::IsPow2(AlignOf<T>::value)>;
+
+template <class Elements, class SizeSeq, class OffsetSeq>
+class LayoutImpl;
+
+// Public base class of `Layout` and the result type of `Layout::Partial()`.
+//
+// `Elements...` contains all template arguments of `Layout` that created this
+// instance.
+//
+// `SizeSeq...` is `[0, NumSizes)` where `NumSizes` is the number of arguments
+// passed to `Layout::Partial()` or `Layout::Layout()`.
+//
+// `OffsetSeq...` is `[0, NumOffsets)` where `NumOffsets` is
+// `Min(sizeof...(Elements), NumSizes + 1)` (the number of arrays for which we
+// can compute offsets).
+template <class... Elements, size_t... SizeSeq, size_t... OffsetSeq>
+class LayoutImpl<std::tuple<Elements...>, std::index_sequence<SizeSeq...>,
+                 std::index_sequence<OffsetSeq...>> {
+ private:
+  static_assert(sizeof...(Elements) > 0, "At least one field is required");
+  static_assert(std::conjunction_v<IsLegalElementType<Elements>...>,
+                "Invalid element type (see IsLegalElementType)");
+
+  enum {
+    NumTypes = sizeof...(Elements),
+    NumSizes = sizeof...(SizeSeq),
+    NumOffsets = sizeof...(OffsetSeq),
+  };
+
+  // These are guaranteed by `Layout`.
+  static_assert(NumOffsets == adl_barrier::Min(NumTypes, NumSizes + 1),
+                "Internal error");
+  static_assert(NumTypes > 0, "Internal error");
+
+  // Returns the index of `T` in `Elements...`. Results in a compilation error
+  // if `Elements...` doesn't contain exactly one instance of `T`.
+  template <class T>
+  static constexpr size_t ElementIndex() {
+    static_assert(Contains<Type<T>, Type<typename Type<Elements>::type>...>(),
+                  "Type not found");
+    return adl_barrier::Find(Type<T>(),
+                             Type<typename Type<Elements>::type>()...);
+  }
+
+  template <size_t N>
+  using ElementAlignment =
+      AlignOf<typename std::tuple_element<N, std::tuple<Elements...>>::type>;
+
+ public:
+  // Element types of all arrays packed in a tuple.
+  using ElementTypes = std::tuple<typename Type<Elements>::type...>;
+
+  // Element type of the Nth array.
+  template <size_t N>
+  using ElementType = typename std::tuple_element<N, ElementTypes>::type;
+
+  constexpr explicit LayoutImpl(IntToSize<SizeSeq>... sizes)
+      : size_{sizes...} {}
+
+  // Alignment of the layout, equal to the strictest alignment of all elements.
+  // All pointers passed to the methods of layout must be aligned to this value.
+  static constexpr size_t Alignment() {
+    return adl_barrier::Max(AlignOf<Elements>::value...);
+  }
+
+  // Offset in bytes of the Nth array.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   assert(x.Offset<0>() == 0);   // The ints starts from 0.
+  //   assert(x.Offset<1>() == 16);  // The doubles starts from 16.
+  //
+  // Requires: `N <= NumSizes && N < sizeof...(Ts)`.
+  template <size_t N, EnableIf<N == 0> = 0>
+  constexpr size_t Offset() const {
+    return 0;
+  }
+
+  template <size_t N, EnableIf<N != 0> = 0>
+  constexpr size_t Offset() const {
+    static_assert(N < NumOffsets, "Index out of bounds");
+    return adl_barrier::Align(
+        Offset<N - 1>() + SizeOf<ElementType<N - 1>>() * size_[N - 1],
+        ElementAlignment<N>::value);
+  }
+
+  // Offset in bytes of the array with the specified element type. There must
+  // be exactly one such array and its zero-based index must be at most
+  // `NumSizes`.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   assert(x.Offset<int>() == 0);      // The ints starts from 0.
+  //   assert(x.Offset<double>() == 16);  // The doubles starts from 16.
+  template <class T>
+  constexpr size_t Offset() const {
+    return Offset<ElementIndex<T>()>();
+  }
+
+  // Offsets in bytes of all arrays for which the offsets are known.
+  constexpr std::array<size_t, NumOffsets> Offsets() const {
+    return {{Offset<OffsetSeq>()...}};
+  }
+
+  // The number of elements in the Nth array. This is the Nth argument of
+  // `Layout::Partial()` or `Layout::Layout()` (zero-based).
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   assert(x.Size<0>() == 3);
+  //   assert(x.Size<1>() == 4);
+  //
+  // Requires: `N < NumSizes`.
+  template <size_t N>
+  constexpr size_t Size() const {
+    static_assert(N < NumSizes, "Index out of bounds");
+    return size_[N];
+  }
+
+  // The number of elements in the array with the specified element type.
+  // There must be exactly one such array and its zero-based index must be
+  // at most `NumSizes`.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   assert(x.Size<int>() == 3);
+  //   assert(x.Size<double>() == 4);
+  template <class T>
+  constexpr size_t Size() const {
+    return Size<ElementIndex<T>()>();
+  }
+
+  // The number of elements of all arrays for which they are known.
+  constexpr std::array<size_t, NumSizes> Sizes() const {
+    return {{Size<SizeSeq>()...}};
+  }
+
+  // Pointer to the beginning of the Nth array.
+  //
+  // `Char` must be `[const] [signed|unsigned] char`.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   unsigned char* p = new unsigned char[x.AllocSize()];
+  //   int* ints = x.Pointer<0>(p);
+  //   double* doubles = x.Pointer<1>(p);
+  //
+  // Requires: `N <= NumSizes && N < sizeof...(Ts)`.
+  // Requires: `p` is aligned to `Alignment()`.
+  template <size_t N, class Char>
+  CopyConst<Char, ElementType<N>>* Pointer(Char* p) const {
+    using C = typename std::remove_const<Char>::type;
+    static_assert(
+        std::is_same<C, char>() || std::is_same<C, unsigned char>() ||
+            std::is_same<C, signed char>(),
+        "The argument must be a pointer to [const] [signed|unsigned] char");
+    constexpr size_t alignment = Alignment();
+    (void)alignment;
+    assert(reinterpret_cast<uintptr_t>(p) % alignment == 0);
+    return reinterpret_cast<CopyConst<Char, ElementType<N>>*>(p + Offset<N>());
+  }
+
+  // Pointer to the beginning of the array with the specified element type.
+  // There must be exactly one such array and its zero-based index must be at
+  // most `NumSizes`.
+  //
+  // `Char` must be `[const] [signed|unsigned] char`.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   unsigned char* p = new unsigned char[x.AllocSize()];
+  //   int* ints = x.Pointer<int>(p);
+  //   double* doubles = x.Pointer<double>(p);
+  //
+  // Requires: `p` is aligned to `Alignment()`.
+  template <class T, class Char>
+  CopyConst<Char, T>* Pointer(Char* p) const {
+    return Pointer<ElementIndex<T>()>(p);
+  }
+
+  // Pointers to all arrays for which pointers are known.
+  //
+  // `Char` must be `[const] [signed|unsigned] char`.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   unsigned char* p = new unsigned char[x.AllocSize()];
+  //
+  //   int* ints;
+  //   double* doubles;
+  //   std::tie(ints, doubles) = x.Pointers(p);
+  //
+  // Requires: `p` is aligned to `Alignment()`.
+  //
+  // Note: We're not using ElementType alias here because it does not compile
+  // under MSVC.
+  template <class Char>
+  std::tuple<CopyConst<
+      Char, typename std::tuple_element<OffsetSeq, ElementTypes>::type>*...>
+  Pointers(Char* p) const {
+    return std::tuple<CopyConst<Char, ElementType<OffsetSeq>>*...>(
+        Pointer<OffsetSeq>(p)...);
+  }
+
+  // The Nth array.
+  //
+  // `Char` must be `[const] [signed|unsigned] char`.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   unsigned char* p = new unsigned char[x.AllocSize()];
+  //   Span<int> ints = x.Slice<0>(p);
+  //   Span<double> doubles = x.Slice<1>(p);
+  //
+  // Requires: `N < NumSizes`.
+  // Requires: `p` is aligned to `Alignment()`.
+  template <size_t N, class Char>
+  SliceType<CopyConst<Char, ElementType<N>>> Slice(Char* p) const {
+    return SliceType<CopyConst<Char, ElementType<N>>>(Pointer<N>(p), Size<N>());
+  }
+
+  // The array with the specified element type. There must be exactly one
+  // such array and its zero-based index must be less than `NumSizes`.
+  //
+  // `Char` must be `[const] [signed|unsigned] char`.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   unsigned char* p = new unsigned char[x.AllocSize()];
+  //   Span<int> ints = x.Slice<int>(p);
+  //   Span<double> doubles = x.Slice<double>(p);
+  //
+  // Requires: `p` is aligned to `Alignment()`.
+  template <class T, class Char>
+  SliceType<CopyConst<Char, T>> Slice(Char* p) const {
+    return Slice<ElementIndex<T>()>(p);
+  }
+
+  // All arrays with known sizes.
+  //
+  // `Char` must be `[const] [signed|unsigned] char`.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   unsigned char* p = new unsigned char[x.AllocSize()];
+  //
+  //   Span<int> ints;
+  //   Span<double> doubles;
+  //   std::tie(ints, doubles) = x.Slices(p);
+  //
+  // Requires: `p` is aligned to `Alignment()`.
+  //
+  // Note: We're not using ElementType alias here because it does not compile
+  // under MSVC.
+  template <class Char>
+  std::tuple<SliceType<CopyConst<
+      Char, typename std::tuple_element<SizeSeq, ElementTypes>::type>>...>
+  Slices(Char* p) const {
+    // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63875 (fixed
+    // in 6.1).
+    (void)p;
+    return std::tuple<SliceType<CopyConst<Char, ElementType<SizeSeq>>>...>(
+        Slice<SizeSeq>(p)...);
+  }
+
+  // The size of the allocation that fits all arrays.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   unsigned char* p = new unsigned char[x.AllocSize()];  // 48 bytes
+  //
+  // Requires: `NumSizes == sizeof...(Ts)`.
+  constexpr size_t AllocSize() const {
+    static_assert(NumTypes == NumSizes, "You must specify sizes of all fields");
+    return Offset<NumTypes - 1>() +
+           SizeOf<ElementType<NumTypes - 1>>() * size_[NumTypes - 1];
+  }
+
+  // If built with --config=asan, poisons padding bytes (if any) in the
+  // allocation. The pointer must point to a memory block at least
+  // `AllocSize()` bytes in length.
+  //
+  // `Char` must be `[const] [signed|unsigned] char`.
+  //
+  // Requires: `p` is aligned to `Alignment()`.
+  template <class Char, size_t N = NumOffsets - 1, EnableIf<N == 0> = 0>
+  void PoisonPadding(const Char* p) const {
+    Pointer<0>(p);  // verify the requirements on `Char` and `p`
+  }
+
+  template <class Char, size_t N = NumOffsets - 1, EnableIf<N != 0> = 0>
+  void PoisonPadding(const Char* p) const {
+    static_assert(N < NumOffsets, "Index out of bounds");
+    (void)p;
+#ifdef ADDRESS_SANITIZER
+    PoisonPadding<Char, N - 1>(p);
+    // The `if` is an optimization. It doesn't affect the observable behaviour.
+    if (ElementAlignment<N - 1>::value % ElementAlignment<N>::value) {
+      size_t start =
+          Offset<N - 1>() + SizeOf<ElementType<N - 1>>() * size_[N - 1];
+      ASAN_POISON_MEMORY_REGION(p + start, Offset<N>() - start);
+    }
+#endif
+  }
+
+  // Human-readable description of the memory layout. Useful for debugging.
+  // Slow.
+  //
+  //   // char[5], 3 bytes of padding, int[3], 4 bytes of padding, followed
+  //   // by an unknown number of doubles.
+  //   auto x = Layout<char, int, double>::Partial(5, 3);
+  //   assert(x.DebugString() ==
+  //          "@0<char>(1)[5]; @8<int>(4)[3]; @24<double>(8)");
+  //
+  // Each field is in the following format: @offset<type>(sizeof)[size] (<type>
+  // may be missing depending on the target platform). For example,
+  // @8<int>(4)[3] means that at offset 8 we have an array of ints, where each
+  // int is 4 bytes, and we have 3 of those ints. The size of the last field may
+  // be missing (as in the example above). Only fields with known offsets are
+  // described. Type names may differ across platforms: one compiler might
+  // produce "unsigned*" where another produces "unsigned int *".
+  std::string DebugString() const {
+    const auto offsets = Offsets();
+    const size_t sizes[] = {SizeOf<ElementType<OffsetSeq>>()...};
+    const std::string types[] = {
+        adl_barrier::TypeName<ElementType<OffsetSeq>>()...};
+    std::string res = fmt::format("@0{}({})", types[0], sizes[0]);
+    for (size_t i = 0; i != NumOffsets - 1; ++i) {
+      res += fmt::format("[{}]; @({})", size_[i], offsets[i + 1], types[i + 1], sizes[i + 1]);
+    }
+    // NumSizes is a constant that may be zero. Some compilers cannot see that
+    // inside the if statement "size_[NumSizes - 1]" must be valid.
+    int last = static_cast<int>(NumSizes) - 1;
+    if (NumTypes == NumSizes && last >= 0) {
+      res += fmt::format("[{}]", size_[last]);
+    }
+    return res;
+  }
+
+ private:
+  // Arguments of `Layout::Partial()` or `Layout::Layout()`.
+  size_t size_[NumSizes > 0 ? NumSizes : 1];
+};
+
+template <size_t NumSizes, class... Ts>
+using LayoutType = LayoutImpl<
+    std::tuple<Ts...>, std::make_index_sequence<NumSizes>,
+    std::make_index_sequence<adl_barrier::Min(sizeof...(Ts), NumSizes + 1)>>;
+
+}  // namespace internal_layout
+
+// Descriptor of arrays of various types and sizes laid out in memory one after
+// another. See the top of the file for documentation.
+//
+// Check out the public API of internal_layout::LayoutImpl above. The type is
+// internal to the library but its methods are public, and they are inherited
+// by `Layout`.
+template <class... Ts>
+class Layout : public internal_layout::LayoutType<sizeof...(Ts), Ts...> {
+ public:
+  static_assert(sizeof...(Ts) > 0, "At least one field is required");
+  static_assert(
+      std::conjunction_v<internal_layout::IsLegalElementType<Ts>...>,
+      "Invalid element type (see IsLegalElementType)");
+
+  // The result type of `Partial()` with `NumSizes` arguments.
+  template <size_t NumSizes>
+  using PartialType = internal_layout::LayoutType<NumSizes, Ts...>;
+
+  // `Layout` knows the element types of the arrays we want to lay out in
+  // memory but not the number of elements in each array.
+  // `Partial(size1, ..., sizeN)` allows us to specify the latter. The
+  // resulting immutable object can be used to obtain pointers to the
+  // individual arrays.
+  //
+  // It's allowed to pass fewer array sizes than the number of arrays. E.g.,
+  // if all you need is to the offset of the second array, you only need to
+  // pass one argument -- the number of elements in the first array.
+  //
+  //   // int[3] followed by 4 bytes of padding and an unknown number of
+  //   // doubles.
+  //   auto x = Layout<int, double>::Partial(3);
+  //   // doubles start at byte 16.
+  //   assert(x.Offset<1>() == 16);
+  //
+  // If you know the number of elements in all arrays, you can still call
+  // `Partial()` but it's more convenient to use the constructor of `Layout`.
+  //
+  //   Layout<int, double> x(3, 5);
+  //
+  // Note: The sizes of the arrays must be specified in number of elements,
+  // not in bytes.
+  //
+  // Requires: `sizeof...(Sizes) <= sizeof...(Ts)`.
+  // Requires: all arguments are convertible to `size_t`.
+  template <class... Sizes>
+  static constexpr PartialType<sizeof...(Sizes)> Partial(Sizes&&... sizes) {
+    static_assert(sizeof...(Sizes) <= sizeof...(Ts));
+    return PartialType<sizeof...(Sizes)>(std::forward<Sizes>(sizes)...);
+  }
+
+  // Creates a layout with the sizes of all arrays specified. If you know
+  // only the sizes of the first N arrays (where N can be zero), you can use
+  // `Partial()` defined above. The constructor is essentially equivalent to
+  // calling `Partial()` and passing in all array sizes; the constructor is
+  // provided as a convenient abbreviation.
+  //
+  // Note: The sizes of the arrays must be specified in number of elements,
+  // not in bytes.
+  constexpr explicit Layout(internal_layout::TypeToSize<Ts>... sizes)
+      : internal_layout::LayoutType<sizeof...(Ts), Ts...>(sizes...) {}
+};
+
+}  // namespace container_internal
+}  // namespace absl
+
+#endif  // ABSL_CONTAINER_INTERNAL_LAYOUT_H_
diff --git a/src/crimson/common/log.cc b/src/crimson/common/log.cc
new file mode 100644
index 000000000..cae9f6a7b
--- /dev/null
+++ b/src/crimson/common/log.cc
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "log.h"
+
+static std::array<seastar::logger, ceph_subsys_get_num()> loggers{
+#define SUBSYS(name, log_level, gather_level) \
+  seastar::logger(#name),
+#define DEFAULT_SUBSYS(log_level, gather_level) \
+  seastar::logger("none"),
+  #include "common/subsys.h"
+#undef SUBSYS
+#undef DEFAULT_SUBSYS
+};
+
+namespace crimson {
+seastar::logger& get_logger(int subsys) {
+  assert(subsys < ceph_subsys_max);
+  return loggers[subsys];
+}
+}
diff --git a/src/crimson/common/log.h b/src/crimson/common/log.h
new file mode 100644
index 000000000..635349098
--- /dev/null
+++ b/src/crimson/common/log.h
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/util/log.hh>
+#include "common/subsys_types.h"
+
+namespace crimson {
+seastar::logger& get_logger(int subsys);
+static inline seastar::log_level to_log_level(int level) {
+  if (level < 0) {
+    return seastar::log_level::error;
+  } else if (level < 1) {
+    return seastar::log_level::warn;
+  } else if (level < 5) {
+    return seastar::log_level::info;
+  } else if (level <= 20) {
+    return seastar::log_level::debug;
+  } else {
+    return seastar::log_level::trace;
+  }
+}
+}
diff --git a/src/crimson/common/perf_counters_collection.cc b/src/crimson/common/perf_counters_collection.cc
new file mode 100644
index 000000000..af80dbcc2
--- /dev/null
+++ b/src/crimson/common/perf_counters_collection.cc
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "perf_counters_collection.h"
+
+namespace crimson::common {
+PerfCountersCollection::PerfCountersCollection()
+{
+  perf_collection = std::make_unique<PerfCountersCollectionImpl>();
+}
+PerfCountersCollection::~PerfCountersCollection()
+{
+  perf_collection->clear();
+}
+
+PerfCountersCollectionImpl* PerfCountersCollection:: get_perf_collection()
+{
+  return perf_collection.get();
+}
+
+PerfCountersCollection::ShardedPerfCountersCollection PerfCountersCollection::sharded_perf_coll;
+
+}
+
+
diff --git a/src/crimson/common/perf_counters_collection.h b/src/crimson/common/perf_counters_collection.h
new file mode 100644
index 000000000..a19630247
--- /dev/null
+++ b/src/crimson/common/perf_counters_collection.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "common/perf_counters.h"
+#include <seastar/core/sharded.hh>
+
+using crimson::common::PerfCountersCollectionImpl;
+namespace crimson::common {
+class PerfCountersCollection: public seastar::sharded<PerfCountersCollection>
+{
+  using ShardedPerfCountersCollection = seastar::sharded<PerfCountersCollection>;
+
+private:
+  std::unique_ptr<PerfCountersCollectionImpl> perf_collection;
+  static ShardedPerfCountersCollection sharded_perf_coll;
+  friend PerfCountersCollection& local_perf_coll();
+  friend ShardedPerfCountersCollection& sharded_perf_coll();
+
+public:
+  PerfCountersCollection();
+  ~PerfCountersCollection();
+  PerfCountersCollectionImpl* get_perf_collection();
+
+};
+
+inline PerfCountersCollection::ShardedPerfCountersCollection& sharded_perf_coll(){
+  return PerfCountersCollection::sharded_perf_coll;
+}
+
+inline PerfCountersCollection& local_perf_coll() {
+  return PerfCountersCollection::sharded_perf_coll.local();
+}
+
+}
+
diff --git a/src/crimson/common/shared_lru.h b/src/crimson/common/shared_lru.h
new file mode 100644
index 000000000..4c1da401e
--- /dev/null
+++ b/src/crimson/common/shared_lru.h
@@ -0,0 +1,178 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <boost/smart_ptr/local_shared_ptr.hpp>
+#include <boost/smart_ptr/weak_ptr.hpp>
+#include "simple_lru.h"
+
+/// SharedLRU does its best to cache objects. It not only tracks the objects
+/// in its LRU cache with strong references, it also tracks objects with
+/// weak_ptr even if the cache does not hold any strong references to them. so
+/// that it can return the objects after they are evicted, as long as they've
+/// ever been cached and have not been destroyed yet.
+template<class K, class V>
+class SharedLRU {
+  using shared_ptr_t = boost::local_shared_ptr<V>;
+  using weak_ptr_t = boost::weak_ptr<V>;
+  using value_type = std::pair<K, shared_ptr_t>;
+
+  // weak_refs is already ordered, and we don't use accessors like
+  // LRUCache::lower_bound(), so unordered LRUCache would suffice.
+  SimpleLRU<K, shared_ptr_t, false> cache;
+  std::map<K, std::pair<weak_ptr_t, V*>> weak_refs;
+
+  struct Deleter {
+    SharedLRU<K,V>* cache;
+    const K key;
+    void operator()(V* ptr) {
+      cache->_erase_weak(key);
+      delete ptr;
+    }
+  };
+  void _erase_weak(const K& key) {
+    weak_refs.erase(key);
+  }
+public:
+  SharedLRU(size_t max_size = 20)
+    : cache{max_size}
+  {}
+  ~SharedLRU() {
+    cache.clear();
+    // use plain assert() in utiliy classes to avoid dependencies on logging
+    assert(weak_refs.empty());
+  }
+  /**
+   * Returns a reference to the given key, and perform an insertion if such
+   * key does not already exist
+   */
+  shared_ptr_t operator[](const K& key);
+  /**
+   * Returns true iff there are no live references left to anything that has been
+   * in the cache.
+   */
+  bool empty() const {
+    return weak_refs.empty();
+  }
+  size_t size() const {
+    return cache.size();
+  }
+  size_t capacity() const {
+    return cache.capacity();
+  }
+  /***
+   * Inserts a key if not present, or bumps it to the front of the LRU if
+   * it is, and then gives you a reference to the value. If the key already
+   * existed, you are responsible for deleting the new value you tried to
+   * insert.
+   *
+   * @param key The key to insert
+   * @param value The value that goes with the key
+   * @param existed Set to true if the value was already in the
+   * map, false otherwise
+   * @return A reference to the map's value for the given key
+   */
+  shared_ptr_t insert(const K& key, std::unique_ptr<V> value);
+  // clear all strong reference from the lru.
+  void clear() {
+    cache.clear();
+  }
+  shared_ptr_t find(const K& key);
+  // return the last element that is not greater than key
+  shared_ptr_t lower_bound(const K& key);
+  // return the first element that is greater than key
+  std::optional<value_type> upper_bound(const K& key);
+
+  void erase(const K& key) {
+    cache.erase(key);
+    _erase_weak(key);
+  }
+};
+
+template<class K, class V>
+typename SharedLRU<K,V>::shared_ptr_t
+SharedLRU<K,V>::insert(const K& key, std::unique_ptr<V> value)
+{
+  shared_ptr_t val;
+  if (auto found = weak_refs.find(key); found != weak_refs.end()) {
+    val = found->second.first.lock();
+  }
+  if (!val) {
+    val.reset(value.release(), Deleter{this, key});
+    weak_refs.emplace(key, std::make_pair(val, val.get()));
+  }
+  cache.insert(key, val);
+  return val;
+}
+
+template<class K, class V>
+typename SharedLRU<K,V>::shared_ptr_t
+SharedLRU<K,V>::operator[](const K& key)
+{
+  if (auto found = cache.find(key); found) {
+    return *found;
+  }
+  shared_ptr_t val;
+  if (auto found = weak_refs.find(key); found != weak_refs.end()) {
+    val = found->second.first.lock();
+  }
+  if (!val) {
+    val.reset(new V{}, Deleter{this, key});
+    weak_refs.emplace(key, std::make_pair(val, val.get()));
+  }
+  cache.insert(key, val);
+  return val;
+}
+
+template<class K, class V>
+typename SharedLRU<K,V>::shared_ptr_t
+SharedLRU<K,V>::find(const K& key)
+{
+  if (auto found = cache.find(key); found) {
+    return *found;
+  }
+  shared_ptr_t val;
+  if (auto found = weak_refs.find(key); found != weak_refs.end()) {
+    val = found->second.first.lock();
+  }
+  if (val) {
+    cache.insert(key, val);
+  }
+  return val;
+}
+
+template<class K, class V>
+typename SharedLRU<K,V>::shared_ptr_t
+SharedLRU<K,V>::lower_bound(const K& key)
+{
+  if (weak_refs.empty()) {
+    return {};
+  }
+  auto found = weak_refs.lower_bound(key);
+  if (found == weak_refs.end()) {
+    --found;
+  }
+  if (auto val = found->second.first.lock(); val) {
+    cache.insert(key, val);
+    return val;
+  } else {
+    return {};
+  }
+}
+
+template<class K, class V>
+std::optional<typename SharedLRU<K,V>::value_type>
+SharedLRU<K,V>::upper_bound(const K& key)
+{
+  for (auto found = weak_refs.upper_bound(key);
+       found != weak_refs.end();
+       ++found) {
+    if (auto val = found->second.first.lock(); val) {
+      return std::make_pair(found->first, val);
+    }
+  }
+  return std::nullopt;
+}
diff --git a/src/crimson/common/simple_lru.h b/src/crimson/common/simple_lru.h
new file mode 100644
index 000000000..1419c4885
--- /dev/null
+++ b/src/crimson/common/simple_lru.h
@@ -0,0 +1,141 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <list>
+#include <map>
+#include <optional>
+#include <type_traits>
+#include <unordered_map>
+
+template <class Key, class Value, bool Ordered>
+class SimpleLRU {
+  static_assert(std::is_default_constructible_v<Value>);
+  using list_type = std::list<Key>;
+  template<class K, class V>
+  using map_t = std::conditional_t<Ordered,
+				   std::map<K, V>,
+				   std::unordered_map<K, V>>;
+  using map_type = map_t<Key, std::pair<Value, typename list_type::iterator>>;
+  list_type lru;
+  map_type cache;
+  const size_t max_size;
+
+public:
+  SimpleLRU(size_t size = 20)
+    : cache(size),
+      max_size(size)
+  {}
+  size_t size() const {
+    return cache.size();
+  }
+  size_t capacity() const {
+    return max_size;
+  }
+  using insert_return_type = std::pair<Value, bool>;
+  insert_return_type insert(const Key& key, Value value);
+  std::optional<Value> find(const Key& key);
+  std::optional<std::enable_if<Ordered, Value>> lower_bound(const Key& key);
+  void erase(const Key& key);
+  void clear();
+private:
+  // bump the item to the front of the lru list
+  Value _lru_add(typename map_type::iterator found);
+  // evict the last element of most recently used list
+  void _evict();
+};
+
+template <class Key, class Value, bool Ordered>
+typename SimpleLRU<Key,Value,Ordered>::insert_return_type
+SimpleLRU<Key,Value,Ordered>::insert(const Key& key, Value value)
+{
+  if constexpr(Ordered) {
+    auto found = cache.lower_bound(key);
+    if (found != cache.end() && found->first == key) {
+      // already exists
+      return {found->second.first, true};
+    } else {
+      if (size() >= capacity()) {
+        _evict();
+      }
+      lru.push_front(key);
+      // use lower_bound as hint to save the lookup
+      cache.emplace_hint(found, key, std::make_pair(value, lru.begin()));
+      return {std::move(value), false};
+    }
+  } else {
+    // cache is not ordered
+    auto found = cache.find(key);
+    if (found != cache.end()) {
+      // already exists
+      return {found->second.first, true};
+    } else {
+      if (size() >= capacity()) {
+	_evict();
+      }
+      lru.push_front(key);
+      cache.emplace(key, std::make_pair(value, lru.begin()));
+      return {std::move(value), false};
+    }
+  }
+}
+
+template <class Key, class Value, bool Ordered>
+std::optional<Value> SimpleLRU<Key,Value,Ordered>::find(const Key& key)
+{
+  if (auto found = cache.find(key); found != cache.end()){
+    return _lru_add(found);
+  } else {
+    return {};
+  }
+}
+
+template <class Key, class Value, bool Ordered>
+std::optional<std::enable_if<Ordered, Value>>
+SimpleLRU<Key,Value,Ordered>::lower_bound(const Key& key)
+{
+  if (auto found = cache.lower_bound(key); found != cache.end()) {
+    return _lru_add(found);
+  } else {
+    return {};
+  }
+}
+
+template <class Key, class Value, bool Ordered>
+void SimpleLRU<Key,Value,Ordered>::clear()
+{
+  lru.clear();
+  cache.clear();
+}
+
+template <class Key, class Value, bool Ordered>
+void SimpleLRU<Key,Value,Ordered>::erase(const Key& key)
+{
+  if (auto found = cache.find(key); found != cache.end()) {
+    lru.erase(found->second.second);
+    cache.erase(found);
+  }
+}
+
+template <class Key, class Value, bool Ordered>
+Value SimpleLRU<Key,Value,Ordered>::_lru_add(
+  typename SimpleLRU<Key,Value,Ordered>::map_type::iterator found)
+{
+  auto& [value, in_lru] = found->second;
+  if (in_lru != lru.begin()){
+    // move item to the front
+    lru.splice(lru.begin(), lru, in_lru);
+  }
+  // the item is already at the front
+  return value;
+}
+
+template <class Key, class Value, bool Ordered>
+void SimpleLRU<Key,Value,Ordered>::_evict()
+{
+  // evict the last element of most recently used list
+  auto last = --lru.end();
+  cache.erase(*last);
+  lru.erase(last);
+}
diff --git a/src/crimson/common/throttle.cc b/src/crimson/common/throttle.cc
new file mode 100644
index 000000000..bd9195181
--- /dev/null
+++ b/src/crimson/common/throttle.cc
@@ -0,0 +1,59 @@
+#include "throttle.h"
+
+namespace crimson::common {
+
+int64_t Throttle::take(int64_t c)
+{
+  if (!max) {
+    return 0;
+  }
+  count += c;
+  return count;
+}
+
+int64_t Throttle::put(int64_t c)
+{
+  if (!max) {
+    return 0;
+  }
+  if (!c) {
+    return count;
+  }
+  on_free_slots.signal();
+  count -= c;
+  return count;
+}
+
+seastar::future<> Throttle::get(size_t c)
+{
+  if (!max) {
+    return seastar::make_ready_future<>();
+  }
+  return on_free_slots.wait([this, c] {
+    return !_should_wait(c);
+  }).then([this, c] {
+    count += c;
+    return seastar::make_ready_future<>();
+  });
+}
+
+void Throttle::reset_max(size_t m) {
+  if (max == m) {
+    return;
+  }
+
+  if (m > max) {
+    on_free_slots.signal();
+  }
+  max = m;
+}
+
+bool Throttle::_should_wait(size_t c) const {
+  if (!max) {
+    return false;
+  }
+  return ((c <= max && count + c > max) || // normally stay under max
+          (c >= max && count > max));      // except for large c
+}
+
+} // namespace crimson::common
diff --git a/src/crimson/common/throttle.h b/src/crimson/common/throttle.h
new file mode 100644
index 000000000..fea471c8d
--- /dev/null
+++ b/src/crimson/common/throttle.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/condition-variable.hh>
+// pull seastar::timer<...>::timer definitions. FIX SEASTAR or reactor.hh
+// is obligatory and should be included everywhere?
+#include <seastar/core/reactor.hh>
+
+#include "common/ThrottleInterface.h"
+
+namespace crimson::common {
+
+class Throttle final : public ThrottleInterface {
+  size_t max = 0;
+  size_t count = 0;
+  // we cannot change the "count" of seastar::semaphore after it is created,
+  // so use condition_variable instead.
+  seastar::condition_variable on_free_slots;
+public:
+  explicit Throttle(size_t m)
+    : max(m)
+  {}
+  int64_t take(int64_t c = 1) override;
+  int64_t put(int64_t c = 1) override;
+  seastar::future<> get(size_t c);
+  size_t get_current() const {
+    return count;
+  }
+  size_t get_max() const {
+    return max;
+  }
+  void reset_max(size_t m);
+private:
+  bool _should_wait(size_t c) const;
+};
+
+} // namespace crimson::common
diff --git a/src/crimson/common/tri_mutex.cc b/src/crimson/common/tri_mutex.cc
new file mode 100644
index 000000000..c18aff1a0
--- /dev/null
+++ b/src/crimson/common/tri_mutex.cc
@@ -0,0 +1,225 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tri_mutex.h"
+
+seastar::future<> read_lock::lock()
+{
+  return static_cast<tri_mutex*>(this)->lock_for_read();
+}
+
+void read_lock::unlock()
+{
+  static_cast<tri_mutex*>(this)->unlock_for_read();
+}
+
+seastar::future<> write_lock::lock()
+{
+  return static_cast<tri_mutex*>(this)->lock_for_write(false);
+}
+
+void write_lock::unlock()
+{
+  static_cast<tri_mutex*>(this)->unlock_for_write();
+}
+
+seastar::future<> excl_lock::lock()
+{
+  return static_cast<tri_mutex*>(this)->lock_for_excl();
+}
+
+void excl_lock::unlock()
+{
+  static_cast<tri_mutex*>(this)->unlock_for_excl();
+}
+
+seastar::future<> excl_lock_from_read::lock()
+{
+  static_cast<tri_mutex*>(this)->promote_from_read();
+  return seastar::make_ready_future<>();
+}
+
+void excl_lock_from_read::unlock()
+{
+  static_cast<tri_mutex*>(this)->demote_to_read();
+}
+
+seastar::future<> excl_lock_from_write::lock()
+{
+  static_cast<tri_mutex*>(this)->promote_from_write();
+  return seastar::make_ready_future<>();
+}
+
+void excl_lock_from_write::unlock()
+{
+  static_cast<tri_mutex*>(this)->demote_to_write();
+}
+
+seastar::future<> excl_lock_from_excl::lock()
+{
+  return seastar::make_ready_future<>();
+}
+
+void excl_lock_from_excl::unlock()
+{
+}
+
+tri_mutex::~tri_mutex()
+{
+  assert(!is_acquired());
+}
+
+seastar::future<> tri_mutex::lock_for_read()
+{
+  if (try_lock_for_read()) {
+    return seastar::make_ready_future<>();
+  }
+  waiters.emplace_back(seastar::promise<>(), type_t::read);
+  return waiters.back().pr.get_future();
+}
+
+bool tri_mutex::try_lock_for_read() noexcept
+{
+  if (!writers && !exclusively_used && waiters.empty()) {
+    ++readers;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void tri_mutex::unlock_for_read()
+{
+  assert(readers > 0);
+  if (--readers == 0) {
+    wake();
+  }
+}
+
+void tri_mutex::promote_from_read()
+{
+  assert(readers == 1);
+  --readers;
+  exclusively_used = true;
+}
+
+void tri_mutex::demote_to_read()
+{
+  assert(exclusively_used);
+  exclusively_used = false;
+  ++readers;
+}
+
+seastar::future<> tri_mutex::lock_for_write(bool greedy)
+{
+  if (try_lock_for_write(greedy)) {
+    return seastar::make_ready_future<>();
+  }
+  waiters.emplace_back(seastar::promise<>(), type_t::write);
+  return waiters.back().pr.get_future();
+}
+
+bool tri_mutex::try_lock_for_write(bool greedy) noexcept
+{
+  if (!readers && !exclusively_used) {
+    if (greedy || waiters.empty()) {
+      ++writers;
+      return true;
+    }
+  }
+  return false;
+}
+
+void tri_mutex::unlock_for_write()
+{
+  assert(writers > 0);
+  if (--writers == 0) {
+    wake();
+  }
+}
+
+void tri_mutex::promote_from_write()
+{
+  assert(writers == 1);
+  --writers;
+  exclusively_used = true;
+}
+
+void tri_mutex::demote_to_write()
+{
+  assert(exclusively_used);
+  exclusively_used = false;
+  ++writers;
+}
+
+// for exclusive users
+seastar::future<> tri_mutex::lock_for_excl()
+{
+  if (try_lock_for_excl()) {
+    return seastar::make_ready_future<>();
+  }
+  waiters.emplace_back(seastar::promise<>(), type_t::exclusive);
+  return waiters.back().pr.get_future();
+}
+
+bool tri_mutex::try_lock_for_excl() noexcept
+{
+  if (!readers && !writers && !exclusively_used) {
+    exclusively_used = true;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void tri_mutex::unlock_for_excl()
+{
+  assert(exclusively_used);
+  exclusively_used = false;
+  wake();
+}
+
+bool tri_mutex::is_acquired() const
+{
+  if (readers) {
+    return true;
+  } else if (writers) {
+    return true;
+  } else if (exclusively_used) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void tri_mutex::wake()
+{
+  assert(!readers && !writers && !exclusively_used);
+  type_t type = type_t::none;
+  while (!waiters.empty()) {
+    auto& waiter = waiters.front();
+    if (type == type_t::exclusive) {
+      break;
+    } if (type == type_t::none) {
+      type = waiter.type;
+    } else if (type != waiter.type) {
+      // to be woken in the next batch
+      break;
+    }
+    switch (type) {
+    case type_t::read:
+      ++readers;
+      break;
+    case type_t::write:
+      ++writers;
+      break;
+    case type_t::exclusive:
+      exclusively_used = true;
+      break;
+    default:
+      assert(0);
+    }
+    waiter.pr.set_value();
+    waiters.pop_front();
+  }
+}
diff --git a/src/crimson/common/tri_mutex.h b/src/crimson/common/tri_mutex.h
new file mode 100644
index 000000000..127573b3a
--- /dev/null
+++ b/src/crimson/common/tri_mutex.h
@@ -0,0 +1,156 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/circular_buffer.hh>
+
+class read_lock {
+public:
+  seastar::future<> lock();
+  void unlock();
+};
+
+class write_lock {
+public:
+  seastar::future<> lock();
+  void unlock();
+};
+
+class excl_lock {
+public:
+  seastar::future<> lock();
+  void unlock();
+};
+
+// promote from read to excl
+class excl_lock_from_read {
+public:
+  seastar::future<> lock();
+  void unlock();
+};
+
+// promote from write to excl
+class excl_lock_from_write {
+public:
+  seastar::future<> lock();
+  void unlock();
+};
+
+// promote from excl to excl
+class excl_lock_from_excl {
+public:
+  seastar::future<> lock();
+  void unlock();
+};
+
+/// shared/exclusive mutual exclusion
+///
+/// this lock design uses reader and writer is entirely and completely
+/// independent of the conventional reader/writer lock usage. Here, what we
+/// mean is that we can pipeline reads, and we can pipeline writes, but we
+/// cannot allow a read while writes are in progress or a write while reads are
+/// in progress. Any rmw operation is therefore exclusive.
+///
+/// tri_mutex is based on seastar::shared_mutex, but instead of two kinds of
+/// waiters, tri_mutex keeps track of three kinds of lock users:
+/// - readers
+/// - writers
+/// - exclusive users
+class tri_mutex : private read_lock,
+                          write_lock,
+                          excl_lock,
+                          excl_lock_from_read,
+                          excl_lock_from_write,
+                          excl_lock_from_excl
+{
+public:
+  tri_mutex() = default;
+  ~tri_mutex();
+
+  read_lock& for_read() {
+    return *this;
+  }
+  write_lock& for_write() {
+    return *this;
+  }
+  excl_lock& for_excl() {
+    return *this;
+  }
+  excl_lock_from_read& excl_from_read() {
+    return *this;
+  }
+  excl_lock_from_write& excl_from_write() {
+    return *this;
+  }
+  excl_lock_from_write& excl_from_excl() {
+    return *this;
+  }
+
+  // for shared readers
+  seastar::future<> lock_for_read();
+  bool try_lock_for_read() noexcept;
+  void unlock_for_read();
+  void promote_from_read();
+  void demote_to_read();
+  unsigned get_readers() const {
+    return readers;
+  }
+
+  // for shared writers
+  seastar::future<> lock_for_write(bool greedy);
+  bool try_lock_for_write(bool greedy) noexcept;
+  void unlock_for_write();
+  void promote_from_write();
+  void demote_to_write();
+  unsigned get_writers() const {
+    return writers;
+  }
+
+  // for exclusive users
+  seastar::future<> lock_for_excl();
+  bool try_lock_for_excl() noexcept;
+  void unlock_for_excl();
+  bool is_excl_acquired() const {
+    return exclusively_used;
+  }
+
+  bool is_acquired() const;
+
+  /// pass the provided exception to any waiting waiters
+  template<typename Exception>
+  void abort(Exception ex) {
+    while (!waiters.empty()) {
+      auto& waiter = waiters.front();
+      waiter.pr.set_exception(std::make_exception_ptr(ex));
+      waiters.pop_front();
+    }
+  }
+
+private:
+  void wake();
+  unsigned readers = 0;
+  unsigned writers = 0;
+  bool exclusively_used = false;
+  enum class type_t : uint8_t {
+    read,
+    write,
+    exclusive,
+    none,
+  };
+  struct waiter_t {
+    waiter_t(seastar::promise<>&& pr, type_t type)
+      : pr(std::move(pr)), type(type)
+    {}
+    seastar::promise<> pr;
+    type_t type;
+  };
+  seastar::circular_buffer<waiter_t> waiters;
+  friend class read_lock;
+  friend class write_lock;
+  friend class excl_lock;
+  friend class excl_lock_from_read;
+  friend class excl_lock_from_write;
+  friend class excl_lock_from_excl;
+};
diff --git a/src/crimson/common/type_helpers.h b/src/crimson/common/type_helpers.h
new file mode 100644
index 000000000..4c606581f
--- /dev/null
+++ b/src/crimson/common/type_helpers.h
@@ -0,0 +1,8 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "boost/intrusive_ptr.hpp"
+
+template<typename T> using Ref = boost::intrusive_ptr<T>;
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
commit	19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree	42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/crimson/common
parent	Initial commit. (diff)
download	ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip