232 files changed, 55919 insertions, 0 deletions
diff --git a/src/crimson/CMakeLists.txt b/src/crimson/CMakeLists.txt
new file mode 100644
index 000000000..26f729336
--- /dev/null
+++ b/src/crimson/CMakeLists.txt
@@ -0,0 +1,191 @@
+add_library(crimson::cflags INTERFACE IMPORTED)
+set(crimson_cflag_definitions "WITH_SEASTAR=1")
+# disable concepts to address https://github.com/boostorg/asio/issues/312
+if((CMAKE_CXX_COMPILER_ID STREQUAL GNU AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10) OR
+   (CMAKE_CXX_COMPILER_ID STREQUAL Clang))
+ list(APPEND crimson_cflag_definitions
+   "BOOST_ASIO_DISABLE_CONCEPTS")
+endif()
+set_target_properties(crimson::cflags PROPERTIES
+  INTERFACE_COMPILE_DEFINITIONS "${crimson_cflag_definitions}"
+  INTERFACE_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CXX>:-Wno-non-virtual-dtor>
+  INTERFACE_LINK_LIBRARIES Seastar::seastar)
+
+set(crimson_common_srcs
+  common/assert.cc
+  common/buffer_io.cc
+  common/config_proxy.cc
+  common/formatter.cc
+  common/perf_counters_collection.cc
+  common/log.cc
+  common/throttle.cc
+  common/tri_mutex.cc)
+
+# the specialized version of ceph-common, where
+#  - the logging is sent to Seastar backend
+#  - and the template parameter of lock_policy is SINGLE
+add_library(crimson-common STATIC
+  ${PROJECT_SOURCE_DIR}/src/common/admin_socket_client.cc
+  ${PROJECT_SOURCE_DIR}/src/common/bit_str.cc
+  ${PROJECT_SOURCE_DIR}/src/common/bloom_filter.cc
+  ${PROJECT_SOURCE_DIR}/src/common/buffer.cc
+  ${PROJECT_SOURCE_DIR}/src/common/buffer_seastar.cc
+  ${PROJECT_SOURCE_DIR}/src/common/ceph_argparse.cc
+  ${PROJECT_SOURCE_DIR}/src/common/ceph_context.cc
+  ${PROJECT_SOURCE_DIR}/src/common/ceph_crypto.cc
+  ${PROJECT_SOURCE_DIR}/src/common/ceph_hash.cc
+  ${PROJECT_SOURCE_DIR}/src/common/ceph_time.cc
+  ${PROJECT_SOURCE_DIR}/src/common/ceph_strings.cc
+  ${PROJECT_SOURCE_DIR}/src/common/ceph_releases.cc
+  ${PROJECT_SOURCE_DIR}/src/common/cmdparse.cc
+  ${PROJECT_SOURCE_DIR}/src/common/common_init.cc
+  ${PROJECT_SOURCE_DIR}/src/common/compat.cc
+  ${PROJECT_SOURCE_DIR}/src/common/code_environment.cc
+  ${PROJECT_SOURCE_DIR}/src/common/config.cc
+  ${PROJECT_SOURCE_DIR}/src/common/config_values.cc
+  ${PROJECT_SOURCE_DIR}/src/common/dout.cc
+  ${PROJECT_SOURCE_DIR}/src/common/entity_name.cc
+  ${PROJECT_SOURCE_DIR}/src/common/environment.cc
+  ${PROJECT_SOURCE_DIR}/src/common/errno.cc
+  ${PROJECT_SOURCE_DIR}/src/common/escape.cc
+  ${PROJECT_SOURCE_DIR}/src/common/hex.cc
+  ${PROJECT_SOURCE_DIR}/src/common/fs_types.cc
+  ${PROJECT_SOURCE_DIR}/src/common/ceph_json.cc
+  ${PROJECT_SOURCE_DIR}/src/common/histogram.cc
+  ${PROJECT_SOURCE_DIR}/src/common/hobject.cc
+  ${PROJECT_SOURCE_DIR}/src/common/hostname.cc
+  ${PROJECT_SOURCE_DIR}/src/common/ipaddr.cc
+  ${PROJECT_SOURCE_DIR}/src/common/mempool.cc
+  ${PROJECT_SOURCE_DIR}/src/common/options.cc
+  ${PROJECT_SOURCE_DIR}/src/common/perf_counters.cc
+  ${PROJECT_SOURCE_DIR}/src/common/perf_histogram.cc
+  ${PROJECT_SOURCE_DIR}/src/common/page.cc
+  ${PROJECT_SOURCE_DIR}/src/common/pick_address.cc
+  ${PROJECT_SOURCE_DIR}/src/common/snap_types.cc
+  ${PROJECT_SOURCE_DIR}/src/common/signal.cc
+  ${PROJECT_SOURCE_DIR}/src/common/str_list.cc
+  ${PROJECT_SOURCE_DIR}/src/common/str_map.cc
+  ${PROJECT_SOURCE_DIR}/src/common/strtol.cc
+  ${PROJECT_SOURCE_DIR}/src/common/reverse.c
+  ${PROJECT_SOURCE_DIR}/src/common/types.cc
+  ${PROJECT_SOURCE_DIR}/src/common/utf8.c
+  ${PROJECT_SOURCE_DIR}/src/common/version.cc
+  ${PROJECT_SOURCE_DIR}/src/common/BackTrace.cc
+  ${PROJECT_SOURCE_DIR}/src/common/ConfUtils.cc
+  ${PROJECT_SOURCE_DIR}/src/common/DecayCounter.cc
+  ${PROJECT_SOURCE_DIR}/src/common/HTMLFormatter.cc
+  ${PROJECT_SOURCE_DIR}/src/common/Formatter.cc
+  ${PROJECT_SOURCE_DIR}/src/common/Graylog.cc
+  ${PROJECT_SOURCE_DIR}/src/common/ostream_temp.cc
+  ${PROJECT_SOURCE_DIR}/src/common/LogEntry.cc
+  ${PROJECT_SOURCE_DIR}/src/common/TextTable.cc
+  ${PROJECT_SOURCE_DIR}/src/common/Thread.cc
+  ${PROJECT_SOURCE_DIR}/src/common/PluginRegistry.cc
+  ${PROJECT_SOURCE_DIR}/src/common/RefCountedObj.cc
+  ${PROJECT_SOURCE_DIR}/src/common/util.cc
+  ${PROJECT_SOURCE_DIR}/src/crush/builder.c
+  ${PROJECT_SOURCE_DIR}/src/crush/mapper.c
+  ${PROJECT_SOURCE_DIR}/src/crush/crush.c
+  ${PROJECT_SOURCE_DIR}/src/crush/hash.c
+  ${PROJECT_SOURCE_DIR}/src/crush/CrushWrapper.cc
+  ${PROJECT_SOURCE_DIR}/src/crush/CrushCompiler.cc
+  ${PROJECT_SOURCE_DIR}/src/crush/CrushTester.cc
+  ${PROJECT_SOURCE_DIR}/src/crush/CrushLocation.cc
+  ${PROJECT_SOURCE_DIR}/src/global/global_context.cc
+  ${PROJECT_SOURCE_DIR}/src/global/pidfile.cc
+  ${PROJECT_SOURCE_DIR}/src/librbd/Features.cc
+  ${PROJECT_SOURCE_DIR}/src/log/Log.cc
+  ${PROJECT_SOURCE_DIR}/src/mgr/ServiceMap.cc
+  ${PROJECT_SOURCE_DIR}/src/mds/inode_backtrace.cc
+  ${PROJECT_SOURCE_DIR}/src/mds/mdstypes.cc
+  ${PROJECT_SOURCE_DIR}/src/mds/cephfs_features.cc
+  ${PROJECT_SOURCE_DIR}/src/mds/FSMap.cc
+  ${PROJECT_SOURCE_DIR}/src/mds/FSMapUser.cc
+  ${PROJECT_SOURCE_DIR}/src/mds/MDSMap.cc
+  ${PROJECT_SOURCE_DIR}/src/msg/msg_types.cc
+  ${PROJECT_SOURCE_DIR}/src/msg/Message.cc
+  ${PROJECT_SOURCE_DIR}/src/mon/PGMap.cc
+  ${PROJECT_SOURCE_DIR}/src/mon/MonCap.cc
+  ${PROJECT_SOURCE_DIR}/src/mon/MonMap.cc
+  ${PROJECT_SOURCE_DIR}/src/osd/osd_types.cc
+  ${PROJECT_SOURCE_DIR}/src/osd/ECMsgTypes.cc
+  ${PROJECT_SOURCE_DIR}/src/osd/HitSet.cc
+  ${PROJECT_SOURCE_DIR}/src/osd/OSDMap.cc
+  ${PROJECT_SOURCE_DIR}/src/osd/PGPeeringEvent.cc
+  ${PROJECT_SOURCE_DIR}/src/xxHash/xxhash.c
+  ${crimson_common_srcs}
+  $<TARGET_OBJECTS:common_mountcephfs_objs>)
+
+target_compile_definitions(crimson-common PRIVATE
+  "CEPH_LIBDIR=\"${CMAKE_INSTALL_FULL_LIBDIR}\""
+  "CEPH_PKGLIBDIR=\"${CEPH_INSTALL_FULL_PKGLIBDIR}\""
+  "CEPH_DATADIR=\"${CEPH_INSTALL_DATADIR}\"")
+
+set(crimson_common_deps
+  Boost::iostreams
+  Boost::random
+  json_spirit)
+
+if(WITH_JAEGER)
+  include_directories(SYSTEM ${CMAKE_BINARY_DIR}/external/include)
+  list(APPEND crimson_common_deps jaeger-base)
+endif()
+
+if(NOT WITH_SYSTEM_BOOST)
+  list(APPEND crimson_common_deps ${ZLIB_LIBRARIES})
+endif()
+
+target_link_libraries(crimson-common
+  PUBLIC
+    crimson::cflags
+  PRIVATE
+    crc32
+    ${crimson_common_deps}
+    OpenSSL::Crypto)
+
+set(crimson_auth_srcs
+  auth/KeyRing.cc
+  ${PROJECT_SOURCE_DIR}/src/auth/AuthClientHandler.cc
+  ${PROJECT_SOURCE_DIR}/src/auth/AuthMethodList.cc
+  ${PROJECT_SOURCE_DIR}/src/auth/AuthRegistry.cc
+  ${PROJECT_SOURCE_DIR}/src/auth/AuthSessionHandler.cc
+  ${PROJECT_SOURCE_DIR}/src/auth/Crypto.cc
+  ${PROJECT_SOURCE_DIR}/src/auth/KeyRing.cc
+  ${PROJECT_SOURCE_DIR}/src/auth/RotatingKeyRing.cc
+  ${PROJECT_SOURCE_DIR}/src/auth/cephx/CephxAuthorizeHandler.cc
+  ${PROJECT_SOURCE_DIR}/src/auth/cephx/CephxClientHandler.cc
+  ${PROJECT_SOURCE_DIR}/src/auth/cephx/CephxProtocol.cc
+  ${PROJECT_SOURCE_DIR}/src/auth/cephx/CephxSessionHandler.cc
+  ${PROJECT_SOURCE_DIR}/src/auth/none/AuthNoneAuthorizeHandler.cc)
+set(crimson_mgr_srcs
+  mgr/client.cc)
+set(crimson_mon_srcs
+  mon/MonClient.cc
+  ${PROJECT_SOURCE_DIR}/src/mon/MonSub.cc)
+set(crimson_net_srcs
+  ${PROJECT_SOURCE_DIR}/src/msg/async/crypto_onwire.cc
+  ${PROJECT_SOURCE_DIR}/src/msg/async/frames_v2.cc
+  net/Errors.cc
+  net/Messenger.cc
+  net/SocketConnection.cc
+  net/SocketMessenger.cc
+  net/Socket.cc
+  net/Protocol.cc
+  net/ProtocolV1.cc
+  net/ProtocolV2.cc
+  net/chained_dispatchers.cc)
+add_library(crimson STATIC
+  ${crimson_auth_srcs}
+  ${crimson_mgr_srcs}
+  ${crimson_mon_srcs}
+  ${crimson_net_srcs})
+target_compile_options(crimson PUBLIC
+  "-ftemplate-backtrace-limit=0")
+target_link_libraries(crimson
+  PUBLIC
+    crimson-common
+    crimson::cflags)
+add_subdirectory(admin)
+add_subdirectory(os)
+add_subdirectory(osd)
+add_subdirectory(tools)
diff --git a/src/crimson/admin/CMakeLists.txt b/src/crimson/admin/CMakeLists.txt
new file mode 100644
index 000000000..aa0771735
--- /dev/null
+++ b/src/crimson/admin/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_library(crimson-admin STATIC
+  admin_socket.cc
+  osd_admin.cc
+  pg_commands.cc)
+
+target_link_libraries(crimson-admin
+  crimson::cflags
+  Boost::MPL)
diff --git a/src/crimson/admin/admin_socket.cc b/src/crimson/admin/admin_socket.cc
new file mode 100644
index 000000000..852185af1
--- /dev/null
+++ b/src/crimson/admin/admin_socket.cc
@@ -0,0 +1,521 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/admin/admin_socket.h"
+
+#include <boost/algorithm/string/join.hpp>
+#include <fmt/format.h>
+#include <seastar/net/api.hh>
+#include <seastar/net/inet_address.hh>
+#include <seastar/core/future-util.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/core/sleep.hh>
+#include <seastar/core/thread.hh>
+#include <seastar/util/std-compat.hh>
+
+#include "common/version.h"
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+#include "crimson/common/log.h"
+#include "crimson/net/Socket.h"
+
+using namespace crimson::common;
+
+namespace {
+seastar::logger& logger()
+{
+  return crimson::get_logger(ceph_subsys_osd);
+}
+}  // namespace
+
+namespace crimson::admin {
+
+tell_result_t::tell_result_t(int ret, std::string&& err)
+  : ret{ret}, err(std::move(err))
+{}
+
+tell_result_t::tell_result_t(int ret, std::string&& err, ceph::bufferlist&& out)
+  : ret{ret}, err(std::move(err)), out(std::move(out))
+{}
+
+tell_result_t::tell_result_t(std::unique_ptr<Formatter> formatter)
+{
+  formatter->flush(out);
+}
+
+seastar::future<>
+AdminSocket::register_command(std::unique_ptr<AdminSocketHook>&& hook)
+{
+  return seastar::with_lock(servers_tbl_rwlock,
+			    [this, hook = std::move(hook)]() mutable {
+    auto prefix = hook->prefix;
+    auto [it, added] = hooks.emplace(prefix, std::move(hook));
+    //  was this server tag already registered?
+    assert(added);
+    if (added) {
+      logger().info("register_command(): {})", it->first);
+    }
+    return seastar::now();
+  });
+}
+
+/*
+ * Note: parse_cmd() is executed with servers_tbl_rwlock held as shared
+ */
+auto AdminSocket::parse_cmd(const std::vector<std::string>& cmd)
+  -> std::variant<parsed_command_t, tell_result_t>
+{
+  // preliminaries:
+  //   - create the formatter specified by the cmd parameters
+  //   - locate the "op-code" string (the 'prefix' segment)
+  //   - prepare for command parameters extraction via cmdmap_t
+  cmdmap_t cmdmap;
+  ceph::bufferlist out;
+
+  try {
+    stringstream errss;
+    //  note that cmdmap_from_json() may throw on syntax issues
+    if (!cmdmap_from_json(cmd, &cmdmap, errss)) {
+      logger().error("{}: incoming command error: {}", __func__, errss.str());
+      out.append("error:"s);
+      out.append(errss.str());
+      return tell_result_t{-EINVAL, "invalid json", std::move(out)};
+    }
+  } catch (const std::runtime_error& e) {
+    logger().error("{}: incoming command syntax: {}", __func__, cmd);
+    out.append(string{e.what()});
+    return tell_result_t{-EINVAL, "invalid json", std::move(out)};
+  }
+
+  string format;
+  string prefix;
+  try {
+    cmd_getval(cmdmap, "format", format);
+    cmd_getval(cmdmap, "prefix", prefix);
+  } catch (const bad_cmd_get& e) {
+    logger().error("{}: invalid syntax: {}", __func__, cmd);
+    out.append(string{e.what()});
+    return tell_result_t{-EINVAL, "invalid json", std::move(out)};
+  }
+
+  // match the incoming op-code to one of the registered APIs
+  if (auto found = hooks.find(prefix); found != hooks.end()) {
+    return parsed_command_t{ cmdmap, format, *found->second };
+  } else {
+    return tell_result_t{-EINVAL,
+                         fmt::format("unknown command '{}'", prefix),
+                         std::move(out)};
+  }
+}
+
+seastar::future<> AdminSocket::finalize_response(
+  seastar::output_stream<char>& out, ceph::bufferlist&& msgs)
+{
+  string outbuf_cont = msgs.to_str();
+  if (outbuf_cont.empty()) {
+    outbuf_cont = " {} ";
+  }
+  uint32_t response_length = htonl(outbuf_cont.length());
+  logger().info("asok response length: {}", outbuf_cont.length());
+
+  return out.write((char*)&response_length, sizeof(uint32_t))
+    .then([&out, outbuf_cont] { return out.write(outbuf_cont.c_str()); });
+}
+
+
+seastar::future<> AdminSocket::handle_command(crimson::net::ConnectionRef conn,
+					      boost::intrusive_ptr<MCommand> m)
+{
+  return execute_command(m->cmd, std::move(m->get_data())).then(
+    [conn, tid=m->get_tid()](auto result) {
+    auto [ret, err, out] = std::move(result);
+    auto reply = make_message<MCommandReply>(ret, err);
+    reply->set_tid(tid);
+    reply->set_data(out);
+    return conn->send(reply);
+  });
+}
+
+seastar::future<> AdminSocket::execute_line(std::string cmdline,
+                                            seastar::output_stream<char>& out)
+{
+  return execute_command({cmdline}, {}).then([&out, this](auto result) {
+     auto [ret, stderr, stdout] = std::move(result);
+     if (ret < 0) {
+       stdout.append(fmt::format("ERROR: {}\n", cpp_strerror(ret)));
+       stdout.append(stderr);
+     }
+     return finalize_response(out, std::move(stdout));
+  });
+}
+
+auto AdminSocket::execute_command(const std::vector<std::string>& cmd,
+				  ceph::bufferlist&& buf)
+  -> seastar::future<tell_result_t>
+{
+  return seastar::with_shared(servers_tbl_rwlock,
+			      [cmd, buf=std::move(buf), this]() mutable {
+    auto maybe_parsed = parse_cmd(cmd);
+    if (auto parsed = std::get_if<parsed_command_t>(&maybe_parsed); parsed) {
+      stringstream os;
+      string desc{parsed->hook.desc};
+      if (!validate_cmd(nullptr, desc, parsed->params, os)) {
+	logger().error("AdminSocket::execute_command: "
+		       "failed to validate '{}': {}", cmd, os.str());
+	ceph::bufferlist out;
+	out.append(os);
+	return seastar::make_ready_future<tell_result_t>(
+          tell_result_t{-EINVAL, "invalid command json", std::move(out)});
+      }
+      return parsed->hook.call(parsed->params, parsed->format, std::move(buf));
+    } else {
+      auto& result = std::get<tell_result_t>(maybe_parsed);
+      return seastar::make_ready_future<tell_result_t>(std::move(result));
+    }
+  });
+}
+
+// an input_stream consumer that reads buffer into a std::string up to the first
+// '\0' which indicates the end of command
+struct line_consumer {
+  using tmp_buf = seastar::temporary_buffer<char>;
+  using consumption_result_type =
+    typename seastar::input_stream<char>::consumption_result_type;
+
+  seastar::future<consumption_result_type> operator()(tmp_buf&& buf) {
+    size_t consumed = 0;
+    for (auto c : buf) {
+      consumed++;
+      if (c == '\0') {
+	buf.trim_front(consumed);
+	return seastar::make_ready_future<consumption_result_type>(
+	  consumption_result_type::stop_consuming_type(std::move(buf)));
+      } else {
+	line.push_back(c);
+      }
+    }
+    return seastar::make_ready_future<consumption_result_type>(
+      seastar::continue_consuming{});
+  }
+  std::string line;
+};
+
+seastar::future<> AdminSocket::handle_client(seastar::input_stream<char>& in,
+                                             seastar::output_stream<char>& out)
+{
+  auto consumer = seastar::make_shared<line_consumer>();
+  return in.consume(*consumer).then([consumer, &out, this] {
+    logger().debug("AdminSocket::handle_client: incoming asok string: {}",
+                   consumer->line);
+    return execute_line(consumer->line, out);
+  }).then([&out] {
+    return out.flush();
+  }).finally([&out] {
+    return out.close();
+  }).then([&in] {
+    return in.close();
+  }).handle_exception([](auto ep) {
+    logger().debug("exception on {}: {}", __func__, ep);
+  });
+}
+
+seastar::future<> AdminSocket::start(const std::string& path)
+{
+  if (path.empty()) {
+    logger().error(
+      "{}: Admin Socket socket path missing from the configuration", __func__);
+    return seastar::now();
+  }
+
+  logger().debug("{}: asok socket path={}", __func__, path);
+  auto sock_path = seastar::socket_address{ seastar::unix_domain_addr{ path } };
+  try {
+    server_sock = seastar::engine().listen(sock_path);
+  } catch (const std::system_error& e) {
+    logger().error("{}: unable to listen({}): {}", __func__, path, e.what());
+    server_sock.reset();
+    return seastar::make_ready_future<>();
+  }
+  // listen in background
+  task = seastar::do_until(
+    [this] { return stop_gate.is_closed(); },
+    [this] {
+      return seastar::with_gate(stop_gate, [this] {
+        assert(!connected_sock.has_value());
+        return server_sock->accept().then([this](seastar::accept_result acc) {
+          connected_sock = std::move(acc.connection);
+          return seastar::do_with(connected_sock->input(),
+                                  connected_sock->output(),
+            [this](auto& input, auto& output) mutable {
+            return handle_client(input, output);
+          }).finally([this] {
+            assert(connected_sock.has_value());
+            connected_sock.reset();
+          });
+        }).handle_exception([this](auto ep) {
+          if (!stop_gate.is_closed()) {
+            logger().error("AdminSocket: terminated: {}", ep);
+          }
+        });
+      });
+    }).finally([path] {
+      return seastar::remove_file(path);
+    });
+  return seastar::make_ready_future<>();
+}
+
+seastar::future<> AdminSocket::stop()
+{
+  if (!server_sock) {
+    return seastar::now();
+  }
+  server_sock->abort_accept();
+  if (connected_sock) {
+    connected_sock->shutdown_input();
+    connected_sock->shutdown_output();
+  }
+  return stop_gate.close().then([this] {
+    assert(task.has_value());
+    return task->then([] {
+      logger().info("AdminSocket: stopped");
+      return seastar::now();
+    });
+  });
+}
+
+/////////////////////////////////////////
+// the internal hooks
+/////////////////////////////////////////
+
+class VersionHook final : public AdminSocketHook {
+ public:
+  VersionHook()
+    : AdminSocketHook{"version", "", "get ceph version"}
+  {}
+  seastar::future<tell_result_t> call(const cmdmap_t&,
+				      std::string_view format,
+				      ceph::bufferlist&&) const final
+  {
+    unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+    f->open_object_section("version");
+    f->dump_string("version", ceph_version_to_str());
+    f->dump_string("release", ceph_release_to_str());
+    f->dump_string("release_type", ceph_release_type());
+    f->close_section();
+    return seastar::make_ready_future<tell_result_t>(std::move(f));
+  }
+};
+
+/**
+  Note that the git_version command is expected to return a 'version' JSON
+  segment.
+*/
+class GitVersionHook final : public AdminSocketHook {
+ public:
+  GitVersionHook()
+    : AdminSocketHook{"git_version", "", "get git sha1"}
+  {}
+  seastar::future<tell_result_t> call(const cmdmap_t&,
+				      std::string_view format,
+				      ceph::bufferlist&&) const final
+  {
+    unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+    f->open_object_section("version");
+    f->dump_string("git_version", git_version_to_str());
+    f->close_section();
+    return seastar::make_ready_future<tell_result_t>(std::move(f));
+  }
+};
+
+class HelpHook final : public AdminSocketHook {
+  const AdminSocket& m_as;
+
+ public:
+  explicit HelpHook(const AdminSocket& as) :
+    AdminSocketHook{"help", "", "list available commands"},
+    m_as{as}
+  {}
+
+  seastar::future<tell_result_t> call(const cmdmap_t&,
+				      std::string_view format,
+				      ceph::bufferlist&&) const final
+  {
+    return seastar::with_shared(m_as.servers_tbl_rwlock,
+				[format, this] {
+      unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+      f->open_object_section("help");
+      for (const auto& [prefix, hook] : m_as) {
+        if (!hook->help.empty()) {
+          f->dump_string(prefix.data(), hook->help);
+	}
+      }
+      f->close_section();
+      return seastar::make_ready_future<tell_result_t>(std::move(f));
+    });
+  }
+};
+
+class GetdescsHook final : public AdminSocketHook {
+  const AdminSocket& m_as;
+
+ public:
+  explicit GetdescsHook(const AdminSocket& as) :
+    AdminSocketHook{"get_command_descriptions",
+		    "",
+		    "list available commands"},
+    m_as{ as } {}
+
+  seastar::future<tell_result_t> call(const cmdmap_t& cmdmap,
+				      std::string_view format,
+				      ceph::bufferlist&&) const final
+  {
+    return seastar::with_shared(m_as.servers_tbl_rwlock, [format, this] {
+      unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+      int cmdnum = 0;
+      f->open_object_section("command_descriptions");
+      for (const auto& [prefix, hook] : m_as) {
+	auto secname = fmt::format("cmd {:>03}", cmdnum);
+        auto cmd = fmt::format("{} {}", hook->prefix, hook->desc);
+        dump_cmd_and_help_to_json(f.get(), CEPH_FEATURES_ALL, secname,
+                                  cmd, std::string{hook->help});
+        cmdnum++;
+      }
+      f->close_section();
+      return seastar::make_ready_future<tell_result_t>(std::move(f));
+    });
+  }
+};
+
+class InjectArgsHook final : public AdminSocketHook {
+public:
+  InjectArgsHook()
+    : AdminSocketHook{"injectargs",
+                      "name=injected_args,type=CephString,n=N",
+                      "inject configuration arguments into running daemon"}
+  {}
+  seastar::future<tell_result_t> call(const cmdmap_t& cmdmap,
+				      std::string_view format,
+				      ceph::bufferlist&&) const final
+  {
+    std::vector<std::string> argv;
+    if (!cmd_getval(cmdmap, "injected_args", argv)) {
+      return seastar::make_ready_future<tell_result_t>();
+    }
+    const std::string args = boost::algorithm::join(argv, " ");
+    return local_conf().inject_args(args).then([] {
+      return seastar::make_ready_future<tell_result_t>();
+    }).handle_exception_type([] (const std::invalid_argument& e) {
+      return seastar::make_ready_future<tell_result_t>(
+        tell_result_t{-EINVAL, e.what()});
+    });
+  }
+};
+
+/**
+ * listing the configuration values
+ */
+class ConfigShowHook : public AdminSocketHook {
+public:
+  ConfigShowHook() :
+    AdminSocketHook{"config show",
+                    "",
+                    "dump current config settings"}
+  {}
+  seastar::future<tell_result_t> call(const cmdmap_t&,
+                                      std::string_view format,
+                                      ceph::bufferlist&& input) const final
+  {
+    unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+    f->open_object_section("config_show");
+    local_conf().show_config(f.get());
+    f->close_section();
+    return seastar::make_ready_future<tell_result_t>(std::move(f));
+  }
+};
+
+/**
+ * fetching the value of a specific configuration item
+ */
+class ConfigGetHook : public AdminSocketHook {
+public:
+  ConfigGetHook() :
+    AdminSocketHook("config get",
+                    "name=var,type=CephString",
+                    "config get <field>: get the config value")
+  {}
+  seastar::future<tell_result_t> call(const cmdmap_t& cmdmap,
+                                      std::string_view format,
+                                      ceph::bufferlist&& input) const final
+  {
+    std::string var;
+    [[maybe_unused]] bool found = cmd_getval(cmdmap, "var", var);
+    assert(found);
+    std::string conf_val;
+    if (int r = local_conf().get_val(var, &conf_val); r < 0) {
+      return seastar::make_ready_future<tell_result_t>(
+        tell_result_t{r, fmt::format("error getting {}: {}",
+                                     var, cpp_strerror(r))});
+    }
+    unique_ptr<Formatter> f{Formatter::create(format,
+                                              "json-pretty",
+                                              "json-pretty")};
+    f->open_object_section("config_get");
+    f->dump_string(var, conf_val);
+    f->close_section();
+    return seastar::make_ready_future<tell_result_t>(std::move(f));
+  }
+};
+
+/**
+ * setting the value of a specific configuration item (an example:
+ * {"prefix": "config set", "var":"debug_osd", "val": ["30/20"]} )
+ */
+class ConfigSetHook : public AdminSocketHook {
+public:
+  ConfigSetHook()
+    : AdminSocketHook("config set",
+                      "name=var,type=CephString "
+                      "name=val,type=CephString,n=N",
+                      "config set <field> <val> [<val> ...]: set a config variable")
+  {}
+  seastar::future<tell_result_t> call(const cmdmap_t& cmdmap,
+                                      std::string_view format,
+                                      ceph::bufferlist&&) const final
+  {
+    std::string var;
+    std::vector<std::string> new_val;
+    cmd_getval(cmdmap, "var", var);
+    cmd_getval(cmdmap, "val", new_val);
+    // val may be multiple words
+    const std::string joined_values = boost::algorithm::join(new_val, " ");
+    return local_conf().set_val(var, joined_values).then([format] {
+      unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+      f->open_object_section("config_set");
+      f->dump_string("success", "");
+      f->close_section();
+      return seastar::make_ready_future<tell_result_t>(std::move(f));
+    }).handle_exception_type([](std::invalid_argument& e) {
+      return seastar::make_ready_future<tell_result_t>(
+        tell_result_t{-EINVAL, e.what()});
+    });
+  }
+};
+
+/// the hooks that are served directly by the admin_socket server
+seastar::future<> AdminSocket::register_admin_commands()
+{
+  return seastar::when_all_succeed(
+    register_command(std::make_unique<VersionHook>()),
+    register_command(std::make_unique<GitVersionHook>()),
+    register_command(std::make_unique<HelpHook>(*this)),
+    register_command(std::make_unique<GetdescsHook>(*this)),
+    register_command(std::make_unique<ConfigGetHook>()),
+    register_command(std::make_unique<ConfigSetHook>()),
+    register_command(std::make_unique<ConfigShowHook>()),
+    register_command(std::make_unique<InjectArgsHook>())
+  ).then_unpack([] {
+    return seastar::now();
+  });
+}
+
+}  // namespace crimson::admin
diff --git a/src/crimson/admin/admin_socket.h b/src/crimson/admin/admin_socket.h
new file mode 100644
index 000000000..a842b62a2
--- /dev/null
+++ b/src/crimson/admin/admin_socket.h
@@ -0,0 +1,201 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+/**
+  A Crimson-wise version of the src/common/admin_socket.h
+
+  Note: assumed to be running on a single core.
+*/
+#include <map>
+#include <string>
+#include <string_view>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/core/iostream.hh>
+#include <seastar/core/shared_mutex.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/net/api.hh>
+
+#include "common/cmdparse.h"
+#include "include/buffer.h"
+#include "crimson/net/Fwd.h"
+
+using namespace std::literals;
+
+class MCommand;
+
+namespace crimson::admin {
+
+class AdminSocket;
+
+struct tell_result_t {
+  int ret = 0;
+  std::string err;
+  ceph::bufferlist out;
+  tell_result_t() = default;
+  tell_result_t(int ret, std::string&& err);
+  tell_result_t(int ret, std::string&& err, ceph::bufferlist&& out);
+  /**
+   * create a \c tell_result_t indicating the successful completion
+   * of command
+   *
+   * \param formatter the content of formatter will be flushed to the
+   *        output buffer
+   */
+  tell_result_t(std::unique_ptr<Formatter> formatter);
+};
+
+/**
+ * An abstract class to be inherited by implementations of asock hooks
+ */
+class AdminSocketHook {
+ public:
+  AdminSocketHook(std::string_view prefix,
+		  std::string_view desc,
+		  std::string_view help) :
+    prefix{prefix}, desc{desc}, help{help}
+  {}
+  /**
+   * handle command defined by cmdmap
+   *
+   * \param cmdmap dictionary holding the named parameters
+   * \param format the expected format of the output
+   * \param input the binary input of the command
+   * \pre \c cmdmap should be validated with \c desc
+   * \retval an instance of \c tell_result_t
+   * \note a negative \c ret should be set to indicate that the hook fails to
+   *       fulfill the command either because of an invalid input or other
+   *       failures. in that case, a brief reason of the failure should
+   *       noted in \c err in the returned value
+   */
+  virtual seastar::future<tell_result_t> call(const cmdmap_t& cmdmap,
+					      std::string_view format,
+					      ceph::bufferlist&& input) const = 0;
+  virtual ~AdminSocketHook() {}
+  const std::string_view prefix;
+  const std::string_view desc;
+  const std::string_view help;
+};
+
+class AdminSocket : public seastar::enable_lw_shared_from_this<AdminSocket> {
+ public:
+  AdminSocket() = default;
+  ~AdminSocket() = default;
+
+  AdminSocket(const AdminSocket&) = delete;
+  AdminSocket& operator=(const AdminSocket&) = delete;
+  AdminSocket(AdminSocket&&) = delete;
+  AdminSocket& operator=(AdminSocket&&) = delete;
+
+  using hook_server_tag = const void*;
+
+  /**
+   *  create the async Seastar thread that handles asok commands arriving
+   *  over the socket.
+   */
+  seastar::future<> start(const std::string& path);
+
+  seastar::future<> stop();
+
+  /**
+   * register an admin socket hook
+   *
+   * Commands (APIs) are registered under a command string. Incoming
+   * commands are split by spaces and matched against the longest
+   * registered command. For example, if 'foo' and 'foo bar' are
+   * registered, and an incoming command is 'foo bar baz', it is
+   * matched with 'foo bar', while 'foo fud' will match 'foo'.
+   *
+   * \param hook a hook which includes its identifying command string, the
+   *        expected call syntax, and some help text.
+   *
+   * A note regarding the help text: if empty, command will not be
+   * included in 'help' output.
+   */
+  seastar::future<> register_command(std::unique_ptr<AdminSocketHook>&& hook);
+
+  /**
+   * Registering the APIs that are served directly by the admin_socket server.
+   */
+  seastar::future<> register_admin_commands();
+  /**
+   * handle a command message by replying an MCommandReply with the same tid
+   *
+   * \param conn connection over which the incoming command message is received
+   * \param m message carrying the command vector and optional input buffer
+   */
+  seastar::future<> handle_command(crimson::net::ConnectionRef conn,
+				   boost::intrusive_ptr<MCommand> m);
+
+private:
+  /**
+   * the result of analyzing an incoming command, and locating it in
+   * the registered APIs collection.
+   */
+  struct parsed_command_t {
+    cmdmap_t params;
+    std::string format;
+    const AdminSocketHook& hook;
+  };
+  // and the shorthand:
+  seastar::future<> handle_client(seastar::input_stream<char>& inp,
+                                  seastar::output_stream<char>& out);
+
+  seastar::future<> execute_line(std::string cmdline,
+                                 seastar::output_stream<char>& out);
+
+  seastar::future<> finalize_response(seastar::output_stream<char>& out,
+                                      ceph::bufferlist&& msgs);
+
+  seastar::future<tell_result_t> execute_command(const std::vector<std::string>& cmd,
+						 ceph::bufferlist&& buf);
+
+  std::optional<seastar::future<>> task;
+  std::optional<seastar::server_socket> server_sock;
+  std::optional<seastar::connected_socket> connected_sock;
+
+  /**
+   * stopping incoming ASOK requests at shutdown
+   */
+  seastar::gate stop_gate;
+
+  /**
+   * parse the incoming command vector, find a registered hook by looking up by
+   * its prefix, perform sanity checks on the parsed parameters with the hook's
+   * command description
+   *
+   * \param cmd a vector of string which presents a command
+   * \retval on success, a \c parsed_command_t is returned, tell_result_t with
+   *         detailed error messages is returned otherwise
+   */
+  std::variant<parsed_command_t, tell_result_t>
+  parse_cmd(const std::vector<std::string>& cmd);
+
+  /**
+   *  The servers table is protected by a rw-lock, to be acquired exclusively
+   *  only when registering or removing a server.
+   *  The lock is locked-shared when executing any hook.
+   */
+  mutable seastar::shared_mutex servers_tbl_rwlock;
+  using hooks_t = std::map<std::string_view, std::unique_ptr<AdminSocketHook>>;
+  hooks_t hooks;
+
+ public:
+  /**
+   * iterator support
+   */
+  hooks_t::const_iterator begin() const {
+    return hooks.cbegin();
+  }
+  hooks_t::const_iterator end() const {
+    return hooks.cend();
+  }
+
+  friend class AdminSocketTest;
+  friend class HelpHook;
+  friend class GetdescsHook;
+};
+
+}  // namespace crimson::admin
diff --git a/src/crimson/admin/osd_admin.cc b/src/crimson/admin/osd_admin.cc
new file mode 100644
index 000000000..ce6b6695d
--- /dev/null
+++ b/src/crimson/admin/osd_admin.cc
@@ -0,0 +1,191 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/admin/osd_admin.h"
+#include <string>
+#include <string_view>
+
+#include <fmt/format.h>
+#include <seastar/core/do_with.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/thread.hh>
+#include <seastar/core/scollectd_api.hh>
+
+#include "common/config.h"
+#include "crimson/admin/admin_socket.h"
+#include "crimson/common/log.h"
+#include "crimson/osd/exceptions.h"
+#include "crimson/osd/osd.h"
+
+using crimson::osd::OSD;
+using namespace crimson::common;
+
+namespace crimson::admin {
+
+using crimson::common::local_conf;
+
+template <class Hook, class... Args>
+std::unique_ptr<AdminSocketHook> make_asok_hook(Args&&... args)
+{
+  return std::make_unique<Hook>(std::forward<Args>(args)...);
+}
+
+/**
+ * An OSD admin hook: OSD status
+ */
+class OsdStatusHook : public AdminSocketHook {
+public:
+  explicit OsdStatusHook(const crimson::osd::OSD& osd) :
+    AdminSocketHook{"status", "", "OSD status"},
+    osd(osd)
+  {}
+  seastar::future<tell_result_t> call(const cmdmap_t&,
+				      std::string_view format,
+				      ceph::bufferlist&& input) const final
+  {
+    unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+    f->open_object_section("status");
+    osd.dump_status(f.get());
+    f->close_section();
+    return seastar::make_ready_future<tell_result_t>(std::move(f));
+  }
+private:
+  const crimson::osd::OSD& osd;
+};
+template std::unique_ptr<AdminSocketHook>
+make_asok_hook<OsdStatusHook>(const crimson::osd::OSD& osd);
+
+/**
+ * An OSD admin hook: send beacon
+ */
+class SendBeaconHook : public AdminSocketHook {
+public:
+  explicit SendBeaconHook(crimson::osd::OSD& osd) :
+    AdminSocketHook{"send_beacon",
+		    "",
+		    "send OSD beacon to mon immediately"},
+    osd(osd)
+  {}
+  seastar::future<tell_result_t> call(const cmdmap_t&,
+				      std::string_view format,
+				      ceph::bufferlist&& input) const final
+  {
+    return osd.send_beacon().then([] {
+      return seastar::make_ready_future<tell_result_t>();
+    });
+  }
+private:
+  crimson::osd::OSD& osd;
+};
+template std::unique_ptr<AdminSocketHook>
+make_asok_hook<SendBeaconHook>(crimson::osd::OSD& osd);
+
+/**
+ * send the latest pg stats to mgr
+ */
+class FlushPgStatsHook : public AdminSocketHook {
+public:
+  explicit FlushPgStatsHook(crimson::osd::OSD& osd) :
+    AdminSocketHook("flush_pg_stats",
+		    "",
+		    "flush pg stats"),
+    osd{osd}
+  {}
+  seastar::future<tell_result_t> call(const cmdmap_t&,
+				      std::string_view format,
+				      ceph::bufferlist&& input) const final
+  {
+    uint64_t seq = osd.send_pg_stats();
+    unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+    f->dump_unsigned("stat_seq", seq);
+    return seastar::make_ready_future<tell_result_t>(std::move(f));
+  }
+
+private:
+  crimson::osd::OSD& osd;
+};
+template std::unique_ptr<AdminSocketHook> make_asok_hook<FlushPgStatsHook>(crimson::osd::OSD& osd);
+
+/// dump the history of PGs' peering state
+class DumpPGStateHistory final: public AdminSocketHook {
+public:
+  explicit DumpPGStateHistory(const crimson::osd::OSD &osd) :
+    AdminSocketHook{"dump_pgstate_history",
+                    "",
+                    "dump history of PGs' peering state"},
+    osd{osd}
+  {}
+  seastar::future<tell_result_t> call(const cmdmap_t&,
+                                      std::string_view format,
+                                      ceph::bufferlist&& input) const final
+  {
+    std::unique_ptr<Formatter> f{Formatter::create(format,
+                                                   "json-pretty",
+                                                   "json-pretty")};
+    f->open_object_section("pgstate_history");
+    osd.dump_pg_state_history(f.get());
+    f->close_section();
+    return seastar::make_ready_future<tell_result_t>(std::move(f));
+  }
+private:
+  const crimson::osd::OSD& osd;
+};
+template std::unique_ptr<AdminSocketHook> make_asok_hook<DumpPGStateHistory>(const crimson::osd::OSD& osd);
+
+/**
+ * A CephContext admin hook: calling assert (if allowed by
+ * 'debug_asok_assert_abort')
+ */
+class AssertAlwaysHook : public AdminSocketHook {
+public:
+  AssertAlwaysHook()  :
+    AdminSocketHook{"assert",
+		    "",
+		    "asserts"}
+  {}
+  seastar::future<tell_result_t> call(const cmdmap_t&,
+				      std::string_view format,
+				      ceph::bufferlist&& input) const final
+  {
+    if (local_conf().get_val<bool>("debug_asok_assert_abort")) {
+      ceph_assert_always(0);
+      return seastar::make_ready_future<tell_result_t>();
+    } else {
+      return seastar::make_ready_future<tell_result_t>(
+        tell_result_t{-EPERM, "configuration set to disallow asok assert"});
+    }
+  }
+};
+template std::unique_ptr<AdminSocketHook> make_asok_hook<AssertAlwaysHook>();
+
+/**
+* A Seastar admin hook: fetching the values of configured metrics
+*/
+class SeastarMetricsHook : public AdminSocketHook {
+public:
+ SeastarMetricsHook()  :
+   AdminSocketHook("perf dump_seastar",
+      "",
+      "dump current configured seastar metrics and their values")
+ {}
+ seastar::future<tell_result_t> call(const cmdmap_t& cmdmap,
+             std::string_view format,
+             ceph::bufferlist&& input) const final
+ {
+   std::unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")};
+   f->open_object_section("perf_dump_seastar");
+   for (const auto& mf : seastar::scollectd::get_value_map()) {
+     for (const auto& m : mf.second) {
+       if (m.second && m.second->is_enabled()) {
+         auto& metric_function = m.second->get_function();
+         f->dump_float(m.second->get_id().full_name(), metric_function().d());
+       }
+     }
+   }
+   f->close_section();
+   return seastar::make_ready_future<tell_result_t>(std::move(f));
+ }
+};
+template std::unique_ptr<AdminSocketHook> make_asok_hook<SeastarMetricsHook>();
+
+} // namespace crimson::admin
diff --git a/src/crimson/admin/osd_admin.h b/src/crimson/admin/osd_admin.h
new file mode 100644
index 000000000..395042ea8
--- /dev/null
+++ b/src/crimson/admin/osd_admin.h
@@ -0,0 +1,22 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <memory>
+
+#include "admin_socket.h"
+
+namespace crimson::admin {
+
+class AssertAlwaysHook;
+class FlushPgStatsHook;
+class OsdStatusHook;
+class SendBeaconHook;
+class DumpPGStateHistory;
+class SeastarMetricsHook;
+
+
+template<class Hook, class... Args>
+std::unique_ptr<AdminSocketHook> make_asok_hook(Args&&... args);
+
+}  // namespace crimson::admin
diff --git a/src/crimson/admin/pg_commands.cc b/src/crimson/admin/pg_commands.cc
new file mode 100644
index 000000000..dacfd515d
--- /dev/null
+++ b/src/crimson/admin/pg_commands.cc
@@ -0,0 +1,159 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/admin/pg_commands.h"
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include <fmt/format.h>
+#include <seastar/core/future.hh>
+
+#include "crimson/admin/admin_socket.h"
+#include "crimson/osd/osd.h"
+#include "crimson/osd/pg.h"
+
+
+using crimson::osd::OSD;
+using crimson::osd::PG;
+using namespace crimson::common;
+
+
+namespace crimson::admin::pg {
+
+class PGCommand : public AdminSocketHook {
+public:
+  // TODO: const correctness of osd
+  PGCommand(crimson::osd::OSD& osd,
+            std::string_view prefix,
+            std::string_view desc,
+            std::string_view help)
+      : AdminSocketHook{prefix, desc, help}, osd {osd}
+  {}
+  seastar::future<tell_result_t> call(const cmdmap_t& cmdmap,
+                                      std::string_view format,
+                                      ceph::bufferlist&& input) const final
+  {
+    // we have "ceph tell <pgid> <cmd>". and it is the ceph cli's responsibility
+    // to add "pgid" to the cmd dict. as rados_pg_command() does not set it for
+    // us. moreover, and "pgid" is not listed in the command description, as user
+    // command format does not follow the convention of "<prefix> [<args>,...]"
+    // so we have to verify it on the server side.
+    std::string pgid_str;
+    pg_t pgid;
+    if (!cmd_getval(cmdmap, "pgid", pgid_str)) {
+      return seastar::make_ready_future<tell_result_t>(
+        tell_result_t{-EINVAL, "no pgid specified"});
+    } else if (!pgid.parse(pgid_str.c_str())) {
+      return seastar::make_ready_future<tell_result_t>(
+        tell_result_t{-EINVAL, fmt::format("couldn't parse pgid '{}'", pgid_str)});
+    }
+    // am i the primary for this pg?
+    const auto osdmap = osd.get_shard_services().get_osdmap();
+    spg_t spg_id;
+    if (!osdmap->get_primary_shard(pgid, &spg_id)) {
+      return seastar::make_ready_future<tell_result_t>(tell_result_t{
+          -ENOENT, fmt::format("pgid '{}' does not exist", pgid_str)});
+    }
+    Ref<PG> pg = osd.get_pg(spg_id);
+    if (!pg) {
+      return seastar::make_ready_future<tell_result_t>(tell_result_t{
+        -ENOENT, fmt::format("i don't have pgid '{}'", spg_id)});
+    }
+    if (!pg->is_primary()) {
+      return seastar::make_ready_future<tell_result_t>(tell_result_t{
+        -EAGAIN, fmt::format("not primary for pgid '{}'", spg_id)});
+    }
+    return this->do_command(pg, cmdmap, format, std::move(input));
+  }
+
+private:
+  virtual seastar::future<tell_result_t>
+  do_command(Ref<PG> pg,
+             const cmdmap_t& cmdmap,
+             std::string_view format,
+             ceph::bufferlist&& input) const = 0;
+
+  OSD& osd;
+};
+
+class QueryCommand final : public PGCommand {
+public:
+  // TODO: const correctness of osd
+  explicit QueryCommand(crimson::osd::OSD& osd) :
+    PGCommand{osd,
+              "query",
+              "",
+              "show details of a specific pg"}
+  {}
+private:
+  seastar::future<tell_result_t>
+  do_command(Ref<PG> pg,
+             const cmdmap_t&,
+             std::string_view format,
+             ceph::bufferlist&& input) const final
+  {
+    std::unique_ptr<Formatter> f{Formatter::create(format,
+                                                   "json-pretty",
+                                                   "json-pretty")};
+    f->open_object_section("pg");
+    pg->dump_primary(f.get());
+    f->close_section();
+    return seastar::make_ready_future<tell_result_t>(std::move(f));
+  }
+};
+
+class MarkUnfoundLostCommand final : public PGCommand {
+public:
+  explicit MarkUnfoundLostCommand(crimson::osd::OSD& osd) :
+    PGCommand{osd,
+              "mark_unfound_lost",
+              "name=pgid,type=CephPgid,req=false"
+              " name=mulcmd,type=CephChoices,strings=revert|delete",
+              "mark all unfound objects in this pg as lost, either"
+              " removing or reverting to a prior version if one is"
+              " available"}
+  {}
+  seastar::future<tell_result_t>
+  do_command(Ref<PG> pg,
+             const cmdmap_t& cmdmap,
+             std::string_view,
+             ceph::bufferlist&&) const final
+  {
+    // what to do with the unfound object specifically.
+    std::string cmd;
+    int op = -1;
+    cmd_getval(cmdmap, "mulcmd", cmd);
+    if (cmd == "revert") {
+      op = pg_log_entry_t::LOST_REVERT;
+    } else if (cmd == "delete") {
+      op = pg_log_entry_t::LOST_DELETE;
+    } else {
+      return seastar::make_ready_future<tell_result_t>(tell_result_t{
+        -EINVAL, "mode must be 'revert' or 'delete'; mark not yet implemented"});
+    }
+    return pg->mark_unfound_lost(op).then([] {
+      // TODO
+      return seastar::make_ready_future<tell_result_t>();
+    });
+  }
+};
+
+} // namespace crimson::admin::pg
+
+namespace crimson::admin {
+
+template <class Hook, class... Args>
+std::unique_ptr<AdminSocketHook> make_asok_hook(Args&&... args)
+{
+  return std::make_unique<Hook>(std::forward<Args>(args)...);
+}
+
+template std::unique_ptr<AdminSocketHook>
+make_asok_hook<crimson::admin::pg::QueryCommand>(crimson::osd::OSD& osd);
+
+template std::unique_ptr<AdminSocketHook>
+make_asok_hook<crimson::admin::pg::MarkUnfoundLostCommand>(crimson::osd::OSD& osd);
+
+} // namespace crimson::admin
diff --git a/src/crimson/admin/pg_commands.h b/src/crimson/admin/pg_commands.h
new file mode 100644
index 000000000..873b3c923
--- /dev/null
+++ b/src/crimson/admin/pg_commands.h
@@ -0,0 +1,10 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+namespace crimson::admin::pg {
+
+class QueryCommand;
+class MarkUnfoundLostCommand;
+
+}  // namespace crimson::admin::pg
diff --git a/src/crimson/auth/AuthClient.h b/src/crimson/auth/AuthClient.h
new file mode 100644
index 000000000..cd21b3838
--- /dev/null
+++ b/src/crimson/auth/AuthClient.h
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <tuple>
+#include <vector>
+#include "include/buffer_fwd.h"
+#include "crimson/net/Fwd.h"
+
+class CryptoKey;
+
+namespace crimson::auth {
+
+class error : public std::logic_error {
+public:
+  using std::logic_error::logic_error;
+};
+
+using method_t = uint32_t;
+
+// TODO: revisit interfaces for non-dummy implementations
+class AuthClient {
+public:
+  virtual ~AuthClient() {}
+
+  struct auth_request_t {
+    method_t auth_method;
+    std::vector<uint32_t> preferred_modes;
+    ceph::bufferlist auth_bl;
+  };
+  /// Build an authentication request to begin the handshake
+  ///
+  /// @throw auth::error if unable to build the request
+  virtual auth_request_t get_auth_request(crimson::net::ConnectionRef conn,
+					  AuthConnectionMetaRef auth_meta) = 0;
+
+  /// Handle server's request to continue the handshake
+  ///
+  /// @throw auth::error if unable to build the request
+  virtual ceph::bufferlist handle_auth_reply_more(
+    crimson::net::ConnectionRef conn,
+    AuthConnectionMetaRef auth_meta,
+    const ceph::bufferlist& bl) = 0;
+
+  /// Handle server's indication that authentication succeeded
+  ///
+  /// @return 0 if authenticated, a negative number otherwise
+  virtual int handle_auth_done(
+    crimson::net::ConnectionRef conn,
+    AuthConnectionMetaRef auth_meta,
+    uint64_t global_id,
+    uint32_t con_mode,
+    const bufferlist& bl) = 0;
+
+  /// Handle server's indication that the previous auth attempt failed
+  ///
+  /// @return 0 if will try next auth method, a negative number if we have no
+  ///         more options
+  virtual int handle_auth_bad_method(
+    crimson::net::ConnectionRef conn,
+    AuthConnectionMetaRef auth_meta,
+    uint32_t old_auth_method,
+    int result,
+    const std::vector<uint32_t>& allowed_methods,
+    const std::vector<uint32_t>& allowed_modes) = 0;
+};
+
+} // namespace crimson::auth
diff --git a/src/crimson/auth/AuthServer.h b/src/crimson/auth/AuthServer.h
new file mode 100644
index 000000000..d75c8f586
--- /dev/null
+++ b/src/crimson/auth/AuthServer.h
@@ -0,0 +1,41 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+#include "crimson/net/Fwd.h"
+
+struct AuthAuthorizeHandler;
+
+namespace crimson::auth {
+
+class AuthServer {
+public:
+  virtual ~AuthServer() {}
+
+  // Get authentication methods and connection modes for the given peer type
+  virtual std::pair<std::vector<uint32_t>, std::vector<uint32_t>>
+  get_supported_auth_methods(int peer_type) = 0;
+  // Get support connection modes for the given peer type and auth method
+  virtual uint32_t pick_con_mode(
+    int peer_type,
+    uint32_t auth_method,
+    const std::vector<uint32_t>& preferred_modes) = 0;
+  // return an AuthAuthorizeHandler for the given peer type and auth method
+  virtual AuthAuthorizeHandler* get_auth_authorize_handler(
+    int peer_type,
+    int auth_method) = 0;
+  // Handle an authentication request on an incoming connection
+  virtual int handle_auth_request(
+    crimson::net::ConnectionRef conn,
+    AuthConnectionMetaRef auth_meta,
+    bool more,           //< true if this is not the first part of the handshake
+    uint32_t auth_method,
+    const bufferlist& bl,
+    bufferlist *reply) = 0;
+};
+
+} // namespace crimson::auth
diff --git a/src/crimson/auth/DummyAuth.h b/src/crimson/auth/DummyAuth.h
new file mode 100644
index 000000000..7c26140a2
--- /dev/null
+++ b/src/crimson/auth/DummyAuth.h
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "AuthClient.h"
+#include "AuthServer.h"
+
+namespace crimson::auth {
+
+class DummyAuthClientServer : public AuthClient,
+                              public AuthServer {
+public:
+  DummyAuthClientServer() {}
+
+  // client
+  std::pair<std::vector<uint32_t>, std::vector<uint32_t>>
+  get_supported_auth_methods(int peer_type) final {
+    return {{CEPH_AUTH_NONE}, {CEPH_AUTH_NONE}};
+  }
+
+  uint32_t pick_con_mode(int peer_type,
+			 uint32_t auth_method,
+			 const std::vector<uint32_t>& preferred_modes) final {
+    ceph_assert(auth_method == CEPH_AUTH_NONE);
+    ceph_assert(preferred_modes.size() &&
+                preferred_modes[0] == CEPH_CON_MODE_CRC);
+    return CEPH_CON_MODE_CRC;
+  }
+
+  AuthAuthorizeHandler* get_auth_authorize_handler(int peer_type,
+						   int auth_method) final {
+    return nullptr;
+  }
+
+  AuthClient::auth_request_t get_auth_request(
+    crimson::net::ConnectionRef conn,
+    AuthConnectionMetaRef auth_meta) override {
+    return {CEPH_AUTH_NONE, {CEPH_CON_MODE_CRC}, {}};
+  }
+
+  ceph::bufferlist handle_auth_reply_more(
+    crimson::net::ConnectionRef conn,
+    AuthConnectionMetaRef auth_meta,
+    const bufferlist& bl) override {
+    ceph_abort();
+  }
+
+  int handle_auth_done(
+    crimson::net::ConnectionRef conn,
+    AuthConnectionMetaRef auth_meta,
+    uint64_t global_id,
+    uint32_t con_mode,
+    const bufferlist& bl) override {
+    return 0;
+  }
+
+  int handle_auth_bad_method(
+    crimson::net::ConnectionRef conn,
+    AuthConnectionMetaRef auth_meta,
+    uint32_t old_auth_method,
+    int result,
+    const std::vector<uint32_t>& allowed_methods,
+    const std::vector<uint32_t>& allowed_modes) override {
+    ceph_abort();
+  }
+
+  // server
+  int handle_auth_request(
+    crimson::net::ConnectionRef conn,
+    AuthConnectionMetaRef auth_meta,
+    bool more,
+    uint32_t auth_method,
+    const bufferlist& bl,
+    bufferlist *reply) override {
+    return 1;
+  }
+};
+
+} // namespace crimson::auth
diff --git a/src/crimson/auth/KeyRing.cc b/src/crimson/auth/KeyRing.cc
new file mode 100644
index 000000000..436e29c1b
--- /dev/null
+++ b/src/crimson/auth/KeyRing.cc
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include "KeyRing.h"
+
+#include <boost/algorithm/string.hpp>
+
+#include <seastar/core/do_with.hh>
+#include <seastar/core/fstream.hh>
+#include <seastar/core/future-util.hh>
+#include <seastar/core/reactor.hh>
+
+#include "common/buffer_seastar.h"
+#include "auth/KeyRing.h"
+#include "include/denc.h"
+#include "crimson/common/buffer_io.h"
+#include "crimson/common/config_proxy.h"
+
+namespace crimson::auth {
+
+seastar::future<KeyRing*> load_from_keyring(KeyRing* keyring)
+{
+  std::vector<std::string> paths;
+  boost::split(paths, crimson::common::local_conf()->keyring,
+               boost::is_any_of(",;"));
+  std::pair<bool, std::string> found;
+  return seastar::map_reduce(paths, [](auto path) {
+    return seastar::engine().file_exists(path).then([path](bool file_exists) {
+      return std::make_pair(file_exists, path);
+    });
+  }, std::move(found), [](auto found, auto file_exists_and_path) {
+    if (!found.first && file_exists_and_path.first) {
+      found = std::move(file_exists_and_path);
+    }
+    return found;
+  }).then([keyring] (auto file_exists_and_path) {
+    const auto& [exists, path] = file_exists_and_path;
+    if (exists) {
+      return read_file(path).then([keyring](auto buf) {
+        bufferlist bl;
+        bl.append(buffer::create(std::move(buf)));
+        auto i = bl.cbegin();
+        keyring->decode(i);
+        return seastar::make_ready_future<KeyRing*>(keyring);
+      });
+    } else {
+      return seastar::make_ready_future<KeyRing*>(keyring);
+    }
+  });
+}
+
+seastar::future<KeyRing*> load_from_keyfile(KeyRing* keyring)
+{
+  auto& path = crimson::common::local_conf()->keyfile;
+  if (!path.empty()) {
+    return read_file(path).then([keyring](auto buf) {
+      EntityAuth ea;
+      ea.key.decode_base64(std::string(buf.begin(),
+                                       buf.end()));
+      keyring->add(crimson::common::local_conf()->name, ea);
+      return seastar::make_ready_future<KeyRing*>(keyring);
+    });
+  } else {
+    return seastar::make_ready_future<KeyRing*>(keyring);
+  }
+}
+
+seastar::future<KeyRing*> load_from_key(KeyRing* keyring)
+{
+  auto& key = crimson::common::local_conf()->key;
+  if (!key.empty()) {
+    EntityAuth ea;
+    ea.key.decode_base64(key);
+    keyring->add(crimson::common::local_conf()->name, ea);
+  }
+  return seastar::make_ready_future<KeyRing*>(keyring);
+}
+
+} // namespace crimson::auth
diff --git a/src/crimson/auth/KeyRing.h b/src/crimson/auth/KeyRing.h
new file mode 100644
index 000000000..850f1bb79
--- /dev/null
+++ b/src/crimson/auth/KeyRing.h
@@ -0,0 +1,15 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+class KeyRing;
+
+namespace crimson::auth {
+  // see KeyRing::from_ceph_context
+  seastar::future<KeyRing*> load_from_keyring(KeyRing* keyring);
+  seastar::future<KeyRing*> load_from_keyfile(KeyRing* keyring);
+  seastar::future<KeyRing*> load_from_key(KeyRing* keyring);
+}
diff --git a/src/crimson/common/assert.cc b/src/crimson/common/assert.cc
new file mode 100644
index 000000000..07610c33f
--- /dev/null
+++ b/src/crimson/common/assert.cc
@@ -0,0 +1,81 @@
+#include <cstdarg>
+#include <iostream>
+
+#include <seastar/util/backtrace.hh>
+#include <seastar/core/reactor.hh>
+
+#include "include/ceph_assert.h"
+
+#include "crimson/common/log.h"
+
+namespace ceph {
+  [[gnu::cold]] void __ceph_assert_fail(const ceph::assert_data &ctx)
+  {
+    __ceph_assert_fail(ctx.assertion, ctx.file, ctx.line, ctx.function);
+  }
+
+  [[gnu::cold]] void __ceph_assert_fail(const char* assertion,
+                                        const char* file, int line,
+                                        const char* func)
+  {
+    seastar::logger& logger = crimson::get_logger(0);
+    logger.error("{}:{} : In function '{}', ceph_assert(%s)\n"
+                 "{}",
+                 file, line, func, assertion,
+                 seastar::current_backtrace());
+    std::cout << std::flush;
+    abort();
+  }
+  [[gnu::cold]] void __ceph_assertf_fail(const char *assertion,
+                                         const char *file, int line,
+                                         const char *func, const char* msg,
+                                         ...)
+  {
+    char buf[8096];
+    va_list args;
+    va_start(args, msg);
+    std::vsnprintf(buf, sizeof(buf), msg, args);
+    va_end(args);
+
+    seastar::logger& logger = crimson::get_logger(0);
+    logger.error("{}:{} : In function '{}', ceph_assert(%s)\n"
+                 "{}\n{}\n",
+                 file, line, func, assertion,
+                 buf,
+                 seastar::current_backtrace());
+    std::cout << std::flush;
+    abort();
+  }
+
+  [[gnu::cold]] void __ceph_abort(const char* file, int line,
+                                  const char* func, const std::string& msg)
+  {
+    seastar::logger& logger = crimson::get_logger(0);
+    logger.error("{}:{} : In function '{}', abort(%s)\n"
+                 "{}",
+                 file, line, func, msg,
+                 seastar::current_backtrace());
+    std::cout << std::flush;
+    abort();
+  }
+
+  [[gnu::cold]] void __ceph_abortf(const char* file, int line,
+                                   const char* func, const char* fmt,
+                                   ...)
+  {
+    char buf[8096];
+    va_list args;
+    va_start(args, fmt);
+    std::vsnprintf(buf, sizeof(buf), fmt, args);
+    va_end(args);
+
+    seastar::logger& logger = crimson::get_logger(0);
+    logger.error("{}:{} : In function '{}', abort()\n"
+                 "{}\n{}\n",
+                 file, line, func,
+                 buf,
+                 seastar::current_backtrace());
+    std::cout << std::flush;
+    abort();
+  }
+}
diff --git a/src/crimson/common/auth_handler.h b/src/crimson/common/auth_handler.h
new file mode 100644
index 000000000..d4140b6a2
--- /dev/null
+++ b/src/crimson/common/auth_handler.h
@@ -0,0 +1,17 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+class EntityName;
+class AuthCapsInfo;
+
+namespace crimson::common {
+class AuthHandler {
+public:
+  // the peer just got authorized
+  virtual void handle_authentication(const EntityName& name,
+				     const AuthCapsInfo& caps) = 0;
+  virtual ~AuthHandler() = default;
+};
+}
diff --git a/src/crimson/common/buffer_io.cc b/src/crimson/common/buffer_io.cc
new file mode 100644
index 000000000..86edf7a6f
--- /dev/null
+++ b/src/crimson/common/buffer_io.cc
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "buffer_io.h"
+
+#include <seastar/core/reactor.hh>
+#include <seastar/core/fstream.hh>
+#include <seastar/core/do_with.hh>
+
+#include "include/buffer.h"
+
+namespace crimson {
+
+seastar::future<> write_file(ceph::buffer::list&& bl,
+                             seastar::sstring fn,
+                             seastar::file_permissions permissions)
+{
+  const auto flags = (seastar::open_flags::wo |
+                      seastar::open_flags::create |
+                      seastar::open_flags::truncate);
+  seastar::file_open_options foo;
+  foo.create_permissions = permissions;
+  return seastar::open_file_dma(fn, flags, foo).then(
+    [bl=std::move(bl)](seastar::file f) {
+    return seastar::make_file_output_stream(f).then(
+      [bl=std::move(bl), f=std::move(f)](seastar::output_stream<char> out) {
+      return seastar::do_with(std::move(out),
+                              std::move(f),
+                              std::move(bl),
+                              [](seastar::output_stream<char>& out,
+                                 seastar::file& f,
+                                 ceph::buffer::list& bl) {
+        return seastar::do_for_each(bl.buffers(), [&out](auto& buf) {
+          return out.write(buf.c_str(), buf.length());
+        }).then([&out] {
+          return out.close();
+        });
+      });
+    });
+  });
+}
+
+seastar::future<seastar::temporary_buffer<char>>
+read_file(const seastar::sstring fn)
+{
+  return seastar::open_file_dma(fn, seastar::open_flags::ro).then(
+    [] (seastar::file f) {
+    return f.size().then([f = std::move(f)](size_t s) {
+      return seastar::do_with(seastar::make_file_input_stream(f),
+			      [s](seastar::input_stream<char>& in) {
+        return in.read_exactly(s);
+      });
+    });
+  });
+}
+
+}
diff --git a/src/crimson/common/buffer_io.h b/src/crimson/common/buffer_io.h
new file mode 100644
index 000000000..c5ece4a6f
--- /dev/null
+++ b/src/crimson/common/buffer_io.h
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/file-types.hh>
+
+#include "include/buffer_fwd.h"
+
+namespace crimson {
+  seastar::future<> write_file(ceph::buffer::list&& bl,
+                               seastar::sstring fn,
+                               seastar::file_permissions= // 0644
+                                 (seastar::file_permissions::user_read |
+                                  seastar::file_permissions::user_write |
+                                  seastar::file_permissions::group_read |
+                                  seastar::file_permissions::others_read));
+  seastar::future<seastar::temporary_buffer<char>>
+  read_file(const seastar::sstring fn);
+}
diff --git a/src/crimson/common/config_proxy.cc b/src/crimson/common/config_proxy.cc
new file mode 100644
index 000000000..88d4679d5
--- /dev/null
+++ b/src/crimson/common/config_proxy.cc
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "config_proxy.h"
+
+#include <filesystem>
+
+#include "crimson/common/buffer_io.h"
+
+namespace crimson::common {
+
+ConfigProxy::ConfigProxy(const EntityName& name, std::string_view cluster)
+{
+  if (seastar::this_shard_id() != 0) {
+    return;
+  }
+  // set the initial value on CPU#0
+  values.reset(seastar::make_lw_shared<ConfigValues>());
+  values.get()->name = name;
+  values.get()->cluster = cluster;
+  // and the only copy of md_config_impl<> is allocated on CPU#0
+  local_config.reset(new md_config_t{*values, obs_mgr, true});
+  if (name.is_mds()) {
+    local_config->set_val_default(*values, obs_mgr,
+				  "keyring", "$mds_data/keyring");
+  } else if (name.is_osd()) {
+    local_config->set_val_default(*values, obs_mgr,
+				  "keyring", "$osd_data/keyring");
+  }
+}
+
+seastar::future<> ConfigProxy::start()
+{
+  // populate values and config to all other shards
+  if (!values) {
+    return seastar::make_ready_future<>();
+  }
+  return container().invoke_on_others([this](auto& proxy) {
+    return values.copy().then([config=local_config.get(),
+			       &proxy](auto foreign_values) {
+      proxy.values.reset();
+      proxy.values = std::move(foreign_values);
+      proxy.remote_config = config;
+      return seastar::make_ready_future<>();
+    });
+  });
+}
+
+void ConfigProxy::show_config(ceph::Formatter* f) const {
+  get_config().show_config(*values, f);
+}
+
+seastar::future<> ConfigProxy::parse_config_files(const std::string& conf_files)
+{
+  auto conffile_paths =
+    get_config().get_conffile_paths(*values,
+                                    conf_files.empty() ? nullptr : conf_files.c_str(),
+                                    &std::cerr,
+                                    CODE_ENVIRONMENT_DAEMON);
+  return seastar::do_with(std::move(conffile_paths), [this] (auto& paths) {
+    return seastar::repeat([path=paths.begin(), e=paths.end(), this]() mutable {
+      if (path == e) {
+        // tried all conffile, none of them works
+        return seastar::make_ready_future<seastar::stop_iteration>(
+          seastar::stop_iteration::yes);
+      }
+      return crimson::read_file(*path++).then([this](auto&& buf) {
+        return do_change([buf=std::move(buf), this](ConfigValues& values) {
+          if (get_config().parse_buffer(values, obs_mgr,
+                                        buf.get(), buf.size(),
+                                        &std::cerr) == 0) {
+            get_config().update_legacy_vals(values);
+          } else {
+            throw std::invalid_argument("parse error");
+          }
+        }).then([] {
+          // this one works!
+	  return seastar::make_ready_future<seastar::stop_iteration>(
+            seastar::stop_iteration::yes);
+        });
+      }).handle_exception_type([] (const std::filesystem::filesystem_error&) {
+        return seastar::make_ready_future<seastar::stop_iteration>(
+          seastar::stop_iteration::no);
+      }).handle_exception_type([] (const std::invalid_argument&) {
+        return seastar::make_ready_future<seastar::stop_iteration>(
+         seastar::stop_iteration::no);
+      });
+    });
+  });
+}
+
+ConfigProxy::ShardedConfig ConfigProxy::sharded_conf;
+}
diff --git a/src/crimson/common/config_proxy.h b/src/crimson/common/config_proxy.h
new file mode 100644
index 000000000..f50a63431
--- /dev/null
+++ b/src/crimson/common/config_proxy.h
@@ -0,0 +1,200 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/reactor.hh>
+#include <seastar/core/sharded.hh>
+#include "common/config.h"
+#include "common/config_obs.h"
+#include "common/config_obs_mgr.h"
+#include "common/errno.h"
+
+namespace ceph {
+class Formatter;
+}
+
+namespace crimson::common {
+
+// a facade for managing config. each shard has its own copy of ConfigProxy.
+//
+// In seastar-osd, there could be multiple instances of @c ConfigValues in a
+// single process, as we are using a variant of read-copy-update mechinary to
+// update the settings at runtime.
+class ConfigProxy : public seastar::peering_sharded_service<ConfigProxy>
+{
+  using LocalConfigValues = seastar::lw_shared_ptr<ConfigValues>;
+  seastar::foreign_ptr<LocalConfigValues> values;
+
+  md_config_t* remote_config = nullptr;
+  std::unique_ptr<md_config_t> local_config;
+
+  using ConfigObserver = ceph::md_config_obs_impl<ConfigProxy>;
+  ObserverMgr<ConfigObserver> obs_mgr;
+
+  const md_config_t& get_config() const {
+    return remote_config ? *remote_config : * local_config;
+  }
+  md_config_t& get_config() {
+    return remote_config ? *remote_config : * local_config;
+  }
+
+  // apply changes to all shards
+  // @param func a functor which accepts @c "ConfigValues&"
+  template<typename Func>
+  seastar::future<> do_change(Func&& func) {
+    return container().invoke_on(values.get_owner_shard(),
+                                 [func = std::move(func)](ConfigProxy& owner) {
+      // apply the changes to a copy of the values
+      auto new_values = seastar::make_lw_shared(*owner.values);
+      new_values->changed.clear();
+      func(*new_values);
+
+      // always apply the new settings synchronously on the owner shard, to
+      // avoid racings with other do_change() calls in parallel.
+      ObserverMgr<ConfigObserver>::rev_obs_map rev_obs;
+      owner.values.reset(new_values);
+      owner.obs_mgr.for_each_change(owner.values->changed, owner,
+                                    [&rev_obs](ConfigObserver *obs,
+                                               const std::string &key) {
+                                      rev_obs[obs].insert(key);
+                                    }, nullptr);
+      for (auto& [obs, keys] : rev_obs) {
+        obs->handle_conf_change(owner, keys);
+      }
+
+      return seastar::parallel_for_each(boost::irange(1u, seastar::smp::count),
+                                        [&owner, new_values] (auto cpu) {
+        return owner.container().invoke_on(cpu,
+          [foreign_values = seastar::make_foreign(new_values)](ConfigProxy& proxy) mutable {
+            proxy.values.reset();
+            proxy.values = std::move(foreign_values);
+
+            ObserverMgr<ConfigObserver>::rev_obs_map rev_obs;
+            proxy.obs_mgr.for_each_change(proxy.values->changed, proxy,
+              [&rev_obs](ConfigObserver *obs, const std::string& key) {
+                rev_obs[obs].insert(key);
+              }, nullptr);
+            for (auto& obs_keys : rev_obs) {
+              obs_keys.first->handle_conf_change(proxy, obs_keys.second);
+            }
+          });
+        }).finally([new_values] {
+          new_values->changed.clear();
+        });
+      });
+  }
+public:
+  ConfigProxy(const EntityName& name, std::string_view cluster);
+  const ConfigValues* operator->() const noexcept {
+    return values.get();
+  }
+  const ConfigValues get_config_values() {
+     return *values.get();
+  }
+  ConfigValues* operator->() noexcept {
+    return values.get();
+  }
+
+  // required by sharded<>
+  seastar::future<> start();
+  seastar::future<> stop() {
+    return seastar::make_ready_future<>();
+  }
+  void add_observer(ConfigObserver* obs) {
+    obs_mgr.add_observer(obs);
+  }
+  void remove_observer(ConfigObserver* obs) {
+    obs_mgr.remove_observer(obs);
+  }
+  seastar::future<> rm_val(const std::string& key) {
+    return do_change([key, this](ConfigValues& values) {
+      auto ret = get_config().rm_val(values, key);
+      if (ret < 0) {
+        throw std::invalid_argument(cpp_strerror(ret));
+      }
+    });
+  }
+  seastar::future<> set_val(const std::string& key,
+			    const std::string& val) {
+    return do_change([key, val, this](ConfigValues& values) {
+      std::stringstream err;
+      auto ret = get_config().set_val(values, obs_mgr, key, val, &err);
+      if (ret < 0) {
+	throw std::invalid_argument(err.str());
+      }
+    });
+  }
+  int get_val(const std::string &key, std::string *val) const {
+    return get_config().get_val(*values, key, val);
+  }
+  template<typename T>
+  const T get_val(const std::string& key) const {
+    return get_config().template get_val<T>(*values, key);
+  }
+
+  int get_all_sections(std::vector<std::string>& sections) const {
+    return get_config().get_all_sections(sections);
+  }
+
+  int get_val_from_conf_file(const std::vector<std::string>& sections,
+			     const std::string& key, std::string& out,
+			     bool expand_meta) const {
+    return get_config().get_val_from_conf_file(*values, sections, key,
+					       out, expand_meta);
+  }
+
+  unsigned get_osd_pool_default_min_size(uint8_t size) const {
+    return get_config().get_osd_pool_default_min_size(*values, size);
+  }
+
+  seastar::future<>
+  set_mon_vals(const std::map<std::string,std::string,std::less<>>& kv) {
+    return do_change([kv, this](ConfigValues& values) {
+      get_config().set_mon_vals(nullptr, values, obs_mgr, kv, nullptr);
+    });
+  }
+
+  seastar::future<> inject_args(const std::string& s) {
+    return do_change([s, this](ConfigValues& values) {
+      std::stringstream err;
+      if (get_config().injectargs(values, obs_mgr, s, &err)) {
+        throw std::invalid_argument(err.str());
+      }
+    });
+  }
+  void show_config(ceph::Formatter* f) const;
+
+  seastar::future<> parse_argv(std::vector<const char*>& argv) {
+    // we could pass whatever is unparsed to seastar, but seastar::app_template
+    // is used for driving the seastar application, and
+    // crimson::common::ConfigProxy is not available until seastar engine is up
+    // and running, so we have to feed the command line args to app_template
+    // first, then pass them to ConfigProxy.
+    return do_change([&argv, this](ConfigValues& values) {
+      get_config().parse_argv(values,
+			      obs_mgr,
+			      argv,
+			      CONF_CMDLINE);
+    });
+  }
+
+  seastar::future<> parse_config_files(const std::string& conf_files);
+
+  using ShardedConfig = seastar::sharded<ConfigProxy>;
+
+private:
+  static ShardedConfig sharded_conf;
+  friend ConfigProxy& local_conf();
+  friend ShardedConfig& sharded_conf();
+};
+
+inline ConfigProxy& local_conf() {
+  return ConfigProxy::sharded_conf.local();
+}
+
+inline ConfigProxy::ShardedConfig& sharded_conf() {
+  return ConfigProxy::sharded_conf;
+}
+
+}
diff --git a/src/crimson/common/errorator.h b/src/crimson/common/errorator.h
new file mode 100644
index 000000000..af1e6ea45
--- /dev/null
+++ b/src/crimson/common/errorator.h
@@ -0,0 +1,1140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <exception>
+#include <system_error>
+
+#include <seastar/core/future-util.hh>
+
+#include "include/ceph_assert.h"
+
+namespace crimson {
+
+template<typename Iterator, typename AsyncAction>
+inline auto do_for_each(Iterator begin, Iterator end, AsyncAction action) {
+  using futurator = \
+    ::seastar::futurize<std::invoke_result_t<AsyncAction, decltype(*begin)>>;
+
+  if (begin == end) {
+    return futurator::type::errorator_type::template make_ready_future<>();
+  }
+  while (true) {
+    auto f = futurator::invoke(action, *begin);
+    ++begin;
+    if (begin == end) {
+      return f;
+    }
+    if (!f.available() || seastar::need_preempt()) {
+      return std::move(f)._then(
+        [ action = std::move(action),
+          begin = std::move(begin),
+          end = std::move(end)
+        ] () mutable {
+          return ::crimson::do_for_each(std::move(begin),
+                                        std::move(end),
+                                        std::move(action));
+      });
+    }
+    if (f.failed()) {
+      return f;
+    }
+  }
+}
+template<typename Container, typename AsyncAction>
+inline auto do_for_each(Container& c, AsyncAction action) {
+  return ::crimson::do_for_each(std::begin(c), std::end(c), std::move(action));
+}
+
+template<typename AsyncAction>
+inline auto do_until(AsyncAction action) {
+  using errorator_t =
+    typename ::seastar::futurize_t<std::invoke_result_t<AsyncAction>>::errorator_type;
+
+  while (true) {
+    auto f = ::seastar::futurize_invoke(action);
+    if (f.failed()) {
+      return errorator_t::template make_exception_future2<>(
+        f.get_exception()
+      );
+    } else if (f.available()) {
+      if (auto done = f.get0()) {
+        return errorator_t::template make_ready_future<>();
+      }
+    } else {
+      return std::move(f)._then(
+        [action = std::move(action)] (auto &&done) mutable {
+          if (done) {
+            return errorator_t::template make_ready_future<>();
+          }
+          return ::crimson::do_until(
+            std::move(action));
+        });
+    }
+  }
+}
+
+// define the interface between error types and errorator
+template <class ConcreteErrorT>
+class error_t {
+  static constexpr const std::type_info& get_exception_ptr_type_info() {
+    return ConcreteErrorT::exception_ptr_type_info();
+  }
+
+  std::exception_ptr to_exception_ptr() const {
+    const auto* concrete_error = static_cast<const ConcreteErrorT*>(this);
+    return concrete_error->to_exception_ptr();
+  }
+
+  decltype(auto) static from_exception_ptr(std::exception_ptr ep) {
+    return ConcreteErrorT::from_exception_ptr(std::move(ep));
+  }
+
+  template <class... AllowedErrorsT>
+  friend struct errorator;
+
+  template <class ErrorVisitorT, class FuturatorT>
+  friend class maybe_handle_error_t;
+
+public:
+  template <class Func>
+  static decltype(auto) handle(Func&& func) {
+    return ConcreteErrorT::handle(std::forward<Func>(func));
+  }
+};
+
+// unthrowable_wrapper ensures compilation failure when somebody
+// would like to `throw make_error<...>)()` instead of returning.
+// returning allows for the compile-time verification of future's
+// AllowedErrorsV and also avoid the burden of throwing.
+template <class ErrorT, ErrorT ErrorV>
+struct unthrowable_wrapper : error_t<unthrowable_wrapper<ErrorT, ErrorV>> {
+  unthrowable_wrapper(const unthrowable_wrapper&) = delete;
+  [[nodiscard]] static const auto& make() {
+    static constexpr unthrowable_wrapper instance{};
+    return instance;
+  }
+
+  template<class Func>
+  static auto handle(Func&& func) {
+    return [
+      func = std::forward<Func>(func)
+    ] (const unthrowable_wrapper&) mutable -> decltype(auto) {
+      if constexpr (std::is_invocable_v<Func, ErrorT>) {
+        return std::invoke(std::forward<Func>(func), ErrorV);
+      } else {
+        return std::invoke(std::forward<Func>(func));
+      }
+    };
+  }
+
+  struct pass_further {
+    decltype(auto) operator()(const unthrowable_wrapper& e) {
+      return e;
+    }
+  };
+
+  struct discard {
+    decltype(auto) operator()(const unthrowable_wrapper&) {
+    }
+  };
+
+
+private:
+  // can be used only to initialize the `instance` member
+  explicit unthrowable_wrapper() = default;
+
+  // implement the errorable interface
+  struct throwable_carrier{};
+  static std::exception_ptr carrier_instance;
+
+  static constexpr const std::type_info& exception_ptr_type_info() {
+    return typeid(throwable_carrier);
+  }
+  auto to_exception_ptr() const {
+    // error codes don't need to instantiate `std::exception_ptr` each
+    // time as the code is actually a part of the type itself.
+    // `std::make_exception_ptr()` on modern enough GCCs is quite cheap
+    // (see the Gleb Natapov's patch eradicating throw/catch there),
+    // but using one instance per type boils down the overhead to just
+    // ref-counting.
+    return carrier_instance;
+  }
+  static const auto& from_exception_ptr(std::exception_ptr) {
+    return make();
+  }
+
+  friend class error_t<unthrowable_wrapper<ErrorT, ErrorV>>;
+};
+
+template <class ErrorT, ErrorT ErrorV>
+std::exception_ptr unthrowable_wrapper<ErrorT, ErrorV>::carrier_instance = \
+  std::make_exception_ptr<
+    unthrowable_wrapper<ErrorT, ErrorV>::throwable_carrier>({});
+
+
+template <class ErrorT>
+struct stateful_error_t : error_t<stateful_error_t<ErrorT>> {
+  template <class... Args>
+  explicit stateful_error_t(Args&&... args)
+    : ep(std::make_exception_ptr<ErrorT>(std::forward<Args>(args)...)) {
+  }
+
+  template<class Func>
+  static auto handle(Func&& func) {
+    static_assert(std::is_invocable_v<Func, ErrorT>);
+    return [
+      func = std::forward<Func>(func)
+    ] (stateful_error_t<ErrorT>&& e) mutable -> decltype(auto) {
+      try {
+        std::rethrow_exception(e.ep);
+      } catch (const ErrorT& obj) {
+        return std::invoke(std::forward<Func>(func), obj);
+      }
+      ceph_abort_msg("exception type mismatch – impossible!");
+    };
+  }
+
+private:
+  std::exception_ptr ep;
+
+  explicit stateful_error_t(std::exception_ptr ep) : ep(std::move(ep)) {}
+
+  static constexpr const std::type_info& exception_ptr_type_info() {
+    return typeid(ErrorT);
+  }
+  auto to_exception_ptr() const {
+    return ep;
+  }
+  static stateful_error_t<ErrorT> from_exception_ptr(std::exception_ptr ep) {
+    return stateful_error_t<ErrorT>(std::move(ep));
+  }
+
+  friend class error_t<stateful_error_t<ErrorT>>;
+};
+
+namespace _impl {
+  template <class T> struct always_false : std::false_type {};
+};
+
+template <class ErrorVisitorT, class FuturatorT>
+class maybe_handle_error_t {
+  const std::type_info& type_info;
+  typename FuturatorT::type result;
+  ErrorVisitorT errfunc;
+
+public:
+  maybe_handle_error_t(ErrorVisitorT&& errfunc, std::exception_ptr ep)
+    : type_info(*ep.__cxa_exception_type()),
+      result(FuturatorT::make_exception_future(std::move(ep))),
+      errfunc(std::forward<ErrorVisitorT>(errfunc)) {
+  }
+
+  template <class ErrorT>
+  void handle() {
+    static_assert(std::is_invocable<ErrorVisitorT, ErrorT>::value,
+                  "provided Error Visitor is not exhaustive");
+    // In C++ throwing an exception isn't the sole way to signal
+    // error with it. This approach nicely fits cold, infrequent cases
+    // but when applied to a hot one, it will likely hurt performance.
+    //
+    // Alternative approach is to create `std::exception_ptr` on our
+    // own and place it in the future via `make_exception_future()`.
+    // When it comes to handling, the pointer can be interrogated for
+    // pointee's type with `__cxa_exception_type()` instead of costly
+    // re-throwing (via `std::rethrow_exception()`) and matching with
+    // `catch`. The limitation here is lack of support for hierarchies
+    // of exceptions. The code below checks for exact match only while
+    // `catch` would allow to match against a base class as well.
+    // However, this shouldn't be a big issue for `errorator` as Error
+    // Visitors are already checked for exhaustiveness at compile-time.
+    //
+    // NOTE: `__cxa_exception_type()` is an extension of the language.
+    // It should be available both in GCC and Clang but a fallback
+    // (based on `std::rethrow_exception()` and `catch`) can be made
+    // to handle other platforms if necessary.
+    if (type_info == ErrorT::error_t::get_exception_ptr_type_info()) {
+      // set `state::invalid` in internals of `seastar::future` to not
+      // call `report_failed_future()` during `operator=()`.
+      [[maybe_unused]] auto&& ep = std::move(result).get_exception();
+
+      using return_t = std::invoke_result_t<ErrorVisitorT, ErrorT>;
+      if constexpr (std::is_assignable_v<decltype(result), return_t>) {
+        result = std::invoke(std::forward<ErrorVisitorT>(errfunc),
+                             ErrorT::error_t::from_exception_ptr(std::move(ep)));
+      } else if constexpr (std::is_same_v<return_t, void>) {
+        // void denotes explicit discarding
+        // execute for the sake a side effects. Typically this boils down
+        // to throwing an exception by the handler.
+        std::invoke(std::forward<ErrorVisitorT>(errfunc),
+                    ErrorT::error_t::from_exception_ptr(std::move(ep)));
+      } else {
+        static_assert(_impl::always_false<return_t>::value,
+                      "return of Error Visitor is not assignable to future");
+        // do nothing with `ep`.
+      }
+    }
+  }
+
+  auto get_result() && {
+    return std::move(result);
+  }
+};
+
+template <class FuncHead, class... FuncTail>
+static constexpr auto composer(FuncHead&& head, FuncTail&&... tail) {
+  return [
+    head = std::forward<FuncHead>(head),
+    // perfect forwarding in lambda's closure isn't available in C++17
+    // using tuple as workaround; see: https://stackoverflow.com/a/49902823
+    tail = std::make_tuple(std::forward<FuncTail>(tail)...)
+  ] (auto&&... args) mutable -> decltype(auto) {
+    if constexpr (std::is_invocable_v<FuncHead, decltype(args)...>) {
+      return std::invoke(std::forward<FuncHead>(head),
+                         std::forward<decltype(args)>(args)...);
+    } else if constexpr (sizeof...(FuncTail) > 0) {
+      using next_composer_t = decltype(composer<FuncTail...>);
+      auto&& next = std::apply<next_composer_t>(composer<FuncTail...>,
+                                                std::move(tail));
+      return std::invoke(std::move(next),
+                         std::forward<decltype(args)>(args)...);
+    } else {
+      static_assert(
+	std::is_invocable_v<FuncHead, decltype(args)...> ||
+	(sizeof...(FuncTail) > 0),
+      "composition is not exhaustive");
+    }
+  };
+}
+
+template <class ValueT>
+struct errorated_future_marker{};
+
+template <class... AllowedErrors>
+struct errorator {
+  template <class T>
+  static inline constexpr bool is_error_v = std::is_base_of_v<error_t<T>, T>;
+
+  static_assert((... && is_error_v<AllowedErrors>),
+                "errorator expects presence of ::is_error in all error types");
+
+  template <class ErrorT>
+  struct contains_once {
+    static constexpr bool value =
+      (0 + ... + std::is_same_v<ErrorT, AllowedErrors>) == 1;
+  };
+  template <class... Errors>
+  struct contains_once<errorator<Errors...>> {
+    static constexpr bool value = (... && contains_once<Errors>::value);
+  };
+  template <class T>
+  static constexpr bool contains_once_v = contains_once<T>::value;
+
+  static_assert((... && contains_once_v<AllowedErrors>),
+                "no error type in errorator can be duplicated");
+
+  struct ready_future_marker{};
+  struct exception_future_marker{};
+
+private:
+  // see the comment for `using future = _future` below.
+  template <class>
+  class _future {};
+  template <class ValueT>
+  class _future<::crimson::errorated_future_marker<ValueT>>
+    : private seastar::future<ValueT> {
+    using base_t = seastar::future<ValueT>;
+    // we need the friendship for the sake of `get_exception() &&` when
+    // `safe_then()` is going to return an errorated future as a result of
+    // chaining. In contrast to `seastar::future`, errorator<T...>::future`
+    // has this member private.
+    template <class ErrorVisitor, class Futurator>
+    friend class maybe_handle_error_t;
+
+    // any `seastar::futurize` specialization must be able to access the base.
+    // see : `satisfy_with_result_of()` far below.
+    template <typename>
+    friend class seastar::futurize;
+
+    template <typename T1, typename T2, typename... More>
+    friend auto seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, More&&... more);
+
+    template <class, class = std::void_t<>>
+    struct get_errorator {
+      // generic template for non-errorated things (plain types and
+      // vanilla seastar::future as well).
+      using type = errorator<>;
+    };
+    template <class FutureT>
+    struct get_errorator<FutureT,
+                         std::void_t<typename FutureT::errorator_type>> {
+      using type = typename FutureT::errorator_type;
+    };
+    template <class T>
+    using get_errorator_t = typename get_errorator<T>::type;
+
+    template <class ValueFuncErroratorT, class... ErrorVisitorRetsT>
+    struct make_errorator {
+      // NOP. The generic template.
+    };
+    template <class... ValueFuncAllowedErrors,
+              class    ErrorVisitorRetsHeadT,
+              class... ErrorVisitorRetsTailT>
+    struct make_errorator<errorator<ValueFuncAllowedErrors...>,
+                          ErrorVisitorRetsHeadT,
+                          ErrorVisitorRetsTailT...> {
+    private:
+      using step_errorator = errorator<ValueFuncAllowedErrors...>;
+      // add ErrorVisitorRetsHeadT only if 1) it's an error type and
+      // 2) isn't already included in the errorator's error set.
+      // It's enough to negate contains_once_v as any errorator<...>
+      // type is already guaranteed to be free of duplications.
+      using next_errorator = std::conditional_t<
+        is_error_v<ErrorVisitorRetsHeadT> &&
+          !step_errorator::template contains_once_v<ErrorVisitorRetsHeadT>,
+        typename step_errorator::template extend<ErrorVisitorRetsHeadT>,
+        step_errorator>;
+
+    public:
+      using type = typename make_errorator<next_errorator,
+                                           ErrorVisitorRetsTailT...>::type;
+    };
+    // finish the recursion
+    template <class... ValueFuncAllowedErrors>
+    struct make_errorator<errorator<ValueFuncAllowedErrors...>> {
+      using type = ::crimson::errorator<ValueFuncAllowedErrors...>;
+    };
+    template <class... Args>
+    using make_errorator_t = typename make_errorator<Args...>::type;
+
+    using base_t::base_t;
+
+    template <class Futurator, class Future, class ErrorVisitor>
+    [[gnu::noinline]]
+    static auto _safe_then_handle_errors(Future&& future,
+                                         ErrorVisitor&& errfunc) {
+      maybe_handle_error_t<ErrorVisitor, Futurator> maybe_handle_error(
+        std::forward<ErrorVisitor>(errfunc),
+        std::move(future).get_exception()
+      );
+      (maybe_handle_error.template handle<AllowedErrors>() , ...);
+      return std::move(maybe_handle_error).get_result();
+    }
+
+  public:
+    using errorator_type = ::crimson::errorator<AllowedErrors...>;
+    using promise_type = seastar::promise<ValueT>;
+
+    using base_t::available;
+    using base_t::failed;
+    // need this because of the legacy in PG::do_osd_ops().
+    using base_t::handle_exception_type;
+
+    [[gnu::always_inline]]
+    _future(base_t&& base)
+      : base_t(std::move(base)) {
+    }
+
+    template <class... A>
+    [[gnu::always_inline]]
+    _future(ready_future_marker, A&&... a)
+      : base_t(::seastar::make_ready_future<ValueT>(std::forward<A>(a)...)) {
+    }
+    [[gnu::always_inline]]
+    _future(exception_future_marker, ::seastar::future_state_base&& state) noexcept
+      : base_t(::seastar::futurize<base_t>::make_exception_future(std::move(state))) {
+    }
+    [[gnu::always_inline]]
+    _future(exception_future_marker, std::exception_ptr&& ep) noexcept
+      : base_t(::seastar::futurize<base_t>::make_exception_future(std::move(ep))) {
+    }
+
+    template <template <class...> class ErroratedFuture,
+              class = std::void_t<
+                typename ErroratedFuture<
+                  ::crimson::errorated_future_marker<ValueT>>::errorator_type>>
+    operator ErroratedFuture<errorated_future_marker<ValueT>> () && {
+      using dest_errorator_t = \
+        typename ErroratedFuture<
+          ::crimson::errorated_future_marker<ValueT>>::errorator_type;
+      static_assert(dest_errorator_t::template contains_once_v<errorator_type>,
+                    "conversion is possible to more-or-eq errorated future!");
+      return static_cast<base_t&&>(*this);
+    }
+
+    // initialize future as failed without throwing. `make_exception_future()`
+    // internally uses `std::make_exception_ptr()`. cppreference.com shouldn't
+    // be misinterpreted when it says:
+    //
+    //   "This is done as if executing the following code:
+    //     try {
+    //         throw e;
+    //     } catch(...) {
+    //         return std::current_exception();
+    //     }",
+    //
+    // the "as if" is absolutely crucial because modern GCCs employ optimized
+    // path for it. See:
+    //   * https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=cce8e59224e18858749a2324bce583bcfd160d6c,
+    //   * https://gcc.gnu.org/ml/gcc-patches/2016-08/msg00373.html.
+    //
+    // This behavior, combined with `__cxa_exception_type()` for inspecting
+    // exception's type, allows for throw/catch-free handling of stateless
+    // exceptions (which is fine for error codes). Stateful jumbos would be
+    // actually a bit harder as `_M_get()` is private, and thus rethrowing is
+    // necessary to get to the state inside. However, it's not unthinkable to
+    // see another extension bringing operator*() to the exception pointer...
+    //
+    // TODO: we don't really need to `make_exception_ptr` each time. It still
+    // allocates memory underneath while can be replaced with single instance
+    // per type created on start-up.
+    template <class ErrorT,
+              class DecayedT = std::decay_t<ErrorT>,
+              bool IsError = is_error_v<DecayedT>,
+              class = std::enable_if_t<IsError>>
+    _future(ErrorT&& e)
+      : base_t(
+          seastar::make_exception_future<ValueT>(
+            errorator_type::make_exception_ptr(e))) {
+      static_assert(errorator_type::contains_once_v<DecayedT>,
+                    "ErrorT is not enlisted in errorator");
+    }
+
+    template <class ValueFuncT, class ErrorVisitorT>
+    auto safe_then(ValueFuncT&& valfunc, ErrorVisitorT&& errfunc) {
+      static_assert((... && std::is_invocable_v<ErrorVisitorT,
+                                                AllowedErrors>),
+                    "provided Error Visitor is not exhaustive");
+
+      using value_func_result_t =
+        typename std::conditional_t<std::is_void_v<ValueT>,
+				    std::invoke_result<ValueFuncT>,
+				    std::invoke_result<ValueFuncT, ValueT>>::type;
+      // recognize whether there can be any error coming from the Value
+      // Function.
+      using value_func_errorator_t = get_errorator_t<value_func_result_t>;
+      // mutate the Value Function's errorator to harvest errors coming
+      // from the Error Visitor. Yes, it's perfectly fine to fail error
+      // handling at one step and delegate even broader set of issues
+      // to next continuation.
+      using return_errorator_t = make_errorator_t<
+        value_func_errorator_t,
+        std::decay_t<std::invoke_result_t<ErrorVisitorT, AllowedErrors>>...>;
+      // OK, now we know about all errors next continuation must take
+      // care about. If Visitor handled everything and the Value Func
+      // doesn't return any, we'll finish with errorator<>::future
+      // which is just vanilla seastar::future – that's it, next cont
+      // finally could use `.then()`!
+      using futurator_t = \
+        typename return_errorator_t::template futurize<value_func_result_t>;
+      // `seastar::futurize`, used internally by `then_wrapped()`, would
+      // wrap any non-`seastar::future` type coming from Value Func into
+      // `seastar::future`. As we really don't want to end with things
+      // like `seastar::future<errorator::future<...>>`, we need either:
+      //   * convert the errorated future into plain in the lambda below
+      //     and back here or
+      //   * specialize the `seastar::futurize<T>` to get proper kind of
+      //     future directly from `::then_wrapped()`.
+      // As C++17 doesn't guarantee copy elision when non-same types are
+      // involved while examination of assemblies from GCC 8.1 confirmed
+      // extra copying, switch to the second approach has been made.
+      return this->then_wrapped(
+        [ valfunc = std::forward<ValueFuncT>(valfunc),
+          errfunc = std::forward<ErrorVisitorT>(errfunc)
+        ] (auto&& future) mutable noexcept {
+          if (__builtin_expect(future.failed(), false)) {
+            return _safe_then_handle_errors<futurator_t>(
+              std::move(future), std::forward<ErrorVisitorT>(errfunc));
+          } else {
+            // NOTE: using `seastar::future::get()` here is a bit bloaty
+            // as the method rechecks availability of future's value and,
+            // if it's unavailable, does the `::do_wait()` path (yes, it
+            // targets `seastar::thread`). Actually this is dead code as
+            // `then_wrapped()` executes the lambda only when the future
+            // is available (which means: failed or ready). However, GCC
+            // hasn't optimized it out:
+            //
+            //          if (__builtin_expect(future.failed(), false)) {
+            //    ea25:       48 83 bd c8 fe ff ff    cmpq   $0x2,-0x138(%rbp)
+            //    ea2c:       02
+            //    ea2d:       0f 87 f0 05 00 00       ja     f023 <ceph::osd::
+            // ...
+            //    /// If get() is called in a \ref seastar::thread context,
+            //    /// then it need not be available; instead, the thread will
+            //    /// be paused until the future becomes available.
+            //    [[gnu::always_inline]]
+            //    std::tuple<T...> get() {
+            //        if (!_state.available()) {
+            //    ea3a:       0f 85 1b 05 00 00       jne    ef5b <ceph::osd::
+            //    }
+            // ...
+            //
+            // I don't perceive this as huge issue. Though, it cannot be
+            // claimed errorator has 0 overhead on hot path. The perfect
+            // solution here would be mark the `::get_available_state()`
+            // as `protected` and use dedicated `get_value()` exactly as
+            // `::then()` already does.
+            return futurator_t::invoke(std::forward<ValueFuncT>(valfunc),
+                                       std::move(future).get());
+          }
+        });
+    }
+
+    /**
+     * unsafe_thread_get
+     *
+     * Only valid within a seastar_thread.  Ignores errorator protections
+     * and throws any contained exceptions.
+     *
+     * Should really only be used within test code
+     * (see test/crimson/gtest_seastar.h).
+     */
+    auto &&unsafe_get() {
+      return seastar::future<ValueT>::get();
+    }
+    auto unsafe_get0() {
+      return seastar::future<ValueT>::get0();
+    }
+
+    template <class FuncT>
+    _future finally(FuncT &&func) {
+      return this->then_wrapped(
+        [func = std::forward<FuncT>(func)](auto &&result) mutable noexcept {
+        if constexpr (seastar::is_future<std::invoke_result_t<FuncT>>::value) {
+          return ::seastar::futurize_invoke(std::forward<FuncT>(func)).then_wrapped(
+            [result = std::move(result)](auto&& f_res) mutable {
+            // TODO: f_res.failed()
+            (void)f_res.discard_result();
+            return std::move(result);
+          });
+        } else {
+          try {
+            func();
+          } catch (...) {
+            // TODO: rethrow
+          }
+          return std::move(result);
+        }
+      });
+    }
+
+    // taking ErrorFuncOne and ErrorFuncTwo separately from ErrorFuncTail
+    // to avoid SFINAE
+    template <class ValueFunc,
+              class ErrorFuncHead,
+              class... ErrorFuncTail>
+    auto safe_then(ValueFunc&& value_func,
+                   ErrorFuncHead&& error_func_head,
+                   ErrorFuncTail&&... error_func_tail) {
+      static_assert(sizeof...(ErrorFuncTail) > 0);
+      return safe_then(
+        std::forward<ValueFunc>(value_func),
+        composer(std::forward<ErrorFuncHead>(error_func_head),
+                 std::forward<ErrorFuncTail>(error_func_tail)...));
+    }
+
+    template <class ValueFunc>
+    auto safe_then(ValueFunc&& value_func) {
+      return safe_then(std::forward<ValueFunc>(value_func),
+                       errorator_type::pass_further{});
+    }
+
+    template <class Func>
+    void then(Func&&) = delete;
+
+    template <class ErrorVisitorT>
+    auto handle_error(ErrorVisitorT&& errfunc) {
+      static_assert((... && std::is_invocable_v<ErrorVisitorT,
+                                                AllowedErrors>),
+                    "provided Error Visitor is not exhaustive");
+      using return_errorator_t = make_errorator_t<
+        errorator<>,
+        std::decay_t<std::invoke_result_t<ErrorVisitorT, AllowedErrors>>...>;
+      using futurator_t = \
+        typename return_errorator_t::template futurize<::seastar::future<ValueT>>;
+      return this->then_wrapped(
+        [ errfunc = std::forward<ErrorVisitorT>(errfunc)
+        ] (auto&& future) mutable noexcept {
+          if (__builtin_expect(future.failed(), false)) {
+            return _safe_then_handle_errors<futurator_t>(
+              std::move(future), std::forward<ErrorVisitorT>(errfunc));
+          } else {
+            return typename futurator_t::type{ std::move(future) };
+          }
+        });
+    }
+    template <class ErrorFuncHead,
+              class... ErrorFuncTail>
+    auto handle_error(ErrorFuncHead&& error_func_head,
+                      ErrorFuncTail&&... error_func_tail) {
+      static_assert(sizeof...(ErrorFuncTail) > 0);
+      return this->handle_error(
+        composer(std::forward<ErrorFuncHead>(error_func_head),
+                 std::forward<ErrorFuncTail>(error_func_tail)...));
+    }
+
+  private:
+    // for ::crimson::do_for_each
+    template <class Func>
+    auto _then(Func&& func) {
+      return base_t::then(std::forward<Func>(func));
+    }
+    template<typename Iterator, typename AsyncAction>
+    friend inline auto ::crimson::do_for_each(Iterator begin,
+                                              Iterator end,
+                                              AsyncAction action);
+
+    template<typename AsyncAction>
+    friend inline auto ::crimson::do_until(AsyncAction action);
+
+    template <typename Result>
+    friend class ::seastar::future;
+
+    // let seastar::do_with_impl to up-cast us to seastar::future.
+    template<typename T, typename F>
+    friend inline auto ::seastar::internal::do_with_impl(T&& rvalue, F&& f);
+    template<typename T1, typename T2, typename T3_or_F, typename... More>
+    friend inline auto ::seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, T3_or_F&& rv3, More&&... more);
+  };
+
+  class Enabler {};
+
+  template <typename T>
+  using EnableIf = typename std::enable_if<contains_once_v<std::decay_t<T>>, Enabler>::type;
+
+  template <typename ErrorFunc>
+  struct all_same_way_t {
+    ErrorFunc func;
+    all_same_way_t(ErrorFunc &&error_func)
+      : func(std::forward<ErrorFunc>(error_func)) {}
+
+    template <typename ErrorT, EnableIf<ErrorT>...>
+    decltype(auto) operator()(ErrorT&& e) {
+      using decayed_t = std::decay_t<decltype(e)>;
+      auto&& handler =
+        decayed_t::error_t::handle(std::forward<ErrorFunc>(func));
+      static_assert(std::is_invocable_v<decltype(handler), ErrorT>);
+      return std::invoke(std::move(handler), std::forward<ErrorT>(e));
+    }
+  };
+
+public:
+  // HACK: `errorated_future_marker` and `_future` is just a hack to
+  // specialize `seastar::futurize` for category of class templates:
+  // `future<...>` from distinct errorators. Such tricks are usually
+  // performed basing on SFINAE and `std::void_t` to check existence
+  // of a trait/member (`future<...>::errorator_type` in our case).
+  // Unfortunately, this technique can't be applied as the `futurize`
+  // lacks the optional parameter. The problem looks awfully similar
+  // to following SO item:  https://stackoverflow.com/a/38860413.
+  template <class ValueT=void>
+  using future = _future<::crimson::errorated_future_marker<ValueT>>;
+
+  // the visitor that forwards handling of all errors to next continuation
+  struct pass_further {
+    template <class ErrorT, EnableIf<ErrorT>...>
+    decltype(auto) operator()(ErrorT&& e) {
+      static_assert(contains_once_v<std::decay_t<ErrorT>>,
+                    "passing further disallowed ErrorT");
+      return std::forward<ErrorT>(e);
+    }
+  };
+
+  struct discard_all {
+    template <class ErrorT, EnableIf<ErrorT>...>
+    void operator()(ErrorT&&) {
+      static_assert(contains_once_v<std::decay_t<ErrorT>>,
+                    "discarding disallowed ErrorT");
+    }
+  };
+
+  // assert_all{ "TODO" };
+  class assert_all {
+    const char* const msg = nullptr;
+  public:
+    template <std::size_t N>
+    assert_all(const char (&msg)[N])
+      : msg(msg) {
+    }
+    assert_all() = default;
+
+    template <class ErrorT, EnableIf<ErrorT>...>
+    void operator()(ErrorT&&) {
+      static_assert(contains_once_v<std::decay_t<ErrorT>>,
+                    "discarding disallowed ErrorT");
+      if (msg) {
+        ceph_abort_msg(msg);
+      } else {
+        ceph_abort();
+      }
+    }
+  };
+
+  template <class ErrorFunc>
+  static decltype(auto) all_same_way(ErrorFunc&& error_func) {
+    return all_same_way_t<ErrorFunc>{std::forward<ErrorFunc>(error_func)};
+  };
+
+  // get a new errorator by extending current one with new error
+  template <class... NewAllowedErrorsT>
+  using extend = errorator<AllowedErrors..., NewAllowedErrorsT...>;
+
+  // get a new errorator by summing and deduplicating error set of
+  // the errorator `unify<>` is applied on with another errorator
+  // provided as template parameter.
+  template <class OtherErroratorT>
+  struct unify {
+    // 1st: generic NOP template
+  };
+  template <class    OtherAllowedErrorsHead,
+            class... OtherAllowedErrorsTail>
+  struct unify<errorator<OtherAllowedErrorsHead,
+                         OtherAllowedErrorsTail...>> {
+  private:
+    // 2nd: specialization for errorators with non-empty error set.
+    //
+    // split error set of other errorator, passed as template param,
+    // into head and tail. Mix error set of this errorator with head
+    // of the other one only if it isn't already present in the set.
+    using step_errorator = std::conditional_t<
+      contains_once_v<OtherAllowedErrorsHead> == false,
+      errorator<AllowedErrors..., OtherAllowedErrorsHead>,
+      errorator<AllowedErrors...>>;
+    using rest_errorator = errorator<OtherAllowedErrorsTail...>;
+
+  public:
+    using type = typename step_errorator::template unify<rest_errorator>::type;
+  };
+  template <class... EmptyPack>
+  struct unify<errorator<EmptyPack...>> {
+    // 3rd: recursion finisher
+    static_assert(sizeof...(EmptyPack) == 0);
+    using type = errorator<AllowedErrors...>;
+  };
+
+  template <typename T=void, typename... A>
+  static future<T> make_ready_future(A&&... value) {
+    return future<T>(ready_future_marker(), std::forward<A>(value)...);
+  }
+
+  template <typename T=void>
+  static
+  future<T> make_exception_future2(std::exception_ptr&& ex) noexcept {
+    return future<T>(exception_future_marker(), std::move(ex));
+  }
+  template <typename T=void>
+  static
+  future<T> make_exception_future2(seastar::future_state_base&& state) noexcept {
+    return future<T>(exception_future_marker(), std::move(state));
+  }
+  template <typename T=void, typename Exception>
+  static
+  future<T> make_exception_future2(Exception&& ex) noexcept {
+    return make_exception_future2<T>(std::make_exception_ptr(std::forward<Exception>(ex)));
+  }
+
+  static auto now() {
+    return make_ready_future<>();
+  }
+
+private:
+  template <class T, class = std::void_t<T>>
+  class futurize {
+    using vanilla_futurize = seastar::futurize<T>;
+
+    // explicit specializations for nested type is not allowed unless both
+    // the member template and the enclosing template are specialized. see
+    // section temp.expl.spec, N4659
+    template <class Stored, int Dummy = 0>
+    struct stored_to_future {
+      using type = future<Stored>;
+    };
+    template <int Dummy>
+    struct stored_to_future <seastar::internal::monostate, Dummy> {
+      using type = future<>;
+    };
+
+  public:
+    using type =
+      typename stored_to_future<typename vanilla_futurize::value_type>::type;
+
+    template <class Func, class... Args>
+    static type invoke(Func&& func, Args&&... args) {
+      try {
+        return vanilla_futurize::invoke(std::forward<Func>(func),
+                                        std::forward<Args>(args)...);
+      } catch (...) {
+        return make_exception_future(std::current_exception());
+      }
+    }
+
+    template <class Func>
+    static type invoke(Func&& func, seastar::internal::monostate) {
+      try {
+        return vanilla_futurize::invoke(std::forward<Func>(func));
+      } catch (...) {
+        return make_exception_future(std::current_exception());
+      }
+    }
+
+    template <typename Arg>
+    static type make_exception_future(Arg&& arg) {
+      return vanilla_futurize::make_exception_future(std::forward<Arg>(arg));
+    }
+  };
+  template <template <class...> class ErroratedFutureT,
+            class ValueT>
+  class futurize<ErroratedFutureT<::crimson::errorated_future_marker<ValueT>>,
+                 std::void_t<
+                   typename ErroratedFutureT<
+                     ::crimson::errorated_future_marker<ValueT>>::errorator_type>> {
+  public:
+    using type = ::crimson::errorator<AllowedErrors...>::future<ValueT>;
+
+    template <class Func, class... Args>
+    static type apply(Func&& func, std::tuple<Args...>&& args) {
+      try {
+        return ::seastar::futurize_apply(std::forward<Func>(func),
+					 std::forward<std::tuple<Args...>>(args));
+      } catch (...) {
+        return make_exception_future(std::current_exception());
+      }
+    }
+
+    template <class Func, class... Args>
+    static type invoke(Func&& func, Args&&... args) {
+      try {
+        return ::seastar::futurize_invoke(std::forward<Func>(func),
+                                          std::forward<Args>(args)...);
+      } catch (...) {
+        return make_exception_future(std::current_exception());
+      }
+    }
+
+    template <class Func>
+    static type invoke(Func&& func, seastar::internal::monostate) {
+      try {
+        return ::seastar::futurize_invoke(std::forward<Func>(func));
+      } catch (...) {
+        return make_exception_future(std::current_exception());
+      }
+    }
+
+    template <typename Arg>
+    static type make_exception_future(Arg&& arg) {
+      return ::crimson::errorator<AllowedErrors...>::make_exception_future2<ValueT>(std::forward<Arg>(arg));
+    }
+  };
+
+  template <class ErrorT>
+  static std::exception_ptr make_exception_ptr(ErrorT&& e) {
+    // calling via interface class due to encapsulation and friend relations.
+    return e.error_t<std::decay_t<ErrorT>>::to_exception_ptr();
+  }
+
+  // needed because of:
+  //  * return_errorator_t::template futurize<...> in `safe_then()`,
+  //  * conversion to `std::exception_ptr` in `future::future(ErrorT&&)`.
+  // the friendship with all errorators is an idea from Kefu to fix build
+  // issues on GCC 9. This version likely fixes some access violation bug
+  // we were exploiting before.
+  template <class...>
+  friend class errorator;
+}; // class errorator, generic template
+
+// no errors? errorator<>::future is plain seastar::future then!
+template <>
+class errorator<> {
+public:
+  template <class ValueT>
+  using future = ::seastar::future<ValueT>;
+
+  template <class T>
+  using futurize = ::seastar::futurize<T>;
+
+  // get a new errorator by extending current one with new error
+  template <class... NewAllowedErrors>
+  using extend = errorator<NewAllowedErrors...>;
+
+  // errorator with empty error set never contains any error
+  template <class T>
+  static constexpr bool contains_once_v = false;
+}; // class errorator, <> specialization
+
+
+template <class    ErroratorOne,
+          class    ErroratorTwo,
+          class... FurtherErrators>
+struct compound_errorator {
+private:
+  // generic template. Empty `FurtherErrators` are handled by
+  // the specialization below.
+  static_assert(sizeof...(FurtherErrators) > 0);
+  using step =
+    typename compound_errorator<ErroratorOne, ErroratorTwo>::type;
+
+public:
+  using type =
+    typename compound_errorator<step, FurtherErrators...>::type;
+};
+template <class ErroratorOne,
+          class ErroratorTwo>
+struct compound_errorator<ErroratorOne, ErroratorTwo>  {
+  // specialization for empty `FurtherErrators` arg pack
+  using type =
+    typename ErroratorOne::template unify<ErroratorTwo>::type;
+};
+template <class... Args>
+using compound_errorator_t = typename compound_errorator<Args...>::type;
+
+// this is conjunction of two nasty features: C++14's variable template
+// and inline global variable of C++17. The latter is crucial to ensure
+// the variable will get the same address across all translation units.
+template <std::errc ErrorV>
+inline std::error_code ec = std::make_error_code(ErrorV);
+
+template <std::errc ErrorV>
+using ct_error_code = unthrowable_wrapper<const std::error_code&, ec<ErrorV>>;
+
+namespace ct_error {
+  using enoent = ct_error_code<std::errc::no_such_file_or_directory>;
+  using enodata = ct_error_code<std::errc::no_message_available>;
+  using invarg =  ct_error_code<std::errc::invalid_argument>;
+  using input_output_error = ct_error_code<std::errc::io_error>;
+  using object_corrupted = ct_error_code<std::errc::illegal_byte_sequence>;
+  using permission_denied = ct_error_code<std::errc::permission_denied>;
+  using operation_not_supported =
+    ct_error_code<std::errc::operation_not_supported>;
+  using not_connected = ct_error_code<std::errc::not_connected>;
+  using timed_out = ct_error_code<std::errc::timed_out>;
+  using erange =
+    ct_error_code<std::errc::result_out_of_range>;
+  using ebadf =
+    ct_error_code<std::errc::bad_file_descriptor>;
+  using enospc =
+    ct_error_code<std::errc::no_space_on_device>;
+  using value_too_large = ct_error_code<std::errc::value_too_large>;
+  using eagain =
+    ct_error_code<std::errc::resource_unavailable_try_again>;
+  using file_too_large =
+    ct_error_code<std::errc::file_too_large>;
+  using address_in_use = ct_error_code<std::errc::address_in_use>;
+
+  struct pass_further_all {
+    template <class ErrorT>
+    decltype(auto) operator()(ErrorT&& e) {
+      return std::forward<ErrorT>(e);
+    }
+  };
+
+  struct discard_all {
+    template <class ErrorT>
+    void operator()(ErrorT&&) {
+    }
+  };
+
+  class assert_all {
+    const char* const msg = nullptr;
+  public:
+    template <std::size_t N>
+    assert_all(const char (&msg)[N])
+      : msg(msg) {
+    }
+    assert_all() = default;
+
+    template <class ErrorT>
+    void operator()(ErrorT&&) {
+      if (msg) {
+        ceph_abort(msg);
+      } else {
+        ceph_abort();
+      }
+    }
+  };
+
+  template <class ErrorFunc>
+  static decltype(auto) all_same_way(ErrorFunc&& error_func) {
+    return [
+      error_func = std::forward<ErrorFunc>(error_func)
+    ] (auto&& e) mutable -> decltype(auto) {
+      using decayed_t = std::decay_t<decltype(e)>;
+      auto&& handler =
+        decayed_t::error_t::handle(std::forward<ErrorFunc>(error_func));
+      return std::invoke(std::move(handler), std::forward<decltype(e)>(e));
+    };
+  };
+}
+
+using stateful_errc = stateful_error_t<std::errc>;
+using stateful_errint = stateful_error_t<int>;
+using stateful_ec = stateful_error_t<std::error_code>;
+
+} // namespace crimson
+
+
+// open the `seastar` namespace to specialize `futurize`. This is not
+// pretty for sure. I just hope it's not worse than e.g. specializing
+// `hash` in the `std` namespace. The justification is copy avoidance
+// in `future<...>::safe_then()`. See the comments there for details.
+namespace seastar {
+
+// Container is a placeholder for errorator::_future<> template
+template <template <class> class Container,
+          class Value>
+struct futurize<Container<::crimson::errorated_future_marker<Value>>> {
+  using errorator_type = typename Container<
+    ::crimson::errorated_future_marker<Value>>::errorator_type;
+
+  using type = typename errorator_type::template future<Value>;
+  using value_type = seastar::internal::future_stored_type_t<Value>;
+
+  template<typename Func, typename... FuncArgs>
+  [[gnu::always_inline]]
+  static inline type invoke(Func&& func, FuncArgs&&... args) noexcept {
+    try {
+      return func(std::forward<FuncArgs>(args)...);
+    } catch (...) {
+      return make_exception_future(std::current_exception());
+    }
+  }
+
+  template <class Func>
+  [[gnu::always_inline]]
+  static type invoke(Func&& func, seastar::internal::monostate) noexcept {
+    try {
+      return func();
+    } catch (...) {
+      return make_exception_future(std::current_exception());
+    }
+  }
+
+  template <typename Arg>
+  [[gnu::always_inline]]
+  static type make_exception_future(Arg&& arg) {
+    return errorator_type::template make_exception_future2<Value>(std::forward<Arg>(arg));
+  }
+
+private:
+  template<typename PromiseT, typename Func>
+  static void satisfy_with_result_of(PromiseT&& pr, Func&& func) {
+    // this may use the protected variant of `seastar::future::forward_to()`
+    // because:
+    //   1. `seastar::future` established a friendship with with all
+    //      specializations of `seastar::futurize`, including this
+    //      one (we're in the `seastar` namespace!) WHILE
+    //   2. any errorated future declares now the friendship with any
+    //      `seastar::futurize<...>`.
+    func().forward_to(std::move(pr));
+  }
+  template <typename U>
+  friend class future;
+};
+
+template <template <class> class Container,
+          class Value>
+struct continuation_base_from_future<Container<::crimson::errorated_future_marker<Value>>> {
+  using type = continuation_base<Value>;
+};
+
+} // namespace seastar
diff --git a/src/crimson/common/exception.h b/src/crimson/common/exception.h
new file mode 100644
index 000000000..05caf5ebd
--- /dev/null
+++ b/src/crimson/common/exception.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <exception>
+#include <seastar/core/future.hh>
+#include <seastar/core/future-util.hh>
+
+#include "crimson/common/log.h"
+
+namespace crimson::common {
+
+class system_shutdown_exception final : public std::exception{
+public:
+  const char* what() const noexcept final {
+    return "system shutting down";
+  }
+};
+
+class actingset_changed final : public std::exception {
+public:
+  actingset_changed(bool sp) : still_primary(sp) {}
+  const char* what() const noexcept final {
+    return "acting set changed";
+  }
+  bool is_primary() const {
+    return still_primary;
+  }
+private:
+  const bool still_primary;
+};
+
+template<typename Func, typename... Args>
+inline seastar::future<> handle_system_shutdown(Func&& func, Args&&... args)
+{
+  return seastar::futurize_invoke(std::forward<Func>(func),
+				  std::forward<Args>(args)...)
+  .handle_exception([](std::exception_ptr eptr) {
+    if (*eptr.__cxa_exception_type() ==
+	typeid(crimson::common::system_shutdown_exception)) {
+	crimson::get_logger(ceph_subsys_osd).debug(
+	    "operation skipped, system shutdown");
+	return seastar::now();
+    }
+    std::rethrow_exception(eptr);
+  });
+}
+
+}
diff --git a/src/crimson/common/fixed_kv_node_layout.h b/src/crimson/common/fixed_kv_node_layout.h
new file mode 100644
index 000000000..4c7cc2e76
--- /dev/null
+++ b/src/crimson/common/fixed_kv_node_layout.h
@@ -0,0 +1,700 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include "include/byteorder.h"
+
+#include "crimson/common/layout.h"
+
+namespace crimson::common {
+
+template <typename T, bool is_const>
+struct maybe_const_t {
+};
+template<typename T>
+struct maybe_const_t<T, true> {
+  using type = const T*;
+};
+template<typename T>
+struct maybe_const_t<T, false> {
+  using type = T*;
+};
+
+
+/**
+ * FixedKVNodeLayout
+ *
+ * Reusable implementation of a fixed size block mapping
+ * K -> V with internal representations KINT and VINT.
+ *
+ * Uses absl::container_internal::Layout for the actual memory layout.
+ *
+ * The primary interface exposed is centered on the iterator
+ * and related methods.
+ *
+ * Also included are helpers for doing splits and merges as for a btree.
+ */
+template <
+  size_t CAPACITY,
+  typename Meta,
+  typename MetaInt,
+  typename K,
+  typename KINT,
+  typename V,
+  typename VINT,
+  bool VALIDATE_INVARIANTS=true>
+class FixedKVNodeLayout {
+  char *buf = nullptr;
+
+  using L = absl::container_internal::Layout<ceph_le32, MetaInt, KINT, VINT>;
+  static constexpr L layout{1, 1, CAPACITY, CAPACITY};
+
+public:
+  template <bool is_const>
+  struct iter_t {
+    friend class FixedKVNodeLayout;
+    using parent_t = typename maybe_const_t<FixedKVNodeLayout, is_const>::type;
+
+    parent_t node;
+    uint16_t offset;
+
+    iter_t(
+      parent_t parent,
+      uint16_t offset) : node(parent), offset(offset) {}
+
+    iter_t(const iter_t &) = default;
+    iter_t(iter_t &&) = default;
+    iter_t &operator=(const iter_t &) = default;
+    iter_t &operator=(iter_t &&) = default;
+
+    operator iter_t<!is_const>() const {
+      static_assert(!is_const);
+      return iter_t<!is_const>(node, offset);
+    }
+
+    // Work nicely with for loops without requiring a nested type.
+    iter_t &operator*() { return *this; }
+    iter_t *operator->() { return this; }
+
+    iter_t operator++(int) {
+      auto ret = *this;
+      ++offset;
+      return ret;
+    }
+
+    iter_t &operator++() {
+      ++offset;
+      return *this;
+    }
+
+    uint16_t operator-(const iter_t &rhs) const {
+      assert(rhs.node == node);
+      return offset - rhs.offset;
+    }
+
+    iter_t operator+(uint16_t off) const {
+      return iter_t(
+	node,
+	offset + off);
+    }
+    iter_t operator-(uint16_t off) const {
+      return iter_t(
+	node,
+	offset - off);
+    }
+
+    bool operator==(const iter_t &rhs) const {
+      assert(node == rhs.node);
+      return rhs.offset == offset;
+    }
+
+    bool operator!=(const iter_t &rhs) const {
+      return !(*this == rhs);
+    }
+
+    K get_key() const {
+      return K(node->get_key_ptr()[offset]);
+    }
+
+    K get_next_key_or_max() const {
+      auto next = *this + 1;
+      if (next == node->end())
+	return std::numeric_limits<K>::max();
+      else
+	return next->get_key();
+    }
+
+    void set_val(V val) const {
+      static_assert(!is_const);
+      node->get_val_ptr()[offset] = VINT(val);
+    }
+
+    V get_val() const {
+      return V(node->get_val_ptr()[offset]);
+    };
+
+    bool contains(K addr) const {
+      return (get_key() <= addr) && (get_next_key_or_max() > addr);
+    }
+
+    uint16_t get_offset() const {
+      return offset;
+    }
+
+  private:
+    void set_key(K _lb) const {
+      static_assert(!is_const);
+      KINT lb;
+      lb = _lb;
+      node->get_key_ptr()[offset] = lb;
+    }
+
+    typename maybe_const_t<char, is_const>::type get_key_ptr() const {
+      return reinterpret_cast<
+	typename maybe_const_t<char, is_const>::type>(
+	  node->get_key_ptr() + offset);
+    }
+
+    typename maybe_const_t<char, is_const>::type get_val_ptr() const {
+      return reinterpret_cast<
+	typename maybe_const_t<char, is_const>::type>(
+	  node->get_val_ptr() + offset);
+    }
+  };
+  using const_iterator = iter_t<true>;
+  using iterator = iter_t<false>;
+
+  struct delta_t {
+    enum class op_t : uint8_t {
+      INSERT,
+      REMOVE,
+      UPDATE,
+    } op;
+    KINT key;
+    VINT val;
+
+    void replay(FixedKVNodeLayout &l) {
+      switch (op) {
+      case op_t::INSERT: {
+	l.insert(l.lower_bound(key), key, val);
+	break;
+      }
+      case op_t::REMOVE: {
+	auto iter = l.find(key);
+	assert(iter != l.end());
+	l.remove(iter);
+	break;
+      }
+      case op_t::UPDATE: {
+	auto iter = l.find(key);
+	assert(iter != l.end());
+	l.update(iter, val);
+	break;
+      }
+      default:
+	assert(0 == "Impossible");
+      }
+    }
+
+    bool operator==(const delta_t &rhs) const {
+      return op == rhs.op &&
+	key == rhs.key &&
+	val == rhs.val;
+    }
+  };
+
+public:
+  class delta_buffer_t {
+    std::vector<delta_t> buffer;
+  public:
+    bool empty() const {
+      return buffer.empty();
+    }
+    void insert(
+      const K &key,
+      const V &val) {
+      KINT k;
+      k = key;
+      buffer.push_back(
+	delta_t{
+	  delta_t::op_t::INSERT,
+	  k,
+	  VINT(val)
+	});
+    }
+    void update(
+      const K &key,
+      const V &val) {
+      KINT k;
+      k = key;
+      buffer.push_back(
+	delta_t{
+	  delta_t::op_t::UPDATE,
+	  k,
+	  VINT(val)
+	});
+    }
+    void remove(const K &key) {
+      KINT k;
+      k = key;
+      buffer.push_back(
+	delta_t{
+	  delta_t::op_t::REMOVE,
+	  k,
+	  VINT()
+	});
+    }
+    void replay(FixedKVNodeLayout &node) {
+      for (auto &i: buffer) {
+	i.replay(node);
+      }
+    }
+    size_t get_bytes() const {
+      return buffer.size() * sizeof(delta_t);
+    }
+    void copy_out(char *out, size_t len) {
+      assert(len == get_bytes());
+      ::memcpy(out, reinterpret_cast<const void *>(buffer.data()), get_bytes());
+      buffer.clear();
+    }
+    void copy_in(const char *out, size_t len) {
+      assert(empty());
+      assert(len % sizeof(delta_t) == 0);
+      buffer = std::vector(
+	reinterpret_cast<const delta_t*>(out),
+	reinterpret_cast<const delta_t*>(out + len));
+    }
+    bool operator==(const delta_buffer_t &rhs) const {
+      return buffer == rhs.buffer;
+    }
+  };
+
+  void journal_insert(
+    const_iterator _iter,
+    const K &key,
+    const V &val,
+    delta_buffer_t *recorder) {
+    auto iter = iterator(this, _iter.offset);
+    if (recorder) {
+      recorder->insert(
+	key,
+	val);
+    }
+    insert(iter, key, val);
+  }
+
+  void journal_update(
+    const_iterator _iter,
+    const V &val,
+    delta_buffer_t *recorder) {
+    auto iter = iterator(this, _iter.offset);
+    if (recorder) {
+      recorder->update(iter->get_key(), val);
+    }
+    update(iter, val);
+  }
+
+  void journal_replace(
+    const_iterator _iter,
+    const K &key,
+    const V &val,
+    delta_buffer_t *recorder) {
+    auto iter = iterator(this, _iter.offset);
+    if (recorder) {
+      recorder->remove(iter->get_key());
+      recorder->insert(key, val);
+    }
+    replace(iter, key, val);
+  }
+
+
+  void journal_remove(
+    const_iterator _iter,
+    delta_buffer_t *recorder) {
+    auto iter = iterator(this, _iter.offset);
+    if (recorder) {
+      recorder->remove(iter->get_key());
+    }
+    remove(iter);
+  }
+
+
+  FixedKVNodeLayout(char *buf) :
+    buf(buf) {}
+
+  virtual ~FixedKVNodeLayout() = default;
+
+  const_iterator begin() const {
+    return const_iterator(
+      this,
+      0);
+  }
+
+  const_iterator end() const {
+    return const_iterator(
+      this,
+      get_size());
+  }
+
+  iterator begin() {
+    return iterator(
+      this,
+      0);
+  }
+
+  iterator end() {
+    return iterator(
+      this,
+      get_size());
+  }
+
+  const_iterator iter_idx(uint16_t off) const {
+    return const_iterator(
+      this,
+      off);
+  }
+
+  const_iterator find(K l) const {
+    auto ret = begin();
+    for (; ret != end(); ++ret) {
+      if (ret->get_key() == l)
+	break;
+    }
+    return ret;
+  }
+  iterator find(K l) {
+    const auto &tref = *this;
+    return iterator(this, tref.find(l).offset);
+  }
+
+  const_iterator lower_bound(K l) const {
+    auto ret = begin();
+    for (; ret != end(); ++ret) {
+      if (ret->get_key() >= l)
+	break;
+    }
+    return ret;
+  }
+  iterator lower_bound(K l) {
+    const auto &tref = *this;
+    return iterator(this, tref.lower_bound(l).offset);
+  }
+
+  const_iterator upper_bound(K l) const {
+    auto ret = begin();
+    for (; ret != end(); ++ret) {
+      if (ret->get_key() > l)
+	break;
+    }
+    return ret;
+  }
+  iterator upper_bound(K l) {
+    const auto &tref = *this;
+    return iterator(this, tref.upper_bound(l).offset);
+  }
+
+  const_iterator get_split_pivot() const {
+    return iter_idx(get_size() / 2);
+  }
+
+  uint16_t get_size() const {
+    return *layout.template Pointer<0>(buf);
+  }
+
+  /**
+   * set_size
+   *
+   * Set size representation to match size
+   */
+  void set_size(uint16_t size) {
+    *layout.template Pointer<0>(buf) = size;
+  }
+
+  /**
+   * get_meta/set_meta
+   *
+   * Enables stashing a templated type within the layout.
+   * Cannot be modified after initial write as it is not represented
+   * in delta_t
+   */
+  Meta get_meta() const {
+    MetaInt &metaint = *layout.template Pointer<1>(buf);
+    return Meta(metaint);
+  }
+  void set_meta(const Meta &meta) {
+    *layout.template Pointer<1>(buf) = MetaInt(meta);
+  }
+
+  constexpr static size_t get_capacity() {
+    return CAPACITY;
+  }
+
+  bool operator==(const FixedKVNodeLayout &rhs) const {
+    if (get_size() != rhs.get_size()) {
+      return false;
+    }
+
+    auto iter = begin();
+    auto iter2 = rhs.begin();
+    while (iter != end()) {
+      if (iter->get_key() != iter2->get_key() ||
+	  iter->get_val() != iter2->get_val()) {
+	return false;
+      }
+      iter++;
+      iter2++;
+    }
+    return true;
+  }
+
+  /**
+   * split_into
+   *
+   * Takes *this and splits its contents into left and right.
+   */
+  K split_into(
+    FixedKVNodeLayout &left,
+    FixedKVNodeLayout &right) const {
+    auto piviter = get_split_pivot();
+
+    left.copy_from_foreign(left.begin(), begin(), piviter);
+    left.set_size(piviter - begin());
+
+    right.copy_from_foreign(right.begin(), piviter, end());
+    right.set_size(end() - piviter);
+
+    auto [lmeta, rmeta] = get_meta().split_into(piviter->get_key());
+    left.set_meta(lmeta);
+    right.set_meta(rmeta);
+
+    return piviter->get_key();
+  }
+
+  /**
+   * merge_from
+   *
+   * Takes two nodes and copies their contents into *this.
+   *
+   * precondition: left.size() + right.size() < CAPACITY
+   */
+  void merge_from(
+    const FixedKVNodeLayout &left,
+    const FixedKVNodeLayout &right)
+  {
+    copy_from_foreign(
+      end(),
+      left.begin(),
+      left.end());
+    set_size(left.get_size());
+    copy_from_foreign(
+      end(),
+      right.begin(),
+      right.end());
+    set_size(left.get_size() + right.get_size());
+    set_meta(Meta::merge_from(left.get_meta(), right.get_meta()));
+  }
+
+  /**
+   * balance_into_new_nodes
+   *
+   * Takes the contents of left and right and copies them into
+   * replacement_left and replacement_right such that in the
+   * event that the number of elements is odd the extra goes to
+   * the left side iff prefer_left.
+   */
+  static K balance_into_new_nodes(
+    const FixedKVNodeLayout &left,
+    const FixedKVNodeLayout &right,
+    bool prefer_left,
+    FixedKVNodeLayout &replacement_left,
+    FixedKVNodeLayout &replacement_right)
+  {
+    auto total = left.get_size() + right.get_size();
+    auto pivot_idx = (left.get_size() + right.get_size()) / 2;
+    if (total % 2 && prefer_left) {
+      pivot_idx++;
+    }
+    auto replacement_pivot = pivot_idx >= left.get_size() ?
+      right.iter_idx(pivot_idx - left.get_size())->get_key() :
+      left.iter_idx(pivot_idx)->get_key();
+
+    if (pivot_idx < left.get_size()) {
+      replacement_left.copy_from_foreign(
+	replacement_left.end(),
+	left.begin(),
+	left.iter_idx(pivot_idx));
+      replacement_left.set_size(pivot_idx);
+
+      replacement_right.copy_from_foreign(
+	replacement_right.end(),
+	left.iter_idx(pivot_idx),
+	left.end());
+
+      replacement_right.set_size(left.get_size() - pivot_idx);
+      replacement_right.copy_from_foreign(
+	replacement_right.end(),
+	right.begin(),
+	right.end());
+      replacement_right.set_size(total - pivot_idx);
+    } else {
+      replacement_left.copy_from_foreign(
+	replacement_left.end(),
+	left.begin(),
+	left.end());
+      replacement_left.set_size(left.get_size());
+
+      replacement_left.copy_from_foreign(
+	replacement_left.end(),
+	right.begin(),
+	right.iter_idx(pivot_idx - left.get_size()));
+      replacement_left.set_size(pivot_idx);
+
+      replacement_right.copy_from_foreign(
+	replacement_right.end(),
+	right.iter_idx(pivot_idx - left.get_size()),
+	right.end());
+      replacement_right.set_size(total - pivot_idx);
+    }
+
+    auto [lmeta, rmeta] = Meta::rebalance(
+      left.get_meta(), right.get_meta(), replacement_pivot);
+    replacement_left.set_meta(lmeta);
+    replacement_right.set_meta(rmeta);
+    return replacement_pivot;
+  }
+
+private:
+  void insert(
+    iterator iter,
+    const K &key,
+    const V &val) {
+    if (VALIDATE_INVARIANTS) {
+      if (iter != begin()) {
+	assert((iter - 1)->get_key() < key);
+      }
+      if (iter != end()) {
+	assert(iter->get_key() > key);
+      }
+      assert(get_size() < CAPACITY);
+    }
+    copy_from_local(iter + 1, iter, end());
+    iter->set_key(key);
+    iter->set_val(val);
+    set_size(get_size() + 1);
+  }
+
+  void update(
+    iterator iter,
+    V val) {
+    assert(iter != end());
+    iter->set_val(val);
+  }
+
+  void replace(
+    iterator iter,
+    const K &key,
+    const V &val) {
+    assert(iter != end());
+    if (VALIDATE_INVARIANTS) {
+      if (iter != begin()) {
+	assert((iter - 1)->get_key() < key);
+      }
+      if ((iter + 1) != end()) {
+	assert((iter + 1)->get_key() > key);
+      }
+    }
+    iter->set_key(key);
+    iter->set_val(val);
+  }
+
+  void remove(iterator iter) {
+    assert(iter != end());
+    copy_from_local(iter, iter + 1, end());
+    set_size(get_size() - 1);
+  }
+
+  /**
+   * get_key_ptr
+   *
+   * Get pointer to start of key array
+   */
+  KINT *get_key_ptr() {
+    return layout.template Pointer<2>(buf);
+  }
+  const KINT *get_key_ptr() const {
+    return layout.template Pointer<2>(buf);
+  }
+
+  /**
+   * get_val_ptr
+   *
+   * Get pointer to start of val array
+   */
+  VINT *get_val_ptr() {
+    return layout.template Pointer<3>(buf);
+  }
+  const VINT *get_val_ptr() const {
+    return layout.template Pointer<3>(buf);
+  }
+
+  /**
+   * node_resolve/unresolve_vals
+   *
+   * If the representation for values depends in some way on the
+   * node in which they are located, users may implement
+   * resolve/unresolve to enable copy_from_foreign to handle that
+   * transition.
+   */
+  virtual void node_resolve_vals(iterator from, iterator to) const {}
+  virtual void node_unresolve_vals(iterator from, iterator to) const {}
+
+  /**
+   * copy_from_foreign
+   *
+   * Copies entries from [from_src, to_src) to tgt.
+   *
+   * tgt and from_src must be from different nodes.
+   * from_src and to_src must be from the same node.
+   */
+  static void copy_from_foreign(
+    iterator tgt,
+    const_iterator from_src,
+    const_iterator to_src) {
+    assert(tgt->node != from_src->node);
+    assert(to_src->node == from_src->node);
+    memcpy(
+      tgt->get_val_ptr(), from_src->get_val_ptr(),
+      to_src->get_val_ptr() - from_src->get_val_ptr());
+    memcpy(
+      tgt->get_key_ptr(), from_src->get_key_ptr(),
+      to_src->get_key_ptr() - from_src->get_key_ptr());
+    from_src->node->node_resolve_vals(tgt, tgt + (to_src - from_src));
+    tgt->node->node_unresolve_vals(tgt, tgt + (to_src - from_src));
+  }
+
+  /**
+   * copy_from_local
+   *
+   * Copies entries from [from_src, to_src) to tgt.
+   *
+   * tgt, from_src, and to_src must be from the same node.
+   */
+  static void copy_from_local(
+    iterator tgt,
+    iterator from_src,
+    iterator to_src) {
+    assert(tgt->node == from_src->node);
+    assert(to_src->node == from_src->node);
+    memmove(
+      tgt->get_val_ptr(), from_src->get_val_ptr(),
+      to_src->get_val_ptr() - from_src->get_val_ptr());
+    memmove(
+      tgt->get_key_ptr(), from_src->get_key_ptr(),
+      to_src->get_key_ptr() - from_src->get_key_ptr());
+  }
+};
+
+}
diff --git a/src/crimson/common/formatter.cc b/src/crimson/common/formatter.cc
new file mode 100644
index 000000000..677216224
--- /dev/null
+++ b/src/crimson/common/formatter.cc
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "formatter.h"
+
+#include <fmt/format.h>
+#if FMT_VERSION >= 60000
+#include <fmt/chrono.h>
+#else
+#include <fmt/time.h>
+#endif
+
+
+template <>
+struct fmt::formatter<seastar::lowres_system_clock::time_point> {
+  // ignore the format string
+  template <typename ParseContext>
+  constexpr auto parse(ParseContext &ctx) { return ctx.begin(); }
+
+  template <typename FormatContext>
+  auto format(const seastar::lowres_system_clock::time_point& t,
+              FormatContext& ctx) {
+    std::time_t tt = std::chrono::duration_cast<std::chrono::seconds>(
+      t.time_since_epoch()).count();
+    auto milliseconds = (t.time_since_epoch() %
+                         std::chrono::seconds(1)).count();
+    return fmt::format_to(ctx.out(), "{:%Y-%m-%d %H:%M:%S} {:03d}",
+                          fmt::localtime(tt), milliseconds);
+  }
+};
+
+template <>
+struct fmt::formatter<ceph::coarse_real_clock::time_point> {
+  // ignore the format string
+  template <typename ParseContext>
+  constexpr auto parse(ParseContext &ctx) { return ctx.begin(); }
+
+  template <typename FormatContext>
+  auto format(const ceph::coarse_real_clock::time_point& t,
+              FormatContext& ctx) {
+    std::time_t tt = std::chrono::duration_cast<std::chrono::seconds>(
+      t.time_since_epoch()).count();
+    auto milliseconds = (t.time_since_epoch() %
+                         std::chrono::seconds(1)).count();
+    return fmt::format_to(ctx.out(), "{:%Y-%m-%d %H:%M:%S} {:03d}",
+                          fmt::localtime(tt), milliseconds);
+  }
+};
+
+namespace std {
+
+ostream& operator<<(ostream& out,
+                    const seastar::lowres_system_clock::time_point& t)
+{
+  return out << fmt::format("{}", t);
+}
+
+ostream& operator<<(ostream& out,
+                    const ceph::coarse_real_clock::time_point& t)
+{
+  return out << fmt::format("{}", t);
+}
+
+}
diff --git a/src/crimson/common/formatter.h b/src/crimson/common/formatter.h
new file mode 100644
index 000000000..1775b0954
--- /dev/null
+++ b/src/crimson/common/formatter.h
@@ -0,0 +1,15 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <seastar/core/lowres_clock.hh>
+
+#include "common/ceph_time.h"
+
+namespace std {
+
+ostream& operator<<(ostream& out,
+                    const seastar::lowres_system_clock::time_point& t);
+ostream& operator<<(ostream& out,
+                    const ceph::coarse_real_clock::time_point& t);
+
+}
diff --git a/src/crimson/common/gated.h b/src/crimson/common/gated.h
new file mode 100644
index 000000000..7d901b6b1
--- /dev/null
+++ b/src/crimson/common/gated.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/gate.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/future-util.hh>
+
+#include "crimson/common/exception.h"
+#include "crimson/common/log.h"
+#include "include/ceph_assert.h"
+
+namespace crimson::common {
+
+class Gated {
+ public:
+  static seastar::logger& gated_logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+  template <typename Func, typename T>
+  inline void dispatch_in_background(const char* what, T& who, Func&& func) {
+    (void) dispatch(what, who, func);
+  }
+  template <typename Func, typename T>
+  inline seastar::future<> dispatch(const char* what, T& who, Func&& func) {
+    return seastar::with_gate(pending_dispatch, std::forward<Func>(func)
+    ).handle_exception([what, &who] (std::exception_ptr eptr) {
+      if (*eptr.__cxa_exception_type() == typeid(system_shutdown_exception)) {
+	gated_logger().debug(
+	    "{}, {} skipped, system shutdown", who, what);
+	return;
+      }
+      gated_logger().error(
+          "{} dispatch() {} caught exception: {}", who, what, eptr);
+      assert(*eptr.__cxa_exception_type()
+	== typeid(seastar::gate_closed_exception));
+    });
+  }
+
+  seastar::future<> close() {
+    return pending_dispatch.close();
+  }
+  bool is_closed() const {
+    return pending_dispatch.is_closed();
+  }
+ private:
+  seastar::gate pending_dispatch;
+};
+
+}// namespace crimson::common
diff --git a/src/crimson/common/layout.h b/src/crimson/common/layout.h
new file mode 100644
index 000000000..9d54ecd1d
--- /dev/null
+++ b/src/crimson/common/layout.h
@@ -0,0 +1,737 @@
+// Copyright 2018 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//                           MOTIVATION AND TUTORIAL
+//
+// If you want to put in a single heap allocation N doubles followed by M ints,
+// it's easy if N and M are known at compile time.
+//
+//   struct S {
+//     double a[N];
+//     int b[M];
+//   };
+//
+//   S* p = new S;
+//
+// But what if N and M are known only in run time? Class template Layout to the
+// rescue! It's a portable generalization of the technique known as struct hack.
+//
+//   // This object will tell us everything we need to know about the memory
+//   // layout of double[N] followed by int[M]. It's structurally identical to
+//   // size_t[2] that stores N and M. It's very cheap to create.
+//   const Layout<double, int> layout(N, M);
+//
+//   // Allocate enough memory for both arrays. `AllocSize()` tells us how much
+//   // memory is needed. We are free to use any allocation function we want as
+//   // long as it returns aligned memory.
+//   std::unique_ptr<unsigned char[]> p(new unsigned char[layout.AllocSize()]);
+//
+//   // Obtain the pointer to the array of doubles.
+//   // Equivalent to `reinterpret_cast<double*>(p.get())`.
+//   //
+//   // We could have written layout.Pointer<0>(p) instead. If all the types are
+//   // unique you can use either form, but if some types are repeated you must
+//   // use the index form.
+//   double* a = layout.Pointer<double>(p.get());
+//
+//   // Obtain the pointer to the array of ints.
+//   // Equivalent to `reinterpret_cast<int*>(p.get() + N * 8)`.
+//   int* b = layout.Pointer<int>(p);
+//
+// If we are unable to specify sizes of all fields, we can pass as many sizes as
+// we can to `Partial()`. In return, it'll allow us to access the fields whose
+// locations and sizes can be computed from the provided information.
+// `Partial()` comes in handy when the array sizes are embedded into the
+// allocation.
+//
+//   // size_t[1] containing N, size_t[1] containing M, double[N], int[M].
+//   using L = Layout<size_t, size_t, double, int>;
+//
+//   unsigned char* Allocate(size_t n, size_t m) {
+//     const L layout(1, 1, n, m);
+//     unsigned char* p = new unsigned char[layout.AllocSize()];
+//     *layout.Pointer<0>(p) = n;
+//     *layout.Pointer<1>(p) = m;
+//     return p;
+//   }
+//
+//   void Use(unsigned char* p) {
+//     // First, extract N and M.
+//     // Specify that the first array has only one element. Using `prefix` we
+//     // can access the first two arrays but not more.
+//     constexpr auto prefix = L::Partial(1);
+//     size_t n = *prefix.Pointer<0>(p);
+//     size_t m = *prefix.Pointer<1>(p);
+//
+//     // Now we can get pointers to the payload.
+//     const L layout(1, 1, n, m);
+//     double* a = layout.Pointer<double>(p);
+//     int* b = layout.Pointer<int>(p);
+//   }
+//
+// The layout we used above combines fixed-size with dynamically-sized fields.
+// This is quite common. Layout is optimized for this use case and generates
+// optimal code. All computations that can be performed at compile time are
+// indeed performed at compile time.
+//
+// Efficiency tip: The order of fields matters. In `Layout<T1, ..., TN>` try to
+// ensure that `alignof(T1) >= ... >= alignof(TN)`. This way you'll have no
+// padding in between arrays.
+//
+// You can manually override the alignment of an array by wrapping the type in
+// `Aligned<T, N>`. `Layout<..., Aligned<T, N>, ...>` has exactly the same API
+// and behavior as `Layout<..., T, ...>` except that the first element of the
+// array of `T` is aligned to `N` (the rest of the elements follow without
+// padding). `N` cannot be less than `alignof(T)`.
+//
+// `AllocSize()` and `Pointer()` are the most basic methods for dealing with
+// memory layouts. Check out the reference or code below to discover more.
+//
+//                            EXAMPLE
+//
+//   // Immutable move-only string with sizeof equal to sizeof(void*). The
+//   // string size and the characters are kept in the same heap allocation.
+//   class CompactString {
+//    public:
+//     CompactString(const char* s = "") {
+//       const size_t size = strlen(s);
+//       // size_t[1] followed by char[size + 1].
+//       const L layout(1, size + 1);
+//       p_.reset(new unsigned char[layout.AllocSize()]);
+//       // If running under ASAN, mark the padding bytes, if any, to catch
+//       // memory errors.
+//       layout.PoisonPadding(p_.get());
+//       // Store the size in the allocation.
+//       *layout.Pointer<size_t>(p_.get()) = size;
+//       // Store the characters in the allocation.
+//       memcpy(layout.Pointer<char>(p_.get()), s, size + 1);
+//     }
+//
+//     size_t size() const {
+//       // Equivalent to reinterpret_cast<size_t&>(*p).
+//       return *L::Partial().Pointer<size_t>(p_.get());
+//     }
+//
+//     const char* c_str() const {
+//       // Equivalent to reinterpret_cast<char*>(p.get() + sizeof(size_t)).
+//       // The argument in Partial(1) specifies that we have size_t[1] in front
+//       // of the characters.
+//       return L::Partial(1).Pointer<char>(p_.get());
+//     }
+//
+//    private:
+//     // Our heap allocation contains a size_t followed by an array of chars.
+//     using L = Layout<size_t, char>;
+//     std::unique_ptr<unsigned char[]> p_;
+//   };
+//
+//   int main() {
+//     CompactString s = "hello";
+//     assert(s.size() == 5);
+//     assert(strcmp(s.c_str(), "hello") == 0);
+//   }
+//
+//                               DOCUMENTATION
+//
+// The interface exported by this file consists of:
+// - class `Layout<>` and its public members.
+// - The public members of class `internal_layout::LayoutImpl<>`. That class
+//   isn't intended to be used directly, and its name and template parameter
+//   list are internal implementation details, but the class itself provides
+//   most of the functionality in this file. See comments on its members for
+//   detailed documentation.
+//
+// `Layout<T1,... Tn>::Partial(count1,..., countm)` (where `m` <= `n`) returns a
+// `LayoutImpl<>` object. `Layout<T1,..., Tn> layout(count1,..., countn)`
+// creates a `Layout` object, which exposes the same functionality by inheriting
+// from `LayoutImpl<>`.
+
+#ifndef ABSL_CONTAINER_INTERNAL_LAYOUT_H_
+#define ABSL_CONTAINER_INTERNAL_LAYOUT_H_
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+
+#ifdef ADDRESS_SANITIZER
+#include <sanitizer/asan_interface.h>
+#endif
+
+// for C++20 std::span
+#include <boost/beast/core/span.hpp>
+#include <fmt/format.h>
+
+#if defined(__GXX_RTTI)
+#define ABSL_INTERNAL_HAS_CXA_DEMANGLE
+#endif
+
+#ifdef ABSL_INTERNAL_HAS_CXA_DEMANGLE
+#include <cxxabi.h>
+#endif
+
+namespace absl {
+namespace container_internal {
+
+// A type wrapper that instructs `Layout` to use the specific alignment for the
+// array. `Layout<..., Aligned<T, N>, ...>` has exactly the same API
+// and behavior as `Layout<..., T, ...>` except that the first element of the
+// array of `T` is aligned to `N` (the rest of the elements follow without
+// padding).
+//
+// Requires: `N >= alignof(T)` and `N` is a power of 2.
+template <class T, size_t N>
+struct Aligned;
+
+namespace internal_layout {
+
+template <class T>
+struct NotAligned {};
+
+template <class T, size_t N>
+struct NotAligned<const Aligned<T, N>> {
+  static_assert(sizeof(T) == 0, "Aligned<T, N> cannot be const-qualified");
+};
+
+template <size_t>
+using IntToSize = size_t;
+
+template <class>
+using TypeToSize = size_t;
+
+template <class T>
+struct Type : NotAligned<T> {
+  using type = T;
+};
+
+template <class T, size_t N>
+struct Type<Aligned<T, N>> {
+  using type = T;
+};
+
+template <class T>
+struct SizeOf : NotAligned<T>, std::integral_constant<size_t, sizeof(T)> {};
+
+template <class T, size_t N>
+struct SizeOf<Aligned<T, N>> : std::integral_constant<size_t, sizeof(T)> {};
+
+// Note: workaround for https://gcc.gnu.org/PR88115
+template <class T>
+struct AlignOf : NotAligned<T> {
+  static constexpr size_t value = alignof(T);
+};
+
+template <class T, size_t N>
+struct AlignOf<Aligned<T, N>> {
+  static_assert(N % alignof(T) == 0,
+                "Custom alignment can't be lower than the type's alignment");
+  static constexpr size_t value = N;
+};
+
+// Does `Ts...` contain `T`?
+template <class T, class... Ts>
+using Contains = std::disjunction<std::is_same<T, Ts>...>;
+
+template <class From, class To>
+using CopyConst =
+    typename std::conditional_t<std::is_const_v<From>, const To, To>;
+
+// Note: We're not qualifying this with absl:: because it doesn't compile under
+// MSVC.
+template <class T>
+using SliceType = boost::beast::span<T>;
+
+// This namespace contains no types. It prevents functions defined in it from
+// being found by ADL.
+namespace adl_barrier {
+
+template <class Needle, class... Ts>
+constexpr size_t Find(Needle, Needle, Ts...) {
+  static_assert(!Contains<Needle, Ts...>(), "Duplicate element type");
+  return 0;
+}
+
+template <class Needle, class T, class... Ts>
+constexpr size_t Find(Needle, T, Ts...) {
+  return adl_barrier::Find(Needle(), Ts()...) + 1;
+}
+
+constexpr bool IsPow2(size_t n) { return !(n & (n - 1)); }
+
+// Returns `q * m` for the smallest `q` such that `q * m >= n`.
+// Requires: `m` is a power of two. It's enforced by IsLegalElementType below.
+constexpr size_t Align(size_t n, size_t m) { return (n + m - 1) & ~(m - 1); }
+
+constexpr size_t Min(size_t a, size_t b) { return b < a ? b : a; }
+
+constexpr size_t Max(size_t a) { return a; }
+
+template <class... Ts>
+constexpr size_t Max(size_t a, size_t b, Ts... rest) {
+  return adl_barrier::Max(b < a ? a : b, rest...);
+}
+
+template <class T>
+std::string TypeName() {
+  std::string out;
+  int status = 0;
+  char* demangled = nullptr;
+#ifdef ABSL_INTERNAL_HAS_CXA_DEMANGLE
+  demangled = abi::__cxa_demangle(typeid(T).name(), nullptr, nullptr, &status);
+#endif
+  if (status == 0 && demangled != nullptr) {  // Demangling succeeded.
+    out = fmt::format("<{}>", demangled);
+    free(demangled);
+  } else {
+#if defined(__GXX_RTTI) || defined(_CPPRTTI)
+    out = fmt::format("<{}>", typeid(T).name());
+#endif
+  }
+  return out;
+}
+
+}  // namespace adl_barrier
+
+template <bool C>
+using EnableIf = typename std::enable_if_t<C, int>;
+
+// Can `T` be a template argument of `Layout`?
+template <class T>
+using IsLegalElementType = std::integral_constant<
+    bool, !std::is_reference_v<T> && !std::is_volatile_v<T> &&
+              !std::is_reference_v<typename Type<T>::type> &&
+              !std::is_volatile_v<typename Type<T>::type> &&
+              adl_barrier::IsPow2(AlignOf<T>::value)>;
+
+template <class Elements, class SizeSeq, class OffsetSeq>
+class LayoutImpl;
+
+// Public base class of `Layout` and the result type of `Layout::Partial()`.
+//
+// `Elements...` contains all template arguments of `Layout` that created this
+// instance.
+//
+// `SizeSeq...` is `[0, NumSizes)` where `NumSizes` is the number of arguments
+// passed to `Layout::Partial()` or `Layout::Layout()`.
+//
+// `OffsetSeq...` is `[0, NumOffsets)` where `NumOffsets` is
+// `Min(sizeof...(Elements), NumSizes + 1)` (the number of arrays for which we
+// can compute offsets).
+template <class... Elements, size_t... SizeSeq, size_t... OffsetSeq>
+class LayoutImpl<std::tuple<Elements...>, std::index_sequence<SizeSeq...>,
+                 std::index_sequence<OffsetSeq...>> {
+ private:
+  static_assert(sizeof...(Elements) > 0, "At least one field is required");
+  static_assert(std::conjunction_v<IsLegalElementType<Elements>...>,
+                "Invalid element type (see IsLegalElementType)");
+
+  enum {
+    NumTypes = sizeof...(Elements),
+    NumSizes = sizeof...(SizeSeq),
+    NumOffsets = sizeof...(OffsetSeq),
+  };
+
+  // These are guaranteed by `Layout`.
+  static_assert(NumOffsets == adl_barrier::Min(NumTypes, NumSizes + 1),
+                "Internal error");
+  static_assert(NumTypes > 0, "Internal error");
+
+  // Returns the index of `T` in `Elements...`. Results in a compilation error
+  // if `Elements...` doesn't contain exactly one instance of `T`.
+  template <class T>
+  static constexpr size_t ElementIndex() {
+    static_assert(Contains<Type<T>, Type<typename Type<Elements>::type>...>(),
+                  "Type not found");
+    return adl_barrier::Find(Type<T>(),
+                             Type<typename Type<Elements>::type>()...);
+  }
+
+  template <size_t N>
+  using ElementAlignment =
+      AlignOf<typename std::tuple_element<N, std::tuple<Elements...>>::type>;
+
+ public:
+  // Element types of all arrays packed in a tuple.
+  using ElementTypes = std::tuple<typename Type<Elements>::type...>;
+
+  // Element type of the Nth array.
+  template <size_t N>
+  using ElementType = typename std::tuple_element<N, ElementTypes>::type;
+
+  constexpr explicit LayoutImpl(IntToSize<SizeSeq>... sizes)
+      : size_{sizes...} {}
+
+  // Alignment of the layout, equal to the strictest alignment of all elements.
+  // All pointers passed to the methods of layout must be aligned to this value.
+  static constexpr size_t Alignment() {
+    return adl_barrier::Max(AlignOf<Elements>::value...);
+  }
+
+  // Offset in bytes of the Nth array.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   assert(x.Offset<0>() == 0);   // The ints starts from 0.
+  //   assert(x.Offset<1>() == 16);  // The doubles starts from 16.
+  //
+  // Requires: `N <= NumSizes && N < sizeof...(Ts)`.
+  template <size_t N, EnableIf<N == 0> = 0>
+  constexpr size_t Offset() const {
+    return 0;
+  }
+
+  template <size_t N, EnableIf<N != 0> = 0>
+  constexpr size_t Offset() const {
+    static_assert(N < NumOffsets, "Index out of bounds");
+    return adl_barrier::Align(
+        Offset<N - 1>() + SizeOf<ElementType<N - 1>>() * size_[N - 1],
+        ElementAlignment<N>::value);
+  }
+
+  // Offset in bytes of the array with the specified element type. There must
+  // be exactly one such array and its zero-based index must be at most
+  // `NumSizes`.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   assert(x.Offset<int>() == 0);      // The ints starts from 0.
+  //   assert(x.Offset<double>() == 16);  // The doubles starts from 16.
+  template <class T>
+  constexpr size_t Offset() const {
+    return Offset<ElementIndex<T>()>();
+  }
+
+  // Offsets in bytes of all arrays for which the offsets are known.
+  constexpr std::array<size_t, NumOffsets> Offsets() const {
+    return {{Offset<OffsetSeq>()...}};
+  }
+
+  // The number of elements in the Nth array. This is the Nth argument of
+  // `Layout::Partial()` or `Layout::Layout()` (zero-based).
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   assert(x.Size<0>() == 3);
+  //   assert(x.Size<1>() == 4);
+  //
+  // Requires: `N < NumSizes`.
+  template <size_t N>
+  constexpr size_t Size() const {
+    static_assert(N < NumSizes, "Index out of bounds");
+    return size_[N];
+  }
+
+  // The number of elements in the array with the specified element type.
+  // There must be exactly one such array and its zero-based index must be
+  // at most `NumSizes`.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   assert(x.Size<int>() == 3);
+  //   assert(x.Size<double>() == 4);
+  template <class T>
+  constexpr size_t Size() const {
+    return Size<ElementIndex<T>()>();
+  }
+
+  // The number of elements of all arrays for which they are known.
+  constexpr std::array<size_t, NumSizes> Sizes() const {
+    return {{Size<SizeSeq>()...}};
+  }
+
+  // Pointer to the beginning of the Nth array.
+  //
+  // `Char` must be `[const] [signed|unsigned] char`.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   unsigned char* p = new unsigned char[x.AllocSize()];
+  //   int* ints = x.Pointer<0>(p);
+  //   double* doubles = x.Pointer<1>(p);
+  //
+  // Requires: `N <= NumSizes && N < sizeof...(Ts)`.
+  // Requires: `p` is aligned to `Alignment()`.
+  template <size_t N, class Char>
+  CopyConst<Char, ElementType<N>>* Pointer(Char* p) const {
+    using C = typename std::remove_const<Char>::type;
+    static_assert(
+        std::is_same<C, char>() || std::is_same<C, unsigned char>() ||
+            std::is_same<C, signed char>(),
+        "The argument must be a pointer to [const] [signed|unsigned] char");
+    constexpr size_t alignment = Alignment();
+    (void)alignment;
+    assert(reinterpret_cast<uintptr_t>(p) % alignment == 0);
+    return reinterpret_cast<CopyConst<Char, ElementType<N>>*>(p + Offset<N>());
+  }
+
+  // Pointer to the beginning of the array with the specified element type.
+  // There must be exactly one such array and its zero-based index must be at
+  // most `NumSizes`.
+  //
+  // `Char` must be `[const] [signed|unsigned] char`.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   unsigned char* p = new unsigned char[x.AllocSize()];
+  //   int* ints = x.Pointer<int>(p);
+  //   double* doubles = x.Pointer<double>(p);
+  //
+  // Requires: `p` is aligned to `Alignment()`.
+  template <class T, class Char>
+  CopyConst<Char, T>* Pointer(Char* p) const {
+    return Pointer<ElementIndex<T>()>(p);
+  }
+
+  // Pointers to all arrays for which pointers are known.
+  //
+  // `Char` must be `[const] [signed|unsigned] char`.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   unsigned char* p = new unsigned char[x.AllocSize()];
+  //
+  //   int* ints;
+  //   double* doubles;
+  //   std::tie(ints, doubles) = x.Pointers(p);
+  //
+  // Requires: `p` is aligned to `Alignment()`.
+  //
+  // Note: We're not using ElementType alias here because it does not compile
+  // under MSVC.
+  template <class Char>
+  std::tuple<CopyConst<
+      Char, typename std::tuple_element<OffsetSeq, ElementTypes>::type>*...>
+  Pointers(Char* p) const {
+    return std::tuple<CopyConst<Char, ElementType<OffsetSeq>>*...>(
+        Pointer<OffsetSeq>(p)...);
+  }
+
+  // The Nth array.
+  //
+  // `Char` must be `[const] [signed|unsigned] char`.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   unsigned char* p = new unsigned char[x.AllocSize()];
+  //   Span<int> ints = x.Slice<0>(p);
+  //   Span<double> doubles = x.Slice<1>(p);
+  //
+  // Requires: `N < NumSizes`.
+  // Requires: `p` is aligned to `Alignment()`.
+  template <size_t N, class Char>
+  SliceType<CopyConst<Char, ElementType<N>>> Slice(Char* p) const {
+    return SliceType<CopyConst<Char, ElementType<N>>>(Pointer<N>(p), Size<N>());
+  }
+
+  // The array with the specified element type. There must be exactly one
+  // such array and its zero-based index must be less than `NumSizes`.
+  //
+  // `Char` must be `[const] [signed|unsigned] char`.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   unsigned char* p = new unsigned char[x.AllocSize()];
+  //   Span<int> ints = x.Slice<int>(p);
+  //   Span<double> doubles = x.Slice<double>(p);
+  //
+  // Requires: `p` is aligned to `Alignment()`.
+  template <class T, class Char>
+  SliceType<CopyConst<Char, T>> Slice(Char* p) const {
+    return Slice<ElementIndex<T>()>(p);
+  }
+
+  // All arrays with known sizes.
+  //
+  // `Char` must be `[const] [signed|unsigned] char`.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   unsigned char* p = new unsigned char[x.AllocSize()];
+  //
+  //   Span<int> ints;
+  //   Span<double> doubles;
+  //   std::tie(ints, doubles) = x.Slices(p);
+  //
+  // Requires: `p` is aligned to `Alignment()`.
+  //
+  // Note: We're not using ElementType alias here because it does not compile
+  // under MSVC.
+  template <class Char>
+  std::tuple<SliceType<CopyConst<
+      Char, typename std::tuple_element<SizeSeq, ElementTypes>::type>>...>
+  Slices(Char* p) const {
+    // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63875 (fixed
+    // in 6.1).
+    (void)p;
+    return std::tuple<SliceType<CopyConst<Char, ElementType<SizeSeq>>>...>(
+        Slice<SizeSeq>(p)...);
+  }
+
+  // The size of the allocation that fits all arrays.
+  //
+  //   // int[3], 4 bytes of padding, double[4].
+  //   Layout<int, double> x(3, 4);
+  //   unsigned char* p = new unsigned char[x.AllocSize()];  // 48 bytes
+  //
+  // Requires: `NumSizes == sizeof...(Ts)`.
+  constexpr size_t AllocSize() const {
+    static_assert(NumTypes == NumSizes, "You must specify sizes of all fields");
+    return Offset<NumTypes - 1>() +
+           SizeOf<ElementType<NumTypes - 1>>() * size_[NumTypes - 1];
+  }
+
+  // If built with --config=asan, poisons padding bytes (if any) in the
+  // allocation. The pointer must point to a memory block at least
+  // `AllocSize()` bytes in length.
+  //
+  // `Char` must be `[const] [signed|unsigned] char`.
+  //
+  // Requires: `p` is aligned to `Alignment()`.
+  template <class Char, size_t N = NumOffsets - 1, EnableIf<N == 0> = 0>
+  void PoisonPadding(const Char* p) const {
+    Pointer<0>(p);  // verify the requirements on `Char` and `p`
+  }
+
+  template <class Char, size_t N = NumOffsets - 1, EnableIf<N != 0> = 0>
+  void PoisonPadding(const Char* p) const {
+    static_assert(N < NumOffsets, "Index out of bounds");
+    (void)p;
+#ifdef ADDRESS_SANITIZER
+    PoisonPadding<Char, N - 1>(p);
+    // The `if` is an optimization. It doesn't affect the observable behaviour.
+    if (ElementAlignment<N - 1>::value % ElementAlignment<N>::value) {
+      size_t start =
+          Offset<N - 1>() + SizeOf<ElementType<N - 1>>() * size_[N - 1];
+      ASAN_POISON_MEMORY_REGION(p + start, Offset<N>() - start);
+    }
+#endif
+  }
+
+  // Human-readable description of the memory layout. Useful for debugging.
+  // Slow.
+  //
+  //   // char[5], 3 bytes of padding, int[3], 4 bytes of padding, followed
+  //   // by an unknown number of doubles.
+  //   auto x = Layout<char, int, double>::Partial(5, 3);
+  //   assert(x.DebugString() ==
+  //          "@0<char>(1)[5]; @8<int>(4)[3]; @24<double>(8)");
+  //
+  // Each field is in the following format: @offset<type>(sizeof)[size] (<type>
+  // may be missing depending on the target platform). For example,
+  // @8<int>(4)[3] means that at offset 8 we have an array of ints, where each
+  // int is 4 bytes, and we have 3 of those ints. The size of the last field may
+  // be missing (as in the example above). Only fields with known offsets are
+  // described. Type names may differ across platforms: one compiler might
+  // produce "unsigned*" where another produces "unsigned int *".
+  std::string DebugString() const {
+    const auto offsets = Offsets();
+    const size_t sizes[] = {SizeOf<ElementType<OffsetSeq>>()...};
+    const std::string types[] = {
+        adl_barrier::TypeName<ElementType<OffsetSeq>>()...};
+    std::string res = fmt::format("@0{}({})", types[0], sizes[0]);
+    for (size_t i = 0; i != NumOffsets - 1; ++i) {
+      res += fmt::format("[{}]; @({})", size_[i], offsets[i + 1], types[i + 1], sizes[i + 1]);
+    }
+    // NumSizes is a constant that may be zero. Some compilers cannot see that
+    // inside the if statement "size_[NumSizes - 1]" must be valid.
+    int last = static_cast<int>(NumSizes) - 1;
+    if (NumTypes == NumSizes && last >= 0) {
+      res += fmt::format("[{}]", size_[last]);
+    }
+    return res;
+  }
+
+ private:
+  // Arguments of `Layout::Partial()` or `Layout::Layout()`.
+  size_t size_[NumSizes > 0 ? NumSizes : 1];
+};
+
+template <size_t NumSizes, class... Ts>
+using LayoutType = LayoutImpl<
+    std::tuple<Ts...>, std::make_index_sequence<NumSizes>,
+    std::make_index_sequence<adl_barrier::Min(sizeof...(Ts), NumSizes + 1)>>;
+
+}  // namespace internal_layout
+
+// Descriptor of arrays of various types and sizes laid out in memory one after
+// another. See the top of the file for documentation.
+//
+// Check out the public API of internal_layout::LayoutImpl above. The type is
+// internal to the library but its methods are public, and they are inherited
+// by `Layout`.
+template <class... Ts>
+class Layout : public internal_layout::LayoutType<sizeof...(Ts), Ts...> {
+ public:
+  static_assert(sizeof...(Ts) > 0, "At least one field is required");
+  static_assert(
+      std::conjunction_v<internal_layout::IsLegalElementType<Ts>...>,
+      "Invalid element type (see IsLegalElementType)");
+
+  // The result type of `Partial()` with `NumSizes` arguments.
+  template <size_t NumSizes>
+  using PartialType = internal_layout::LayoutType<NumSizes, Ts...>;
+
+  // `Layout` knows the element types of the arrays we want to lay out in
+  // memory but not the number of elements in each array.
+  // `Partial(size1, ..., sizeN)` allows us to specify the latter. The
+  // resulting immutable object can be used to obtain pointers to the
+  // individual arrays.
+  //
+  // It's allowed to pass fewer array sizes than the number of arrays. E.g.,
+  // if all you need is to the offset of the second array, you only need to
+  // pass one argument -- the number of elements in the first array.
+  //
+  //   // int[3] followed by 4 bytes of padding and an unknown number of
+  //   // doubles.
+  //   auto x = Layout<int, double>::Partial(3);
+  //   // doubles start at byte 16.
+  //   assert(x.Offset<1>() == 16);
+  //
+  // If you know the number of elements in all arrays, you can still call
+  // `Partial()` but it's more convenient to use the constructor of `Layout`.
+  //
+  //   Layout<int, double> x(3, 5);
+  //
+  // Note: The sizes of the arrays must be specified in number of elements,
+  // not in bytes.
+  //
+  // Requires: `sizeof...(Sizes) <= sizeof...(Ts)`.
+  // Requires: all arguments are convertible to `size_t`.
+  template <class... Sizes>
+  static constexpr PartialType<sizeof...(Sizes)> Partial(Sizes&&... sizes) {
+    static_assert(sizeof...(Sizes) <= sizeof...(Ts));
+    return PartialType<sizeof...(Sizes)>(std::forward<Sizes>(sizes)...);
+  }
+
+  // Creates a layout with the sizes of all arrays specified. If you know
+  // only the sizes of the first N arrays (where N can be zero), you can use
+  // `Partial()` defined above. The constructor is essentially equivalent to
+  // calling `Partial()` and passing in all array sizes; the constructor is
+  // provided as a convenient abbreviation.
+  //
+  // Note: The sizes of the arrays must be specified in number of elements,
+  // not in bytes.
+  constexpr explicit Layout(internal_layout::TypeToSize<Ts>... sizes)
+      : internal_layout::LayoutType<sizeof...(Ts), Ts...>(sizes...) {}
+};
+
+}  // namespace container_internal
+}  // namespace absl
+
+#endif  // ABSL_CONTAINER_INTERNAL_LAYOUT_H_
diff --git a/src/crimson/common/log.cc b/src/crimson/common/log.cc
new file mode 100644
index 000000000..cae9f6a7b
--- /dev/null
+++ b/src/crimson/common/log.cc
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "log.h"
+
+static std::array<seastar::logger, ceph_subsys_get_num()> loggers{
+#define SUBSYS(name, log_level, gather_level) \
+  seastar::logger(#name),
+#define DEFAULT_SUBSYS(log_level, gather_level) \
+  seastar::logger("none"),
+  #include "common/subsys.h"
+#undef SUBSYS
+#undef DEFAULT_SUBSYS
+};
+
+namespace crimson {
+seastar::logger& get_logger(int subsys) {
+  assert(subsys < ceph_subsys_max);
+  return loggers[subsys];
+}
+}
diff --git a/src/crimson/common/log.h b/src/crimson/common/log.h
new file mode 100644
index 000000000..635349098
--- /dev/null
+++ b/src/crimson/common/log.h
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/util/log.hh>
+#include "common/subsys_types.h"
+
+namespace crimson {
+seastar::logger& get_logger(int subsys);
+static inline seastar::log_level to_log_level(int level) {
+  if (level < 0) {
+    return seastar::log_level::error;
+  } else if (level < 1) {
+    return seastar::log_level::warn;
+  } else if (level < 5) {
+    return seastar::log_level::info;
+  } else if (level <= 20) {
+    return seastar::log_level::debug;
+  } else {
+    return seastar::log_level::trace;
+  }
+}
+}
diff --git a/src/crimson/common/perf_counters_collection.cc b/src/crimson/common/perf_counters_collection.cc
new file mode 100644
index 000000000..af80dbcc2
--- /dev/null
+++ b/src/crimson/common/perf_counters_collection.cc
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "perf_counters_collection.h"
+
+namespace crimson::common {
+PerfCountersCollection::PerfCountersCollection()
+{
+  perf_collection = std::make_unique<PerfCountersCollectionImpl>();
+}
+PerfCountersCollection::~PerfCountersCollection()
+{
+  perf_collection->clear();
+}
+
+PerfCountersCollectionImpl* PerfCountersCollection:: get_perf_collection()
+{
+  return perf_collection.get();
+}
+
+PerfCountersCollection::ShardedPerfCountersCollection PerfCountersCollection::sharded_perf_coll;
+
+}
+
+
diff --git a/src/crimson/common/perf_counters_collection.h b/src/crimson/common/perf_counters_collection.h
new file mode 100644
index 000000000..a19630247
--- /dev/null
+++ b/src/crimson/common/perf_counters_collection.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "common/perf_counters.h"
+#include <seastar/core/sharded.hh>
+
+using crimson::common::PerfCountersCollectionImpl;
+namespace crimson::common {
+class PerfCountersCollection: public seastar::sharded<PerfCountersCollection>
+{
+  using ShardedPerfCountersCollection = seastar::sharded<PerfCountersCollection>;
+
+private:
+  std::unique_ptr<PerfCountersCollectionImpl> perf_collection;
+  static ShardedPerfCountersCollection sharded_perf_coll;
+  friend PerfCountersCollection& local_perf_coll();
+  friend ShardedPerfCountersCollection& sharded_perf_coll();
+
+public:
+  PerfCountersCollection();
+  ~PerfCountersCollection();
+  PerfCountersCollectionImpl* get_perf_collection();
+
+};
+
+inline PerfCountersCollection::ShardedPerfCountersCollection& sharded_perf_coll(){
+  return PerfCountersCollection::sharded_perf_coll;
+}
+
+inline PerfCountersCollection& local_perf_coll() {
+  return PerfCountersCollection::sharded_perf_coll.local();
+}
+
+}
+
diff --git a/src/crimson/common/shared_lru.h b/src/crimson/common/shared_lru.h
new file mode 100644
index 000000000..4c1da401e
--- /dev/null
+++ b/src/crimson/common/shared_lru.h
@@ -0,0 +1,178 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <boost/smart_ptr/local_shared_ptr.hpp>
+#include <boost/smart_ptr/weak_ptr.hpp>
+#include "simple_lru.h"
+
+/// SharedLRU does its best to cache objects. It not only tracks the objects
+/// in its LRU cache with strong references, it also tracks objects with
+/// weak_ptr even if the cache does not hold any strong references to them. so
+/// that it can return the objects after they are evicted, as long as they've
+/// ever been cached and have not been destroyed yet.
+template<class K, class V>
+class SharedLRU {
+  using shared_ptr_t = boost::local_shared_ptr<V>;
+  using weak_ptr_t = boost::weak_ptr<V>;
+  using value_type = std::pair<K, shared_ptr_t>;
+
+  // weak_refs is already ordered, and we don't use accessors like
+  // LRUCache::lower_bound(), so unordered LRUCache would suffice.
+  SimpleLRU<K, shared_ptr_t, false> cache;
+  std::map<K, std::pair<weak_ptr_t, V*>> weak_refs;
+
+  struct Deleter {
+    SharedLRU<K,V>* cache;
+    const K key;
+    void operator()(V* ptr) {
+      cache->_erase_weak(key);
+      delete ptr;
+    }
+  };
+  void _erase_weak(const K& key) {
+    weak_refs.erase(key);
+  }
+public:
+  SharedLRU(size_t max_size = 20)
+    : cache{max_size}
+  {}
+  ~SharedLRU() {
+    cache.clear();
+    // use plain assert() in utiliy classes to avoid dependencies on logging
+    assert(weak_refs.empty());
+  }
+  /**
+   * Returns a reference to the given key, and perform an insertion if such
+   * key does not already exist
+   */
+  shared_ptr_t operator[](const K& key);
+  /**
+   * Returns true iff there are no live references left to anything that has been
+   * in the cache.
+   */
+  bool empty() const {
+    return weak_refs.empty();
+  }
+  size_t size() const {
+    return cache.size();
+  }
+  size_t capacity() const {
+    return cache.capacity();
+  }
+  /***
+   * Inserts a key if not present, or bumps it to the front of the LRU if
+   * it is, and then gives you a reference to the value. If the key already
+   * existed, you are responsible for deleting the new value you tried to
+   * insert.
+   *
+   * @param key The key to insert
+   * @param value The value that goes with the key
+   * @param existed Set to true if the value was already in the
+   * map, false otherwise
+   * @return A reference to the map's value for the given key
+   */
+  shared_ptr_t insert(const K& key, std::unique_ptr<V> value);
+  // clear all strong reference from the lru.
+  void clear() {
+    cache.clear();
+  }
+  shared_ptr_t find(const K& key);
+  // return the last element that is not greater than key
+  shared_ptr_t lower_bound(const K& key);
+  // return the first element that is greater than key
+  std::optional<value_type> upper_bound(const K& key);
+
+  void erase(const K& key) {
+    cache.erase(key);
+    _erase_weak(key);
+  }
+};
+
+template<class K, class V>
+typename SharedLRU<K,V>::shared_ptr_t
+SharedLRU<K,V>::insert(const K& key, std::unique_ptr<V> value)
+{
+  shared_ptr_t val;
+  if (auto found = weak_refs.find(key); found != weak_refs.end()) {
+    val = found->second.first.lock();
+  }
+  if (!val) {
+    val.reset(value.release(), Deleter{this, key});
+    weak_refs.emplace(key, std::make_pair(val, val.get()));
+  }
+  cache.insert(key, val);
+  return val;
+}
+
+template<class K, class V>
+typename SharedLRU<K,V>::shared_ptr_t
+SharedLRU<K,V>::operator[](const K& key)
+{
+  if (auto found = cache.find(key); found) {
+    return *found;
+  }
+  shared_ptr_t val;
+  if (auto found = weak_refs.find(key); found != weak_refs.end()) {
+    val = found->second.first.lock();
+  }
+  if (!val) {
+    val.reset(new V{}, Deleter{this, key});
+    weak_refs.emplace(key, std::make_pair(val, val.get()));
+  }
+  cache.insert(key, val);
+  return val;
+}
+
+template<class K, class V>
+typename SharedLRU<K,V>::shared_ptr_t
+SharedLRU<K,V>::find(const K& key)
+{
+  if (auto found = cache.find(key); found) {
+    return *found;
+  }
+  shared_ptr_t val;
+  if (auto found = weak_refs.find(key); found != weak_refs.end()) {
+    val = found->second.first.lock();
+  }
+  if (val) {
+    cache.insert(key, val);
+  }
+  return val;
+}
+
+template<class K, class V>
+typename SharedLRU<K,V>::shared_ptr_t
+SharedLRU<K,V>::lower_bound(const K& key)
+{
+  if (weak_refs.empty()) {
+    return {};
+  }
+  auto found = weak_refs.lower_bound(key);
+  if (found == weak_refs.end()) {
+    --found;
+  }
+  if (auto val = found->second.first.lock(); val) {
+    cache.insert(key, val);
+    return val;
+  } else {
+    return {};
+  }
+}
+
+template<class K, class V>
+std::optional<typename SharedLRU<K,V>::value_type>
+SharedLRU<K,V>::upper_bound(const K& key)
+{
+  for (auto found = weak_refs.upper_bound(key);
+       found != weak_refs.end();
+       ++found) {
+    if (auto val = found->second.first.lock(); val) {
+      return std::make_pair(found->first, val);
+    }
+  }
+  return std::nullopt;
+}
diff --git a/src/crimson/common/simple_lru.h b/src/crimson/common/simple_lru.h
new file mode 100644
index 000000000..1419c4885
--- /dev/null
+++ b/src/crimson/common/simple_lru.h
@@ -0,0 +1,141 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <list>
+#include <map>
+#include <optional>
+#include <type_traits>
+#include <unordered_map>
+
+template <class Key, class Value, bool Ordered>
+class SimpleLRU {
+  static_assert(std::is_default_constructible_v<Value>);
+  using list_type = std::list<Key>;
+  template<class K, class V>
+  using map_t = std::conditional_t<Ordered,
+				   std::map<K, V>,
+				   std::unordered_map<K, V>>;
+  using map_type = map_t<Key, std::pair<Value, typename list_type::iterator>>;
+  list_type lru;
+  map_type cache;
+  const size_t max_size;
+
+public:
+  SimpleLRU(size_t size = 20)
+    : cache(size),
+      max_size(size)
+  {}
+  size_t size() const {
+    return cache.size();
+  }
+  size_t capacity() const {
+    return max_size;
+  }
+  using insert_return_type = std::pair<Value, bool>;
+  insert_return_type insert(const Key& key, Value value);
+  std::optional<Value> find(const Key& key);
+  std::optional<std::enable_if<Ordered, Value>> lower_bound(const Key& key);
+  void erase(const Key& key);
+  void clear();
+private:
+  // bump the item to the front of the lru list
+  Value _lru_add(typename map_type::iterator found);
+  // evict the last element of most recently used list
+  void _evict();
+};
+
+template <class Key, class Value, bool Ordered>
+typename SimpleLRU<Key,Value,Ordered>::insert_return_type
+SimpleLRU<Key,Value,Ordered>::insert(const Key& key, Value value)
+{
+  if constexpr(Ordered) {
+    auto found = cache.lower_bound(key);
+    if (found != cache.end() && found->first == key) {
+      // already exists
+      return {found->second.first, true};
+    } else {
+      if (size() >= capacity()) {
+        _evict();
+      }
+      lru.push_front(key);
+      // use lower_bound as hint to save the lookup
+      cache.emplace_hint(found, key, std::make_pair(value, lru.begin()));
+      return {std::move(value), false};
+    }
+  } else {
+    // cache is not ordered
+    auto found = cache.find(key);
+    if (found != cache.end()) {
+      // already exists
+      return {found->second.first, true};
+    } else {
+      if (size() >= capacity()) {
+	_evict();
+      }
+      lru.push_front(key);
+      cache.emplace(key, std::make_pair(value, lru.begin()));
+      return {std::move(value), false};
+    }
+  }
+}
+
+template <class Key, class Value, bool Ordered>
+std::optional<Value> SimpleLRU<Key,Value,Ordered>::find(const Key& key)
+{
+  if (auto found = cache.find(key); found != cache.end()){
+    return _lru_add(found);
+  } else {
+    return {};
+  }
+}
+
+template <class Key, class Value, bool Ordered>
+std::optional<std::enable_if<Ordered, Value>>
+SimpleLRU<Key,Value,Ordered>::lower_bound(const Key& key)
+{
+  if (auto found = cache.lower_bound(key); found != cache.end()) {
+    return _lru_add(found);
+  } else {
+    return {};
+  }
+}
+
+template <class Key, class Value, bool Ordered>
+void SimpleLRU<Key,Value,Ordered>::clear()
+{
+  lru.clear();
+  cache.clear();
+}
+
+template <class Key, class Value, bool Ordered>
+void SimpleLRU<Key,Value,Ordered>::erase(const Key& key)
+{
+  if (auto found = cache.find(key); found != cache.end()) {
+    lru.erase(found->second.second);
+    cache.erase(found);
+  }
+}
+
+template <class Key, class Value, bool Ordered>
+Value SimpleLRU<Key,Value,Ordered>::_lru_add(
+  typename SimpleLRU<Key,Value,Ordered>::map_type::iterator found)
+{
+  auto& [value, in_lru] = found->second;
+  if (in_lru != lru.begin()){
+    // move item to the front
+    lru.splice(lru.begin(), lru, in_lru);
+  }
+  // the item is already at the front
+  return value;
+}
+
+template <class Key, class Value, bool Ordered>
+void SimpleLRU<Key,Value,Ordered>::_evict()
+{
+  // evict the last element of most recently used list
+  auto last = --lru.end();
+  cache.erase(*last);
+  lru.erase(last);
+}
diff --git a/src/crimson/common/throttle.cc b/src/crimson/common/throttle.cc
new file mode 100644
index 000000000..bd9195181
--- /dev/null
+++ b/src/crimson/common/throttle.cc
@@ -0,0 +1,59 @@
+#include "throttle.h"
+
+namespace crimson::common {
+
+int64_t Throttle::take(int64_t c)
+{
+  if (!max) {
+    return 0;
+  }
+  count += c;
+  return count;
+}
+
+int64_t Throttle::put(int64_t c)
+{
+  if (!max) {
+    return 0;
+  }
+  if (!c) {
+    return count;
+  }
+  on_free_slots.signal();
+  count -= c;
+  return count;
+}
+
+seastar::future<> Throttle::get(size_t c)
+{
+  if (!max) {
+    return seastar::make_ready_future<>();
+  }
+  return on_free_slots.wait([this, c] {
+    return !_should_wait(c);
+  }).then([this, c] {
+    count += c;
+    return seastar::make_ready_future<>();
+  });
+}
+
+void Throttle::reset_max(size_t m) {
+  if (max == m) {
+    return;
+  }
+
+  if (m > max) {
+    on_free_slots.signal();
+  }
+  max = m;
+}
+
+bool Throttle::_should_wait(size_t c) const {
+  if (!max) {
+    return false;
+  }
+  return ((c <= max && count + c > max) || // normally stay under max
+          (c >= max && count > max));      // except for large c
+}
+
+} // namespace crimson::common
diff --git a/src/crimson/common/throttle.h b/src/crimson/common/throttle.h
new file mode 100644
index 000000000..fea471c8d
--- /dev/null
+++ b/src/crimson/common/throttle.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/condition-variable.hh>
+// pull seastar::timer<...>::timer definitions. FIX SEASTAR or reactor.hh
+// is obligatory and should be included everywhere?
+#include <seastar/core/reactor.hh>
+
+#include "common/ThrottleInterface.h"
+
+namespace crimson::common {
+
+class Throttle final : public ThrottleInterface {
+  size_t max = 0;
+  size_t count = 0;
+  // we cannot change the "count" of seastar::semaphore after it is created,
+  // so use condition_variable instead.
+  seastar::condition_variable on_free_slots;
+public:
+  explicit Throttle(size_t m)
+    : max(m)
+  {}
+  int64_t take(int64_t c = 1) override;
+  int64_t put(int64_t c = 1) override;
+  seastar::future<> get(size_t c);
+  size_t get_current() const {
+    return count;
+  }
+  size_t get_max() const {
+    return max;
+  }
+  void reset_max(size_t m);
+private:
+  bool _should_wait(size_t c) const;
+};
+
+} // namespace crimson::common
diff --git a/src/crimson/common/tri_mutex.cc b/src/crimson/common/tri_mutex.cc
new file mode 100644
index 000000000..c18aff1a0
--- /dev/null
+++ b/src/crimson/common/tri_mutex.cc
@@ -0,0 +1,225 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tri_mutex.h"
+
+seastar::future<> read_lock::lock()
+{
+  return static_cast<tri_mutex*>(this)->lock_for_read();
+}
+
+void read_lock::unlock()
+{
+  static_cast<tri_mutex*>(this)->unlock_for_read();
+}
+
+seastar::future<> write_lock::lock()
+{
+  return static_cast<tri_mutex*>(this)->lock_for_write(false);
+}
+
+void write_lock::unlock()
+{
+  static_cast<tri_mutex*>(this)->unlock_for_write();
+}
+
+seastar::future<> excl_lock::lock()
+{
+  return static_cast<tri_mutex*>(this)->lock_for_excl();
+}
+
+void excl_lock::unlock()
+{
+  static_cast<tri_mutex*>(this)->unlock_for_excl();
+}
+
+seastar::future<> excl_lock_from_read::lock()
+{
+  static_cast<tri_mutex*>(this)->promote_from_read();
+  return seastar::make_ready_future<>();
+}
+
+void excl_lock_from_read::unlock()
+{
+  static_cast<tri_mutex*>(this)->demote_to_read();
+}
+
+seastar::future<> excl_lock_from_write::lock()
+{
+  static_cast<tri_mutex*>(this)->promote_from_write();
+  return seastar::make_ready_future<>();
+}
+
+void excl_lock_from_write::unlock()
+{
+  static_cast<tri_mutex*>(this)->demote_to_write();
+}
+
+seastar::future<> excl_lock_from_excl::lock()
+{
+  return seastar::make_ready_future<>();
+}
+
+void excl_lock_from_excl::unlock()
+{
+}
+
+tri_mutex::~tri_mutex()
+{
+  assert(!is_acquired());
+}
+
+seastar::future<> tri_mutex::lock_for_read()
+{
+  if (try_lock_for_read()) {
+    return seastar::make_ready_future<>();
+  }
+  waiters.emplace_back(seastar::promise<>(), type_t::read);
+  return waiters.back().pr.get_future();
+}
+
+bool tri_mutex::try_lock_for_read() noexcept
+{
+  if (!writers && !exclusively_used && waiters.empty()) {
+    ++readers;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void tri_mutex::unlock_for_read()
+{
+  assert(readers > 0);
+  if (--readers == 0) {
+    wake();
+  }
+}
+
+void tri_mutex::promote_from_read()
+{
+  assert(readers == 1);
+  --readers;
+  exclusively_used = true;
+}
+
+void tri_mutex::demote_to_read()
+{
+  assert(exclusively_used);
+  exclusively_used = false;
+  ++readers;
+}
+
+seastar::future<> tri_mutex::lock_for_write(bool greedy)
+{
+  if (try_lock_for_write(greedy)) {
+    return seastar::make_ready_future<>();
+  }
+  waiters.emplace_back(seastar::promise<>(), type_t::write);
+  return waiters.back().pr.get_future();
+}
+
+bool tri_mutex::try_lock_for_write(bool greedy) noexcept
+{
+  if (!readers && !exclusively_used) {
+    if (greedy || waiters.empty()) {
+      ++writers;
+      return true;
+    }
+  }
+  return false;
+}
+
+void tri_mutex::unlock_for_write()
+{
+  assert(writers > 0);
+  if (--writers == 0) {
+    wake();
+  }
+}
+
+void tri_mutex::promote_from_write()
+{
+  assert(writers == 1);
+  --writers;
+  exclusively_used = true;
+}
+
+void tri_mutex::demote_to_write()
+{
+  assert(exclusively_used);
+  exclusively_used = false;
+  ++writers;
+}
+
+// for exclusive users
+seastar::future<> tri_mutex::lock_for_excl()
+{
+  if (try_lock_for_excl()) {
+    return seastar::make_ready_future<>();
+  }
+  waiters.emplace_back(seastar::promise<>(), type_t::exclusive);
+  return waiters.back().pr.get_future();
+}
+
+bool tri_mutex::try_lock_for_excl() noexcept
+{
+  if (!readers && !writers && !exclusively_used) {
+    exclusively_used = true;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void tri_mutex::unlock_for_excl()
+{
+  assert(exclusively_used);
+  exclusively_used = false;
+  wake();
+}
+
+bool tri_mutex::is_acquired() const
+{
+  if (readers) {
+    return true;
+  } else if (writers) {
+    return true;
+  } else if (exclusively_used) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void tri_mutex::wake()
+{
+  assert(!readers && !writers && !exclusively_used);
+  type_t type = type_t::none;
+  while (!waiters.empty()) {
+    auto& waiter = waiters.front();
+    if (type == type_t::exclusive) {
+      break;
+    } if (type == type_t::none) {
+      type = waiter.type;
+    } else if (type != waiter.type) {
+      // to be woken in the next batch
+      break;
+    }
+    switch (type) {
+    case type_t::read:
+      ++readers;
+      break;
+    case type_t::write:
+      ++writers;
+      break;
+    case type_t::exclusive:
+      exclusively_used = true;
+      break;
+    default:
+      assert(0);
+    }
+    waiter.pr.set_value();
+    waiters.pop_front();
+  }
+}
diff --git a/src/crimson/common/tri_mutex.h b/src/crimson/common/tri_mutex.h
new file mode 100644
index 000000000..127573b3a
--- /dev/null
+++ b/src/crimson/common/tri_mutex.h
@@ -0,0 +1,156 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/circular_buffer.hh>
+
+class read_lock {
+public:
+  seastar::future<> lock();
+  void unlock();
+};
+
+class write_lock {
+public:
+  seastar::future<> lock();
+  void unlock();
+};
+
+class excl_lock {
+public:
+  seastar::future<> lock();
+  void unlock();
+};
+
+// promote from read to excl
+class excl_lock_from_read {
+public:
+  seastar::future<> lock();
+  void unlock();
+};
+
+// promote from write to excl
+class excl_lock_from_write {
+public:
+  seastar::future<> lock();
+  void unlock();
+};
+
+// promote from excl to excl
+class excl_lock_from_excl {
+public:
+  seastar::future<> lock();
+  void unlock();
+};
+
+/// shared/exclusive mutual exclusion
+///
+/// this lock design uses reader and writer is entirely and completely
+/// independent of the conventional reader/writer lock usage. Here, what we
+/// mean is that we can pipeline reads, and we can pipeline writes, but we
+/// cannot allow a read while writes are in progress or a write while reads are
+/// in progress. Any rmw operation is therefore exclusive.
+///
+/// tri_mutex is based on seastar::shared_mutex, but instead of two kinds of
+/// waiters, tri_mutex keeps track of three kinds of lock users:
+/// - readers
+/// - writers
+/// - exclusive users
+class tri_mutex : private read_lock,
+                          write_lock,
+                          excl_lock,
+                          excl_lock_from_read,
+                          excl_lock_from_write,
+                          excl_lock_from_excl
+{
+public:
+  tri_mutex() = default;
+  ~tri_mutex();
+
+  read_lock& for_read() {
+    return *this;
+  }
+  write_lock& for_write() {
+    return *this;
+  }
+  excl_lock& for_excl() {
+    return *this;
+  }
+  excl_lock_from_read& excl_from_read() {
+    return *this;
+  }
+  excl_lock_from_write& excl_from_write() {
+    return *this;
+  }
+  excl_lock_from_write& excl_from_excl() {
+    return *this;
+  }
+
+  // for shared readers
+  seastar::future<> lock_for_read();
+  bool try_lock_for_read() noexcept;
+  void unlock_for_read();
+  void promote_from_read();
+  void demote_to_read();
+  unsigned get_readers() const {
+    return readers;
+  }
+
+  // for shared writers
+  seastar::future<> lock_for_write(bool greedy);
+  bool try_lock_for_write(bool greedy) noexcept;
+  void unlock_for_write();
+  void promote_from_write();
+  void demote_to_write();
+  unsigned get_writers() const {
+    return writers;
+  }
+
+  // for exclusive users
+  seastar::future<> lock_for_excl();
+  bool try_lock_for_excl() noexcept;
+  void unlock_for_excl();
+  bool is_excl_acquired() const {
+    return exclusively_used;
+  }
+
+  bool is_acquired() const;
+
+  /// pass the provided exception to any waiting waiters
+  template<typename Exception>
+  void abort(Exception ex) {
+    while (!waiters.empty()) {
+      auto& waiter = waiters.front();
+      waiter.pr.set_exception(std::make_exception_ptr(ex));
+      waiters.pop_front();
+    }
+  }
+
+private:
+  void wake();
+  unsigned readers = 0;
+  unsigned writers = 0;
+  bool exclusively_used = false;
+  enum class type_t : uint8_t {
+    read,
+    write,
+    exclusive,
+    none,
+  };
+  struct waiter_t {
+    waiter_t(seastar::promise<>&& pr, type_t type)
+      : pr(std::move(pr)), type(type)
+    {}
+    seastar::promise<> pr;
+    type_t type;
+  };
+  seastar::circular_buffer<waiter_t> waiters;
+  friend class read_lock;
+  friend class write_lock;
+  friend class excl_lock;
+  friend class excl_lock_from_read;
+  friend class excl_lock_from_write;
+  friend class excl_lock_from_excl;
+};
diff --git a/src/crimson/common/type_helpers.h b/src/crimson/common/type_helpers.h
new file mode 100644
index 000000000..4c606581f
--- /dev/null
+++ b/src/crimson/common/type_helpers.h
@@ -0,0 +1,8 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "boost/intrusive_ptr.hpp"
+
+template<typename T> using Ref = boost::intrusive_ptr<T>;
diff --git a/src/crimson/mgr/client.cc b/src/crimson/mgr/client.cc
new file mode 100644
index 000000000..5aa8a88ba
--- /dev/null
+++ b/src/crimson/mgr/client.cc
@@ -0,0 +1,166 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "client.h"
+
+#include <seastar/core/sleep.hh>
+
+#include "crimson/common/log.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Messenger.h"
+#include "messages/MMgrConfigure.h"
+#include "messages/MMgrMap.h"
+#include "messages/MMgrOpen.h"
+
+namespace {
+  seastar::logger& logger()
+  {
+    return crimson::get_logger(ceph_subsys_mgrc);
+  }
+}
+
+using crimson::common::local_conf;
+
+namespace crimson::mgr
+{
+
+Client::Client(crimson::net::Messenger& msgr,
+                 WithStats& with_stats)
+  : msgr{msgr},
+    with_stats{with_stats},
+    report_timer{[this] {report();}}
+{}
+
+seastar::future<> Client::start()
+{
+  return seastar::now();
+}
+
+seastar::future<> Client::stop()
+{
+  logger().info("{}", __func__);
+  report_timer.cancel();
+  auto fut = gate.close();
+  if (conn) {
+    conn->mark_down();
+  }
+  return fut;
+}
+
+std::optional<seastar::future<>>
+Client::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m)
+{
+  bool dispatched = true;
+  gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] {
+    switch(m->get_type()) {
+    case MSG_MGR_MAP:
+      return handle_mgr_map(conn, boost::static_pointer_cast<MMgrMap>(m));
+    case MSG_MGR_CONFIGURE:
+      return handle_mgr_conf(conn, boost::static_pointer_cast<MMgrConfigure>(m));
+    default:
+      dispatched = false;
+      return seastar::now();
+    }
+  });
+  return (dispatched ? std::make_optional(seastar::now()) : std::nullopt);
+}
+
+void Client::ms_handle_connect(crimson::net::ConnectionRef c)
+{
+  gate.dispatch_in_background(__func__, *this, [this, c] {
+    if (conn == c) {
+      // ask for the mgrconfigure message
+      auto m = ceph::make_message<MMgrOpen>();
+      m->daemon_name = local_conf()->name.get_id();
+      return conn->send(std::move(m));
+    } else {
+      return seastar::now();
+    }
+  });
+}
+
+void Client::ms_handle_reset(crimson::net::ConnectionRef c, bool /* is_replace */)
+{
+  gate.dispatch_in_background(__func__, *this, [this, c] {
+    if (conn == c) {
+      report_timer.cancel();
+      return reconnect();
+    } else {
+      return seastar::now();
+    }
+  });
+}
+
+seastar::future<> Client::reconnect()
+{
+  if (conn) {
+    conn->mark_down();
+    conn = {};
+  }
+  if (!mgrmap.get_available()) {
+    logger().warn("No active mgr available yet");
+    return seastar::now();
+  }
+  auto retry_interval = std::chrono::duration<double>(
+    local_conf().get_val<double>("mgr_connect_retry_interval"));
+  auto a_while = std::chrono::duration_cast<seastar::steady_clock_type::duration>(
+    retry_interval);
+  return seastar::sleep(a_while).then([this] {
+    auto peer = mgrmap.get_active_addrs().pick_addr(msgr.get_myaddr().get_type());
+    if (peer == entity_addr_t{}) {
+      // crimson msgr only uses the first bound addr
+      logger().error("mgr.{} does not have an addr compatible with me",
+                     mgrmap.get_active_name());
+      return;
+    }
+    conn = msgr.connect(peer, CEPH_ENTITY_TYPE_MGR);
+  });
+}
+
+seastar::future<> Client::handle_mgr_map(crimson::net::ConnectionRef,
+                                         Ref<MMgrMap> m)
+{
+  mgrmap = m->get_map();
+  if (!conn) {
+    return reconnect();
+  } else if (conn->get_peer_addr() !=
+             mgrmap.get_active_addrs().legacy_addr()) {
+    return reconnect();
+  } else {
+    return seastar::now();
+  }
+}
+
+seastar::future<> Client::handle_mgr_conf(crimson::net::ConnectionRef,
+                                          Ref<MMgrConfigure> m)
+{
+  logger().info("{} {}", __func__, *m);
+
+  auto report_period = std::chrono::seconds{m->stats_period};
+  if (report_period.count()) {
+    if (report_timer.armed()) {
+      report_timer.rearm(report_timer.get_timeout(), report_period);
+    } else {
+      report_timer.arm_periodic(report_period);
+    }
+  } else {
+    report_timer.cancel();
+  }
+  return seastar::now();
+}
+
+void Client::report()
+{
+  gate.dispatch_in_background(__func__, *this, [this] {
+    assert(conn);
+    auto pg_stats = with_stats.get_stats();
+    return conn->send(std::move(pg_stats));
+  });
+}
+
+void Client::print(std::ostream& out) const
+{
+  out << "mgrc ";
+}
+
+}
diff --git a/src/crimson/mgr/client.h b/src/crimson/mgr/client.h
new file mode 100644
index 000000000..ad7e1fde5
--- /dev/null
+++ b/src/crimson/mgr/client.h
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/timer.hh>
+
+#include "crimson/common/gated.h"
+#include "crimson/net/Dispatcher.h"
+#include "crimson/net/Fwd.h"
+#include "mon/MgrMap.h"
+
+template<typename Message> using Ref = boost::intrusive_ptr<Message>;
+namespace crimson::net {
+  class Messenger;
+}
+
+class MMgrMap;
+class MMgrConfigure;
+
+namespace crimson::mgr
+{
+
+// implement WithStats if you want to report stats to mgr periodically
+class WithStats {
+public:
+  virtual MessageRef get_stats() const = 0;
+  virtual ~WithStats() {}
+};
+
+class Client : public crimson::net::Dispatcher {
+public:
+  Client(crimson::net::Messenger& msgr,
+	 WithStats& with_stats);
+  seastar::future<> start();
+  seastar::future<> stop();
+  void report();
+
+private:
+  std::optional<seastar::future<>> ms_dispatch(
+      crimson::net::ConnectionRef conn, Ref<Message> m) override;
+  void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) final;
+  void ms_handle_connect(crimson::net::ConnectionRef conn) final;
+  seastar::future<> handle_mgr_map(crimson::net::ConnectionRef conn,
+				   Ref<MMgrMap> m);
+  seastar::future<> handle_mgr_conf(crimson::net::ConnectionRef conn,
+				    Ref<MMgrConfigure> m);
+  seastar::future<> reconnect();
+
+  void print(std::ostream&) const;
+  friend std::ostream& operator<<(std::ostream& out, const Client& client);
+private:
+  MgrMap mgrmap;
+  crimson::net::Messenger& msgr;
+  WithStats& with_stats;
+  crimson::net::ConnectionRef conn;
+  seastar::timer<seastar::lowres_clock> report_timer;
+  crimson::common::Gated gate;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const Client& client) {
+  client.print(out);
+  return out;
+}
+
+}
diff --git a/src/crimson/mon/MonClient.cc b/src/crimson/mon/MonClient.cc
new file mode 100644
index 000000000..9dfbb103a
--- /dev/null
+++ b/src/crimson/mon/MonClient.cc
@@ -0,0 +1,1111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "MonClient.h"
+
+#include <random>
+
+#include <seastar/core/future-util.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/shared_future.hh>
+#include <seastar/util/log.hh>
+
+#include "auth/AuthClientHandler.h"
+#include "auth/RotatingKeyRing.h"
+
+#include "common/hostname.h"
+
+#include "crimson/auth/KeyRing.h"
+#include "crimson/common/config_proxy.h"
+#include "crimson/common/log.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Errors.h"
+#include "crimson/net/Messenger.h"
+
+#include "messages/MAuth.h"
+#include "messages/MAuthReply.h"
+#include "messages/MConfig.h"
+#include "messages/MLogAck.h"
+#include "messages/MMonCommand.h"
+#include "messages/MMonCommandAck.h"
+#include "messages/MMonGetMap.h"
+#include "messages/MMonGetVersion.h"
+#include "messages/MMonGetVersionReply.h"
+#include "messages/MMonMap.h"
+#include "messages/MMonSubscribe.h"
+#include "messages/MMonSubscribeAck.h"
+
+namespace {
+  seastar::logger& logger()
+  {
+    return crimson::get_logger(ceph_subsys_monc);
+  }
+}
+
+namespace crimson::mon {
+
+using crimson::common::local_conf;
+
+class Connection {
+public:
+  Connection(const AuthRegistry& auth_registry,
+             crimson::net::ConnectionRef conn,
+             KeyRing* keyring);
+  enum class auth_result_t {
+    success = 0,
+    failure,
+    canceled
+  };
+  seastar::future<> handle_auth_reply(Ref<MAuthReply> m);
+  // v1
+  seastar::future<auth_result_t> authenticate_v1(
+    epoch_t epoch,
+    const EntityName& name,
+    uint32_t want_keys);
+  // v2
+  seastar::future<auth_result_t> authenticate_v2();
+  auth::AuthClient::auth_request_t
+  get_auth_request(const EntityName& name,
+                   uint32_t want_keys);
+  using secret_t = string;
+  tuple<CryptoKey, secret_t, bufferlist>
+  handle_auth_reply_more(const ceph::buffer::list& bl);
+  int handle_auth_bad_method(uint32_t old_auth_method,
+                             int result,
+                             const std::vector<uint32_t>& allowed_methods,
+                             const std::vector<uint32_t>& allowed_modes);
+
+  // v1 and v2
+  tuple<CryptoKey, secret_t, int>
+  handle_auth_done(uint64_t new_global_id,
+                   const ceph::buffer::list& bl);
+  void close();
+  bool is_my_peer(const entity_addr_t& addr) const;
+  AuthAuthorizer* get_authorizer(entity_type_t peer) const;
+  KeyStore& get_keys();
+  seastar::future<> renew_tickets();
+  seastar::future<> renew_rotating_keyring();
+
+  crimson::net::ConnectionRef get_conn();
+
+private:
+  seastar::future<> setup_session(epoch_t epoch,
+                                  const EntityName& name);
+  std::unique_ptr<AuthClientHandler> create_auth(crimson::auth::method_t,
+                                                 uint64_t global_id,
+                                                 const EntityName& name,
+                                                 uint32_t want_keys);
+  enum class request_t {
+    rotating,
+    general,
+  };
+  seastar::future<std::optional<auth_result_t>> do_auth_single(request_t);
+  seastar::future<auth_result_t> do_auth(request_t);
+
+private:
+  bool closed = false;
+  // v1
+  seastar::shared_promise<Ref<MAuthReply>> reply;
+  // v2
+  using clock_t = seastar::lowres_system_clock;
+  clock_t::time_point auth_start;
+  crimson::auth::method_t auth_method = 0;
+  std::optional<seastar::promise<auth_result_t>> auth_done;
+  // v1 and v2
+  const AuthRegistry& auth_registry;
+  crimson::net::ConnectionRef conn;
+  std::unique_ptr<AuthClientHandler> auth;
+  std::unique_ptr<RotatingKeyRing> rotating_keyring;
+  uint64_t global_id = 0;
+  clock_t::time_point last_rotating_renew_sent;
+};
+
+Connection::Connection(const AuthRegistry& auth_registry,
+                       crimson::net::ConnectionRef conn,
+                       KeyRing* keyring)
+  : auth_registry{auth_registry},
+    conn{conn},
+    rotating_keyring{
+      std::make_unique<RotatingKeyRing>(nullptr,
+                                        CEPH_ENTITY_TYPE_OSD,
+                                        keyring)}
+{}
+
+seastar::future<> Connection::handle_auth_reply(Ref<MAuthReply> m)
+{
+  reply.set_value(m);
+  reply = {};
+  return seastar::now();
+}
+
+seastar::future<> Connection::renew_tickets()
+{
+  if (auth->need_tickets()) {
+    return do_auth(request_t::general).then([](auth_result_t r) {
+      if (r != auth_result_t::success)  {
+        throw std::system_error(
+	  make_error_code(
+	    crimson::net::error::negotiation_failure));
+      }
+    });
+  }
+  return seastar::now();
+}
+
+seastar::future<> Connection::renew_rotating_keyring()
+{
+  auto now = clock_t::now();
+  auto ttl = std::chrono::seconds{
+    static_cast<long>(crimson::common::local_conf()->auth_service_ticket_ttl)};
+  auto cutoff = now - ttl / 4;
+  if (!rotating_keyring->need_new_secrets(utime_t(cutoff))) {
+    return seastar::now();
+  }
+  if (now - last_rotating_renew_sent < std::chrono::seconds{1}) {
+    logger().info("renew_rotating_keyring called too often");
+    return seastar::now();
+  }
+  last_rotating_renew_sent = now;
+  return do_auth(request_t::rotating).then([](auth_result_t r) {
+    if (r != auth_result_t::success)  {
+      throw std::system_error(make_error_code(
+        crimson::net::error::negotiation_failure));
+    }
+  });
+}
+
+AuthAuthorizer* Connection::get_authorizer(entity_type_t peer) const
+{
+  if (auth) {
+    return auth->build_authorizer(peer);
+  } else {
+    return nullptr;
+  }
+}
+
+KeyStore& Connection::get_keys() {
+  return *rotating_keyring;
+}
+
+std::unique_ptr<AuthClientHandler>
+Connection::create_auth(crimson::auth::method_t protocol,
+                        uint64_t global_id,
+                        const EntityName& name,
+                        uint32_t want_keys)
+{
+  static crimson::common::CephContext cct;
+  std::unique_ptr<AuthClientHandler> auth;
+  auth.reset(AuthClientHandler::create(&cct,
+                                       protocol,
+                                       rotating_keyring.get()));
+  if (!auth) {
+    logger().error("no handler for protocol {}", protocol);
+    throw std::system_error(make_error_code(
+      crimson::net::error::negotiation_failure));
+  }
+  auth->init(name);
+  auth->set_want_keys(want_keys);
+  auth->set_global_id(global_id);
+  return auth;
+}
+
+seastar::future<>
+Connection::setup_session(epoch_t epoch,
+                          const EntityName& name)
+{
+  auto m = ceph::make_message<MAuth>();
+  m->protocol = CEPH_AUTH_UNKNOWN;
+  m->monmap_epoch = epoch;
+  __u8 struct_v = 1;
+  encode(struct_v, m->auth_payload);
+  std::vector<crimson::auth::method_t> auth_methods;
+  auth_registry.get_supported_methods(conn->get_peer_type(), &auth_methods);
+  encode(auth_methods, m->auth_payload);
+  encode(name, m->auth_payload);
+  encode(global_id, m->auth_payload);
+  return conn->send(m);
+}
+
+seastar::future<std::optional<Connection::auth_result_t>>
+Connection::do_auth_single(Connection::request_t what)
+{
+  auto m = make_message<MAuth>();
+  m->protocol = auth->get_protocol();
+  auth->prepare_build_request();
+  switch (what) {
+  case request_t::rotating:
+    auth->build_rotating_request(m->auth_payload);
+    break;
+  case request_t::general:
+    if (int ret = auth->build_request(m->auth_payload); ret) {
+      logger().error("missing/bad key for '{}'", local_conf()->name);
+      throw std::system_error(make_error_code(
+        crimson::net::error::negotiation_failure));
+    }
+    break;
+  default:
+    assert(0);
+  }
+  logger().info("sending {}", *m);
+  return conn->send(m).then([this] {
+    logger().info("waiting");
+    return reply.get_shared_future();
+  }).then([this] (Ref<MAuthReply> m) {
+    if (!m) {
+      ceph_assert(closed);
+      logger().info("do_auth: connection closed");
+      return seastar::make_ready_future<std::optional<Connection::auth_result_t>>(
+	std::make_optional(auth_result_t::canceled));
+    }
+    logger().info(
+      "do_auth: mon {} => {} returns {}: {}",
+      conn->get_messenger()->get_myaddr(),
+      conn->get_peer_addr(), *m, m->result);
+    auto p = m->result_bl.cbegin();
+    auto ret = auth->handle_response(m->result, p,
+				     nullptr, nullptr);
+    if (ret != 0 && ret != -EAGAIN) {
+      logger().error(
+	"do_auth: got error {} on mon {}",
+	ret,
+	conn->get_peer_addr());
+    }
+    return seastar::make_ready_future<std::optional<Connection::auth_result_t>>(
+      ret == -EAGAIN
+      ? std::nullopt
+      : std::make_optional(ret == 0
+	 ? auth_result_t::success
+	 : auth_result_t::failure
+      ));
+  });
+}
+
+seastar::future<Connection::auth_result_t>
+Connection::do_auth(Connection::request_t what) {
+  return seastar::repeat_until_value([this, what]() {
+    return do_auth_single(what);
+  });
+}
+
+seastar::future<Connection::auth_result_t>
+Connection::authenticate_v1(epoch_t epoch,
+                            const EntityName& name,
+                            uint32_t want_keys)
+{
+  return conn->keepalive().then([epoch, name, this] {
+    return setup_session(epoch, name);
+  }).then([this] {
+    return reply.get_shared_future();
+  }).then([name, want_keys, this](Ref<MAuthReply> m) {
+    if (!m) {
+      logger().error("authenticate_v1 canceled on {}", name);
+      return seastar::make_ready_future<auth_result_t>(auth_result_t::canceled);
+    }
+    global_id = m->global_id;
+    auth = create_auth(m->protocol, m->global_id, name, want_keys);
+    switch (auto p = m->result_bl.cbegin();
+            auth->handle_response(m->result, p,
+				  nullptr, nullptr)) {
+    case 0:
+      // none
+      return seastar::make_ready_future<auth_result_t>(auth_result_t::success);
+    case -EAGAIN:
+      // cephx
+      return do_auth(request_t::general);
+    default:
+      ceph_assert_always(0);
+    }
+  }).handle_exception([](auto ep) {
+    logger().error("authenticate_v1 failed with {}", ep);
+    return seastar::make_ready_future<auth_result_t>(auth_result_t::canceled);
+  });
+}
+
+seastar::future<Connection::auth_result_t> Connection::authenticate_v2()
+{
+  auth_start = seastar::lowres_system_clock::now();
+  return conn->send(make_message<MMonGetMap>()).then([this] {
+    auth_done.emplace();
+    return auth_done->get_future();
+  });
+}
+
+auth::AuthClient::auth_request_t
+Connection::get_auth_request(const EntityName& entity_name,
+                             uint32_t want_keys)
+{
+  // choose method
+  auth_method = [&] {
+    std::vector<crimson::auth::method_t> methods;
+    auth_registry.get_supported_methods(conn->get_peer_type(), &methods);
+    if (methods.empty()) {
+      logger().info("get_auth_request no methods is supported");
+      throw crimson::auth::error("no methods is supported");
+    }
+    return methods.front();
+  }();
+
+  std::vector<uint32_t> modes;
+  auth_registry.get_supported_modes(conn->get_peer_type(), auth_method,
+                                    &modes);
+  logger().info("method {} preferred_modes {}", auth_method, modes);
+  if (modes.empty()) {
+    throw crimson::auth::error("no modes is supported");
+  }
+  auth = create_auth(auth_method, global_id, entity_name, want_keys);
+
+  using ceph::encode;
+  bufferlist bl;
+  // initial request includes some boilerplate...
+  encode((char)AUTH_MODE_MON, bl);
+  encode(entity_name, bl);
+  encode(global_id, bl);
+  // and (maybe) some method-specific initial payload
+  auth->build_initial_request(&bl);
+  return {auth_method, modes, bl};
+}
+
+tuple<CryptoKey, Connection::secret_t, bufferlist>
+Connection::handle_auth_reply_more(const ceph::buffer::list& payload)
+{
+  CryptoKey session_key;
+  secret_t connection_secret;
+  bufferlist reply;
+  auto p = payload.cbegin();
+  int r = auth->handle_response(0, p, &session_key,	&connection_secret);
+  if (r == -EAGAIN) {
+    auth->prepare_build_request();
+    auth->build_request(reply);
+    logger().info(" responding with {} bytes", reply.length());
+    return {session_key, connection_secret, reply};
+  } else if (r < 0) {
+    logger().error(" handle_response returned {}",  r);
+    throw crimson::auth::error("unable to build auth");
+  } else {
+    logger().info("authenticated!");
+    std::terminate();
+  }
+}
+
+tuple<CryptoKey, Connection::secret_t, int>
+Connection::handle_auth_done(uint64_t new_global_id,
+                             const ceph::buffer::list& payload)
+{
+  global_id = new_global_id;
+  auth->set_global_id(global_id);
+  auto p = payload.begin();
+  CryptoKey session_key;
+  secret_t connection_secret;
+  int r = auth->handle_response(0, p, &session_key, &connection_secret);
+  conn->set_last_keepalive_ack(auth_start);
+  if (auth_done) {
+    auth_done->set_value(auth_result_t::success);
+    auth_done.reset();
+  }
+  return {session_key, connection_secret, r};
+}
+
+int Connection::handle_auth_bad_method(uint32_t old_auth_method,
+                                       int result,
+                                       const std::vector<uint32_t>& allowed_methods,
+                                       const std::vector<uint32_t>& allowed_modes)
+{
+  logger().info("old_auth_method {} result {} allowed_methods {}",
+                old_auth_method, cpp_strerror(result), allowed_methods);
+  std::vector<uint32_t> auth_supported;
+  auth_registry.get_supported_methods(conn->get_peer_type(), &auth_supported);
+  auto p = std::find(auth_supported.begin(), auth_supported.end(),
+                     old_auth_method);
+  assert(p != auth_supported.end());
+  p = std::find_first_of(std::next(p), auth_supported.end(),
+                         allowed_methods.begin(), allowed_methods.end());
+  if (p == auth_supported.end()) {
+    logger().error("server allowed_methods {} but i only support {}",
+                   allowed_methods, auth_supported);
+    assert(auth_done);
+    auth_done->set_exception(std::system_error(make_error_code(
+      crimson::net::error::negotiation_failure)));
+    return -EACCES;
+  }
+  auth_method = *p;
+  logger().info("will try {} next", auth_method);
+  return 0;
+}
+
+void Connection::close()
+{
+  reply.set_value(Ref<MAuthReply>(nullptr));
+  reply = {};
+  if (auth_done) {
+    auth_done->set_value(auth_result_t::canceled);
+    auth_done.reset();
+  }
+  if (conn && !std::exchange(closed, true)) {
+    conn->mark_down();
+  }
+}
+
+bool Connection::is_my_peer(const entity_addr_t& addr) const
+{
+  ceph_assert(conn);
+  return conn->get_peer_addr() == addr;
+}
+
+crimson::net::ConnectionRef Connection::get_conn() {
+  return conn;
+}
+
+Client::Client(crimson::net::Messenger& messenger,
+               crimson::common::AuthHandler& auth_handler)
+  // currently, crimson is OSD-only
+  : want_keys{CEPH_ENTITY_TYPE_MON |
+              CEPH_ENTITY_TYPE_OSD |
+              CEPH_ENTITY_TYPE_MGR},
+    timer{[this] { tick(); }},
+    msgr{messenger},
+    auth_registry{&cct},
+    auth_handler{auth_handler}
+{}
+
+Client::Client(Client&&) = default;
+Client::~Client() = default;
+
+seastar::future<> Client::start() {
+  entity_name = crimson::common::local_conf()->name;
+  auth_registry.refresh_config();
+  return load_keyring().then([this] {
+    return monmap.build_initial(crimson::common::local_conf(), false);
+  }).then([this] {
+    return authenticate();
+  }).then([this] {
+    auto interval =
+      std::chrono::duration_cast<seastar::lowres_clock::duration>(
+        std::chrono::duration<double>(
+          local_conf().get_val<double>("mon_client_ping_interval")));
+    timer.arm_periodic(interval);
+  });
+}
+
+seastar::future<> Client::load_keyring()
+{
+  if (!auth_registry.is_supported_method(msgr.get_mytype(), CEPH_AUTH_CEPHX)) {
+    return seastar::now();
+  } else {
+    return crimson::auth::load_from_keyring(&keyring).then([](KeyRing* keyring) {
+      return crimson::auth::load_from_keyfile(keyring);
+    }).then([](KeyRing* keyring) {
+      return crimson::auth::load_from_key(keyring);
+    }).then([](KeyRing*) {
+      return seastar::now();
+    });
+  }
+}
+
+void Client::tick()
+{
+  gate.dispatch_in_background(__func__, *this, [this] {
+    if (active_con) {
+      return seastar::when_all_succeed(active_con->get_conn()->keepalive(),
+                                       active_con->renew_tickets(),
+                                       active_con->renew_rotating_keyring()).then_unpack([] {});
+    } else {
+      return seastar::now();
+    }
+  });
+}
+
+bool Client::is_hunting() const {
+  return !active_con;
+}
+
+std::optional<seastar::future<>>
+Client::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m)
+{
+  bool dispatched = true;
+  gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] {
+    // we only care about these message types
+    switch (m->get_type()) {
+    case CEPH_MSG_MON_MAP:
+      return handle_monmap(conn, boost::static_pointer_cast<MMonMap>(m));
+    case CEPH_MSG_AUTH_REPLY:
+      return handle_auth_reply(
+	conn, boost::static_pointer_cast<MAuthReply>(m));
+    case CEPH_MSG_MON_SUBSCRIBE_ACK:
+      return handle_subscribe_ack(
+	boost::static_pointer_cast<MMonSubscribeAck>(m));
+    case CEPH_MSG_MON_GET_VERSION_REPLY:
+      return handle_get_version_reply(
+	boost::static_pointer_cast<MMonGetVersionReply>(m));
+    case MSG_MON_COMMAND_ACK:
+      return handle_mon_command_ack(
+	boost::static_pointer_cast<MMonCommandAck>(m));
+    case MSG_LOGACK:
+      return handle_log_ack(
+	boost::static_pointer_cast<MLogAck>(m));
+    case MSG_CONFIG:
+      return handle_config(
+	boost::static_pointer_cast<MConfig>(m));
+    default:
+      dispatched = false;
+      return seastar::now();
+    }
+  });
+  return (dispatched ? std::make_optional(seastar::now()) : std::nullopt);
+}
+
+void Client::ms_handle_reset(crimson::net::ConnectionRef conn, bool /* is_replace */)
+{
+  gate.dispatch_in_background(__func__, *this, [this, conn] {
+    auto found = std::find_if(pending_conns.begin(), pending_conns.end(),
+			      [peer_addr = conn->get_peer_addr()](auto& mc) {
+				return mc->is_my_peer(peer_addr);
+			      });
+    if (found != pending_conns.end()) {
+      logger().warn("pending conn reset by {}", conn->get_peer_addr());
+      (*found)->close();
+      return seastar::now();
+    } else if (active_con && active_con->is_my_peer(conn->get_peer_addr())) {
+      logger().warn("active conn reset {}", conn->get_peer_addr());
+      active_con.reset();
+      return reopen_session(-1).then([this] {
+	send_pendings();
+	return seastar::now();
+      });
+    } else {
+      return seastar::now();
+    }
+  });
+}
+
+std::pair<std::vector<uint32_t>, std::vector<uint32_t>>
+Client::get_supported_auth_methods(int peer_type)
+{
+    std::vector<uint32_t> methods;
+    std::vector<uint32_t> modes;
+    auth_registry.get_supported_methods(peer_type, &methods, &modes);
+    return {methods, modes};
+}
+
+uint32_t Client::pick_con_mode(int peer_type,
+                               uint32_t auth_method,
+                               const std::vector<uint32_t>& preferred_modes)
+{
+  return auth_registry.pick_mode(peer_type, auth_method, preferred_modes);
+}
+
+AuthAuthorizeHandler* Client::get_auth_authorize_handler(int peer_type,
+                                                         int auth_method)
+{
+  return auth_registry.get_handler(peer_type, auth_method);
+}
+
+
+int Client::handle_auth_request(crimson::net::ConnectionRef con,
+                                AuthConnectionMetaRef auth_meta,
+                                bool more,
+                                uint32_t auth_method,
+                                const ceph::bufferlist& payload,
+                                ceph::bufferlist *reply)
+{
+  // for some channels prior to nautilus (osd heartbeat), we tolerate the lack of
+  // an authorizer.
+  if (payload.length() == 0) {
+    if (con->get_messenger()->get_require_authorizer()) {
+      return -EACCES;
+    } else {
+      auth_handler.handle_authentication({}, {});
+      return 1;
+    }
+  }
+  auth_meta->auth_mode = payload[0];
+  if (auth_meta->auth_mode < AUTH_MODE_AUTHORIZER ||
+      auth_meta->auth_mode > AUTH_MODE_AUTHORIZER_MAX) {
+    return -EACCES;
+  }
+  AuthAuthorizeHandler* ah = get_auth_authorize_handler(con->get_peer_type(),
+                                                        auth_method);
+  if (!ah) {
+    logger().error("no AuthAuthorizeHandler found for auth method: {}",
+                   auth_method);
+    return -EOPNOTSUPP;
+  }
+  auto authorizer_challenge = &auth_meta->authorizer_challenge;
+  if (auth_meta->skip_authorizer_challenge) {
+    logger().info("skipping challenge on {}", con);
+    authorizer_challenge = nullptr;
+  }
+  bool was_challenge = (bool)auth_meta->authorizer_challenge;
+  EntityName name;
+  AuthCapsInfo caps_info;
+  bool is_valid = ah->verify_authorizer(
+    &cct,
+    active_con->get_keys(),
+    payload,
+    auth_meta->get_connection_secret_length(),
+    reply,
+    &name,
+    &active_con->get_conn()->peer_global_id,
+    &caps_info,
+    &auth_meta->session_key,
+    &auth_meta->connection_secret,
+    authorizer_challenge);
+  if (is_valid) {
+    auth_handler.handle_authentication(name, caps_info);
+    return 1;
+  }
+  if (!more && !was_challenge && auth_meta->authorizer_challenge) {
+    logger().info("added challenge on {}", con);
+    return 0;
+  } else {
+    logger().info("bad authorizer on {}", con);
+    return -EACCES;
+  }
+}
+
+auth::AuthClient::auth_request_t
+Client::get_auth_request(crimson::net::ConnectionRef con,
+                         AuthConnectionMetaRef auth_meta)
+{
+  logger().info("get_auth_request(con={}, auth_method={})",
+                con, auth_meta->auth_method);
+  // connection to mon?
+  if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+    auto found = std::find_if(pending_conns.begin(), pending_conns.end(),
+                              [peer_addr = con->get_peer_addr()](auto& mc) {
+                                return mc->is_my_peer(peer_addr);
+                              });
+    if (found == pending_conns.end()) {
+      throw crimson::auth::error{"unknown connection"};
+    }
+    return (*found)->get_auth_request(entity_name, want_keys);
+  } else {
+    // generate authorizer
+    if (!active_con) {
+      logger().error(" but no auth handler is set up");
+      throw crimson::auth::error("no auth available");
+    }
+    auto authorizer = active_con->get_authorizer(con->get_peer_type());
+    if (!authorizer) {
+      logger().error("failed to build_authorizer for type {}",
+                     ceph_entity_type_name(con->get_peer_type()));
+      throw crimson::auth::error("unable to build auth");
+    }
+    auth_meta->authorizer.reset(authorizer);
+    auth_meta->auth_method = authorizer->protocol;
+    vector<uint32_t> modes;
+    auth_registry.get_supported_modes(con->get_peer_type(),
+                                      auth_meta->auth_method,
+                                      &modes);
+    return {authorizer->protocol, modes, authorizer->bl};
+  }
+}
+
+ceph::bufferlist Client::handle_auth_reply_more(crimson::net::ConnectionRef conn,
+                                                AuthConnectionMetaRef auth_meta,
+                                                const bufferlist& bl)
+{
+  if (conn->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+    auto found = std::find_if(pending_conns.begin(), pending_conns.end(),
+                              [peer_addr = conn->get_peer_addr()](auto& mc) {
+                                return mc->is_my_peer(peer_addr);
+                              });
+    if (found == pending_conns.end()) {
+      throw crimson::auth::error{"unknown connection"};
+    }
+    bufferlist reply;
+    tie(auth_meta->session_key, auth_meta->connection_secret, reply) =
+      (*found)->handle_auth_reply_more(bl);
+    return reply;
+  } else {
+    // authorizer challenges
+    if (!active_con || !auth_meta->authorizer) {
+      logger().error("no authorizer?");
+      throw crimson::auth::error("no auth available");
+    }
+    auth_meta->authorizer->add_challenge(&cct, bl);
+    return auth_meta->authorizer->bl;
+  }
+}
+
+int Client::handle_auth_done(crimson::net::ConnectionRef conn,
+                             AuthConnectionMetaRef auth_meta,
+                             uint64_t global_id,
+                             uint32_t con_mode,
+                             const bufferlist& bl)
+{
+  if (conn->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+    auto found = std::find_if(pending_conns.begin(), pending_conns.end(),
+                              [peer_addr = conn->get_peer_addr()](auto& mc) {
+                                return mc->is_my_peer(peer_addr);
+                              });
+    if (found == pending_conns.end()) {
+      return -ENOENT;
+    }
+    int r = 0;
+    tie(auth_meta->session_key, auth_meta->connection_secret, r) =
+      (*found)->handle_auth_done(global_id, bl);
+    return r;
+  } else {
+    // verify authorizer reply
+    auto p = bl.begin();
+    if (!auth_meta->authorizer->verify_reply(p, &auth_meta->connection_secret)) {
+      logger().error("failed verifying authorizer reply");
+      return -EACCES;
+    }
+    auth_meta->session_key = auth_meta->authorizer->session_key;
+    return 0;
+  }
+}
+
+ // Handle server's indication that the previous auth attempt failed
+int Client::handle_auth_bad_method(crimson::net::ConnectionRef conn,
+                                   AuthConnectionMetaRef auth_meta,
+                                   uint32_t old_auth_method,
+                                   int result,
+                                   const std::vector<uint32_t>& allowed_methods,
+                                   const std::vector<uint32_t>& allowed_modes)
+{
+  if (conn->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+    auto found = std::find_if(pending_conns.begin(), pending_conns.end(),
+                              [peer_addr = conn->get_peer_addr()](auto& mc) {
+                                return mc->is_my_peer(peer_addr);
+                              });
+    if (found != pending_conns.end()) {
+      return (*found)->handle_auth_bad_method(
+	old_auth_method, result,
+	allowed_methods, allowed_modes);
+    } else {
+      return -ENOENT;
+    }
+  } else {
+    // huh...
+    logger().info("hmm, they didn't like {} result {}",
+                  old_auth_method, cpp_strerror(result));
+    return -EACCES;
+  }
+}
+
+seastar::future<> Client::handle_monmap(crimson::net::ConnectionRef conn,
+                                        Ref<MMonMap> m)
+{
+  monmap.decode(m->monmapbl);
+  const auto peer_addr = conn->get_peer_addr();
+  auto cur_mon = monmap.get_name(peer_addr);
+  logger().info("got monmap {}, mon.{}, is now rank {}",
+                 monmap.epoch, cur_mon, monmap.get_rank(cur_mon));
+  sub.got("monmap", monmap.get_epoch());
+
+  if (monmap.get_addr_name(peer_addr, cur_mon)) {
+    if (active_con) {
+      logger().info("handle_monmap: renewing tickets");
+      return seastar::when_all_succeed(
+	active_con->renew_tickets(),
+	active_con->renew_rotating_keyring()).then_unpack([](){
+	  logger().info("handle_mon_map: renewed tickets");
+	});
+    } else {
+      return seastar::now();
+    }
+  } else {
+    logger().warn("mon.{} went away", cur_mon);
+    return reopen_session(-1).then([this] {
+      send_pendings();
+      return seastar::now();
+    });
+  }
+}
+
+seastar::future<> Client::handle_auth_reply(crimson::net::ConnectionRef conn,
+                                            Ref<MAuthReply> m)
+{
+  logger().info(
+    "handle_auth_reply mon {} => {} returns {}: {}",
+    conn->get_messenger()->get_myaddr(),
+    conn->get_peer_addr(), *m, m->result);
+  auto found = std::find_if(pending_conns.begin(), pending_conns.end(),
+                            [peer_addr = conn->get_peer_addr()](auto& mc) {
+                              return mc->is_my_peer(peer_addr);
+                            });
+  if (found != pending_conns.end()) {
+    return (*found)->handle_auth_reply(m);
+  } else if (active_con) {
+    return active_con->handle_auth_reply(m);
+  } else {
+    logger().error("unknown auth reply from {}", conn->get_peer_addr());
+    return seastar::now();
+  }
+}
+
+seastar::future<> Client::handle_subscribe_ack(Ref<MMonSubscribeAck> m)
+{
+  sub.acked(m->interval);
+  return seastar::now();
+}
+
+Client::get_version_t Client::get_version(const std::string& map)
+{
+  auto m = make_message<MMonGetVersion>();
+  auto tid = ++last_version_req_id;
+  m->handle = tid;
+  m->what = map;
+  auto& req = version_reqs[tid];
+  return send_message(m).then([&req] {
+    return req.get_future();
+  });
+}
+
+seastar::future<>
+Client::handle_get_version_reply(Ref<MMonGetVersionReply> m)
+{
+  if (auto found = version_reqs.find(m->handle);
+      found != version_reqs.end()) {
+    auto& result = found->second;
+    logger().trace("{}: {} returns {}",
+                 __func__, m->handle, m->version);
+    result.set_value(std::make_tuple(m->version, m->oldest_version));
+    version_reqs.erase(found);
+  } else {
+    logger().warn("{}: version request with handle {} not found",
+                __func__, m->handle);
+  }
+  return seastar::now();
+}
+
+seastar::future<> Client::handle_mon_command_ack(Ref<MMonCommandAck> m)
+{
+  const auto tid = m->get_tid();
+  if (auto found = mon_commands.find(tid);
+      found != mon_commands.end()) {
+    auto& result = found->second;
+    logger().trace("{} {}", __func__, tid);
+    result.set_value(std::make_tuple(m->r, m->rs, std::move(m->get_data())));
+    mon_commands.erase(found);
+  } else {
+    logger().warn("{} {} not found", __func__, tid);
+  }
+  return seastar::now();
+}
+
+seastar::future<> Client::handle_log_ack(Ref<MLogAck> m)
+{
+  // XXX
+  return seastar::now();
+}
+
+seastar::future<> Client::handle_config(Ref<MConfig> m)
+{
+  return crimson::common::local_conf().set_mon_vals(m->config);
+}
+
+std::vector<unsigned> Client::get_random_mons(unsigned n) const
+{
+  uint16_t min_priority = std::numeric_limits<uint16_t>::max();
+  for (const auto& m : monmap.mon_info) {
+    if (m.second.priority < min_priority) {
+      min_priority = m.second.priority;
+    }
+  }
+  vector<unsigned> ranks;
+  for (auto [name, info] : monmap.mon_info) {
+    if (info.priority == min_priority) {
+      ranks.push_back(monmap.get_rank(name));
+    }
+  }
+  std::random_device rd;
+  std::default_random_engine rng{rd()};
+  std::shuffle(ranks.begin(), ranks.end(), rng);
+  if (n == 0 || n > ranks.size()) {
+    return ranks;
+  } else {
+    return {ranks.begin(), ranks.begin() + n};
+  }
+}
+
+seastar::future<> Client::authenticate()
+{
+  return reopen_session(-1).then([this] {
+    send_pendings();
+    return seastar::now();
+  });
+}
+
+seastar::future<> Client::stop()
+{
+  logger().info("{}", __func__);
+  auto fut = gate.close();
+  timer.cancel();
+  for (auto& pending_con : pending_conns) {
+    pending_con->close();
+  }
+  if (active_con) {
+    active_con->close();
+  }
+  return fut;
+}
+
+seastar::future<> Client::reopen_session(int rank)
+{
+  logger().info("{} to mon.{}", __func__, rank);
+  vector<unsigned> mons;
+  if (rank >= 0) {
+    mons.push_back(rank);
+  } else {
+    const auto parallel =
+      crimson::common::local_conf().get_val<uint64_t>("mon_client_hunt_parallel");
+    mons = get_random_mons(parallel);
+  }
+  pending_conns.reserve(mons.size());
+  return seastar::parallel_for_each(mons, [this](auto rank) {
+    // TODO: connect to multiple addrs
+    auto peer = monmap.get_addrs(rank).pick_addr(msgr.get_myaddr().get_type());
+    if (peer == entity_addr_t{}) {
+      // crimson msgr only uses the first bound addr
+      logger().warn("mon.{} does not have an addr compatible with me", rank);
+      return seastar::now();
+    }
+    logger().info("connecting to mon.{}", rank);
+    return seastar::futurize_invoke(
+        [peer, this] () -> seastar::future<Connection::auth_result_t> {
+      auto conn = msgr.connect(peer, CEPH_ENTITY_TYPE_MON);
+      auto& mc = pending_conns.emplace_back(
+	std::make_unique<Connection>(auth_registry, conn, &keyring));
+      if (conn->get_peer_addr().is_msgr2()) {
+        return mc->authenticate_v2();
+      } else {
+        return mc->authenticate_v1(monmap.get_epoch(), entity_name, want_keys)
+          .handle_exception([conn](auto ep) {
+            conn->mark_down();
+            return seastar::make_exception_future<Connection::auth_result_t>(ep);
+          });
+      }
+    }).then([peer, this](auto result) {
+      if (result == Connection::auth_result_t::success) {
+        _finish_auth(peer);
+      }
+      logger().debug("reopen_session mon connection attempts complete");
+    }).handle_exception([](auto ep) {
+      logger().error("mon connections failed with ep {}", ep);
+      return seastar::make_exception_future(ep);
+    });
+  }).then([this] {
+    if (!active_con) {
+      return seastar::make_exception_future(
+	  crimson::common::system_shutdown_exception());
+    }
+    return active_con->renew_rotating_keyring();
+  });
+}
+
+void Client::_finish_auth(const entity_addr_t& peer)
+{
+  if (!is_hunting()) {
+    return;
+  }
+  logger().info("found mon.{}", monmap.get_name(peer));
+
+  auto found = std::find_if(
+    pending_conns.begin(), pending_conns.end(),
+    [peer](auto& conn) {
+      return conn->is_my_peer(peer);
+  });
+  if (found == pending_conns.end()) {
+    // Happens if another connection has won the race
+    ceph_assert(active_con && pending_conns.empty());
+    logger().info("no pending connection for mon.{}, peer {}",
+      monmap.get_name(peer), peer);
+    return;
+  }
+
+  ceph_assert(!active_con && !pending_conns.empty());
+  active_con = std::move(*found);
+  found->reset();
+  for (auto& conn : pending_conns) {
+    if (conn) {
+      conn->close();
+    }
+  }
+  pending_conns.clear();
+}
+
+Client::command_result_t
+Client::run_command(const std::vector<std::string>& cmd,
+                    const bufferlist& bl)
+{
+  auto m = make_message<MMonCommand>(monmap.fsid);
+  auto tid = ++last_mon_command_id;
+  m->set_tid(tid);
+  m->cmd = cmd;
+  m->set_data(bl);
+  auto& req = mon_commands[tid];
+  return send_message(m).then([&req] {
+    return req.get_future();
+  });
+}
+
+seastar::future<> Client::send_message(MessageRef m)
+{
+  if (active_con) {
+    if (!pending_messages.empty()) {
+      send_pendings();
+    }
+    return active_con->get_conn()->send(m);
+  }
+  auto& delayed = pending_messages.emplace_back(m);
+  return delayed.pr.get_future();
+}
+
+void Client::send_pendings()
+{
+  if (active_con) {
+    for (auto& m : pending_messages) {
+      (void) active_con->get_conn()->send(m.msg);
+      m.pr.set_value();
+    }
+    pending_messages.clear();
+  }
+}
+
+bool Client::sub_want(const std::string& what, version_t start, unsigned flags)
+{
+  return sub.want(what, start, flags);
+}
+
+void Client::sub_got(const std::string& what, version_t have)
+{
+  sub.got(what, have);
+}
+
+void Client::sub_unwant(const std::string& what)
+{
+  sub.unwant(what);
+}
+
+bool Client::sub_want_increment(const std::string& what,
+                                version_t start,
+                                unsigned flags)
+{
+  return sub.inc_want(what, start, flags);
+}
+
+seastar::future<> Client::renew_subs()
+{
+  if (!sub.have_new()) {
+    logger().warn("{} - empty", __func__);
+    return seastar::now();
+  }
+  logger().trace("{}", __func__);
+
+  auto m = make_message<MMonSubscribe>();
+  m->what = sub.get_subs();
+  m->hostname = ceph_get_short_hostname();
+  return send_message(m).then([this] {
+    sub.renewed();
+  });
+}
+
+void Client::print(std::ostream& out) const
+{
+  out << "mon." << entity_name;
+}
+
+} // namespace crimson::mon
diff --git a/src/crimson/mon/MonClient.h b/src/crimson/mon/MonClient.h
new file mode 100644
index 000000000..e7d2df863
--- /dev/null
+++ b/src/crimson/mon/MonClient.h
@@ -0,0 +1,183 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/timer.hh>
+
+#include "auth/AuthRegistry.h"
+#include "auth/KeyRing.h"
+#include "common/ceph_context.h"
+
+#include "crimson/auth/AuthClient.h"
+#include "crimson/auth/AuthServer.h"
+#include "crimson/common/auth_handler.h"
+#include "crimson/common/gated.h"
+#include "crimson/net/Dispatcher.h"
+#include "crimson/net/Fwd.h"
+
+#include "mon/MonMap.h"
+
+#include "mon/MonSub.h"
+
+template<typename Message> using Ref = boost::intrusive_ptr<Message>;
+namespace crimson::net {
+  class Messenger;
+}
+
+struct AuthAuthorizeHandler;
+class MAuthReply;
+struct MMonMap;
+struct MMonSubscribeAck;
+struct MMonGetVersionReply;
+struct MMonCommandAck;
+struct MLogAck;
+struct MConfig;
+
+namespace crimson::mon {
+
+class Connection;
+
+class Client : public crimson::net::Dispatcher,
+	       public crimson::auth::AuthClient,
+	       public crimson::auth::AuthServer
+{
+  EntityName entity_name;
+  KeyRing keyring;
+  const uint32_t want_keys;
+
+  MonMap monmap;
+  std::unique_ptr<Connection> active_con;
+  std::vector<std::unique_ptr<Connection>> pending_conns;
+  seastar::timer<seastar::lowres_clock> timer;
+
+  crimson::net::Messenger& msgr;
+
+  // commands
+  using get_version_t = seastar::future<std::tuple<version_t, version_t>>;
+
+  ceph_tid_t last_version_req_id = 0;
+  std::map<ceph_tid_t, typename get_version_t::promise_type> version_reqs;
+
+  ceph_tid_t last_mon_command_id = 0;
+  using command_result_t =
+    seastar::future<std::tuple<std::int32_t, string, ceph::bufferlist>>;
+  std::map<ceph_tid_t, typename command_result_t::promise_type> mon_commands;
+
+  MonSub sub;
+
+public:
+  Client(crimson::net::Messenger&, crimson::common::AuthHandler&);
+  Client(Client&&);
+  ~Client();
+  seastar::future<> start();
+  seastar::future<> stop();
+
+  const uuid_d& get_fsid() const {
+    return monmap.fsid;
+  }
+  get_version_t get_version(const std::string& map);
+  command_result_t run_command(const std::vector<std::string>& cmd,
+			       const bufferlist& bl);
+  seastar::future<> send_message(MessageRef);
+  bool sub_want(const std::string& what, version_t start, unsigned flags);
+  void sub_got(const std::string& what, version_t have);
+  void sub_unwant(const std::string& what);
+  bool sub_want_increment(const std::string& what, version_t start, unsigned flags);
+  seastar::future<> renew_subs();
+
+  void print(std::ostream&) const;
+private:
+  // AuthServer methods
+  std::pair<std::vector<uint32_t>, std::vector<uint32_t>>
+  get_supported_auth_methods(int peer_type) final;
+  uint32_t pick_con_mode(int peer_type,
+			 uint32_t auth_method,
+			 const std::vector<uint32_t>& preferred_modes) final;
+  AuthAuthorizeHandler* get_auth_authorize_handler(int peer_type,
+						   int auth_method) final;
+  int handle_auth_request(crimson::net::ConnectionRef conn,
+			  AuthConnectionMetaRef auth_meta,
+			  bool more,
+			  uint32_t auth_method,
+			  const ceph::bufferlist& payload,
+			  ceph::bufferlist *reply) final;
+
+  crimson::common::CephContext cct; // for auth_registry
+  AuthRegistry auth_registry;
+  crimson::common::AuthHandler& auth_handler;
+
+  // AuthClient methods
+  crimson::auth::AuthClient::auth_request_t
+  get_auth_request(crimson::net::ConnectionRef conn,
+		   AuthConnectionMetaRef auth_meta) final;
+
+   // Handle server's request to continue the handshake
+  ceph::bufferlist handle_auth_reply_more(crimson::net::ConnectionRef conn,
+					  AuthConnectionMetaRef auth_meta,
+					  const bufferlist& bl) final;
+
+   // Handle server's indication that authentication succeeded
+  int handle_auth_done(crimson::net::ConnectionRef conn,
+		       AuthConnectionMetaRef auth_meta,
+		       uint64_t global_id,
+		       uint32_t con_mode,
+		       const bufferlist& bl) final;
+
+   // Handle server's indication that the previous auth attempt failed
+  int handle_auth_bad_method(crimson::net::ConnectionRef conn,
+			     AuthConnectionMetaRef auth_meta,
+			     uint32_t old_auth_method,
+			     int result,
+			     const std::vector<uint32_t>& allowed_methods,
+			     const std::vector<uint32_t>& allowed_modes) final;
+
+private:
+  void tick();
+
+  std::optional<seastar::future<>> ms_dispatch(crimson::net::ConnectionRef conn,
+                                               MessageRef m) override;
+  void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) override;
+
+  seastar::future<> handle_monmap(crimson::net::ConnectionRef conn,
+				  Ref<MMonMap> m);
+  seastar::future<> handle_auth_reply(crimson::net::ConnectionRef conn,
+				      Ref<MAuthReply> m);
+  seastar::future<> handle_subscribe_ack(Ref<MMonSubscribeAck> m);
+  seastar::future<> handle_get_version_reply(Ref<MMonGetVersionReply> m);
+  seastar::future<> handle_mon_command_ack(Ref<MMonCommandAck> m);
+  seastar::future<> handle_log_ack(Ref<MLogAck> m);
+  seastar::future<> handle_config(Ref<MConfig> m);
+
+  void send_pendings();
+private:
+  seastar::future<> load_keyring();
+  seastar::future<> authenticate();
+
+  bool is_hunting() const;
+  seastar::future<> reopen_session(int rank);
+  std::vector<unsigned> get_random_mons(unsigned n) const;
+  seastar::future<> _add_conn(unsigned rank, uint64_t global_id);
+  void _finish_auth(const entity_addr_t& peer);
+  crimson::common::Gated gate;
+
+  // messages that are waiting for the active_con to be available
+  struct pending_msg_t {
+    pending_msg_t(MessageRef& m) : msg(m) {}
+    MessageRef msg;
+    seastar::promise<> pr;
+  };
+  std::deque<pending_msg_t> pending_messages;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const Client& client) {
+  client.print(out);
+  return out;
+}
+
+} // namespace crimson::mon
diff --git a/src/crimson/net/Connection.h b/src/crimson/net/Connection.h
new file mode 100644
index 000000000..6af12692e
--- /dev/null
+++ b/src/crimson/net/Connection.h
@@ -0,0 +1,175 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <queue>
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_ptr.hh>
+
+#include "Fwd.h"
+
+namespace crimson::net {
+
+#ifdef UNIT_TESTS_BUILT
+class Interceptor;
+#endif
+
+using seq_num_t = uint64_t;
+
+class Connection : public seastar::enable_shared_from_this<Connection> {
+  entity_name_t peer_name = {0, entity_name_t::NEW};
+
+ protected:
+  entity_addr_t peer_addr;
+
+  // which of the peer_addrs we're connecting to (as client)
+  // or should reconnect to (as peer)
+  entity_addr_t target_addr;
+
+  using clock_t = seastar::lowres_system_clock;
+  clock_t::time_point last_keepalive;
+  clock_t::time_point last_keepalive_ack;
+
+  void set_peer_type(entity_type_t peer_type) {
+    // it is not allowed to assign an unknown value when the current
+    // value is known
+    assert(!(peer_type == 0 &&
+             peer_name.type() != 0));
+    // it is not allowed to assign a different known value when the
+    // current value is also known.
+    assert(!(peer_type != 0 &&
+             peer_name.type() != 0 &&
+             peer_type != peer_name.type()));
+    peer_name._type = peer_type;
+  }
+  void set_peer_id(int64_t peer_id) {
+    // it is not allowed to assign an unknown value when the current
+    // value is known
+    assert(!(peer_id == entity_name_t::NEW &&
+             peer_name.num() != entity_name_t::NEW));
+    // it is not allowed to assign a different known value when the
+    // current value is also known.
+    assert(!(peer_id != entity_name_t::NEW &&
+             peer_name.num() != entity_name_t::NEW &&
+             peer_id != peer_name.num()));
+    peer_name._num = peer_id;
+  }
+  void set_peer_name(entity_name_t name) {
+    set_peer_type(name.type());
+    set_peer_id(name.num());
+  }
+
+ public:
+  uint64_t peer_global_id = 0;
+
+ protected:
+  uint64_t features = 0;
+
+ public:
+  void set_features(uint64_t new_features) {
+    features = new_features;
+  }
+  auto get_features() const {
+    return features;
+  }
+  bool has_feature(uint64_t f) const {
+    return features & f;
+  }
+
+ public:
+  Connection() {}
+  virtual ~Connection() {}
+
+#ifdef UNIT_TESTS_BUILT
+  Interceptor *interceptor = nullptr;
+#endif
+
+  virtual Messenger* get_messenger() const = 0;
+  const entity_addr_t& get_peer_addr() const { return peer_addr; }
+  const entity_addrvec_t get_peer_addrs() const {
+    return entity_addrvec_t(peer_addr);
+  }
+  const auto& get_peer_socket_addr() const {
+    return target_addr;
+  }
+  const entity_name_t& get_peer_name() const { return peer_name; }
+  entity_type_t get_peer_type() const { return peer_name.type(); }
+  int64_t get_peer_id() const { return peer_name.num(); }
+
+  bool peer_is_mon() const { return peer_name.is_mon(); }
+  bool peer_is_mgr() const { return peer_name.is_mgr(); }
+  bool peer_is_mds() const { return peer_name.is_mds(); }
+  bool peer_is_osd() const { return peer_name.is_osd(); }
+  bool peer_is_client() const { return peer_name.is_client(); }
+
+  /// true if the handshake has completed and no errors have been encountered
+  virtual bool is_connected() const = 0;
+
+#ifdef UNIT_TESTS_BUILT
+  virtual bool is_closed() const = 0;
+
+  virtual bool is_closed_clean() const = 0;
+
+  virtual bool peer_wins() const = 0;
+#endif
+
+  /// send a message over a connection that has completed its handshake
+  virtual seastar::future<> send(MessageRef msg) = 0;
+
+  /// send a keepalive message over a connection that has completed its
+  /// handshake
+  virtual seastar::future<> keepalive() = 0;
+
+  // close the connection and cancel any any pending futures from read/send,
+  // without dispatching any reset event
+  virtual void mark_down() = 0;
+
+  virtual void print(ostream& out) const = 0;
+
+  void set_last_keepalive(clock_t::time_point when) {
+    last_keepalive = when;
+  }
+  void set_last_keepalive_ack(clock_t::time_point when) {
+    last_keepalive_ack = when;
+  }
+  auto get_last_keepalive() const { return last_keepalive; }
+  auto get_last_keepalive_ack() const { return last_keepalive_ack; }
+
+  struct user_private_t {
+    virtual ~user_private_t() = default;
+  };
+private:
+  unique_ptr<user_private_t> user_private;
+public:
+  bool has_user_private() const {
+    return user_private != nullptr;
+  }
+  void set_user_private(unique_ptr<user_private_t> new_user_private) {
+    user_private = std::move(new_user_private);
+  }
+  user_private_t &get_user_private() {
+    ceph_assert(user_private);
+    return *user_private;
+  }
+};
+
+inline ostream& operator<<(ostream& out, const Connection& conn) {
+  out << "[";
+  conn.print(out);
+  out << "]";
+  return out;
+}
+
+} // namespace crimson::net
diff --git a/src/crimson/net/Dispatcher.h b/src/crimson/net/Dispatcher.h
new file mode 100644
index 000000000..cc6fd4574
--- /dev/null
+++ b/src/crimson/net/Dispatcher.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "Fwd.h"
+
+class AuthAuthorizer;
+
+namespace crimson::net {
+
+class Dispatcher {
+ public:
+  virtual ~Dispatcher() {}
+
+  // Dispatchers are put into a chain as described by chain-of-responsibility
+  // pattern. If any of the dispatchers claims this message, it returns a valid
+  // future to prevent other dispatchers from processing it, and this is also
+  // used to throttle the connection if it's too busy.
+  virtual std::optional<seastar::future<>> ms_dispatch(ConnectionRef, MessageRef) = 0;
+
+  virtual void ms_handle_accept(ConnectionRef conn) {}
+
+  virtual void ms_handle_connect(ConnectionRef conn) {}
+
+  // a reset event is dispatched when the connection is closed unexpectedly.
+  // is_replace=true means the reset connection is going to be replaced by
+  // another accepting connection with the same peer_addr, which currently only
+  // happens under lossy policy when both sides wish to connect to each other.
+  virtual void ms_handle_reset(ConnectionRef conn, bool is_replace) {}
+
+  virtual void ms_handle_remote_reset(ConnectionRef conn) {}
+};
+
+} // namespace crimson::net
diff --git a/src/crimson/net/Errors.cc b/src/crimson/net/Errors.cc
new file mode 100644
index 000000000..d07c090db
--- /dev/null
+++ b/src/crimson/net/Errors.cc
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "Errors.h"
+
+namespace crimson::net {
+
+const std::error_category& net_category()
+{
+  struct category : public std::error_category {
+    const char* name() const noexcept override {
+      return "crimson::net";
+    }
+
+    std::string message(int ev) const override {
+      switch (static_cast<error>(ev)) {
+        case error::success:
+          return "success";
+        case error::bad_connect_banner:
+          return "bad connect banner";
+        case error::bad_peer_address:
+          return "bad peer address";
+        case error::negotiation_failure:
+          return "negotiation failure";
+        case error::read_eof:
+          return "read eof";
+        case error::corrupted_message:
+          return "corrupted message";
+        case error::protocol_aborted:
+          return "protocol aborted";
+        default:
+          return "unknown";
+      }
+    }
+  };
+  static category instance;
+  return instance;
+}
+
+} // namespace crimson::net
diff --git a/src/crimson/net/Errors.h b/src/crimson/net/Errors.h
new file mode 100644
index 000000000..3a17a103a
--- /dev/null
+++ b/src/crimson/net/Errors.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <system_error>
+
+namespace crimson::net {
+
+/// net error codes
+enum class error {
+  success = 0,
+  bad_connect_banner,
+  bad_peer_address,
+  negotiation_failure,
+  read_eof,
+  corrupted_message,
+  protocol_aborted,
+};
+
+/// net error category
+const std::error_category& net_category();
+
+inline std::error_code make_error_code(error e)
+{
+  return {static_cast<int>(e), net_category()};
+}
+
+inline std::error_condition make_error_condition(error e)
+{
+  return {static_cast<int>(e), net_category()};
+}
+
+} // namespace crimson::net
+
+namespace std {
+
+/// enables implicit conversion to std::error_condition
+template <>
+struct is_error_condition_enum<crimson::net::error> : public true_type {};
+
+} // namespace std
diff --git a/src/crimson/net/Fwd.h b/src/crimson/net/Fwd.h
new file mode 100644
index 000000000..e10120571
--- /dev/null
+++ b/src/crimson/net/Fwd.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <boost/container/small_vector.hpp>
+#include <seastar/core/future.hh>
+#include <seastar/core/future-util.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/sharded.hh>
+
+#include "msg/Connection.h"
+#include "msg/MessageRef.h"
+#include "msg/msg_types.h"
+
+#include "crimson/common/errorator.h"
+
+using auth_proto_t = int;
+
+class AuthConnectionMeta;
+using AuthConnectionMetaRef = seastar::lw_shared_ptr<AuthConnectionMeta>;
+
+namespace crimson::net {
+
+using msgr_tag_t = uint8_t;
+using stop_t = seastar::stop_iteration;
+
+class Connection;
+using ConnectionRef = seastar::shared_ptr<Connection>;
+
+class Dispatcher;
+class ChainedDispatchers;
+constexpr std::size_t NUM_DISPATCHERS = 4u;
+using dispatchers_t = boost::container::small_vector<Dispatcher*, NUM_DISPATCHERS>;
+
+class Messenger;
+using MessengerRef = seastar::shared_ptr<Messenger>;
+
+} // namespace crimson::net
diff --git a/src/crimson/net/Interceptor.h b/src/crimson/net/Interceptor.h
new file mode 100644
index 000000000..dfa2183ec
--- /dev/null
+++ b/src/crimson/net/Interceptor.h
@@ -0,0 +1,165 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <variant>
+#include <seastar/core/sharded.hh>
+#include <seastar/core/sleep.hh>
+
+#include "Fwd.h"
+#include "msg/async/frames_v2.h"
+
+namespace crimson::net {
+
+enum class custom_bp_t : uint8_t {
+  BANNER_WRITE = 0,
+  BANNER_READ,
+  BANNER_PAYLOAD_READ,
+  SOCKET_CONNECTING,
+  SOCKET_ACCEPTED
+};
+inline const char* get_bp_name(custom_bp_t bp) {
+  uint8_t index = static_cast<uint8_t>(bp);
+  static const char *const bp_names[] = {"BANNER_WRITE",
+                                         "BANNER_READ",
+                                         "BANNER_PAYLOAD_READ",
+                                         "SOCKET_CONNECTING",
+                                         "SOCKET_ACCEPTED"};
+  assert(index < std::size(bp_names));
+  return bp_names[index];
+}
+
+enum class bp_type_t {
+  READ = 0,
+  WRITE
+};
+
+enum class bp_action_t {
+  CONTINUE = 0,
+  FAULT,
+  BLOCK,
+  STALL
+};
+
+inline std::ostream& operator<<(std::ostream& out, const bp_action_t& action) {
+  static const char *const action_names[] = {"CONTINUE",
+                                             "FAULT",
+                                             "BLOCK",
+                                             "STALL"};
+  assert(static_cast<size_t>(action) < std::size(action_names));
+  return out << action_names[static_cast<size_t>(action)];
+}
+
+class socket_blocker {
+  std::optional<seastar::abort_source> p_blocked;
+  std::optional<seastar::abort_source> p_unblocked;
+
+ public:
+  seastar::future<> wait_blocked() {
+    ceph_assert(!p_blocked);
+    if (p_unblocked) {
+      return seastar::make_ready_future<>();
+    } else {
+      p_blocked = seastar::abort_source();
+      return seastar::sleep_abortable(10s, *p_blocked).then([] {
+        throw std::runtime_error(
+            "Timeout (10s) in socket_blocker::wait_blocked()");
+      }).handle_exception_type([] (const seastar::sleep_aborted& e) {
+        // wait done!
+      });
+    }
+  }
+
+  seastar::future<> block() {
+    if (p_blocked) {
+      p_blocked->request_abort();
+      p_blocked = std::nullopt;
+    }
+    ceph_assert(!p_unblocked);
+    p_unblocked = seastar::abort_source();
+    return seastar::sleep_abortable(10s, *p_unblocked).then([] {
+      ceph_abort("Timeout (10s) in socket_blocker::block()");
+    }).handle_exception_type([] (const seastar::sleep_aborted& e) {
+      // wait done!
+    });
+  }
+
+  void unblock() {
+    ceph_assert(!p_blocked);
+    ceph_assert(p_unblocked);
+    p_unblocked->request_abort();
+    p_unblocked = std::nullopt;
+  }
+};
+
+struct tag_bp_t {
+  ceph::msgr::v2::Tag tag;
+  bp_type_t type;
+  bool operator==(const tag_bp_t& x) const {
+    return tag == x.tag && type == x.type;
+  }
+  bool operator!=(const tag_bp_t& x) const { return !operator==(x); }
+  bool operator<(const tag_bp_t& x) const {
+    return std::tie(tag, type) < std::tie(x.tag, x.type);
+  }
+};
+
+struct Breakpoint {
+  using var_t = std::variant<custom_bp_t, tag_bp_t>;
+  var_t bp;
+  Breakpoint(custom_bp_t bp) : bp(bp) { }
+  Breakpoint(ceph::msgr::v2::Tag tag, bp_type_t type)
+    : bp(tag_bp_t{tag, type}) { }
+  bool operator==(const Breakpoint& x) const { return bp == x.bp; }
+  bool operator!=(const Breakpoint& x) const { return !operator==(x); }
+  bool operator==(const custom_bp_t& x) const { return bp == var_t(x); }
+  bool operator!=(const custom_bp_t& x) const { return !operator==(x); }
+  bool operator==(const tag_bp_t& x) const { return bp == var_t(x); }
+  bool operator!=(const tag_bp_t& x) const { return !operator==(x); }
+  bool operator<(const Breakpoint& x) const { return bp < x.bp; }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const Breakpoint& bp) {
+  if (auto custom_bp = std::get_if<custom_bp_t>(&bp.bp)) {
+    return out << get_bp_name(*custom_bp);
+  } else {
+    auto tag_bp = std::get<tag_bp_t>(bp.bp);
+    static const char *const tag_names[] = {"NONE",
+                                            "HELLO",
+                                            "AUTH_REQUEST",
+                                            "AUTH_BAD_METHOD",
+                                            "AUTH_REPLY_MORE",
+                                            "AUTH_REQUEST_MORE",
+                                            "AUTH_DONE",
+                                            "AUTH_SIGNATURE",
+                                            "CLIENT_IDENT",
+                                            "SERVER_IDENT",
+                                            "IDENT_MISSING_FEATURES",
+                                            "SESSION_RECONNECT",
+                                            "SESSION_RESET",
+                                            "SESSION_RETRY",
+                                            "SESSION_RETRY_GLOBAL",
+                                            "SESSION_RECONNECT_OK",
+                                            "WAIT",
+                                            "MESSAGE",
+                                            "KEEPALIVE2",
+                                            "KEEPALIVE2_ACK",
+                                            "ACK"};
+    assert(static_cast<size_t>(tag_bp.tag) < std::size(tag_names));
+    return out << tag_names[static_cast<size_t>(tag_bp.tag)]
+               << (tag_bp.type == bp_type_t::WRITE ? "_WRITE" : "_READ");
+  }
+}
+
+struct Interceptor {
+  socket_blocker blocker;
+  virtual ~Interceptor() {}
+  virtual void register_conn(Connection& conn) = 0;
+  virtual void register_conn_ready(Connection& conn) = 0;
+  virtual void register_conn_closed(Connection& conn) = 0;
+  virtual void register_conn_replaced(Connection& conn) = 0;
+  virtual bp_action_t intercept(Connection& conn, Breakpoint bp) = 0;
+};
+
+} // namespace crimson::net
diff --git a/src/crimson/net/Messenger.cc b/src/crimson/net/Messenger.cc
new file mode 100644
index 000000000..aab476f7a
--- /dev/null
+++ b/src/crimson/net/Messenger.cc
@@ -0,0 +1,17 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Messenger.h"
+#include "SocketMessenger.h"
+
+namespace crimson::net {
+
+MessengerRef
+Messenger::create(const entity_name_t& name,
+                  const std::string& lname,
+                  const uint64_t nonce)
+{
+  return seastar::make_shared<SocketMessenger>(name, lname, nonce);
+}
+
+} // namespace crimson::net
diff --git a/src/crimson/net/Messenger.h b/src/crimson/net/Messenger.h
new file mode 100644
index 000000000..2b39fbf63
--- /dev/null
+++ b/src/crimson/net/Messenger.h
@@ -0,0 +1,154 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "Fwd.h"
+#include "crimson/common/throttle.h"
+#include "msg/Message.h"
+#include "msg/Policy.h"
+
+class AuthAuthorizer;
+
+namespace crimson::auth {
+class AuthClient;
+class AuthServer;
+}
+
+namespace crimson::net {
+
+#ifdef UNIT_TESTS_BUILT
+class Interceptor;
+#endif
+
+using Throttle = crimson::common::Throttle;
+using SocketPolicy = ceph::net::Policy<Throttle>;
+
+class Messenger {
+  entity_name_t my_name;
+  entity_addrvec_t my_addrs;
+  uint32_t crc_flags = 0;
+  crimson::auth::AuthClient* auth_client = nullptr;
+  crimson::auth::AuthServer* auth_server = nullptr;
+  bool require_authorizer = true;
+
+public:
+  Messenger(const entity_name_t& name)
+    : my_name(name)
+  {}
+  virtual ~Messenger() {}
+
+#ifdef UNIT_TESTS_BUILT
+  Interceptor *interceptor = nullptr;
+#endif
+
+  entity_type_t get_mytype() const { return my_name.type(); }
+  const entity_name_t& get_myname() const { return my_name; }
+  const entity_addrvec_t& get_myaddrs() const { return my_addrs; }
+  entity_addr_t get_myaddr() const { return my_addrs.front(); }
+  virtual seastar::future<> set_myaddrs(const entity_addrvec_t& addrs) {
+    my_addrs = addrs;
+    return seastar::now();
+  }
+
+  using bind_ertr = crimson::errorator<
+    crimson::ct_error::address_in_use // The address (range) is already bound
+    >;
+  /// bind to the given address
+  virtual bind_ertr::future<> bind(const entity_addrvec_t& addr) = 0;
+
+  /// try to bind to the first unused port of given address
+  virtual bind_ertr::future<> try_bind(const entity_addrvec_t& addr,
+                                       uint32_t min_port, uint32_t max_port) = 0;
+
+  /// start the messenger
+  virtual seastar::future<> start(const dispatchers_t&) = 0;
+
+  /// either return an existing connection to the peer,
+  /// or a new pending connection
+  virtual ConnectionRef
+  connect(const entity_addr_t& peer_addr,
+          const entity_name_t& peer_name) = 0;
+
+  ConnectionRef
+  connect(const entity_addr_t& peer_addr,
+          const entity_type_t& peer_type) {
+    return connect(peer_addr, entity_name_t(peer_type, -1));
+  }
+
+  // wait for messenger shutdown
+  virtual seastar::future<> wait() = 0;
+
+  // stop dispatching events and messages
+  virtual void stop() = 0;
+
+  virtual bool is_started() const = 0;
+
+  // free internal resources before destruction, must be called after stopped,
+  // and must be called if is bound.
+  virtual seastar::future<> shutdown() = 0;
+
+  uint32_t get_crc_flags() const {
+    return crc_flags;
+  }
+  void set_crc_data() {
+    crc_flags |= MSG_CRC_DATA;
+  }
+  void set_crc_header() {
+    crc_flags |= MSG_CRC_HEADER;
+  }
+
+  crimson::auth::AuthClient* get_auth_client() const { return auth_client; }
+  void set_auth_client(crimson::auth::AuthClient *ac) {
+    auth_client = ac;
+  }
+  crimson::auth::AuthServer* get_auth_server() const { return auth_server; }
+  void set_auth_server(crimson::auth::AuthServer *as) {
+    auth_server = as;
+  }
+
+  virtual void print(ostream& out) const = 0;
+
+  virtual SocketPolicy get_policy(entity_type_t peer_type) const = 0;
+
+  virtual SocketPolicy get_default_policy() const = 0;
+
+  virtual void set_default_policy(const SocketPolicy& p) = 0;
+
+  virtual void set_policy(entity_type_t peer_type, const SocketPolicy& p) = 0;
+
+  virtual void set_policy_throttler(entity_type_t peer_type, Throttle* throttle) = 0;
+
+  // allow unauthenticated connections.  This is needed for compatibility with
+  // pre-nautilus OSDs, which do not authenticate the heartbeat sessions.
+  bool get_require_authorizer() const {
+    return require_authorizer;
+  }
+  void set_require_authorizer(bool r) {
+    require_authorizer = r;
+  }
+  static MessengerRef
+  create(const entity_name_t& name,
+         const std::string& lname,
+         const uint64_t nonce);
+};
+
+inline ostream& operator<<(ostream& out, const Messenger& msgr) {
+  out << "[";
+  msgr.print(out);
+  out << "]";
+  return out;
+}
+
+} // namespace crimson::net
diff --git a/src/crimson/net/Protocol.cc b/src/crimson/net/Protocol.cc
new file mode 100644
index 000000000..50b5c45a3
--- /dev/null
+++ b/src/crimson/net/Protocol.cc
@@ -0,0 +1,323 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Protocol.h"
+
+#include "auth/Auth.h"
+
+#include "crimson/common/log.h"
+#include "crimson/net/Errors.h"
+#include "crimson/net/chained_dispatchers.h"
+#include "crimson/net/Socket.h"
+#include "crimson/net/SocketConnection.h"
+#include "msg/Message.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_ms);
+  }
+}
+
+namespace crimson::net {
+
+Protocol::Protocol(proto_t type,
+                   ChainedDispatchers& dispatchers,
+                   SocketConnection& conn)
+  : proto_type(type),
+    dispatchers(dispatchers),
+    conn(conn),
+    auth_meta{seastar::make_lw_shared<AuthConnectionMeta>()}
+{}
+
+Protocol::~Protocol()
+{
+  ceph_assert(gate.is_closed());
+  assert(!exit_open);
+}
+
+void Protocol::close(bool dispatch_reset,
+                     std::optional<std::function<void()>> f_accept_new)
+{
+  if (closed) {
+    // already closing
+    return;
+  }
+
+  bool is_replace = f_accept_new ? true : false;
+  logger().info("{} closing: reset {}, replace {}", conn,
+                dispatch_reset ? "yes" : "no",
+                is_replace ? "yes" : "no");
+
+  // atomic operations
+  closed = true;
+  trigger_close();
+  if (f_accept_new) {
+    (*f_accept_new)();
+  }
+  if (socket) {
+    socket->shutdown();
+  }
+  set_write_state(write_state_t::drop);
+  assert(!gate.is_closed());
+  auto gate_closed = gate.close();
+
+  if (dispatch_reset) {
+    dispatchers.ms_handle_reset(
+        seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this()),
+        is_replace);
+  }
+
+  // asynchronous operations
+  assert(!close_ready.valid());
+  close_ready = std::move(gate_closed).then([this] {
+    if (socket) {
+      return socket->close();
+    } else {
+      return seastar::now();
+    }
+  }).then([this] {
+    logger().debug("{} closed!", conn);
+    on_closed();
+#ifdef UNIT_TESTS_BUILT
+    is_closed_clean = true;
+    if (conn.interceptor) {
+      conn.interceptor->register_conn_closed(conn);
+    }
+#endif
+  }).handle_exception([conn_ref = conn.shared_from_this(), this] (auto eptr) {
+    logger().error("{} closing: close_ready got unexpected exception {}", conn, eptr);
+    ceph_abort();
+  });
+}
+
+seastar::future<> Protocol::send(MessageRef msg)
+{
+  if (write_state != write_state_t::drop) {
+    conn.out_q.push_back(std::move(msg));
+    write_event();
+  }
+  return seastar::now();
+}
+
+seastar::future<> Protocol::keepalive()
+{
+  if (!need_keepalive) {
+    need_keepalive = true;
+    write_event();
+  }
+  return seastar::now();
+}
+
+void Protocol::notify_keepalive_ack(utime_t _keepalive_ack)
+{
+  logger().trace("{} got keepalive ack {}", conn, _keepalive_ack);
+  keepalive_ack = _keepalive_ack;
+  write_event();
+}
+
+void Protocol::notify_ack()
+{
+  if (!conn.policy.lossy) {
+    ++ack_left;
+    write_event();
+  }
+}
+
+void Protocol::requeue_sent()
+{
+  assert(write_state != write_state_t::open);
+  if (conn.sent.empty()) {
+    return;
+  }
+
+  conn.out_seq -= conn.sent.size();
+  logger().debug("{} requeue {} items, revert out_seq to {}",
+                 conn, conn.sent.size(), conn.out_seq);
+  for (MessageRef& msg : conn.sent) {
+    msg->clear_payload();
+    msg->set_seq(0);
+  }
+  conn.out_q.insert(conn.out_q.begin(),
+                    std::make_move_iterator(conn.sent.begin()),
+                    std::make_move_iterator(conn.sent.end()));
+  conn.sent.clear();
+  write_event();
+}
+
+void Protocol::requeue_up_to(seq_num_t seq)
+{
+  assert(write_state != write_state_t::open);
+  if (conn.sent.empty() && conn.out_q.empty()) {
+    logger().debug("{} nothing to requeue, reset out_seq from {} to seq {}",
+                   conn, conn.out_seq, seq);
+    conn.out_seq = seq;
+    return;
+  }
+  logger().debug("{} discarding sent items by seq {} (sent_len={}, out_seq={})",
+                 conn, seq, conn.sent.size(), conn.out_seq);
+  while (!conn.sent.empty()) {
+    auto cur_seq = conn.sent.front()->get_seq();
+    if (cur_seq == 0 || cur_seq > seq) {
+      break;
+    } else {
+      conn.sent.pop_front();
+    }
+  }
+  requeue_sent();
+}
+
+void Protocol::reset_write()
+{
+  assert(write_state != write_state_t::open);
+  conn.out_seq = 0;
+  conn.out_q.clear();
+  conn.sent.clear();
+  need_keepalive = false;
+  keepalive_ack = std::nullopt;
+  ack_left = 0;
+}
+
+void Protocol::ack_writes(seq_num_t seq)
+{
+  if (conn.policy.lossy) {  // lossy connections don't keep sent messages
+    return;
+  }
+  while (!conn.sent.empty() && conn.sent.front()->get_seq() <= seq) {
+    logger().trace("{} got ack seq {} >= {}, pop {}",
+                   conn, seq, conn.sent.front()->get_seq(), conn.sent.front());
+    conn.sent.pop_front();
+  }
+}
+
+seastar::future<stop_t> Protocol::try_exit_sweep() {
+  assert(!is_queued());
+  return socket->flush().then([this] {
+    if (!is_queued()) {
+      // still nothing pending to send after flush,
+      // the dispatching can ONLY stop now
+      ceph_assert(write_dispatching);
+      write_dispatching = false;
+      if (unlikely(exit_open.has_value())) {
+        exit_open->set_value();
+        exit_open = std::nullopt;
+        logger().info("{} write_event: nothing queued at {},"
+                      " set exit_open",
+                      conn, get_state_name(write_state));
+      }
+      return seastar::make_ready_future<stop_t>(stop_t::yes);
+    } else {
+      // something is pending to send during flushing
+      return seastar::make_ready_future<stop_t>(stop_t::no);
+    }
+  });
+}
+
+seastar::future<> Protocol::do_write_dispatch_sweep()
+{
+  return seastar::repeat([this] {
+    switch (write_state) {
+     case write_state_t::open: {
+      size_t num_msgs = conn.out_q.size();
+      bool still_queued = is_queued();
+      if (unlikely(!still_queued)) {
+        return try_exit_sweep();
+      }
+      conn.pending_q.clear();
+      conn.pending_q.swap(conn.out_q);
+      if (!conn.policy.lossy) {
+        conn.sent.insert(conn.sent.end(),
+                         conn.pending_q.begin(),
+                         conn.pending_q.end());
+      }
+      auto acked = ack_left;
+      assert(acked == 0 || conn.in_seq > 0);
+      // sweep all pending writes with the concrete Protocol
+      return socket->write(do_sweep_messages(
+          conn.pending_q, num_msgs, need_keepalive, keepalive_ack, acked > 0)
+      ).then([this, prv_keepalive_ack=keepalive_ack, acked] {
+        need_keepalive = false;
+        if (keepalive_ack == prv_keepalive_ack) {
+          keepalive_ack = std::nullopt;
+        }
+        assert(ack_left >= acked);
+        ack_left -= acked;
+        if (!is_queued()) {
+          return try_exit_sweep();
+        } else {
+          // messages were enqueued during socket write
+          return seastar::make_ready_future<stop_t>(stop_t::no);
+        }
+      });
+     }
+     case write_state_t::delay:
+      // delay dispatching writes until open
+      if (exit_open) {
+        exit_open->set_value();
+        exit_open = std::nullopt;
+        logger().info("{} write_event: delay and set exit_open ...", conn);
+      } else {
+        logger().info("{} write_event: delay ...", conn);
+      }
+      return state_changed.get_shared_future()
+      .then([] { return stop_t::no; });
+     case write_state_t::drop:
+      ceph_assert(write_dispatching);
+      write_dispatching = false;
+      if (exit_open) {
+        exit_open->set_value();
+        exit_open = std::nullopt;
+        logger().info("{} write_event: dropped and set exit_open", conn);
+      } else {
+        logger().info("{} write_event: dropped", conn);
+      }
+      return seastar::make_ready_future<stop_t>(stop_t::yes);
+     default:
+      ceph_assert(false);
+    }
+  }).handle_exception_type([this] (const std::system_error& e) {
+    if (e.code() != std::errc::broken_pipe &&
+        e.code() != std::errc::connection_reset &&
+        e.code() != error::negotiation_failure) {
+      logger().error("{} write_event(): unexpected error at {} -- {}",
+                     conn, get_state_name(write_state), e);
+      ceph_abort();
+    }
+    socket->shutdown();
+    if (write_state == write_state_t::open) {
+      logger().info("{} write_event(): fault at {}, going to delay -- {}",
+                    conn, get_state_name(write_state), e);
+      write_state = write_state_t::delay;
+    } else {
+      logger().info("{} write_event(): fault at {} -- {}",
+                    conn, get_state_name(write_state), e);
+    }
+    return do_write_dispatch_sweep();
+  });
+}
+
+void Protocol::write_event()
+{
+  notify_write();
+  if (write_dispatching) {
+    // already dispatching
+    return;
+  }
+  write_dispatching = true;
+  switch (write_state) {
+   case write_state_t::open:
+     [[fallthrough]];
+   case write_state_t::delay:
+    assert(!gate.is_closed());
+    gate.dispatch_in_background("do_write_dispatch_sweep", *this, [this] {
+      return do_write_dispatch_sweep();
+    });
+    return;
+   case write_state_t::drop:
+    write_dispatching = false;
+    return;
+   default:
+    ceph_assert(false);
+  }
+}
+
+} // namespace crimson::net
diff --git a/src/crimson/net/Protocol.h b/src/crimson/net/Protocol.h
new file mode 100644
index 000000000..dc4e4f2af
--- /dev/null
+++ b/src/crimson/net/Protocol.h
@@ -0,0 +1,173 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/gate.hh>
+#include <seastar/core/shared_future.hh>
+
+#include "crimson/common/gated.h"
+#include "crimson/common/log.h"
+#include "Fwd.h"
+#include "SocketConnection.h"
+
+namespace crimson::net {
+
+class Protocol {
+ public:
+  enum class proto_t {
+    none,
+    v1,
+    v2
+  };
+
+  Protocol(Protocol&&) = delete;
+  virtual ~Protocol();
+
+  virtual bool is_connected() const = 0;
+
+#ifdef UNIT_TESTS_BUILT
+  bool is_closed_clean = false;
+  bool is_closed() const { return closed; }
+#endif
+
+  // Reentrant closing
+  void close(bool dispatch_reset, std::optional<std::function<void()>> f_accept_new=std::nullopt);
+  seastar::future<> close_clean(bool dispatch_reset) {
+    close(dispatch_reset);
+    // it can happen if close_clean() is called inside Dispatcher::ms_handle_reset()
+    // which will otherwise result in deadlock
+    assert(close_ready.valid());
+    return close_ready.get_future();
+  }
+
+  virtual void start_connect(const entity_addr_t& peer_addr,
+                             const entity_name_t& peer_name) = 0;
+
+  virtual void start_accept(SocketRef&& socket,
+                            const entity_addr_t& peer_addr) = 0;
+
+  virtual void print(std::ostream&) const = 0;
+ protected:
+  Protocol(proto_t type,
+           ChainedDispatchers& dispatchers,
+           SocketConnection& conn);
+
+  virtual void trigger_close() = 0;
+
+  virtual ceph::bufferlist do_sweep_messages(
+      const std::deque<MessageRef>& msgs,
+      size_t num_msgs,
+      bool require_keepalive,
+      std::optional<utime_t> keepalive_ack,
+      bool require_ack) = 0;
+
+  virtual void notify_write() {};
+
+  virtual void on_closed() {}
+
+ public:
+  const proto_t proto_type;
+  SocketRef socket;
+
+ protected:
+  ChainedDispatchers& dispatchers;
+  SocketConnection &conn;
+
+  AuthConnectionMetaRef auth_meta;
+
+ private:
+  bool closed = false;
+  // become valid only after closed == true
+  seastar::shared_future<> close_ready;
+
+// the write state-machine
+ public:
+  seastar::future<> send(MessageRef msg);
+  seastar::future<> keepalive();
+
+// TODO: encapsulate a SessionedSender class
+ protected:
+  // write_state is changed with state atomically, indicating the write
+  // behavior of the according state.
+  enum class write_state_t : uint8_t {
+    none,
+    delay,
+    open,
+    drop
+  };
+
+  static const char* get_state_name(write_state_t state) {
+    uint8_t index = static_cast<uint8_t>(state);
+    static const char *const state_names[] = {"none",
+                                              "delay",
+                                              "open",
+                                              "drop"};
+    assert(index < std::size(state_names));
+    return state_names[index];
+  }
+
+  void set_write_state(const write_state_t& state) {
+    if (write_state == write_state_t::open &&
+        state != write_state_t::open &&
+        write_dispatching) {
+      exit_open = seastar::shared_promise<>();
+    }
+    write_state = state;
+    state_changed.set_value();
+    state_changed = seastar::shared_promise<>();
+  }
+
+  seastar::future<> wait_write_exit() {
+    if (exit_open) {
+      return exit_open->get_shared_future();
+    }
+    return seastar::now();
+  }
+
+  void notify_keepalive_ack(utime_t keepalive_ack);
+
+  void notify_ack();
+
+  void requeue_up_to(seq_num_t seq);
+
+  void requeue_sent();
+
+  void reset_write();
+
+  bool is_queued() const {
+    return (!conn.out_q.empty() ||
+            ack_left > 0 ||
+            need_keepalive ||
+            keepalive_ack.has_value());
+  }
+
+  void ack_writes(seq_num_t seq);
+  crimson::common::Gated gate;
+
+ private:
+  write_state_t write_state = write_state_t::none;
+  // wait until current state changed
+  seastar::shared_promise<> state_changed;
+
+  bool need_keepalive = false;
+  std::optional<utime_t> keepalive_ack = std::nullopt;
+  uint64_t ack_left = 0;
+  bool write_dispatching = false;
+  // If another continuation is trying to close or replace socket when
+  // write_dispatching is true and write_state is open,
+  // it needs to wait for exit_open until writing is stopped or failed.
+  std::optional<seastar::shared_promise<>> exit_open;
+
+  seastar::future<stop_t> try_exit_sweep();
+  seastar::future<> do_write_dispatch_sweep();
+  void write_event();
+};
+
+inline std::ostream& operator<<(std::ostream& out, const Protocol& proto) {
+  proto.print(out);
+  return out;
+}
+
+
+} // namespace crimson::net
diff --git a/src/crimson/net/ProtocolV1.cc b/src/crimson/net/ProtocolV1.cc
new file mode 100644
index 000000000..3c604240d
--- /dev/null
+++ b/src/crimson/net/ProtocolV1.cc
@@ -0,0 +1,1014 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ProtocolV1.h"
+
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/sleep.hh>
+#include <seastar/net/packet.hh>
+
+#include "include/msgr.h"
+#include "include/random.h"
+#include "auth/Auth.h"
+#include "auth/AuthSessionHandler.h"
+
+#include "crimson/auth/AuthClient.h"
+#include "crimson/auth/AuthServer.h"
+#include "crimson/common/log.h"
+#include "chained_dispatchers.h"
+#include "Errors.h"
+#include "Socket.h"
+#include "SocketConnection.h"
+#include "SocketMessenger.h"
+
+WRITE_RAW_ENCODER(ceph_msg_connect);
+WRITE_RAW_ENCODER(ceph_msg_connect_reply);
+
+using crimson::common::local_conf;
+
+std::ostream& operator<<(std::ostream& out, const ceph_msg_connect& c)
+{
+  return out << "connect{features=" << std::hex << c.features << std::dec
+      << " host_type=" << c.host_type
+      << " global_seq=" << c.global_seq
+      << " connect_seq=" << c.connect_seq
+      << " protocol_version=" << c.protocol_version
+      << " authorizer_protocol=" << c.authorizer_protocol
+      << " authorizer_len=" << c.authorizer_len
+      << " flags=" << std::hex << static_cast<uint16_t>(c.flags) << std::dec << '}';
+}
+
+std::ostream& operator<<(std::ostream& out, const ceph_msg_connect_reply& r)
+{
+  return out << "connect_reply{tag=" << static_cast<uint16_t>(r.tag)
+      << " features=" << std::hex << r.features << std::dec
+      << " global_seq=" << r.global_seq
+      << " connect_seq=" << r.connect_seq
+      << " protocol_version=" << r.protocol_version
+      << " authorizer_len=" << r.authorizer_len
+      << " flags=" << std::hex << static_cast<uint16_t>(r.flags) << std::dec << '}';
+}
+
+namespace {
+
+seastar::logger& logger() {
+  return crimson::get_logger(ceph_subsys_ms);
+}
+
+template <typename T>
+seastar::net::packet make_static_packet(const T& value) {
+    return { reinterpret_cast<const char*>(&value), sizeof(value) };
+}
+
+// store the banner in a non-const string for buffer::create_static()
+char banner[] = CEPH_BANNER;
+constexpr size_t banner_size = sizeof(CEPH_BANNER)-1;
+
+constexpr size_t client_header_size = banner_size + sizeof(ceph_entity_addr);
+constexpr size_t server_header_size = banner_size + 2 * sizeof(ceph_entity_addr);
+
+// check that the buffer starts with a valid banner without requiring it to
+// be contiguous in memory
+void validate_banner(bufferlist::const_iterator& p)
+{
+  auto b = std::cbegin(banner);
+  auto end = b + banner_size;
+  while (b != end) {
+    const char *buf{nullptr};
+    auto remaining = std::distance(b, end);
+    auto len = p.get_ptr_and_advance(remaining, &buf);
+    if (!std::equal(buf, buf + len, b)) {
+      throw std::system_error(
+          make_error_code(crimson::net::error::bad_connect_banner));
+    }
+    b += len;
+  }
+}
+
+// return a static bufferptr to the given object
+template <typename T>
+bufferptr create_static(T& obj)
+{
+  return buffer::create_static(sizeof(obj), reinterpret_cast<char*>(&obj));
+}
+
+uint32_t get_proto_version(entity_type_t peer_type, bool connect)
+{
+  constexpr entity_type_t my_type = CEPH_ENTITY_TYPE_OSD;
+  // see also OSD.h, unlike other connection of simple/async messenger,
+  // crimson msgr is only used by osd
+  constexpr uint32_t CEPH_OSD_PROTOCOL = 10;
+  if (peer_type == my_type) {
+    // internal
+    return CEPH_OSD_PROTOCOL;
+  } else {
+    // public
+    switch (connect ? peer_type : my_type) {
+      case CEPH_ENTITY_TYPE_OSD: return CEPH_OSDC_PROTOCOL;
+      case CEPH_ENTITY_TYPE_MDS: return CEPH_MDSC_PROTOCOL;
+      case CEPH_ENTITY_TYPE_MON: return CEPH_MONC_PROTOCOL;
+      default: return 0;
+    }
+  }
+}
+
+void discard_up_to(std::deque<MessageRef>* queue,
+                   crimson::net::seq_num_t seq)
+{
+  while (!queue->empty() &&
+         queue->front()->get_seq() < seq) {
+    queue->pop_front();
+  }
+}
+
+} // namespace anonymous
+
+namespace crimson::net {
+
+ProtocolV1::ProtocolV1(ChainedDispatchers& dispatchers,
+                       SocketConnection& conn,
+                       SocketMessenger& messenger)
+  : Protocol(proto_t::v1, dispatchers, conn), messenger{messenger} {}
+
+ProtocolV1::~ProtocolV1() {}
+
+bool ProtocolV1::is_connected() const
+{
+  return state == state_t::open;
+}
+
+// connecting state
+
+void ProtocolV1::reset_session()
+{
+  conn.out_q = {};
+  conn.sent = {};
+  conn.in_seq = 0;
+  h.connect_seq = 0;
+  if (HAVE_FEATURE(conn.features, MSG_AUTH)) {
+    // Set out_seq to a random value, so CRC won't be predictable.
+    // Constant to limit starting sequence number to 2^31.  Nothing special
+    // about it, just a big number.
+    constexpr uint64_t SEQ_MASK = 0x7fffffff;
+    conn.out_seq = ceph::util::generate_random_number<uint64_t>(0, SEQ_MASK);
+  } else {
+    // previously, seq #'s always started at 0.
+    conn.out_seq = 0;
+  }
+}
+
+seastar::future<stop_t>
+ProtocolV1::handle_connect_reply(msgr_tag_t tag)
+{
+  if (h.auth_payload.length() && !conn.peer_is_mon()) {
+    if (tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { // more
+      h.auth_more = messenger.get_auth_client()->handle_auth_reply_more(
+          conn.shared_from_this(), auth_meta, h.auth_payload);
+      return seastar::make_ready_future<stop_t>(stop_t::no);
+    } else {
+      int ret = messenger.get_auth_client()->handle_auth_done(
+          conn.shared_from_this(), auth_meta, 0, 0, h.auth_payload);
+      if (ret < 0) {
+        // fault
+        logger().warn("{} AuthClient::handle_auth_done() return {}", conn, ret);
+        throw std::system_error(make_error_code(error::negotiation_failure));
+      }
+    }
+  }
+
+  switch (tag) {
+  case CEPH_MSGR_TAG_FEATURES:
+    logger().error("{} connect protocol feature mispatch", __func__);
+    throw std::system_error(make_error_code(error::negotiation_failure));
+  case CEPH_MSGR_TAG_BADPROTOVER:
+    logger().error("{} connect protocol version mispatch", __func__);
+    throw std::system_error(make_error_code(error::negotiation_failure));
+  case CEPH_MSGR_TAG_BADAUTHORIZER:
+    logger().error("{} got bad authorizer", __func__);
+    throw std::system_error(make_error_code(error::negotiation_failure));
+  case CEPH_MSGR_TAG_RESETSESSION:
+    reset_session();
+    return seastar::make_ready_future<stop_t>(stop_t::no);
+  case CEPH_MSGR_TAG_RETRY_GLOBAL:
+    return messenger.get_global_seq(h.reply.global_seq).then([this] (auto gs) {
+      h.global_seq = gs;
+      return seastar::make_ready_future<stop_t>(stop_t::no);
+    });
+  case CEPH_MSGR_TAG_RETRY_SESSION:
+    ceph_assert(h.reply.connect_seq > h.connect_seq);
+    h.connect_seq = h.reply.connect_seq;
+    return seastar::make_ready_future<stop_t>(stop_t::no);
+  case CEPH_MSGR_TAG_WAIT:
+    // TODO: state wait
+    throw std::system_error(make_error_code(error::negotiation_failure));
+  case CEPH_MSGR_TAG_SEQ:
+  case CEPH_MSGR_TAG_READY:
+    if (auto missing = (conn.policy.features_required & ~(uint64_t)h.reply.features);
+        missing) {
+      logger().error("{} missing required features", __func__);
+      throw std::system_error(make_error_code(error::negotiation_failure));
+    }
+    return seastar::futurize_invoke([this, tag] {
+        if (tag == CEPH_MSGR_TAG_SEQ) {
+          return socket->read_exactly(sizeof(seq_num_t))
+            .then([this] (auto buf) {
+              auto acked_seq = reinterpret_cast<const seq_num_t*>(buf.get());
+              discard_up_to(&conn.out_q, *acked_seq);
+              return socket->write_flush(make_static_packet(conn.in_seq));
+            });
+        }
+        // tag CEPH_MSGR_TAG_READY
+        return seastar::now();
+      }).then([this] {
+        // hooray!
+        h.peer_global_seq = h.reply.global_seq;
+        conn.policy.lossy = h.reply.flags & CEPH_MSG_CONNECT_LOSSY;
+        h.connect_seq++;
+        h.backoff = 0ms;
+        conn.set_features(h.reply.features & h.connect.features);
+        if (auth_meta->authorizer) {
+          session_security.reset(
+              get_auth_session_handler(nullptr,
+                                       auth_meta->authorizer->protocol,
+                                       auth_meta->session_key,
+                                       conn.features));
+        } else {
+          session_security.reset();
+        }
+        return seastar::make_ready_future<stop_t>(stop_t::yes);
+      });
+    break;
+  default:
+    // unknown tag
+    logger().error("{} got unknown tag", __func__, int(tag));
+    throw std::system_error(make_error_code(error::negotiation_failure));
+  }
+}
+
+ceph::bufferlist ProtocolV1::get_auth_payload()
+{
+  // only non-mons connectings to mons use MAuth messages
+  if (conn.peer_is_mon() &&
+     messenger.get_mytype() != CEPH_ENTITY_TYPE_MON) {
+    return {};
+  } else {
+    if (h.auth_more.length()) {
+      logger().info("using augmented (challenge) auth payload");
+      return std::move(h.auth_more);
+    } else {
+      auto [auth_method, preferred_modes, auth_bl] =
+	messenger.get_auth_client()->get_auth_request(
+            conn.shared_from_this(), auth_meta);
+      auth_meta->auth_method = auth_method;
+      return auth_bl;
+    }
+  }
+}
+
+seastar::future<stop_t>
+ProtocolV1::repeat_connect()
+{
+  // encode ceph_msg_connect
+  memset(&h.connect, 0, sizeof(h.connect));
+  h.connect.features = conn.policy.features_supported;
+  h.connect.host_type = messenger.get_myname().type();
+  h.connect.global_seq = h.global_seq;
+  h.connect.connect_seq = h.connect_seq;
+  h.connect.protocol_version = get_proto_version(conn.get_peer_type(), true);
+  // this is fyi, actually, server decides!
+  h.connect.flags = conn.policy.lossy ? CEPH_MSG_CONNECT_LOSSY : 0;
+
+  ceph_assert(messenger.get_auth_client());
+
+  bufferlist bl;
+  bufferlist auth_bl = get_auth_payload();
+  if (auth_bl.length()) {
+    h.connect.authorizer_protocol = auth_meta->auth_method;
+    h.connect.authorizer_len = auth_bl.length();
+    bl.append(create_static(h.connect));
+    bl.claim_append(auth_bl);
+  } else {
+    h.connect.authorizer_protocol = 0;
+    h.connect.authorizer_len = 0;
+    bl.append(create_static(h.connect));
+  };
+  return socket->write_flush(std::move(bl))
+    .then([this] {
+      // read the reply
+      return socket->read(sizeof(h.reply));
+    }).then([this] (bufferlist bl) {
+      auto p = bl.cbegin();
+      ::decode(h.reply, p);
+      ceph_assert(p.end());
+      return socket->read(h.reply.authorizer_len);
+    }).then([this] (bufferlist bl) {
+      h.auth_payload = std::move(bl);
+      return handle_connect_reply(h.reply.tag);
+    });
+}
+
+void ProtocolV1::start_connect(const entity_addr_t& _peer_addr,
+                               const entity_name_t& _peer_name)
+{
+  ceph_assert(state == state_t::none);
+  logger().trace("{} trigger connecting, was {}", conn, static_cast<int>(state));
+  state = state_t::connecting;
+  set_write_state(write_state_t::delay);
+
+  ceph_assert(!socket);
+  ceph_assert(!gate.is_closed());
+  conn.peer_addr = _peer_addr;
+  conn.target_addr = _peer_addr;
+  conn.set_peer_name(_peer_name);
+  conn.policy = messenger.get_policy(_peer_name.type());
+  messenger.register_conn(
+    seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this()));
+  gate.dispatch_in_background("start_connect", *this, [this] {
+      return Socket::connect(conn.peer_addr)
+        .then([this](SocketRef sock) {
+          socket = std::move(sock);
+          if (state != state_t::connecting) {
+            assert(state == state_t::closing);
+            return socket->close().then([] {
+              throw std::system_error(make_error_code(error::protocol_aborted));
+            });
+          }
+          return seastar::now();
+        }).then([this] {
+          return messenger.get_global_seq();
+        }).then([this] (auto gs) {
+          h.global_seq = gs;
+          // read server's handshake header
+          return socket->read(server_header_size);
+        }).then([this] (bufferlist headerbl) {
+          auto p = headerbl.cbegin();
+          validate_banner(p);
+          entity_addr_t saddr, caddr;
+          ::decode(saddr, p);
+          ::decode(caddr, p);
+          ceph_assert(p.end());
+          if (saddr != conn.peer_addr) {
+            logger().error("{} my peer_addr {} doesn't match what peer advertized {}",
+                           conn, conn.peer_addr, saddr);
+            throw std::system_error(
+                make_error_code(crimson::net::error::bad_peer_address));
+          }
+          if (state != state_t::connecting) {
+            assert(state == state_t::closing);
+            throw std::system_error(make_error_code(error::protocol_aborted));
+          }
+          socket->learn_ephemeral_port_as_connector(caddr.get_port());
+          if (unlikely(caddr.is_msgr2())) {
+            logger().warn("{} peer sent a v2 address for me: {}",
+                          conn, caddr);
+            throw std::system_error(
+                make_error_code(crimson::net::error::bad_peer_address));
+          }
+          caddr.set_type(entity_addr_t::TYPE_LEGACY);
+          return messenger.learned_addr(caddr, conn);
+        }).then([this] {
+          // encode/send client's handshake header
+          bufferlist bl;
+          bl.append(buffer::create_static(banner_size, banner));
+          ::encode(messenger.get_myaddr(), bl, 0);
+          return socket->write_flush(std::move(bl));
+        }).then([=] {
+          return seastar::repeat([this] {
+            return repeat_connect();
+          });
+        }).then([this] {
+          if (state != state_t::connecting) {
+            assert(state == state_t::closing);
+            throw std::system_error(make_error_code(error::protocol_aborted));
+          }
+          execute_open(open_t::connected);
+        }).handle_exception([this] (std::exception_ptr eptr) {
+          // TODO: handle fault in the connecting state
+          logger().warn("{} connecting fault: {}", conn, eptr);
+          close(true);
+        });
+    });
+}
+
+// accepting state
+
+seastar::future<stop_t> ProtocolV1::send_connect_reply(
+    msgr_tag_t tag, bufferlist&& authorizer_reply)
+{
+  h.reply.tag = tag;
+  h.reply.features = static_cast<uint64_t>((h.connect.features &
+                                            conn.policy.features_supported) |
+                                           conn.policy.features_required);
+  h.reply.authorizer_len = authorizer_reply.length();
+  return socket->write(make_static_packet(h.reply))
+    .then([this, reply=std::move(authorizer_reply)]() mutable {
+      return socket->write_flush(std::move(reply));
+    }).then([] {
+      return stop_t::no;
+    });
+}
+
+seastar::future<stop_t> ProtocolV1::send_connect_reply_ready(
+    msgr_tag_t tag, bufferlist&& authorizer_reply)
+{
+  return messenger.get_global_seq(
+    ).then([this, tag, auth_len = authorizer_reply.length()] (auto gs) {
+      h.global_seq = gs;
+      h.reply.tag = tag;
+      h.reply.features = conn.policy.features_supported;
+      h.reply.global_seq = h.global_seq;
+      h.reply.connect_seq = h.connect_seq;
+      h.reply.flags = 0;
+      if (conn.policy.lossy) {
+        h.reply.flags = h.reply.flags | CEPH_MSG_CONNECT_LOSSY;
+      }
+      h.reply.authorizer_len = auth_len;
+
+      session_security.reset(
+          get_auth_session_handler(nullptr,
+                                   auth_meta->auth_method,
+                                   auth_meta->session_key,
+                                   conn.features));
+
+      return socket->write(make_static_packet(h.reply));
+    }).then([this, reply=std::move(authorizer_reply)]() mutable {
+      if (reply.length()) {
+        return socket->write(std::move(reply));
+      } else {
+        return seastar::now();
+      }
+    }).then([this] {
+      if (h.reply.tag == CEPH_MSGR_TAG_SEQ) {
+        return socket->write_flush(make_static_packet(conn.in_seq))
+          .then([this] {
+            return socket->read_exactly(sizeof(seq_num_t));
+          }).then([this] (auto buf) {
+            auto acked_seq = reinterpret_cast<const seq_num_t*>(buf.get());
+            discard_up_to(&conn.out_q, *acked_seq);
+          });
+      } else {
+        return socket->flush();
+      }
+    }).then([] {
+      return stop_t::yes;
+    });
+}
+
+seastar::future<stop_t> ProtocolV1::replace_existing(
+    SocketConnectionRef existing,
+    bufferlist&& authorizer_reply,
+    bool is_reset_from_peer)
+{
+  msgr_tag_t reply_tag;
+  if (HAVE_FEATURE(h.connect.features, RECONNECT_SEQ) &&
+      !is_reset_from_peer) {
+    reply_tag = CEPH_MSGR_TAG_SEQ;
+  } else {
+    reply_tag = CEPH_MSGR_TAG_READY;
+  }
+  if (!existing->is_lossy()) {
+    // XXX: we decided not to support lossless connection in v1. as the
+    // client's default policy is
+    // Messenger::Policy::lossy_client(CEPH_FEATURE_OSDREPLYMUX) which is
+    // lossy. And by the time
+    // will all be performed using v2 protocol.
+    ceph_abort("lossless policy not supported for v1");
+  }
+  existing->protocol->close(true);
+  return send_connect_reply_ready(reply_tag, std::move(authorizer_reply));
+}
+
+seastar::future<stop_t> ProtocolV1::handle_connect_with_existing(
+    SocketConnectionRef existing, bufferlist&& authorizer_reply)
+{
+  ProtocolV1 *exproto = dynamic_cast<ProtocolV1*>(existing->protocol.get());
+
+  if (h.connect.global_seq < exproto->peer_global_seq()) {
+    h.reply.global_seq = exproto->peer_global_seq();
+    return send_connect_reply(CEPH_MSGR_TAG_RETRY_GLOBAL);
+  } else if (existing->is_lossy()) {
+    return replace_existing(existing, std::move(authorizer_reply));
+  } else if (h.connect.connect_seq == 0 && exproto->connect_seq() > 0) {
+    return replace_existing(existing, std::move(authorizer_reply), true);
+  } else if (h.connect.connect_seq < exproto->connect_seq()) {
+    // old attempt, or we sent READY but they didn't get it.
+    h.reply.connect_seq = exproto->connect_seq() + 1;
+    return send_connect_reply(CEPH_MSGR_TAG_RETRY_SESSION);
+  } else if (h.connect.connect_seq == exproto->connect_seq()) {
+    // if the existing connection successfully opened, and/or
+    // subsequently went to standby, then the peer should bump
+    // their connect_seq and retry: this is not a connection race
+    // we need to resolve here.
+    if (exproto->get_state() == state_t::open ||
+        exproto->get_state() == state_t::standby) {
+      if (conn.policy.resetcheck && exproto->connect_seq() == 0) {
+        return replace_existing(existing, std::move(authorizer_reply));
+      } else {
+        h.reply.connect_seq = exproto->connect_seq() + 1;
+        return send_connect_reply(CEPH_MSGR_TAG_RETRY_SESSION);
+      }
+    } else if (existing->peer_wins()) {
+      return replace_existing(existing, std::move(authorizer_reply));
+    } else {
+      return send_connect_reply(CEPH_MSGR_TAG_WAIT);
+    }
+  } else if (conn.policy.resetcheck &&
+             exproto->connect_seq() == 0) {
+    return send_connect_reply(CEPH_MSGR_TAG_RESETSESSION);
+  } else {
+    return replace_existing(existing, std::move(authorizer_reply));
+  }
+}
+
+bool ProtocolV1::require_auth_feature() const
+{
+  if (h.connect.authorizer_protocol != CEPH_AUTH_CEPHX) {
+    return false;
+  }
+  if (local_conf()->cephx_require_signatures) {
+    return true;
+  }
+  if (h.connect.host_type == CEPH_ENTITY_TYPE_OSD ||
+      h.connect.host_type == CEPH_ENTITY_TYPE_MDS ||
+      h.connect.host_type == CEPH_ENTITY_TYPE_MGR) {
+    return local_conf()->cephx_cluster_require_signatures;
+  } else {
+    return local_conf()->cephx_service_require_signatures;
+  }
+}
+
+bool ProtocolV1::require_cephx_v2_feature() const
+{
+  if (h.connect.authorizer_protocol != CEPH_AUTH_CEPHX) {
+    return false;
+  }
+  if (local_conf()->cephx_require_version >= 2) {
+    return true;
+  }
+  if (h.connect.host_type == CEPH_ENTITY_TYPE_OSD ||
+      h.connect.host_type == CEPH_ENTITY_TYPE_MDS ||
+      h.connect.host_type == CEPH_ENTITY_TYPE_MGR) {
+    return local_conf()->cephx_cluster_require_version >= 2;
+  } else {
+    return local_conf()->cephx_service_require_version >= 2;
+  }
+}
+
+seastar::future<stop_t> ProtocolV1::repeat_handle_connect()
+{
+  return socket->read(sizeof(h.connect))
+    .then([this](bufferlist bl) {
+      auto p = bl.cbegin();
+      ::decode(h.connect, p);
+      if (conn.get_peer_type() != 0 &&
+          conn.get_peer_type() != h.connect.host_type) {
+        logger().error("{} repeat_handle_connect(): my peer type does not match"
+                       " what peer advertises {} != {}",
+                       conn, conn.get_peer_type(), h.connect.host_type);
+        throw std::system_error(make_error_code(error::protocol_aborted));
+      }
+      conn.set_peer_type(h.connect.host_type);
+      conn.policy = messenger.get_policy(h.connect.host_type);
+      if (!conn.policy.lossy && !conn.policy.server && conn.target_addr.get_port() <= 0) {
+          logger().error("{} we don't know how to reconnect to peer {}",
+                         conn, conn.target_addr);
+        throw std::system_error(
+            make_error_code(crimson::net::error::bad_peer_address));
+      }
+      return socket->read(h.connect.authorizer_len);
+    }).then([this] (bufferlist authorizer) {
+      memset(&h.reply, 0, sizeof(h.reply));
+      // TODO: set reply.protocol_version
+      if (h.connect.protocol_version != get_proto_version(h.connect.host_type, false)) {
+        return send_connect_reply(
+            CEPH_MSGR_TAG_BADPROTOVER, bufferlist{});
+      }
+      if (require_auth_feature()) {
+        conn.policy.features_required |= CEPH_FEATURE_MSG_AUTH;
+      }
+      if (require_cephx_v2_feature()) {
+        conn.policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+      }
+      if (auto feat_missing = conn.policy.features_required & ~(uint64_t)h.connect.features;
+          feat_missing != 0) {
+        return send_connect_reply(
+            CEPH_MSGR_TAG_FEATURES, bufferlist{});
+      }
+
+      bufferlist authorizer_reply;
+      auth_meta->auth_method = h.connect.authorizer_protocol;
+      if (!HAVE_FEATURE((uint64_t)h.connect.features, CEPHX_V2)) {
+        // peer doesn't support it and we won't get here if we require it
+        auth_meta->skip_authorizer_challenge = true;
+      }
+      auto more = static_cast<bool>(auth_meta->authorizer_challenge);
+      ceph_assert(messenger.get_auth_server());
+      int r = messenger.get_auth_server()->handle_auth_request(
+          conn.shared_from_this(), auth_meta, more, auth_meta->auth_method, authorizer,
+          &authorizer_reply);
+
+      if (r < 0) {
+        session_security.reset();
+        return send_connect_reply(
+            CEPH_MSGR_TAG_BADAUTHORIZER, std::move(authorizer_reply));
+      } else if (r == 0) {
+        ceph_assert(authorizer_reply.length());
+        return send_connect_reply(
+            CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER, std::move(authorizer_reply));
+      }
+
+      // r > 0
+      if (auto existing = messenger.lookup_conn(conn.peer_addr); existing) {
+        if (existing->protocol->proto_type != proto_t::v1) {
+          logger().warn("{} existing {} proto version is {} not 1, close existing",
+                        conn, *existing,
+                        static_cast<int>(existing->protocol->proto_type));
+          // NOTE: this is following async messenger logic, but we may miss the reset event.
+          existing->mark_down();
+        } else {
+          return handle_connect_with_existing(existing, std::move(authorizer_reply));
+        }
+      }
+      if (h.connect.connect_seq > 0) {
+        return send_connect_reply(CEPH_MSGR_TAG_RESETSESSION,
+                                  std::move(authorizer_reply));
+      }
+      h.connect_seq = h.connect.connect_seq + 1;
+      h.peer_global_seq = h.connect.global_seq;
+      conn.set_features((uint64_t)conn.policy.features_supported & (uint64_t)h.connect.features);
+      // TODO: cct
+      return send_connect_reply_ready(CEPH_MSGR_TAG_READY, std::move(authorizer_reply));
+    });
+}
+
+void ProtocolV1::start_accept(SocketRef&& sock,
+                              const entity_addr_t& _peer_addr)
+{
+  ceph_assert(state == state_t::none);
+  logger().trace("{} trigger accepting, was {}",
+                 conn, static_cast<int>(state));
+  state = state_t::accepting;
+  set_write_state(write_state_t::delay);
+
+  ceph_assert(!socket);
+  // until we know better
+  conn.target_addr = _peer_addr;
+  socket = std::move(sock);
+  messenger.accept_conn(
+    seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this()));
+  gate.dispatch_in_background("start_accept", *this, [this] {
+      // stop learning my_addr before sending it out, so it won't change
+      return messenger.learned_addr(messenger.get_myaddr(), conn).then([this] {
+          // encode/send server's handshake header
+          bufferlist bl;
+          bl.append(buffer::create_static(banner_size, banner));
+          ::encode(messenger.get_myaddr(), bl, 0);
+          ::encode(conn.target_addr, bl, 0);
+          return socket->write_flush(std::move(bl));
+        }).then([this] {
+          // read client's handshake header and connect request
+          return socket->read(client_header_size);
+        }).then([this] (bufferlist bl) {
+          auto p = bl.cbegin();
+          validate_banner(p);
+          entity_addr_t addr;
+          ::decode(addr, p);
+          ceph_assert(p.end());
+          if ((addr.is_legacy() || addr.is_any()) &&
+              addr.is_same_host(conn.target_addr)) {
+            // good
+          } else {
+            logger().error("{} peer advertized an invalid peer_addr: {},"
+                           " which should be v1 and the same host with {}.",
+                           conn, addr, conn.peer_addr);
+            throw std::system_error(
+                make_error_code(crimson::net::error::bad_peer_address));
+          }
+          conn.peer_addr = addr;
+          conn.target_addr = conn.peer_addr;
+          return seastar::repeat([this] {
+            return repeat_handle_connect();
+          });
+        }).then([this] {
+          if (state != state_t::accepting) {
+            assert(state == state_t::closing);
+            throw std::system_error(make_error_code(error::protocol_aborted));
+          }
+          messenger.register_conn(
+            seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this()));
+          messenger.unaccept_conn(
+            seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this()));
+          execute_open(open_t::accepted);
+        }).handle_exception([this] (std::exception_ptr eptr) {
+          // TODO: handle fault in the accepting state
+          logger().warn("{} accepting fault: {}", conn, eptr);
+          close(false);
+        });
+    });
+}
+
+// open state
+
+ceph::bufferlist ProtocolV1::do_sweep_messages(
+    const std::deque<MessageRef>& msgs,
+    size_t num_msgs,
+    bool require_keepalive,
+    std::optional<utime_t> _keepalive_ack,
+    bool require_ack)
+{
+  static const size_t RESERVE_MSG_SIZE = sizeof(CEPH_MSGR_TAG_MSG) +
+                                         sizeof(ceph_msg_header) +
+                                         sizeof(ceph_msg_footer);
+  static const size_t RESERVE_MSG_SIZE_OLD = sizeof(CEPH_MSGR_TAG_MSG) +
+                                             sizeof(ceph_msg_header) +
+                                             sizeof(ceph_msg_footer_old);
+
+  ceph::bufferlist bl;
+  if (likely(num_msgs)) {
+    if (HAVE_FEATURE(conn.features, MSG_AUTH)) {
+      bl.reserve(num_msgs * RESERVE_MSG_SIZE);
+    } else {
+      bl.reserve(num_msgs * RESERVE_MSG_SIZE_OLD);
+    }
+  }
+
+  if (unlikely(require_keepalive)) {
+    k.req.stamp = ceph::coarse_real_clock::to_ceph_timespec(
+      ceph::coarse_real_clock::now());
+    logger().trace("{} write keepalive2 {}", conn, k.req.stamp.tv_sec);
+    bl.append(create_static(k.req));
+  }
+
+  if (unlikely(_keepalive_ack.has_value())) {
+    logger().trace("{} write keepalive2 ack {}", conn, *_keepalive_ack);
+    k.ack.stamp = ceph_timespec(*_keepalive_ack);
+    bl.append(create_static(k.ack));
+  }
+
+  if (require_ack) {
+    // XXX: we decided not to support lossless connection in v1. as the
+    // client's default policy is
+    // Messenger::Policy::lossy_client(CEPH_FEATURE_OSDREPLYMUX) which is
+    // lossy. And by the time of crimson-osd's GA, the in-cluster communication
+    // will all be performed using v2 protocol.
+    ceph_abort("lossless policy not supported for v1");
+  }
+
+  std::for_each(msgs.begin(), msgs.begin()+num_msgs, [this, &bl](const MessageRef& msg) {
+    ceph_assert(!msg->get_seq() && "message already has seq");
+    msg->set_seq(++conn.out_seq);
+    auto& header = msg->get_header();
+    header.src = messenger.get_myname();
+    msg->encode(conn.features, messenger.get_crc_flags());
+    if (session_security) {
+      session_security->sign_message(msg.get());
+    }
+    logger().debug("{} --> #{} === {} ({})",
+                   conn, msg->get_seq(), *msg, msg->get_type());
+    bl.append(CEPH_MSGR_TAG_MSG);
+    bl.append((const char*)&header, sizeof(header));
+    bl.append(msg->get_payload());
+    bl.append(msg->get_middle());
+    bl.append(msg->get_data());
+    auto& footer = msg->get_footer();
+    if (HAVE_FEATURE(conn.features, MSG_AUTH)) {
+      bl.append((const char*)&footer, sizeof(footer));
+    } else {
+      ceph_msg_footer_old old_footer;
+      if (messenger.get_crc_flags() & MSG_CRC_HEADER) {
+        old_footer.front_crc = footer.front_crc;
+        old_footer.middle_crc = footer.middle_crc;
+      } else {
+        old_footer.front_crc = old_footer.middle_crc = 0;
+      }
+      if (messenger.get_crc_flags() & MSG_CRC_DATA) {
+        old_footer.data_crc = footer.data_crc;
+      } else {
+        old_footer.data_crc = 0;
+      }
+      old_footer.flags = footer.flags;
+      bl.append((const char*)&old_footer, sizeof(old_footer));
+    }
+  });
+
+  return bl;
+}
+
+seastar::future<> ProtocolV1::handle_keepalive2_ack()
+{
+  return socket->read_exactly(sizeof(ceph_timespec))
+    .then([this] (auto buf) {
+      auto t = reinterpret_cast<const ceph_timespec*>(buf.get());
+      k.ack_stamp = *t;
+      logger().trace("{} got keepalive2 ack {}", conn, t->tv_sec);
+    });
+}
+
+seastar::future<> ProtocolV1::handle_keepalive2()
+{
+  return socket->read_exactly(sizeof(ceph_timespec))
+    .then([this] (auto buf) {
+      utime_t ack{*reinterpret_cast<const ceph_timespec*>(buf.get())};
+      notify_keepalive_ack(ack);
+    });
+}
+
+seastar::future<> ProtocolV1::handle_ack()
+{
+  return socket->read_exactly(sizeof(ceph_le64))
+    .then([this] (auto buf) {
+      auto seq = reinterpret_cast<const ceph_le64*>(buf.get());
+      discard_up_to(&conn.sent, *seq);
+    });
+}
+
+seastar::future<> ProtocolV1::maybe_throttle()
+{
+  if (!conn.policy.throttler_bytes) {
+    return seastar::now();
+  }
+  const auto to_read = (m.header.front_len +
+                        m.header.middle_len +
+                        m.header.data_len);
+  return conn.policy.throttler_bytes->get(to_read);
+}
+
+seastar::future<> ProtocolV1::read_message()
+{
+  return socket->read(sizeof(m.header))
+    .then([this] (bufferlist bl) {
+      // throttle the traffic, maybe
+      auto p = bl.cbegin();
+      ::decode(m.header, p);
+      return maybe_throttle();
+    }).then([this] {
+      // read front
+      return socket->read(m.header.front_len);
+    }).then([this] (bufferlist bl) {
+      m.front = std::move(bl);
+      // read middle
+      return socket->read(m.header.middle_len);
+    }).then([this] (bufferlist bl) {
+      m.middle = std::move(bl);
+      // read data
+      return socket->read(m.header.data_len);
+    }).then([this] (bufferlist bl) {
+      m.data = std::move(bl);
+      // read footer
+      return socket->read(sizeof(m.footer));
+    }).then([this] (bufferlist bl) {
+      auto p = bl.cbegin();
+      ::decode(m.footer, p);
+      auto conn_ref = seastar::static_pointer_cast<SocketConnection>(
+        conn.shared_from_this());
+      auto msg = ::decode_message(nullptr, 0, m.header, m.footer,
+                                  m.front, m.middle, m.data, conn_ref);
+      if (unlikely(!msg)) {
+        logger().warn("{} decode message failed", conn);
+        throw std::system_error{make_error_code(error::corrupted_message)};
+      }
+      constexpr bool add_ref = false; // Message starts with 1 ref
+      // TODO: change MessageRef with foreign_ptr
+      auto msg_ref = MessageRef{msg, add_ref};
+
+      if (session_security) {
+        if (unlikely(session_security->check_message_signature(msg))) {
+          logger().warn("{} message signature check failed", conn);
+          throw std::system_error{make_error_code(error::corrupted_message)};
+        }
+      }
+      // TODO: set time stamps
+      msg->set_byte_throttler(conn.policy.throttler_bytes);
+
+      if (unlikely(!conn.update_rx_seq(msg->get_seq()))) {
+        // skip this message
+        return seastar::now();
+      }
+
+      logger().debug("{} <== #{} === {} ({})",
+                     conn, msg_ref->get_seq(), *msg_ref, msg_ref->get_type());
+      // throttle the reading process by the returned future
+      return dispatchers.ms_dispatch(conn_ref, std::move(msg_ref));
+    });
+}
+
+seastar::future<> ProtocolV1::handle_tags()
+{
+  return seastar::keep_doing([this] {
+      // read the next tag
+      return socket->read_exactly(1)
+        .then([this] (auto buf) {
+          switch (buf[0]) {
+          case CEPH_MSGR_TAG_MSG:
+            return read_message();
+          case CEPH_MSGR_TAG_ACK:
+            return handle_ack();
+          case CEPH_MSGR_TAG_KEEPALIVE:
+            return seastar::now();
+          case CEPH_MSGR_TAG_KEEPALIVE2:
+            return handle_keepalive2();
+          case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
+            return handle_keepalive2_ack();
+          case CEPH_MSGR_TAG_CLOSE:
+            logger().info("{} got tag close", conn);
+            throw std::system_error(make_error_code(error::protocol_aborted));
+          default:
+            logger().error("{} got unknown msgr tag {}",
+                           conn, static_cast<int>(buf[0]));
+            throw std::system_error(make_error_code(error::read_eof));
+          }
+        });
+    });
+}
+
+void ProtocolV1::execute_open(open_t type)
+{
+  logger().trace("{} trigger open, was {}", conn, static_cast<int>(state));
+  state = state_t::open;
+  set_write_state(write_state_t::open);
+
+  if (type == open_t::connected) {
+    dispatchers.ms_handle_connect(
+        seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this()));
+  } else { // type == open_t::accepted
+    dispatchers.ms_handle_accept(
+        seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this()));
+  }
+
+  gate.dispatch_in_background("execute_open", *this, [this] {
+      // start background processing of tags
+      return handle_tags()
+        .handle_exception_type([this] (const std::system_error& e) {
+          logger().warn("{} open fault: {}", conn, e);
+          if (e.code() == error::protocol_aborted ||
+              e.code() == std::errc::connection_reset ||
+              e.code() == error::read_eof) {
+            close(true);
+            return seastar::now();
+          } else {
+            throw e;
+          }
+        }).handle_exception([this] (std::exception_ptr eptr) {
+          // TODO: handle fault in the open state
+          logger().warn("{} open fault: {}", conn, eptr);
+          close(true);
+        });
+    });
+}
+
+// closing state
+
+void ProtocolV1::trigger_close()
+{
+  logger().trace("{} trigger closing, was {}",
+                 conn, static_cast<int>(state));
+  messenger.closing_conn(
+      seastar::static_pointer_cast<SocketConnection>(
+        conn.shared_from_this()));
+
+  if (state == state_t::accepting) {
+    messenger.unaccept_conn(seastar::static_pointer_cast<SocketConnection>(
+      conn.shared_from_this()));
+  } else if (state >= state_t::connecting && state < state_t::closing) {
+    messenger.unregister_conn(seastar::static_pointer_cast<SocketConnection>(
+      conn.shared_from_this()));
+  } else {
+    // cannot happen
+    ceph_assert(false);
+  }
+
+  if (!socket) {
+    ceph_assert(state == state_t::connecting);
+  }
+
+  state = state_t::closing;
+}
+
+void ProtocolV1::on_closed()
+{
+  messenger.closed_conn(
+      seastar::static_pointer_cast<SocketConnection>(
+        conn.shared_from_this()));
+}
+
+seastar::future<> ProtocolV1::fault()
+{
+  if (conn.policy.lossy) {
+    messenger.unregister_conn(seastar::static_pointer_cast<SocketConnection>(
+        conn.shared_from_this()));
+  }
+  // XXX: we decided not to support lossless connection in v1. as the
+  // client's default policy is
+  // Messenger::Policy::lossy_client(CEPH_FEATURE_OSDREPLYMUX) which is
+  // lossy. And by the time of crimson-osd's GA, the in-cluster communication
+  // will all be performed using v2 protocol.
+  ceph_abort("lossless policy not supported for v1");
+  return seastar::now();
+}
+
+void ProtocolV1::print(std::ostream& out) const
+{
+  out << conn;
+}
+
+} // namespace crimson::net
diff --git a/src/crimson/net/ProtocolV1.h b/src/crimson/net/ProtocolV1.h
new file mode 100644
index 000000000..ed6df8954
--- /dev/null
+++ b/src/crimson/net/ProtocolV1.h
@@ -0,0 +1,137 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "Protocol.h"
+
+class AuthAuthorizer;
+class AuthSessionHandler;
+
+namespace crimson::net {
+
+class ProtocolV1 final : public Protocol {
+ public:
+  ProtocolV1(ChainedDispatchers& dispatchers,
+             SocketConnection& conn,
+             SocketMessenger& messenger);
+  ~ProtocolV1() override;
+  void print(std::ostream&) const final;
+ private:
+  void on_closed() override;
+  bool is_connected() const override;
+
+  void start_connect(const entity_addr_t& peer_addr,
+                     const entity_name_t& peer_name) override;
+
+  void start_accept(SocketRef&& socket,
+                    const entity_addr_t& peer_addr) override;
+
+  void trigger_close() override;
+
+  ceph::bufferlist do_sweep_messages(
+      const std::deque<MessageRef>& msgs,
+      size_t num_msgs,
+      bool require_keepalive,
+      std::optional<utime_t> keepalive_ack,
+      bool require_ack) override;
+
+ private:
+  SocketMessenger &messenger;
+
+  enum class state_t {
+    none,
+    accepting,
+    connecting,
+    open,
+    standby,
+    wait,
+    closing
+  };
+  state_t state = state_t::none;
+
+  // state for handshake
+  struct Handshake {
+    ceph_msg_connect connect;
+    ceph_msg_connect_reply reply;
+    ceph::bufferlist auth_payload;  // auth(orizer) payload read off the wire
+    ceph::bufferlist auth_more;     // connect-side auth retry (we added challenge)
+    std::chrono::milliseconds backoff;
+    uint32_t connect_seq = 0;
+    uint32_t peer_global_seq = 0;
+    uint32_t global_seq;
+  } h;
+
+  std::unique_ptr<AuthSessionHandler> session_security;
+
+  // state for an incoming message
+  struct MessageReader {
+    ceph_msg_header header;
+    ceph_msg_footer footer;
+    bufferlist front;
+    bufferlist middle;
+    bufferlist data;
+  } m;
+
+  struct Keepalive {
+    struct {
+      const char tag = CEPH_MSGR_TAG_KEEPALIVE2;
+      ceph_timespec stamp;
+    } __attribute__((packed)) req;
+    struct {
+      const char tag = CEPH_MSGR_TAG_KEEPALIVE2_ACK;
+      ceph_timespec stamp;
+    } __attribute__((packed)) ack;
+    ceph_timespec ack_stamp;
+  } k;
+
+ private:
+  // connecting
+  void reset_session();
+  seastar::future<stop_t> handle_connect_reply(crimson::net::msgr_tag_t tag);
+  seastar::future<stop_t> repeat_connect();
+  ceph::bufferlist get_auth_payload();
+
+  // accepting
+  seastar::future<stop_t> send_connect_reply(
+      msgr_tag_t tag, bufferlist&& authorizer_reply = {});
+  seastar::future<stop_t> send_connect_reply_ready(
+      msgr_tag_t tag, bufferlist&& authorizer_reply);
+  seastar::future<stop_t> replace_existing(
+      SocketConnectionRef existing,
+      bufferlist&& authorizer_reply,
+      bool is_reset_from_peer = false);
+  seastar::future<stop_t> handle_connect_with_existing(
+      SocketConnectionRef existing, bufferlist&& authorizer_reply);
+  bool require_auth_feature() const;
+  bool require_cephx_v2_feature() const;
+  seastar::future<stop_t> repeat_handle_connect();
+
+  // open
+  seastar::future<> handle_keepalive2_ack();
+  seastar::future<> handle_keepalive2();
+  seastar::future<> handle_ack();
+  seastar::future<> maybe_throttle();
+  seastar::future<> read_message();
+  seastar::future<> handle_tags();
+
+  enum class open_t {
+    connected,
+    accepted
+  };
+  void execute_open(open_t type);
+
+  // replacing
+  // the number of connections initiated in this session, increment when a
+  // new connection is established
+  uint32_t connect_seq() const { return h.connect_seq; }
+  // the client side should connect us with a gseq. it will be reset with
+  // the one of exsting connection if it's greater.
+  uint32_t peer_global_seq() const { return h.peer_global_seq; }
+  // current state of ProtocolV1
+  state_t get_state() const { return state; }
+
+  seastar::future<> fault();
+};
+
+} // namespace crimson::net
diff --git a/src/crimson/net/ProtocolV2.cc b/src/crimson/net/ProtocolV2.cc
new file mode 100644
index 000000000..b7137b8b8
--- /dev/null
+++ b/src/crimson/net/ProtocolV2.cc
@@ -0,0 +1,2139 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ProtocolV2.h"
+
+#include <seastar/core/lowres_clock.hh>
+#include <fmt/format.h>
+#include "include/msgr.h"
+#include "include/random.h"
+
+#include "crimson/auth/AuthClient.h"
+#include "crimson/auth/AuthServer.h"
+#include "crimson/common/formatter.h"
+
+#include "chained_dispatchers.h"
+#include "Errors.h"
+#include "Socket.h"
+#include "SocketConnection.h"
+#include "SocketMessenger.h"
+
+#ifdef UNIT_TESTS_BUILT
+#include "Interceptor.h"
+#endif
+
+using namespace ceph::msgr::v2;
+using crimson::common::local_conf;
+
+namespace {
+
+// TODO: apply the same logging policy to Protocol V1
+// Log levels in V2 Protocol:
+// * error level, something error that cause connection to terminate:
+//   - fatal errors;
+//   - bugs;
+// * warn level: something unusual that identifies connection fault or replacement:
+//   - unstable network;
+//   - incompatible peer;
+//   - auth failure;
+//   - connection race;
+//   - connection reset;
+// * info level, something very important to show connection lifecycle,
+//   which doesn't happen very frequently;
+// * debug level, important logs for debugging, including:
+//   - all the messages sent/received (-->/<==);
+//   - all the frames exchanged (WRITE/GOT);
+//   - important fields updated (UPDATE);
+//   - connection state transitions (TRIGGER);
+// * trace level, trivial logs showing:
+//   - the exact bytes being sent/received (SEND/RECV(bytes));
+//   - detailed information of sub-frames;
+//   - integrity checks;
+//   - etc.
+seastar::logger& logger() {
+  return crimson::get_logger(ceph_subsys_ms);
+}
+
+[[noreturn]] void abort_in_fault() {
+  throw std::system_error(make_error_code(crimson::net::error::negotiation_failure));
+}
+
+[[noreturn]] void abort_protocol() {
+  throw std::system_error(make_error_code(crimson::net::error::protocol_aborted));
+}
+
+[[noreturn]] void abort_in_close(crimson::net::ProtocolV2& proto, bool dispatch_reset) {
+  proto.close(dispatch_reset);
+  abort_protocol();
+}
+
+inline void expect_tag(const Tag& expected,
+                       const Tag& actual,
+                       crimson::net::SocketConnection& conn,
+                       const char *where) {
+  if (actual != expected) {
+    logger().warn("{} {} received wrong tag: {}, expected {}",
+                  conn, where,
+                  static_cast<uint32_t>(actual),
+                  static_cast<uint32_t>(expected));
+    abort_in_fault();
+  }
+}
+
+inline void unexpected_tag(const Tag& unexpected,
+                           crimson::net::SocketConnection& conn,
+                           const char *where) {
+  logger().warn("{} {} received unexpected tag: {}",
+                conn, where, static_cast<uint32_t>(unexpected));
+  abort_in_fault();
+}
+
+inline uint64_t generate_client_cookie() {
+  return ceph::util::generate_random_number<uint64_t>(
+      1, std::numeric_limits<uint64_t>::max());
+}
+
+} // namespace anonymous
+
+namespace crimson::net {
+
+#ifdef UNIT_TESTS_BUILT
+void intercept(Breakpoint bp, bp_type_t type,
+               SocketConnection& conn, SocketRef& socket) {
+  if (conn.interceptor) {
+    auto action = conn.interceptor->intercept(conn, Breakpoint(bp));
+    socket->set_trap(type, action, &conn.interceptor->blocker);
+  }
+}
+
+#define INTERCEPT_CUSTOM(bp, type)       \
+intercept({bp}, type, conn, socket)
+
+#define INTERCEPT_FRAME(tag, type)       \
+intercept({static_cast<Tag>(tag), type}, \
+          type, conn, socket)
+
+#define INTERCEPT_N_RW(bp)                               \
+if (conn.interceptor) {                                  \
+  auto action = conn.interceptor->intercept(conn, {bp}); \
+  ceph_assert(action != bp_action_t::BLOCK);             \
+  if (action == bp_action_t::FAULT) {                    \
+    abort_in_fault();                                    \
+  }                                                      \
+}
+
+#else
+#define INTERCEPT_CUSTOM(bp, type)
+#define INTERCEPT_FRAME(tag, type)
+#define INTERCEPT_N_RW(bp)
+#endif
+
+seastar::future<> ProtocolV2::Timer::backoff(double seconds)
+{
+  logger().warn("{} waiting {} seconds ...", conn, seconds);
+  cancel();
+  last_dur_ = seconds;
+  as = seastar::abort_source();
+  auto dur = std::chrono::duration_cast<seastar::lowres_clock::duration>(
+      std::chrono::duration<double>(seconds));
+  return seastar::sleep_abortable(dur, *as
+  ).handle_exception_type([this] (const seastar::sleep_aborted& e) {
+    logger().debug("{} wait aborted", conn);
+    abort_protocol();
+  });
+}
+
+ProtocolV2::ProtocolV2(ChainedDispatchers& dispatchers,
+                       SocketConnection& conn,
+                       SocketMessenger& messenger)
+  : Protocol(proto_t::v2, dispatchers, conn),
+    messenger{messenger},
+    protocol_timer{conn}
+{}
+
+ProtocolV2::~ProtocolV2() {}
+
+bool ProtocolV2::is_connected() const {
+  return state == state_t::READY ||
+         state == state_t::ESTABLISHING ||
+         state == state_t::REPLACING;
+}
+
+void ProtocolV2::start_connect(const entity_addr_t& _peer_addr,
+                               const entity_name_t& _peer_name)
+{
+  ceph_assert(state == state_t::NONE);
+  ceph_assert(!socket);
+  ceph_assert(!gate.is_closed());
+  conn.peer_addr = _peer_addr;
+  conn.target_addr = _peer_addr;
+  conn.set_peer_name(_peer_name);
+  conn.policy = messenger.get_policy(_peer_name.type());
+  client_cookie = generate_client_cookie();
+  logger().info("{} ProtocolV2::start_connect(): peer_addr={}, peer_name={}, cc={}"
+                " policy(lossy={}, server={}, standby={}, resetcheck={})",
+                conn, _peer_addr, _peer_name, client_cookie,
+                conn.policy.lossy, conn.policy.server,
+                conn.policy.standby, conn.policy.resetcheck);
+  messenger.register_conn(
+    seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this()));
+  execute_connecting();
+}
+
+void ProtocolV2::start_accept(SocketRef&& sock,
+                              const entity_addr_t& _peer_addr)
+{
+  ceph_assert(state == state_t::NONE);
+  ceph_assert(!socket);
+  // until we know better
+  conn.target_addr = _peer_addr;
+  socket = std::move(sock);
+  logger().info("{} ProtocolV2::start_accept(): target_addr={}", conn, _peer_addr);
+  messenger.accept_conn(
+    seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this()));
+  execute_accepting();
+}
+
+// TODO: Frame related implementations, probably to a separate class.
+
+void ProtocolV2::enable_recording()
+{
+  rxbuf.clear();
+  txbuf.clear();
+  record_io = true;
+}
+
+seastar::future<Socket::tmp_buf> ProtocolV2::read_exactly(size_t bytes)
+{
+  if (unlikely(record_io)) {
+    return socket->read_exactly(bytes)
+    .then([this] (auto bl) {
+      rxbuf.append(buffer::create(bl.share()));
+      return bl;
+    });
+  } else {
+    return socket->read_exactly(bytes);
+  };
+}
+
+seastar::future<bufferlist> ProtocolV2::read(size_t bytes)
+{
+  if (unlikely(record_io)) {
+    return socket->read(bytes)
+    .then([this] (auto buf) {
+      rxbuf.append(buf);
+      return buf;
+    });
+  } else {
+    return socket->read(bytes);
+  }
+}
+
+seastar::future<> ProtocolV2::write(bufferlist&& buf)
+{
+  if (unlikely(record_io)) {
+    txbuf.append(buf);
+  }
+  return socket->write(std::move(buf));
+}
+
+seastar::future<> ProtocolV2::write_flush(bufferlist&& buf)
+{
+  if (unlikely(record_io)) {
+    txbuf.append(buf);
+  }
+  return socket->write_flush(std::move(buf));
+}
+
+size_t ProtocolV2::get_current_msg_size() const
+{
+  ceph_assert(rx_frame_asm.get_num_segments() > 0);
+  size_t sum = 0;
+  // we don't include SegmentIndex::Msg::HEADER.
+  for (size_t idx = 1; idx < rx_frame_asm.get_num_segments(); idx++) {
+    sum += rx_frame_asm.get_segment_logical_len(idx);
+  }
+  return sum;
+}
+
+seastar::future<Tag> ProtocolV2::read_main_preamble()
+{
+  rx_preamble.clear();
+  return read_exactly(rx_frame_asm.get_preamble_onwire_len())
+    .then([this] (auto bl) {
+      rx_segments_data.clear();
+      try {
+        rx_preamble.append(buffer::create(std::move(bl)));
+        const Tag tag = rx_frame_asm.disassemble_preamble(rx_preamble);
+        INTERCEPT_FRAME(tag, bp_type_t::READ);
+        return tag;
+      } catch (FrameError& e) {
+        logger().warn("{} read_main_preamble: {}", conn, e.what());
+        abort_in_fault();
+      }
+    });
+}
+
+seastar::future<> ProtocolV2::read_frame_payload()
+{
+  ceph_assert(rx_segments_data.empty());
+
+  return seastar::do_until(
+    [this] { return rx_frame_asm.get_num_segments() == rx_segments_data.size(); },
+    [this] {
+      // TODO: create aligned and contiguous buffer from socket
+      const size_t seg_idx = rx_segments_data.size();
+      if (uint16_t alignment = rx_frame_asm.get_segment_align(seg_idx);
+	  alignment != segment_t::DEFAULT_ALIGNMENT) {
+        logger().trace("{} cannot allocate {} aligned buffer at segment desc index {}",
+                       conn, alignment, rx_segments_data.size());
+      }
+      uint32_t onwire_len = rx_frame_asm.get_segment_onwire_len(seg_idx);
+      // TODO: create aligned and contiguous buffer from socket
+      return read_exactly(onwire_len).then([this] (auto tmp_bl) {
+        logger().trace("{} RECV({}) frame segment[{}]",
+                       conn, tmp_bl.size(), rx_segments_data.size());
+        bufferlist segment;
+        segment.append(buffer::create(std::move(tmp_bl)));
+        rx_segments_data.emplace_back(std::move(segment));
+      });
+    }
+  ).then([this] {
+    return read_exactly(rx_frame_asm.get_epilogue_onwire_len());
+  }).then([this] (auto bl) {
+    logger().trace("{} RECV({}) frame epilogue", conn, bl.size());
+    bool ok = false;
+    try {
+      rx_frame_asm.disassemble_first_segment(rx_preamble, rx_segments_data[0]);
+      bufferlist rx_epilogue;
+      rx_epilogue.append(buffer::create(std::move(bl)));
+      ok = rx_frame_asm.disassemble_remaining_segments(rx_segments_data.data(), rx_epilogue);
+    } catch (FrameError& e) {
+      logger().error("read_frame_payload: {} {}", conn, e.what());
+      abort_in_fault();
+    } catch (ceph::crypto::onwire::MsgAuthError&) {
+      logger().error("read_frame_payload: {} bad auth tag", conn);
+      abort_in_fault();
+    }
+    // we do have a mechanism that allows transmitter to start sending message
+    // and abort after putting entire data field on wire. This will be used by
+    // the kernel client to avoid unnecessary buffering.
+    if (!ok) {
+      // TODO
+      ceph_assert(false);
+    }
+  });
+}
+
+template <class F>
+seastar::future<> ProtocolV2::write_frame(F &frame, bool flush)
+{
+  auto bl = frame.get_buffer(tx_frame_asm);
+  const auto main_preamble = reinterpret_cast<const preamble_block_t*>(bl.front().c_str());
+  logger().trace("{} SEND({}) frame: tag={}, num_segments={}, crc={}",
+                 conn, bl.length(), (int)main_preamble->tag,
+                 (int)main_preamble->num_segments, main_preamble->crc);
+  INTERCEPT_FRAME(main_preamble->tag, bp_type_t::WRITE);
+  if (flush) {
+    return write_flush(std::move(bl));
+  } else {
+    return write(std::move(bl));
+  }
+}
+
+void ProtocolV2::trigger_state(state_t _state, write_state_t _write_state, bool reentrant)
+{
+  if (!reentrant && _state == state) {
+    logger().error("{} is not allowed to re-trigger state {}",
+                   conn, get_state_name(state));
+    ceph_assert(false);
+  }
+  logger().debug("{} TRIGGER {}, was {}",
+                 conn, get_state_name(_state), get_state_name(state));
+  state = _state;
+  set_write_state(_write_state);
+}
+
+void ProtocolV2::fault(bool backoff, const char* func_name, std::exception_ptr eptr)
+{
+  if (conn.policy.lossy) {
+    logger().info("{} {}: fault at {} on lossy channel, going to CLOSING -- {}",
+                  conn, func_name, get_state_name(state), eptr);
+    close(true);
+  } else if (conn.policy.server ||
+             (conn.policy.standby &&
+              (!is_queued() && conn.sent.empty()))) {
+    logger().info("{} {}: fault at {} with nothing to send, going to STANDBY -- {}",
+                  conn, func_name, get_state_name(state), eptr);
+    execute_standby();
+  } else if (backoff) {
+    logger().info("{} {}: fault at {}, going to WAIT -- {}",
+                  conn, func_name, get_state_name(state), eptr);
+    execute_wait(false);
+  } else {
+    logger().info("{} {}: fault at {}, going to CONNECTING -- {}",
+                  conn, func_name, get_state_name(state), eptr);
+    execute_connecting();
+  }
+}
+
+void ProtocolV2::reset_session(bool full)
+{
+  server_cookie = 0;
+  connect_seq = 0;
+  conn.in_seq = 0;
+  if (full) {
+    client_cookie = generate_client_cookie();
+    peer_global_seq = 0;
+    reset_write();
+    dispatchers.ms_handle_remote_reset(
+	seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this()));
+  }
+}
+
+seastar::future<std::tuple<entity_type_t, entity_addr_t>>
+ProtocolV2::banner_exchange(bool is_connect)
+{
+  // 1. prepare and send banner
+  bufferlist banner_payload;
+  encode((uint64_t)CEPH_MSGR2_SUPPORTED_FEATURES, banner_payload, 0);
+  encode((uint64_t)CEPH_MSGR2_REQUIRED_FEATURES, banner_payload, 0);
+
+  bufferlist bl;
+  bl.append(CEPH_BANNER_V2_PREFIX, strlen(CEPH_BANNER_V2_PREFIX));
+  auto len_payload = static_cast<uint16_t>(banner_payload.length());
+  encode(len_payload, bl, 0);
+  bl.claim_append(banner_payload);
+  logger().debug("{} SEND({}) banner: len_payload={}, supported={}, "
+                 "required={}, banner=\"{}\"",
+                 conn, bl.length(), len_payload,
+                 CEPH_MSGR2_SUPPORTED_FEATURES, CEPH_MSGR2_REQUIRED_FEATURES,
+                 CEPH_BANNER_V2_PREFIX);
+  INTERCEPT_CUSTOM(custom_bp_t::BANNER_WRITE, bp_type_t::WRITE);
+  return write_flush(std::move(bl)).then([this] {
+      // 2. read peer banner
+      unsigned banner_len = strlen(CEPH_BANNER_V2_PREFIX) + sizeof(ceph_le16);
+      INTERCEPT_CUSTOM(custom_bp_t::BANNER_READ, bp_type_t::READ);
+      return read_exactly(banner_len); // or read exactly?
+    }).then([this] (auto bl) {
+      // 3. process peer banner and read banner_payload
+      unsigned banner_prefix_len = strlen(CEPH_BANNER_V2_PREFIX);
+      logger().debug("{} RECV({}) banner: \"{}\"",
+                     conn, bl.size(),
+                     std::string((const char*)bl.get(), banner_prefix_len));
+
+      if (memcmp(bl.get(), CEPH_BANNER_V2_PREFIX, banner_prefix_len) != 0) {
+        if (memcmp(bl.get(), CEPH_BANNER, strlen(CEPH_BANNER)) == 0) {
+          logger().warn("{} peer is using V1 protocol", conn);
+        } else {
+          logger().warn("{} peer sent bad banner", conn);
+        }
+        abort_in_fault();
+      }
+      bl.trim_front(banner_prefix_len);
+
+      uint16_t payload_len;
+      bufferlist buf;
+      buf.append(buffer::create(std::move(bl)));
+      auto ti = buf.cbegin();
+      try {
+        decode(payload_len, ti);
+      } catch (const buffer::error &e) {
+        logger().warn("{} decode banner payload len failed", conn);
+        abort_in_fault();
+      }
+      logger().debug("{} GOT banner: payload_len={}", conn, payload_len);
+      INTERCEPT_CUSTOM(custom_bp_t::BANNER_PAYLOAD_READ, bp_type_t::READ);
+      return read(payload_len);
+    }).then([this, is_connect] (bufferlist bl) {
+      // 4. process peer banner_payload and send HelloFrame
+      auto p = bl.cbegin();
+      uint64_t peer_supported_features;
+      uint64_t peer_required_features;
+      try {
+        decode(peer_supported_features, p);
+        decode(peer_required_features, p);
+      } catch (const buffer::error &e) {
+        logger().warn("{} decode banner payload failed", conn);
+        abort_in_fault();
+      }
+      logger().debug("{} RECV({}) banner features: supported={} required={}",
+                     conn, bl.length(),
+                     peer_supported_features, peer_required_features);
+
+      // Check feature bit compatibility
+      uint64_t supported_features = CEPH_MSGR2_SUPPORTED_FEATURES;
+      uint64_t required_features = CEPH_MSGR2_REQUIRED_FEATURES;
+      if ((required_features & peer_supported_features) != required_features) {
+        logger().error("{} peer does not support all required features"
+                       " required={} peer_supported={}",
+                       conn, required_features, peer_supported_features);
+        abort_in_close(*this, is_connect);
+      }
+      if ((supported_features & peer_required_features) != peer_required_features) {
+        logger().error("{} we do not support all peer required features"
+                       " peer_required={} supported={}",
+                       conn, peer_required_features, supported_features);
+        abort_in_close(*this, is_connect);
+      }
+      this->peer_required_features = peer_required_features;
+      if (this->peer_required_features == 0) {
+        this->connection_features = msgr2_required;
+      }
+      const bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1);
+      tx_frame_asm.set_is_rev1(is_rev1);
+      rx_frame_asm.set_is_rev1(is_rev1);
+
+      auto hello = HelloFrame::Encode(messenger.get_mytype(),
+                                      conn.target_addr);
+      logger().debug("{} WRITE HelloFrame: my_type={}, peer_addr={}",
+                     conn, ceph_entity_type_name(messenger.get_mytype()),
+                     conn.target_addr);
+      return write_frame(hello);
+    }).then([this] {
+      //5. read peer HelloFrame
+      return read_main_preamble();
+    }).then([this] (Tag tag) {
+      expect_tag(Tag::HELLO, tag, conn, __func__);
+      return read_frame_payload();
+    }).then([this] {
+      // 6. process peer HelloFrame
+      auto hello = HelloFrame::Decode(rx_segments_data.back());
+      logger().debug("{} GOT HelloFrame: my_type={} peer_addr={}",
+                     conn, ceph_entity_type_name(hello.entity_type()),
+                     hello.peer_addr());
+      return seastar::make_ready_future<std::tuple<entity_type_t, entity_addr_t>>(
+        std::make_tuple(hello.entity_type(), hello.peer_addr()));
+    });
+}
+
+// CONNECTING state
+
+seastar::future<> ProtocolV2::handle_auth_reply()
+{
+  return read_main_preamble()
+  .then([this] (Tag tag) {
+    switch (tag) {
+      case Tag::AUTH_BAD_METHOD:
+        return read_frame_payload().then([this] {
+          // handle_auth_bad_method() logic
+          auto bad_method = AuthBadMethodFrame::Decode(rx_segments_data.back());
+          logger().warn("{} GOT AuthBadMethodFrame: method={} result={}, "
+                        "allowed_methods={}, allowed_modes={}",
+                        conn, bad_method.method(), cpp_strerror(bad_method.result()),
+                        bad_method.allowed_methods(), bad_method.allowed_modes());
+          ceph_assert(messenger.get_auth_client());
+          int r = messenger.get_auth_client()->handle_auth_bad_method(
+              conn.shared_from_this(), auth_meta,
+              bad_method.method(), bad_method.result(),
+              bad_method.allowed_methods(), bad_method.allowed_modes());
+          if (r < 0) {
+            logger().warn("{} auth_client handle_auth_bad_method returned {}",
+                          conn, r);
+            abort_in_fault();
+          }
+          return client_auth(bad_method.allowed_methods());
+        });
+      case Tag::AUTH_REPLY_MORE:
+        return read_frame_payload().then([this] {
+          // handle_auth_reply_more() logic
+          auto auth_more = AuthReplyMoreFrame::Decode(rx_segments_data.back());
+          logger().debug("{} GOT AuthReplyMoreFrame: payload_len={}",
+                         conn, auth_more.auth_payload().length());
+          ceph_assert(messenger.get_auth_client());
+          // let execute_connecting() take care of the thrown exception
+          auto reply = messenger.get_auth_client()->handle_auth_reply_more(
+            conn.shared_from_this(), auth_meta, auth_more.auth_payload());
+          auto more_reply = AuthRequestMoreFrame::Encode(reply);
+          logger().debug("{} WRITE AuthRequestMoreFrame: payload_len={}",
+                         conn, reply.length());
+          return write_frame(more_reply);
+        }).then([this] {
+          return handle_auth_reply();
+        });
+      case Tag::AUTH_DONE:
+        return read_frame_payload().then([this] {
+          // handle_auth_done() logic
+          auto auth_done = AuthDoneFrame::Decode(rx_segments_data.back());
+          logger().debug("{} GOT AuthDoneFrame: gid={}, con_mode={}, payload_len={}",
+                         conn, auth_done.global_id(),
+                         ceph_con_mode_name(auth_done.con_mode()),
+                         auth_done.auth_payload().length());
+          ceph_assert(messenger.get_auth_client());
+          int r = messenger.get_auth_client()->handle_auth_done(
+              conn.shared_from_this(), auth_meta,
+              auth_done.global_id(),
+              auth_done.con_mode(),
+              auth_done.auth_payload());
+          if (r < 0) {
+            logger().warn("{} auth_client handle_auth_done returned {}", conn, r);
+            abort_in_fault();
+          }
+          auth_meta->con_mode = auth_done.con_mode();
+          session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair(
+              nullptr, *auth_meta, tx_frame_asm.get_is_rev1(), false);
+          return finish_auth();
+        });
+      default: {
+        unexpected_tag(tag, conn, __func__);
+        return seastar::now();
+      }
+    }
+  });
+}
+
+seastar::future<> ProtocolV2::client_auth(std::vector<uint32_t> &allowed_methods)
+{
+  // send_auth_request() logic
+  ceph_assert(messenger.get_auth_client());
+
+  try {
+    auto [auth_method, preferred_modes, bl] =
+      messenger.get_auth_client()->get_auth_request(conn.shared_from_this(), auth_meta);
+    auth_meta->auth_method = auth_method;
+    auto frame = AuthRequestFrame::Encode(auth_method, preferred_modes, bl);
+    logger().debug("{} WRITE AuthRequestFrame: method={},"
+                   " preferred_modes={}, payload_len={}",
+                   conn, auth_method, preferred_modes, bl.length());
+    return write_frame(frame).then([this] {
+      return handle_auth_reply();
+    });
+  } catch (const crimson::auth::error& e) {
+    logger().error("{} get_initial_auth_request returned {}", conn, e);
+    abort_in_close(*this, true);
+    return seastar::now();
+  }
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::process_wait()
+{
+  return read_frame_payload().then([this] {
+    // handle_wait() logic
+    logger().debug("{} GOT WaitFrame", conn);
+    WaitFrame::Decode(rx_segments_data.back());
+    return next_step_t::wait;
+  });
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::client_connect()
+{
+  // send_client_ident() logic
+  uint64_t flags = 0;
+  if (conn.policy.lossy) {
+    flags |= CEPH_MSG_CONNECT_LOSSY;
+  }
+
+  auto client_ident = ClientIdentFrame::Encode(
+      messenger.get_myaddrs(),
+      conn.target_addr,
+      messenger.get_myname().num(),
+      global_seq,
+      conn.policy.features_supported,
+      conn.policy.features_required | msgr2_required, flags,
+      client_cookie);
+
+  logger().debug("{} WRITE ClientIdentFrame: addrs={}, target={}, gid={},"
+                 " gs={}, features_supported={}, features_required={},"
+                 " flags={}, cookie={}",
+                 conn, messenger.get_myaddrs(), conn.target_addr,
+                 messenger.get_myname().num(), global_seq,
+                 conn.policy.features_supported,
+                 conn.policy.features_required | msgr2_required,
+                 flags, client_cookie);
+  return write_frame(client_ident).then([this] {
+    return read_main_preamble();
+  }).then([this] (Tag tag) {
+    switch (tag) {
+      case Tag::IDENT_MISSING_FEATURES:
+        return read_frame_payload().then([this] {
+          // handle_ident_missing_features() logic
+          auto ident_missing = IdentMissingFeaturesFrame::Decode(rx_segments_data.back());
+          logger().warn("{} GOT IdentMissingFeaturesFrame: features={}"
+                        " (client does not support all server features)",
+                        conn, ident_missing.features());
+          abort_in_fault();
+          return next_step_t::none;
+        });
+      case Tag::WAIT:
+        return process_wait();
+      case Tag::SERVER_IDENT:
+        return read_frame_payload().then([this] {
+          // handle_server_ident() logic
+          requeue_sent();
+          auto server_ident = ServerIdentFrame::Decode(rx_segments_data.back());
+          logger().debug("{} GOT ServerIdentFrame:"
+                         " addrs={}, gid={}, gs={},"
+                         " features_supported={}, features_required={},"
+                         " flags={}, cookie={}",
+                         conn,
+                         server_ident.addrs(), server_ident.gid(),
+                         server_ident.global_seq(),
+                         server_ident.supported_features(),
+                         server_ident.required_features(),
+                         server_ident.flags(), server_ident.cookie());
+
+          // is this who we intended to talk to?
+          // be a bit forgiving here, since we may be connecting based on addresses parsed out
+          // of mon_host or something.
+          if (!server_ident.addrs().contains(conn.target_addr)) {
+            logger().warn("{} peer identifies as {}, does not include {}",
+                          conn, server_ident.addrs(), conn.target_addr);
+            throw std::system_error(
+                make_error_code(crimson::net::error::bad_peer_address));
+          }
+
+          server_cookie = server_ident.cookie();
+
+          // TODO: change peer_addr to entity_addrvec_t
+          if (server_ident.addrs().front() != conn.peer_addr) {
+            logger().warn("{} peer advertises as {}, does not match {}",
+                          conn, server_ident.addrs(), conn.peer_addr);
+            throw std::system_error(
+                make_error_code(crimson::net::error::bad_peer_address));
+          }
+          if (conn.get_peer_id() != entity_name_t::NEW &&
+              conn.get_peer_id() != server_ident.gid()) {
+            logger().error("{} connection peer id ({}) does not match "
+                           "what it should be ({}) during connecting, close",
+                            conn, server_ident.gid(), conn.get_peer_id());
+            abort_in_close(*this, true);
+          }
+          conn.set_peer_id(server_ident.gid());
+          conn.set_features(server_ident.supported_features() &
+                            conn.policy.features_supported);
+          peer_global_seq = server_ident.global_seq();
+
+          bool lossy = server_ident.flags() & CEPH_MSG_CONNECT_LOSSY;
+          if (lossy != conn.policy.lossy) {
+            logger().warn("{} UPDATE Policy(lossy={}) from server flags", conn, lossy);
+            conn.policy.lossy = lossy;
+          }
+          if (lossy && (connect_seq != 0 || server_cookie != 0)) {
+            logger().warn("{} UPDATE cs=0({}) sc=0({}) for lossy policy",
+                          conn, connect_seq, server_cookie);
+            connect_seq = 0;
+            server_cookie = 0;
+          }
+
+          return seastar::make_ready_future<next_step_t>(next_step_t::ready);
+        });
+      default: {
+        unexpected_tag(tag, conn, "post_client_connect");
+        return seastar::make_ready_future<next_step_t>(next_step_t::none);
+      }
+    }
+  });
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::client_reconnect()
+{
+  // send_reconnect() logic
+  auto reconnect = ReconnectFrame::Encode(messenger.get_myaddrs(),
+                                          client_cookie,
+                                          server_cookie,
+                                          global_seq,
+                                          connect_seq,
+                                          conn.in_seq);
+  logger().debug("{} WRITE ReconnectFrame: addrs={}, client_cookie={},"
+                 " server_cookie={}, gs={}, cs={}, msg_seq={}",
+                 conn, messenger.get_myaddrs(),
+                 client_cookie, server_cookie,
+                 global_seq, connect_seq, conn.in_seq);
+  return write_frame(reconnect).then([this] {
+    return read_main_preamble();
+  }).then([this] (Tag tag) {
+    switch (tag) {
+      case Tag::SESSION_RETRY_GLOBAL:
+        return read_frame_payload().then([this] {
+          // handle_session_retry_global() logic
+          auto retry = RetryGlobalFrame::Decode(rx_segments_data.back());
+          logger().warn("{} GOT RetryGlobalFrame: gs={}",
+                        conn, retry.global_seq());
+          return messenger.get_global_seq(retry.global_seq()).then([this] (auto gs) {
+            global_seq = gs;
+            logger().warn("{} UPDATE: gs={} for retry global", conn, global_seq);
+            return client_reconnect();
+          });
+        });
+      case Tag::SESSION_RETRY:
+        return read_frame_payload().then([this] {
+          // handle_session_retry() logic
+          auto retry = RetryFrame::Decode(rx_segments_data.back());
+          logger().warn("{} GOT RetryFrame: cs={}",
+                        conn, retry.connect_seq());
+          connect_seq = retry.connect_seq() + 1;
+          logger().warn("{} UPDATE: cs={}", conn, connect_seq);
+          return client_reconnect();
+        });
+      case Tag::SESSION_RESET:
+        return read_frame_payload().then([this] {
+          // handle_session_reset() logic
+          auto reset = ResetFrame::Decode(rx_segments_data.back());
+          logger().warn("{} GOT ResetFrame: full={}", conn, reset.full());
+          reset_session(reset.full());
+          return client_connect();
+        });
+      case Tag::WAIT:
+        return process_wait();
+      case Tag::SESSION_RECONNECT_OK:
+        return read_frame_payload().then([this] {
+          // handle_reconnect_ok() logic
+          auto reconnect_ok = ReconnectOkFrame::Decode(rx_segments_data.back());
+          logger().debug("{} GOT ReconnectOkFrame: msg_seq={}",
+                         conn, reconnect_ok.msg_seq());
+          requeue_up_to(reconnect_ok.msg_seq());
+          return seastar::make_ready_future<next_step_t>(next_step_t::ready);
+        });
+      default: {
+        unexpected_tag(tag, conn, "post_client_reconnect");
+        return seastar::make_ready_future<next_step_t>(next_step_t::none);
+      }
+    }
+  });
+}
+
+void ProtocolV2::execute_connecting()
+{
+  trigger_state(state_t::CONNECTING, write_state_t::delay, true);
+  if (socket) {
+    socket->shutdown();
+  }
+  gated_execute("execute_connecting", [this] {
+      return messenger.get_global_seq().then([this] (auto gs) {
+          global_seq = gs;
+          assert(client_cookie != 0);
+          if (!conn.policy.lossy && server_cookie != 0) {
+            ++connect_seq;
+            logger().debug("{} UPDATE: gs={}, cs={} for reconnect",
+                           conn, global_seq, connect_seq);
+          } else { // conn.policy.lossy || server_cookie == 0
+            assert(connect_seq == 0);
+            assert(server_cookie == 0);
+            logger().debug("{} UPDATE: gs={} for connect", conn, global_seq);
+          }
+
+          return wait_write_exit();
+        }).then([this] {
+          if (unlikely(state != state_t::CONNECTING)) {
+            logger().debug("{} triggered {} before Socket::connect()",
+                           conn, get_state_name(state));
+            abort_protocol();
+          }
+          if (socket) {
+            gate.dispatch_in_background("close_sockect_connecting", *this,
+                           [sock = std::move(socket)] () mutable {
+              return sock->close().then([sock = std::move(sock)] {});
+            });
+          }
+          INTERCEPT_N_RW(custom_bp_t::SOCKET_CONNECTING);
+          return Socket::connect(conn.peer_addr);
+        }).then([this](SocketRef sock) {
+          logger().debug("{} socket connected", conn);
+          if (unlikely(state != state_t::CONNECTING)) {
+            logger().debug("{} triggered {} during Socket::connect()",
+                           conn, get_state_name(state));
+            return sock->close().then([sock = std::move(sock)] {
+              abort_protocol();
+            });
+          }
+          socket = std::move(sock);
+          return seastar::now();
+        }).then([this] {
+          auth_meta = seastar::make_lw_shared<AuthConnectionMeta>();
+          session_stream_handlers = { nullptr, nullptr };
+          enable_recording();
+          return banner_exchange(true);
+        }).then([this] (auto&& ret) {
+          auto [_peer_type, _my_addr_from_peer] = std::move(ret);
+          if (conn.get_peer_type() != _peer_type) {
+            logger().warn("{} connection peer type does not match what peer advertises {} != {}",
+                          conn, ceph_entity_type_name(conn.get_peer_type()),
+                          ceph_entity_type_name(_peer_type));
+            abort_in_close(*this, true);
+          }
+          if (unlikely(state != state_t::CONNECTING)) {
+            logger().debug("{} triggered {} during banner_exchange(), abort",
+                           conn, get_state_name(state));
+            abort_protocol();
+          }
+          socket->learn_ephemeral_port_as_connector(_my_addr_from_peer.get_port());
+          if (unlikely(_my_addr_from_peer.is_legacy())) {
+            logger().warn("{} peer sent a legacy address for me: {}",
+                          conn, _my_addr_from_peer);
+            throw std::system_error(
+                make_error_code(crimson::net::error::bad_peer_address));
+          }
+          _my_addr_from_peer.set_type(entity_addr_t::TYPE_MSGR2);
+          return messenger.learned_addr(_my_addr_from_peer, conn);
+        }).then([this] {
+          return client_auth();
+        }).then([this] {
+          if (server_cookie == 0) {
+            ceph_assert(connect_seq == 0);
+            return client_connect();
+          } else {
+            ceph_assert(connect_seq > 0);
+            return client_reconnect();
+          }
+        }).then([this] (next_step_t next) {
+          if (unlikely(state != state_t::CONNECTING)) {
+            logger().debug("{} triggered {} at the end of execute_connecting()",
+                           conn, get_state_name(state));
+            abort_protocol();
+          }
+          switch (next) {
+           case next_step_t::ready: {
+            logger().info("{} connected:"
+                          " gs={}, pgs={}, cs={}, client_cookie={},"
+                          " server_cookie={}, in_seq={}, out_seq={}, out_q={}",
+                          conn, global_seq, peer_global_seq, connect_seq,
+                          client_cookie, server_cookie, conn.in_seq,
+                          conn.out_seq, conn.out_q.size());
+            execute_ready(true);
+            break;
+           }
+           case next_step_t::wait: {
+            logger().info("{} execute_connecting(): going to WAIT", conn);
+            execute_wait(true);
+            break;
+           }
+           default: {
+            ceph_abort("impossible next step");
+           }
+          }
+        }).handle_exception([this] (std::exception_ptr eptr) {
+          if (state != state_t::CONNECTING) {
+            logger().info("{} execute_connecting(): protocol aborted at {} -- {}",
+                          conn, get_state_name(state), eptr);
+            assert(state == state_t::CLOSING ||
+                   state == state_t::REPLACING);
+            return;
+          }
+
+          if (conn.policy.server ||
+              (conn.policy.standby &&
+               (!is_queued() && conn.sent.empty()))) {
+            logger().info("{} execute_connecting(): fault at {} with nothing to send,"
+                          " going to STANDBY -- {}",
+                          conn, get_state_name(state), eptr);
+            execute_standby();
+          } else {
+            logger().info("{} execute_connecting(): fault at {}, going to WAIT -- {}",
+                          conn, get_state_name(state), eptr);
+            execute_wait(false);
+          }
+        });
+    });
+}
+
+// ACCEPTING state
+
+seastar::future<> ProtocolV2::_auth_bad_method(int r)
+{
+  // _auth_bad_method() logic
+  ceph_assert(r < 0);
+  auto [allowed_methods, allowed_modes] =
+      messenger.get_auth_server()->get_supported_auth_methods(conn.get_peer_type());
+  auto bad_method = AuthBadMethodFrame::Encode(
+      auth_meta->auth_method, r, allowed_methods, allowed_modes);
+  logger().warn("{} WRITE AuthBadMethodFrame: method={}, result={}, "
+                "allowed_methods={}, allowed_modes={})",
+                conn, auth_meta->auth_method, cpp_strerror(r),
+                allowed_methods, allowed_modes);
+  return write_frame(bad_method).then([this] {
+    return server_auth();
+  });
+}
+
+seastar::future<> ProtocolV2::_handle_auth_request(bufferlist& auth_payload, bool more)
+{
+  // _handle_auth_request() logic
+  ceph_assert(messenger.get_auth_server());
+  bufferlist reply;
+  int r = messenger.get_auth_server()->handle_auth_request(
+      conn.shared_from_this(), auth_meta,
+      more, auth_meta->auth_method, auth_payload,
+      &reply);
+  switch (r) {
+   // successful
+   case 1: {
+    auto auth_done = AuthDoneFrame::Encode(
+        conn.peer_global_id, auth_meta->con_mode, reply);
+    logger().debug("{} WRITE AuthDoneFrame: gid={}, con_mode={}, payload_len={}",
+                   conn, conn.peer_global_id,
+                   ceph_con_mode_name(auth_meta->con_mode), reply.length());
+    return write_frame(auth_done).then([this] {
+      ceph_assert(auth_meta);
+      session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair(
+          nullptr, *auth_meta, tx_frame_asm.get_is_rev1(), true);
+      return finish_auth();
+    });
+   }
+   // auth more
+   case 0: {
+    auto more = AuthReplyMoreFrame::Encode(reply);
+    logger().debug("{} WRITE AuthReplyMoreFrame: payload_len={}",
+                   conn, reply.length());
+    return write_frame(more).then([this] {
+      return read_main_preamble();
+    }).then([this] (Tag tag) {
+      expect_tag(Tag::AUTH_REQUEST_MORE, tag, conn, __func__);
+      return read_frame_payload();
+    }).then([this] {
+      auto auth_more = AuthRequestMoreFrame::Decode(rx_segments_data.back());
+      logger().debug("{} GOT AuthRequestMoreFrame: payload_len={}",
+                     conn, auth_more.auth_payload().length());
+      return _handle_auth_request(auth_more.auth_payload(), true);
+    });
+   }
+   case -EBUSY: {
+    logger().warn("{} auth_server handle_auth_request returned -EBUSY", conn);
+    abort_in_fault();
+    return seastar::now();
+   }
+   default: {
+    logger().warn("{} auth_server handle_auth_request returned {}", conn, r);
+    return _auth_bad_method(r);
+   }
+  }
+}
+
+seastar::future<> ProtocolV2::server_auth()
+{
+  return read_main_preamble()
+  .then([this] (Tag tag) {
+    expect_tag(Tag::AUTH_REQUEST, tag, conn, __func__);
+    return read_frame_payload();
+  }).then([this] {
+    // handle_auth_request() logic
+    auto request = AuthRequestFrame::Decode(rx_segments_data.back());
+    logger().debug("{} GOT AuthRequestFrame: method={}, preferred_modes={},"
+                   " payload_len={}",
+                   conn, request.method(), request.preferred_modes(),
+                   request.auth_payload().length());
+    auth_meta->auth_method = request.method();
+    auth_meta->con_mode = messenger.get_auth_server()->pick_con_mode(
+        conn.get_peer_type(), auth_meta->auth_method,
+        request.preferred_modes());
+    if (auth_meta->con_mode == CEPH_CON_MODE_UNKNOWN) {
+      logger().warn("{} auth_server pick_con_mode returned mode CEPH_CON_MODE_UNKNOWN", conn);
+      return _auth_bad_method(-EOPNOTSUPP);
+    }
+    return _handle_auth_request(request.auth_payload(), false);
+  });
+}
+
+bool ProtocolV2::validate_peer_name(const entity_name_t& peer_name) const
+{
+  auto my_peer_name = conn.get_peer_name();
+  if (my_peer_name.type() != peer_name.type()) {
+    return false;
+  }
+  if (my_peer_name.num() != entity_name_t::NEW &&
+      peer_name.num() != entity_name_t::NEW &&
+      my_peer_name.num() != peer_name.num()) {
+    return false;
+  }
+  return true;
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::send_wait()
+{
+  auto wait = WaitFrame::Encode();
+  logger().debug("{} WRITE WaitFrame", conn);
+  return write_frame(wait).then([] {
+    return next_step_t::wait;
+  });
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::reuse_connection(
+    ProtocolV2* existing_proto, bool do_reset,
+    bool reconnect, uint64_t conn_seq, uint64_t msg_seq)
+{
+  existing_proto->trigger_replacing(reconnect,
+                                    do_reset,
+                                    std::move(socket),
+                                    std::move(auth_meta),
+                                    std::move(session_stream_handlers),
+                                    peer_global_seq,
+                                    client_cookie,
+                                    conn.get_peer_name(),
+                                    connection_features,
+                                    tx_frame_asm.get_is_rev1(),
+                                    rx_frame_asm.get_is_rev1(),
+                                    conn_seq,
+                                    msg_seq);
+#ifdef UNIT_TESTS_BUILT
+  if (conn.interceptor) {
+    conn.interceptor->register_conn_replaced(conn);
+  }
+#endif
+  // close this connection because all the necessary information is delivered
+  // to the exisiting connection, and jump to error handling code to abort the
+  // current state.
+  abort_in_close(*this, false);
+  return seastar::make_ready_future<next_step_t>(next_step_t::none);
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::handle_existing_connection(SocketConnectionRef existing_conn)
+{
+  // handle_existing_connection() logic
+  ProtocolV2 *existing_proto = dynamic_cast<ProtocolV2*>(
+      existing_conn->protocol.get());
+  ceph_assert(existing_proto);
+  logger().debug("{}(gs={}, pgs={}, cs={}, cc={}, sc={}) connecting,"
+                 " found existing {}(state={}, gs={}, pgs={}, cs={}, cc={}, sc={})",
+                 conn, global_seq, peer_global_seq, connect_seq,
+                 client_cookie, server_cookie,
+                 existing_conn, get_state_name(existing_proto->state),
+                 existing_proto->global_seq,
+                 existing_proto->peer_global_seq,
+                 existing_proto->connect_seq,
+                 existing_proto->client_cookie,
+                 existing_proto->server_cookie);
+
+  if (!validate_peer_name(existing_conn->get_peer_name())) {
+    logger().error("{} server_connect: my peer_name doesn't match"
+                   " the existing connection {}, abort", conn, existing_conn);
+    abort_in_fault();
+  }
+
+  if (existing_proto->state == state_t::REPLACING) {
+    logger().warn("{} server_connect: racing replace happened while"
+                  " replacing existing connection {}, send wait.",
+                  conn, *existing_conn);
+    return send_wait();
+  }
+
+  if (existing_proto->peer_global_seq > peer_global_seq) {
+    logger().warn("{} server_connect:"
+                  " this is a stale connection, because peer_global_seq({})"
+                  " < existing->peer_global_seq({}), close this connection"
+                  " in favor of existing connection {}",
+                  conn, peer_global_seq,
+                  existing_proto->peer_global_seq, *existing_conn);
+    abort_in_fault();
+  }
+
+  if (existing_conn->policy.lossy) {
+    // existing connection can be thrown out in favor of this one
+    logger().warn("{} server_connect:"
+                  " existing connection {} is a lossy channel. Close existing in favor of"
+                  " this connection", conn, *existing_conn);
+    execute_establishing(existing_conn, true);
+    return seastar::make_ready_future<next_step_t>(next_step_t::ready);
+  }
+
+  if (existing_proto->server_cookie != 0) {
+    if (existing_proto->client_cookie != client_cookie) {
+      // Found previous session
+      // peer has reset and we're going to reuse the existing connection
+      // by replacing the socket
+      logger().warn("{} server_connect:"
+                    " found new session (cs={})"
+                    " when existing {} is with stale session (cs={}, ss={}),"
+                    " peer must have reset",
+                    conn, client_cookie,
+                    *existing_conn, existing_proto->client_cookie,
+                    existing_proto->server_cookie);
+      return reuse_connection(existing_proto, conn.policy.resetcheck);
+    } else {
+      // session establishment interrupted between client_ident and server_ident,
+      // continuing...
+      logger().warn("{} server_connect: found client session with existing {}"
+                    " matched (cs={}, ss={}), continuing session establishment",
+                    conn, *existing_conn, client_cookie, existing_proto->server_cookie);
+      return reuse_connection(existing_proto);
+    }
+  } else {
+    // Looks like a connection race: server and client are both connecting to
+    // each other at the same time.
+    if (existing_proto->client_cookie != client_cookie) {
+      if (existing_conn->peer_wins()) {
+        logger().warn("{} server_connect: connection race detected (cs={}, e_cs={}, ss=0)"
+                      " and win, reusing existing {}",
+                      conn, client_cookie, existing_proto->client_cookie, *existing_conn);
+        return reuse_connection(existing_proto);
+      } else {
+        logger().warn("{} server_connect: connection race detected (cs={}, e_cs={}, ss=0)"
+                      " and lose to existing {}, ask client to wait",
+                      conn, client_cookie, existing_proto->client_cookie, *existing_conn);
+        return existing_conn->keepalive().then([this] {
+          return send_wait();
+        });
+      }
+    } else {
+      logger().warn("{} server_connect: found client session with existing {}"
+                    " matched (cs={}, ss={}), continuing session establishment",
+                    conn, *existing_conn, client_cookie, existing_proto->server_cookie);
+      return reuse_connection(existing_proto);
+    }
+  }
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::server_connect()
+{
+  return read_frame_payload().then([this] {
+    // handle_client_ident() logic
+    auto client_ident = ClientIdentFrame::Decode(rx_segments_data.back());
+    logger().debug("{} GOT ClientIdentFrame: addrs={}, target={},"
+                   " gid={}, gs={}, features_supported={},"
+                   " features_required={}, flags={}, cookie={}",
+                   conn, client_ident.addrs(), client_ident.target_addr(),
+                   client_ident.gid(), client_ident.global_seq(),
+                   client_ident.supported_features(),
+                   client_ident.required_features(),
+                   client_ident.flags(), client_ident.cookie());
+
+    if (client_ident.addrs().empty() ||
+        client_ident.addrs().front() == entity_addr_t()) {
+      logger().warn("{} oops, client_ident.addrs() is empty", conn);
+      throw std::system_error(
+          make_error_code(crimson::net::error::bad_peer_address));
+    }
+    if (!messenger.get_myaddrs().contains(client_ident.target_addr())) {
+      logger().warn("{} peer is trying to reach {} which is not us ({})",
+                    conn, client_ident.target_addr(), messenger.get_myaddrs());
+      throw std::system_error(
+          make_error_code(crimson::net::error::bad_peer_address));
+    }
+    // TODO: change peer_addr to entity_addrvec_t
+    entity_addr_t paddr = client_ident.addrs().front();
+    if ((paddr.is_msgr2() || paddr.is_any()) &&
+        paddr.is_same_host(conn.target_addr)) {
+      // good
+    } else {
+      logger().warn("{} peer's address {} is not v2 or not the same host with {}",
+                    conn, paddr, conn.target_addr);
+      throw std::system_error(
+          make_error_code(crimson::net::error::bad_peer_address));
+    }
+    conn.peer_addr = paddr;
+    logger().debug("{} UPDATE: peer_addr={}", conn, conn.peer_addr);
+    conn.target_addr = conn.peer_addr;
+    if (!conn.policy.lossy && !conn.policy.server && conn.target_addr.get_port() <= 0) {
+      logger().warn("{} we don't know how to reconnect to peer {}",
+                    conn, conn.target_addr);
+      throw std::system_error(
+          make_error_code(crimson::net::error::bad_peer_address));
+    }
+
+    if (conn.get_peer_id() != entity_name_t::NEW &&
+        conn.get_peer_id() != client_ident.gid()) {
+      logger().error("{} client_ident peer_id ({}) does not match"
+                     " what it should be ({}) during accepting, abort",
+                      conn, client_ident.gid(), conn.get_peer_id());
+      abort_in_fault();
+    }
+    conn.set_peer_id(client_ident.gid());
+    client_cookie = client_ident.cookie();
+
+    uint64_t feat_missing =
+      (conn.policy.features_required | msgr2_required) &
+      ~(uint64_t)client_ident.supported_features();
+    if (feat_missing) {
+      auto ident_missing_features = IdentMissingFeaturesFrame::Encode(feat_missing);
+      logger().warn("{} WRITE IdentMissingFeaturesFrame: features={} (peer missing)",
+                    conn, feat_missing);
+      return write_frame(ident_missing_features).then([] {
+        return next_step_t::wait;
+      });
+    }
+    connection_features =
+        client_ident.supported_features() & conn.policy.features_supported;
+    logger().debug("{} UPDATE: connection_features={}", conn, connection_features);
+
+    peer_global_seq = client_ident.global_seq();
+
+    // Looks good so far, let's check if there is already an existing connection
+    // to this peer.
+
+    SocketConnectionRef existing_conn = messenger.lookup_conn(conn.peer_addr);
+
+    if (existing_conn) {
+      if (existing_conn->protocol->proto_type != proto_t::v2) {
+        logger().warn("{} existing connection {} proto version is {}, close existing",
+                      conn, *existing_conn,
+                      static_cast<int>(existing_conn->protocol->proto_type));
+        // should unregister the existing from msgr atomically
+        // NOTE: this is following async messenger logic, but we may miss the reset event.
+        execute_establishing(existing_conn, false);
+        return seastar::make_ready_future<next_step_t>(next_step_t::ready);
+      } else {
+        return handle_existing_connection(existing_conn);
+      }
+    } else {
+      execute_establishing(nullptr, true);
+      return seastar::make_ready_future<next_step_t>(next_step_t::ready);
+    }
+  });
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::read_reconnect()
+{
+  return read_main_preamble()
+  .then([this] (Tag tag) {
+    expect_tag(Tag::SESSION_RECONNECT, tag, conn, "read_reconnect");
+    return server_reconnect();
+  });
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::send_retry(uint64_t connect_seq)
+{
+  auto retry = RetryFrame::Encode(connect_seq);
+  logger().warn("{} WRITE RetryFrame: cs={}", conn, connect_seq);
+  return write_frame(retry).then([this] {
+    return read_reconnect();
+  });
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::send_retry_global(uint64_t global_seq)
+{
+  auto retry = RetryGlobalFrame::Encode(global_seq);
+  logger().warn("{} WRITE RetryGlobalFrame: gs={}", conn, global_seq);
+  return write_frame(retry).then([this] {
+    return read_reconnect();
+  });
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::send_reset(bool full)
+{
+  auto reset = ResetFrame::Encode(full);
+  logger().warn("{} WRITE ResetFrame: full={}", conn, full);
+  return write_frame(reset).then([this] {
+    return read_main_preamble();
+  }).then([this] (Tag tag) {
+    expect_tag(Tag::CLIENT_IDENT, tag, conn, "post_send_reset");
+    return server_connect();
+  });
+}
+
+seastar::future<ProtocolV2::next_step_t>
+ProtocolV2::server_reconnect()
+{
+  return read_frame_payload().then([this] {
+    // handle_reconnect() logic
+    auto reconnect = ReconnectFrame::Decode(rx_segments_data.back());
+
+    logger().debug("{} GOT ReconnectFrame: addrs={}, client_cookie={},"
+                   " server_cookie={}, gs={}, cs={}, msg_seq={}",
+                   conn, reconnect.addrs(),
+                   reconnect.client_cookie(), reconnect.server_cookie(),
+                   reconnect.global_seq(), reconnect.connect_seq(),
+                   reconnect.msg_seq());
+
+    // can peer_addrs be changed on-the-fly?
+    // TODO: change peer_addr to entity_addrvec_t
+    entity_addr_t paddr = reconnect.addrs().front();
+    if (paddr.is_msgr2() || paddr.is_any()) {
+      // good
+    } else {
+      logger().warn("{} peer's address {} is not v2", conn, paddr);
+      throw std::system_error(
+          make_error_code(crimson::net::error::bad_peer_address));
+    }
+    if (conn.peer_addr == entity_addr_t()) {
+      conn.peer_addr = paddr;
+    } else if (conn.peer_addr != paddr) {
+      logger().error("{} peer identifies as {}, while conn.peer_addr={},"
+                     " reconnect failed",
+                     conn, paddr, conn.peer_addr);
+      throw std::system_error(
+          make_error_code(crimson::net::error::bad_peer_address));
+    }
+    peer_global_seq = reconnect.global_seq();
+
+    SocketConnectionRef existing_conn = messenger.lookup_conn(conn.peer_addr);
+
+    if (!existing_conn) {
+      // there is no existing connection therefore cannot reconnect to previous
+      // session
+      logger().warn("{} server_reconnect: no existing connection from address {},"
+                    " reseting client", conn, conn.peer_addr);
+      return send_reset(true);
+    }
+
+    if (existing_conn->protocol->proto_type != proto_t::v2) {
+      logger().warn("{} server_reconnect: existing connection {} proto version is {},"
+                    "close existing and reset client.",
+                    conn, *existing_conn,
+                    static_cast<int>(existing_conn->protocol->proto_type));
+      // NOTE: this is following async messenger logic, but we may miss the reset event.
+      existing_conn->mark_down();
+      return send_reset(true);
+    }
+
+    ProtocolV2 *existing_proto = dynamic_cast<ProtocolV2*>(
+        existing_conn->protocol.get());
+    ceph_assert(existing_proto);
+    logger().debug("{}(gs={}, pgs={}, cs={}, cc={}, sc={}) re-connecting,"
+                   " found existing {}(state={}, gs={}, pgs={}, cs={}, cc={}, sc={})",
+                   conn, global_seq, peer_global_seq, reconnect.connect_seq(),
+                   reconnect.client_cookie(), reconnect.server_cookie(),
+                   existing_conn,
+                   get_state_name(existing_proto->state),
+                   existing_proto->global_seq,
+                   existing_proto->peer_global_seq,
+                   existing_proto->connect_seq,
+                   existing_proto->client_cookie,
+                   existing_proto->server_cookie);
+
+    if (!validate_peer_name(existing_conn->get_peer_name())) {
+      logger().error("{} server_reconnect: my peer_name doesn't match"
+                     " the existing connection {}, abort", conn, existing_conn);
+      abort_in_fault();
+    }
+
+    if (existing_proto->state == state_t::REPLACING) {
+      logger().warn("{} server_reconnect: racing replace happened while "
+                    " replacing existing connection {}, retry global.",
+                    conn, *existing_conn);
+      return send_retry_global(existing_proto->peer_global_seq);
+    }
+
+    if (existing_proto->client_cookie != reconnect.client_cookie()) {
+      logger().warn("{} server_reconnect:"
+                    " client_cookie mismatch with existing connection {},"
+                    " cc={} rcc={}. I must have reset, reseting client.",
+                    conn, *existing_conn,
+                    existing_proto->client_cookie, reconnect.client_cookie());
+      return send_reset(conn.policy.resetcheck);
+    } else if (existing_proto->server_cookie == 0) {
+      // this happens when:
+      //   - a connects to b
+      //   - a sends client_ident
+      //   - b gets client_ident, sends server_ident and sets cookie X
+      //   - connection fault
+      //   - b reconnects to a with cookie X, connect_seq=1
+      //   - a has cookie==0
+      logger().warn("{} server_reconnect: I was a client (cc={}) and didn't received the"
+                    " server_ident with existing connection {}."
+                    " Asking peer to resume session establishment",
+                    conn, existing_proto->client_cookie, *existing_conn);
+      return send_reset(false);
+    }
+
+    if (existing_proto->peer_global_seq > reconnect.global_seq()) {
+      logger().warn("{} server_reconnect: stale global_seq: exist_pgs({}) > peer_gs({}),"
+                    " with existing connection {},"
+                    " ask client to retry global",
+                    conn, existing_proto->peer_global_seq,
+                    reconnect.global_seq(), *existing_conn);
+      return send_retry_global(existing_proto->peer_global_seq);
+    }
+
+    if (existing_proto->connect_seq > reconnect.connect_seq()) {
+      logger().warn("{} server_reconnect: stale peer connect_seq peer_cs({}) < exist_cs({}),"
+                    " with existing connection {}, ask client to retry",
+                    conn, reconnect.connect_seq(),
+                    existing_proto->connect_seq, *existing_conn);
+      return send_retry(existing_proto->connect_seq);
+    } else if (existing_proto->connect_seq == reconnect.connect_seq()) {
+      // reconnect race: both peers are sending reconnect messages
+      if (existing_conn->peer_wins()) {
+        logger().warn("{} server_reconnect: reconnect race detected (cs={})"
+                      " and win, reusing existing {}",
+                      conn, reconnect.connect_seq(), *existing_conn);
+        return reuse_connection(
+            existing_proto, false,
+            true, reconnect.connect_seq(), reconnect.msg_seq());
+      } else {
+        logger().warn("{} server_reconnect: reconnect race detected (cs={})"
+                      " and lose to existing {}, ask client to wait",
+                      conn, reconnect.connect_seq(), *existing_conn);
+        return send_wait();
+      }
+    } else { // existing_proto->connect_seq < reconnect.connect_seq()
+      logger().warn("{} server_reconnect: stale exsiting connect_seq exist_cs({}) < peer_cs({}),"
+                    " reusing existing {}",
+                    conn, existing_proto->connect_seq,
+                    reconnect.connect_seq(), *existing_conn);
+      return reuse_connection(
+          existing_proto, false,
+          true, reconnect.connect_seq(), reconnect.msg_seq());
+    }
+  });
+}
+
+void ProtocolV2::execute_accepting()
+{
+  trigger_state(state_t::ACCEPTING, write_state_t::none, false);
+  gate.dispatch_in_background("execute_accepting", *this, [this] {
+      return seastar::futurize_invoke([this] {
+          INTERCEPT_N_RW(custom_bp_t::SOCKET_ACCEPTED);
+          auth_meta = seastar::make_lw_shared<AuthConnectionMeta>();
+          session_stream_handlers = { nullptr, nullptr };
+          enable_recording();
+          return banner_exchange(false);
+        }).then([this] (auto&& ret) {
+          auto [_peer_type, _my_addr_from_peer] = std::move(ret);
+          ceph_assert(conn.get_peer_type() == 0);
+          conn.set_peer_type(_peer_type);
+
+          conn.policy = messenger.get_policy(_peer_type);
+          logger().info("{} UPDATE: peer_type={},"
+                        " policy(lossy={} server={} standby={} resetcheck={})",
+                        conn, ceph_entity_type_name(_peer_type),
+                        conn.policy.lossy, conn.policy.server,
+                        conn.policy.standby, conn.policy.resetcheck);
+          if (messenger.get_myaddr().get_port() != _my_addr_from_peer.get_port() ||
+              messenger.get_myaddr().get_nonce() != _my_addr_from_peer.get_nonce()) {
+            logger().warn("{} my_addr_from_peer {} port/nonce doesn't match myaddr {}",
+                          conn, _my_addr_from_peer, messenger.get_myaddr());
+            throw std::system_error(
+                make_error_code(crimson::net::error::bad_peer_address));
+          }
+          return messenger.learned_addr(_my_addr_from_peer, conn);
+        }).then([this] {
+          return server_auth();
+        }).then([this] {
+          return read_main_preamble();
+        }).then([this] (Tag tag) {
+          switch (tag) {
+            case Tag::CLIENT_IDENT:
+              return server_connect();
+            case Tag::SESSION_RECONNECT:
+              return server_reconnect();
+            default: {
+              unexpected_tag(tag, conn, "post_server_auth");
+              return seastar::make_ready_future<next_step_t>(next_step_t::none);
+            }
+          }
+        }).then([this] (next_step_t next) {
+          switch (next) {
+           case next_step_t::ready:
+            assert(state != state_t::ACCEPTING);
+            break;
+           case next_step_t::wait:
+            if (unlikely(state != state_t::ACCEPTING)) {
+              logger().debug("{} triggered {} at the end of execute_accepting()",
+                             conn, get_state_name(state));
+              abort_protocol();
+            }
+            logger().info("{} execute_accepting(): going to SERVER_WAIT", conn);
+            execute_server_wait();
+            break;
+           default:
+            ceph_abort("impossible next step");
+          }
+        }).handle_exception([this] (std::exception_ptr eptr) {
+          logger().info("{} execute_accepting(): fault at {}, going to CLOSING -- {}",
+                        conn, get_state_name(state), eptr);
+          close(false);
+        });
+    });
+}
+
+// CONNECTING or ACCEPTING state
+
+seastar::future<> ProtocolV2::finish_auth()
+{
+  ceph_assert(auth_meta);
+
+  const auto sig = auth_meta->session_key.empty() ? sha256_digest_t() :
+    auth_meta->session_key.hmac_sha256(nullptr, rxbuf);
+  auto sig_frame = AuthSignatureFrame::Encode(sig);
+  ceph_assert(record_io);
+  record_io = false;
+  rxbuf.clear();
+  logger().debug("{} WRITE AuthSignatureFrame: signature={}", conn, sig);
+  return write_frame(sig_frame).then([this] {
+    return read_main_preamble();
+  }).then([this] (Tag tag) {
+    expect_tag(Tag::AUTH_SIGNATURE, tag, conn, "post_finish_auth");
+    return read_frame_payload();
+  }).then([this] {
+    // handle_auth_signature() logic
+    auto sig_frame = AuthSignatureFrame::Decode(rx_segments_data.back());
+    logger().debug("{} GOT AuthSignatureFrame: signature={}", conn, sig_frame.signature());
+
+    const auto actual_tx_sig = auth_meta->session_key.empty() ?
+      sha256_digest_t() : auth_meta->session_key.hmac_sha256(nullptr, txbuf);
+    if (sig_frame.signature() != actual_tx_sig) {
+      logger().warn("{} pre-auth signature mismatch actual_tx_sig={}"
+                    " sig_frame.signature()={}",
+                    conn, actual_tx_sig, sig_frame.signature());
+      abort_in_fault();
+    }
+    txbuf.clear();
+  });
+}
+
+// ESTABLISHING
+
+void ProtocolV2::execute_establishing(
+    SocketConnectionRef existing_conn, bool dispatch_reset) {
+  if (unlikely(state != state_t::ACCEPTING)) {
+    logger().debug("{} triggered {} before execute_establishing()",
+                   conn, get_state_name(state));
+    abort_protocol();
+  }
+
+  auto accept_me = [this] {
+    messenger.register_conn(
+      seastar::static_pointer_cast<SocketConnection>(
+        conn.shared_from_this()));
+    messenger.unaccept_conn(
+      seastar::static_pointer_cast<SocketConnection>(
+        conn.shared_from_this()));
+  };
+
+  trigger_state(state_t::ESTABLISHING, write_state_t::delay, false);
+  if (existing_conn) {
+    existing_conn->protocol->close(dispatch_reset, std::move(accept_me));
+    if (unlikely(state != state_t::ESTABLISHING)) {
+      logger().warn("{} triggered {} during execute_establishing(), "
+                    "the accept event will not be delivered!",
+                    conn, get_state_name(state));
+      abort_protocol();
+    }
+  } else {
+    accept_me();
+  }
+
+  dispatchers.ms_handle_accept(
+      seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this()));
+
+  gated_execute("execute_establishing", [this] {
+    return seastar::futurize_invoke([this] {
+      return send_server_ident();
+    }).then([this] {
+      if (unlikely(state != state_t::ESTABLISHING)) {
+        logger().debug("{} triggered {} at the end of execute_establishing()",
+                       conn, get_state_name(state));
+        abort_protocol();
+      }
+      logger().info("{} established: gs={}, pgs={}, cs={}, client_cookie={},"
+                    " server_cookie={}, in_seq={}, out_seq={}, out_q={}",
+                    conn, global_seq, peer_global_seq, connect_seq,
+                    client_cookie, server_cookie, conn.in_seq,
+                    conn.out_seq, conn.out_q.size());
+      execute_ready(false);
+    }).handle_exception([this] (std::exception_ptr eptr) {
+      if (state != state_t::ESTABLISHING) {
+        logger().info("{} execute_establishing() protocol aborted at {} -- {}",
+                      conn, get_state_name(state), eptr);
+        assert(state == state_t::CLOSING ||
+               state == state_t::REPLACING);
+        return;
+      }
+      fault(false, "execute_establishing()", eptr);
+    });
+  });
+}
+
+// ESTABLISHING or REPLACING state
+
+seastar::future<>
+ProtocolV2::send_server_ident()
+{
+  // send_server_ident() logic
+
+  // refered to async-conn v2: not assign gs to global_seq
+  return messenger.get_global_seq().then([this] (auto gs) {
+    logger().debug("{} UPDATE: gs={} for server ident", conn, global_seq);
+
+    // this is required for the case when this connection is being replaced
+    requeue_up_to(0);
+    conn.in_seq = 0;
+
+    if (!conn.policy.lossy) {
+      server_cookie = ceph::util::generate_random_number<uint64_t>(1, -1ll);
+    }
+
+    uint64_t flags = 0;
+    if (conn.policy.lossy) {
+      flags = flags | CEPH_MSG_CONNECT_LOSSY;
+    }
+
+    auto server_ident = ServerIdentFrame::Encode(
+            messenger.get_myaddrs(),
+            messenger.get_myname().num(),
+            gs,
+            conn.policy.features_supported,
+            conn.policy.features_required | msgr2_required,
+            flags,
+            server_cookie);
+
+    logger().debug("{} WRITE ServerIdentFrame: addrs={}, gid={},"
+                   " gs={}, features_supported={}, features_required={},"
+                   " flags={}, cookie={}",
+                   conn, messenger.get_myaddrs(), messenger.get_myname().num(),
+                   gs, conn.policy.features_supported,
+                   conn.policy.features_required | msgr2_required,
+                   flags, server_cookie);
+
+    conn.set_features(connection_features);
+
+    return write_frame(server_ident);
+  });
+}
+
+// REPLACING state
+
+void ProtocolV2::trigger_replacing(bool reconnect,
+                                   bool do_reset,
+                                   SocketRef&& new_socket,
+                                   AuthConnectionMetaRef&& new_auth_meta,
+                                   ceph::crypto::onwire::rxtx_t new_rxtx,
+                                   uint64_t new_peer_global_seq,
+                                   uint64_t new_client_cookie,
+                                   entity_name_t new_peer_name,
+                                   uint64_t new_conn_features,
+                                   bool tx_is_rev1,
+                                   bool rx_is_rev1,
+                                   uint64_t new_connect_seq,
+                                   uint64_t new_msg_seq)
+{
+  trigger_state(state_t::REPLACING, write_state_t::delay, false);
+  if (socket) {
+    socket->shutdown();
+  }
+  dispatchers.ms_handle_accept(
+      seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this()));
+  gate.dispatch_in_background("trigger_replacing", *this,
+                 [this,
+                  reconnect,
+                  do_reset,
+                  new_socket = std::move(new_socket),
+                  new_auth_meta = std::move(new_auth_meta),
+                  new_rxtx = std::move(new_rxtx),
+                  tx_is_rev1, rx_is_rev1,
+                  new_client_cookie, new_peer_name,
+                  new_conn_features, new_peer_global_seq,
+                  new_connect_seq, new_msg_seq] () mutable {
+    return wait_write_exit().then([this, do_reset] {
+      if (do_reset) {
+        reset_session(true);
+      }
+      protocol_timer.cancel();
+      return execution_done.get_future();
+    }).then([this,
+             reconnect,
+             new_socket = std::move(new_socket),
+             new_auth_meta = std::move(new_auth_meta),
+             new_rxtx = std::move(new_rxtx),
+             tx_is_rev1, rx_is_rev1,
+             new_client_cookie, new_peer_name,
+             new_conn_features, new_peer_global_seq,
+             new_connect_seq, new_msg_seq] () mutable {
+      if (unlikely(state != state_t::REPLACING)) {
+        return new_socket->close().then([sock = std::move(new_socket)] {
+          abort_protocol();
+        });
+      }
+
+      if (socket) {
+        gate.dispatch_in_background("close_socket_replacing", *this,
+                       [sock = std::move(socket)] () mutable {
+          return sock->close().then([sock = std::move(sock)] {});
+        });
+      }
+      socket = std::move(new_socket);
+      auth_meta = std::move(new_auth_meta);
+      session_stream_handlers = std::move(new_rxtx);
+      record_io = false;
+      peer_global_seq = new_peer_global_seq;
+
+      if (reconnect) {
+        connect_seq = new_connect_seq;
+        // send_reconnect_ok() logic
+        requeue_up_to(new_msg_seq);
+        auto reconnect_ok = ReconnectOkFrame::Encode(conn.in_seq);
+        logger().debug("{} WRITE ReconnectOkFrame: msg_seq={}", conn, conn.in_seq);
+        return write_frame(reconnect_ok);
+      } else {
+        client_cookie = new_client_cookie;
+        assert(conn.get_peer_type() == new_peer_name.type());
+        if (conn.get_peer_id() == entity_name_t::NEW) {
+          conn.set_peer_id(new_peer_name.num());
+        }
+        connection_features = new_conn_features;
+        tx_frame_asm.set_is_rev1(tx_is_rev1);
+        rx_frame_asm.set_is_rev1(rx_is_rev1);
+        return send_server_ident();
+      }
+    }).then([this, reconnect] {
+      if (unlikely(state != state_t::REPLACING)) {
+        logger().debug("{} triggered {} at the end of trigger_replacing()",
+                       conn, get_state_name(state));
+        abort_protocol();
+      }
+      logger().info("{} replaced ({}):"
+                    " gs={}, pgs={}, cs={}, client_cookie={}, server_cookie={},"
+                    " in_seq={}, out_seq={}, out_q={}",
+                    conn, reconnect ? "reconnected" : "connected",
+                    global_seq, peer_global_seq, connect_seq, client_cookie,
+                    server_cookie, conn.in_seq, conn.out_seq, conn.out_q.size());
+      execute_ready(false);
+    }).handle_exception([this] (std::exception_ptr eptr) {
+      if (state != state_t::REPLACING) {
+        logger().info("{} trigger_replacing(): protocol aborted at {} -- {}",
+                      conn, get_state_name(state), eptr);
+        assert(state == state_t::CLOSING);
+        return;
+      }
+      fault(true, "trigger_replacing()", eptr);
+    });
+  });
+}
+
+// READY state
+
+ceph::bufferlist ProtocolV2::do_sweep_messages(
+    const std::deque<MessageRef>& msgs,
+    size_t num_msgs,
+    bool require_keepalive,
+    std::optional<utime_t> _keepalive_ack,
+    bool require_ack)
+{
+  ceph::bufferlist bl;
+
+  if (unlikely(require_keepalive)) {
+    auto keepalive_frame = KeepAliveFrame::Encode();
+    bl.append(keepalive_frame.get_buffer(tx_frame_asm));
+    INTERCEPT_FRAME(ceph::msgr::v2::Tag::KEEPALIVE2, bp_type_t::WRITE);
+  }
+
+  if (unlikely(_keepalive_ack.has_value())) {
+    auto keepalive_ack_frame = KeepAliveFrameAck::Encode(*_keepalive_ack);
+    bl.append(keepalive_ack_frame.get_buffer(tx_frame_asm));
+    INTERCEPT_FRAME(ceph::msgr::v2::Tag::KEEPALIVE2_ACK, bp_type_t::WRITE);
+  }
+
+  if (require_ack && !num_msgs) {
+    auto ack_frame = AckFrame::Encode(conn.in_seq);
+    bl.append(ack_frame.get_buffer(tx_frame_asm));
+    INTERCEPT_FRAME(ceph::msgr::v2::Tag::ACK, bp_type_t::WRITE);
+  }
+
+  std::for_each(msgs.begin(), msgs.begin()+num_msgs, [this, &bl](const MessageRef& msg) {
+    // TODO: move to common code
+    // set priority
+    msg->get_header().src = messenger.get_myname();
+
+    msg->encode(conn.features, 0);
+
+    ceph_assert(!msg->get_seq() && "message already has seq");
+    msg->set_seq(++conn.out_seq);
+
+    ceph_msg_header &header = msg->get_header();
+    ceph_msg_footer &footer = msg->get_footer();
+
+    ceph_msg_header2 header2{header.seq,        header.tid,
+                             header.type,       header.priority,
+                             header.version,
+                             init_le32(0),      header.data_off,
+                             init_le64(conn.in_seq),
+                             footer.flags,      header.compat_version,
+                             header.reserved};
+
+    auto message = MessageFrame::Encode(header2,
+        msg->get_payload(), msg->get_middle(), msg->get_data());
+    logger().debug("{} --> #{} === {} ({})",
+		   conn, msg->get_seq(), *msg, msg->get_type());
+    bl.append(message.get_buffer(tx_frame_asm));
+    INTERCEPT_FRAME(ceph::msgr::v2::Tag::MESSAGE, bp_type_t::WRITE);
+  });
+
+  return bl;
+}
+
+seastar::future<> ProtocolV2::read_message(utime_t throttle_stamp)
+{
+  return read_frame_payload()
+  .then([this, throttle_stamp] {
+    utime_t recv_stamp{seastar::lowres_system_clock::now()};
+
+    // we need to get the size before std::moving segments data
+    const size_t cur_msg_size = get_current_msg_size();
+    auto msg_frame = MessageFrame::Decode(rx_segments_data);
+    // XXX: paranoid copy just to avoid oops
+    ceph_msg_header2 current_header = msg_frame.header();
+
+    logger().trace("{} got {} + {} + {} byte message,"
+                   " envelope type={} src={} off={} seq={}",
+                   conn, msg_frame.front_len(), msg_frame.middle_len(),
+                   msg_frame.data_len(), current_header.type, conn.get_peer_name(),
+                   current_header.data_off, current_header.seq);
+
+    ceph_msg_header header{current_header.seq,
+                           current_header.tid,
+                           current_header.type,
+                           current_header.priority,
+                           current_header.version,
+                           init_le32(msg_frame.front_len()),
+                           init_le32(msg_frame.middle_len()),
+                           init_le32(msg_frame.data_len()),
+                           current_header.data_off,
+                           conn.get_peer_name(),
+                           current_header.compat_version,
+                           current_header.reserved,
+                           init_le32(0)};
+    ceph_msg_footer footer{init_le32(0), init_le32(0),
+                           init_le32(0), init_le64(0), current_header.flags};
+
+    auto conn_ref = seastar::static_pointer_cast<SocketConnection>(
+        conn.shared_from_this());
+    Message *message = decode_message(nullptr, 0, header, footer,
+        msg_frame.front(), msg_frame.middle(), msg_frame.data(), conn_ref);
+    if (!message) {
+      logger().warn("{} decode message failed", conn);
+      abort_in_fault();
+    }
+
+    // store reservation size in message, so we don't get confused
+    // by messages entering the dispatch queue through other paths.
+    message->set_dispatch_throttle_size(cur_msg_size);
+
+    message->set_throttle_stamp(throttle_stamp);
+    message->set_recv_stamp(recv_stamp);
+    message->set_recv_complete_stamp(utime_t{seastar::lowres_system_clock::now()});
+
+    // check received seq#.  if it is old, drop the message.
+    // note that incoming messages may skip ahead.  this is convenient for the
+    // client side queueing because messages can't be renumbered, but the (kernel)
+    // client will occasionally pull a message out of the sent queue to send
+    // elsewhere.  in that case it doesn't matter if we "got" it or not.
+    uint64_t cur_seq = conn.in_seq;
+    if (message->get_seq() <= cur_seq) {
+      logger().error("{} got old message {} <= {} {}, discarding",
+                     conn, message->get_seq(), cur_seq, *message);
+      if (HAVE_FEATURE(conn.features, RECONNECT_SEQ) &&
+          local_conf()->ms_die_on_old_message) {
+        ceph_assert(0 == "old msgs despite reconnect_seq feature");
+      }
+      return seastar::now();
+    } else if (message->get_seq() > cur_seq + 1) {
+      logger().error("{} missed message? skipped from seq {} to {}",
+                     conn, cur_seq, message->get_seq());
+      if (local_conf()->ms_die_on_skipped_message) {
+        ceph_assert(0 == "skipped incoming seq");
+      }
+    }
+
+    // note last received message.
+    conn.in_seq = message->get_seq();
+    logger().debug("{} <== #{} === {} ({})",
+		   conn, message->get_seq(), *message, message->get_type());
+    notify_ack();
+    ack_writes(current_header.ack_seq);
+
+    // TODO: change MessageRef with seastar::shared_ptr
+    auto msg_ref = MessageRef{message, false};
+    // throttle the reading process by the returned future
+    return dispatchers.ms_dispatch(conn_ref, std::move(msg_ref));
+  });
+}
+
+void ProtocolV2::execute_ready(bool dispatch_connect)
+{
+  assert(conn.policy.lossy || (client_cookie != 0 && server_cookie != 0));
+  trigger_state(state_t::READY, write_state_t::open, false);
+  if (dispatch_connect) {
+    dispatchers.ms_handle_connect(
+	seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this()));
+  }
+#ifdef UNIT_TESTS_BUILT
+  if (conn.interceptor) {
+    conn.interceptor->register_conn_ready(conn);
+  }
+#endif
+  gated_execute("execute_ready", [this] {
+    protocol_timer.cancel();
+    return seastar::keep_doing([this] {
+      return read_main_preamble()
+      .then([this] (Tag tag) {
+        switch (tag) {
+          case Tag::MESSAGE: {
+            return seastar::futurize_invoke([this] {
+              // throttle_message() logic
+              if (!conn.policy.throttler_messages) {
+                return seastar::now();
+              }
+              // TODO: message throttler
+              ceph_assert(false);
+              return seastar::now();
+            }).then([this] {
+              // throttle_bytes() logic
+              if (!conn.policy.throttler_bytes) {
+                return seastar::now();
+              }
+              size_t cur_msg_size = get_current_msg_size();
+              if (!cur_msg_size) {
+                return seastar::now();
+              }
+              logger().trace("{} wants {} bytes from policy throttler {}/{}",
+                             conn, cur_msg_size,
+                             conn.policy.throttler_bytes->get_current(),
+                             conn.policy.throttler_bytes->get_max());
+              return conn.policy.throttler_bytes->get(cur_msg_size);
+            }).then([this] {
+              // TODO: throttle_dispatch_queue() logic
+              utime_t throttle_stamp{seastar::lowres_system_clock::now()};
+              return read_message(throttle_stamp);
+            });
+          }
+          case Tag::ACK:
+            return read_frame_payload().then([this] {
+              // handle_message_ack() logic
+              auto ack = AckFrame::Decode(rx_segments_data.back());
+              logger().debug("{} GOT AckFrame: seq={}", conn, ack.seq());
+              ack_writes(ack.seq());
+            });
+          case Tag::KEEPALIVE2:
+            return read_frame_payload().then([this] {
+              // handle_keepalive2() logic
+              auto keepalive_frame = KeepAliveFrame::Decode(rx_segments_data.back());
+              logger().debug("{} GOT KeepAliveFrame: timestamp={}",
+                             conn, keepalive_frame.timestamp());
+              notify_keepalive_ack(keepalive_frame.timestamp());
+              conn.set_last_keepalive(seastar::lowres_system_clock::now());
+            });
+          case Tag::KEEPALIVE2_ACK:
+            return read_frame_payload().then([this] {
+              // handle_keepalive2_ack() logic
+              auto keepalive_ack_frame = KeepAliveFrameAck::Decode(rx_segments_data.back());
+              conn.set_last_keepalive_ack(
+                seastar::lowres_system_clock::time_point{keepalive_ack_frame.timestamp()});
+              logger().debug("{} GOT KeepAliveFrameAck: timestamp={}",
+                             conn, conn.last_keepalive_ack);
+            });
+          default: {
+            unexpected_tag(tag, conn, "execute_ready");
+            return seastar::now();
+          }
+        }
+      });
+    }).handle_exception([this] (std::exception_ptr eptr) {
+      if (state != state_t::READY) {
+        logger().info("{} execute_ready(): protocol aborted at {} -- {}",
+                      conn, get_state_name(state), eptr);
+        assert(state == state_t::REPLACING ||
+               state == state_t::CLOSING);
+        return;
+      }
+      fault(false, "execute_ready()", eptr);
+    });
+  });
+}
+
+// STANDBY state
+
+void ProtocolV2::execute_standby()
+{
+  trigger_state(state_t::STANDBY, write_state_t::delay, true);
+  if (socket) {
+    socket->shutdown();
+  }
+}
+
+void ProtocolV2::notify_write()
+{
+  if (unlikely(state == state_t::STANDBY && !conn.policy.server)) {
+    logger().info("{} notify_write(): at {}, going to CONNECTING",
+                  conn, get_state_name(state));
+    execute_connecting();
+  }
+}
+
+// WAIT state
+
+void ProtocolV2::execute_wait(bool max_backoff)
+{
+  trigger_state(state_t::WAIT, write_state_t::delay, true);
+  if (socket) {
+    socket->shutdown();
+  }
+  gated_execute("execute_wait", [this, max_backoff] {
+    double backoff = protocol_timer.last_dur();
+    if (max_backoff) {
+      backoff = local_conf().get_val<double>("ms_max_backoff");
+    } else if (backoff > 0) {
+      backoff = std::min(local_conf().get_val<double>("ms_max_backoff"), 2 * backoff);
+    } else {
+      backoff = local_conf().get_val<double>("ms_initial_backoff");
+    }
+    return protocol_timer.backoff(backoff).then([this] {
+      if (unlikely(state != state_t::WAIT)) {
+        logger().debug("{} triggered {} at the end of execute_wait()",
+                       conn, get_state_name(state));
+        abort_protocol();
+      }
+      logger().info("{} execute_wait(): going to CONNECTING", conn);
+      execute_connecting();
+    }).handle_exception([this] (std::exception_ptr eptr) {
+      logger().info("{} execute_wait(): protocol aborted at {} -- {}",
+                    conn, get_state_name(state), eptr);
+      assert(state == state_t::REPLACING ||
+             state == state_t::CLOSING);
+    });
+  });
+}
+
+// SERVER_WAIT state
+
+void ProtocolV2::execute_server_wait()
+{
+  trigger_state(state_t::SERVER_WAIT, write_state_t::delay, false);
+  gated_execute("execute_server_wait", [this] {
+    return read_exactly(1).then([this] (auto bl) {
+      logger().warn("{} SERVER_WAIT got read, abort", conn);
+      abort_in_fault();
+    }).handle_exception([this] (std::exception_ptr eptr) {
+      logger().info("{} execute_server_wait(): fault at {}, going to CLOSING -- {}",
+                    conn, get_state_name(state), eptr);
+      close(false);
+    });
+  });
+}
+
+// CLOSING state
+
+void ProtocolV2::trigger_close()
+{
+  messenger.closing_conn(
+      seastar::static_pointer_cast<SocketConnection>(
+        conn.shared_from_this()));
+
+  if (state == state_t::ACCEPTING || state == state_t::SERVER_WAIT) {
+    messenger.unaccept_conn(
+      seastar::static_pointer_cast<SocketConnection>(
+        conn.shared_from_this()));
+  } else if (state >= state_t::ESTABLISHING && state < state_t::CLOSING) {
+    messenger.unregister_conn(
+      seastar::static_pointer_cast<SocketConnection>(
+        conn.shared_from_this()));
+  } else {
+    // cannot happen
+    ceph_assert(false);
+  }
+
+  protocol_timer.cancel();
+  trigger_state(state_t::CLOSING, write_state_t::drop, false);
+}
+
+void ProtocolV2::on_closed()
+{
+  messenger.closed_conn(
+      seastar::static_pointer_cast<SocketConnection>(
+	conn.shared_from_this()));
+}
+
+void ProtocolV2::print(std::ostream& out) const
+{
+  out << conn;
+}
+
+} // namespace crimson::net
diff --git a/src/crimson/net/ProtocolV2.h b/src/crimson/net/ProtocolV2.h
new file mode 100644
index 000000000..be9a22816
--- /dev/null
+++ b/src/crimson/net/ProtocolV2.h
@@ -0,0 +1,225 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/sleep.hh>
+
+#include "Protocol.h"
+#include "msg/async/frames_v2.h"
+#include "msg/async/crypto_onwire.h"
+
+namespace crimson::net {
+
+class ProtocolV2 final : public Protocol {
+ public:
+  ProtocolV2(ChainedDispatchers& dispatchers,
+             SocketConnection& conn,
+             SocketMessenger& messenger);
+  ~ProtocolV2() override;
+  void print(std::ostream&) const final;
+ private:
+  void on_closed() override;
+  bool is_connected() const override;
+
+  void start_connect(const entity_addr_t& peer_addr,
+                     const entity_name_t& peer_name) override;
+
+  void start_accept(SocketRef&& socket,
+                    const entity_addr_t& peer_addr) override;
+
+  void trigger_close() override;
+
+  ceph::bufferlist do_sweep_messages(
+      const std::deque<MessageRef>& msgs,
+      size_t num_msgs,
+      bool require_keepalive,
+      std::optional<utime_t> keepalive_ack,
+      bool require_ack) override;
+
+  void notify_write() override;
+
+ private:
+  SocketMessenger &messenger;
+
+  enum class state_t {
+    NONE = 0,
+    ACCEPTING,
+    SERVER_WAIT,
+    ESTABLISHING,
+    CONNECTING,
+    READY,
+    STANDBY,
+    WAIT,
+    REPLACING,
+    CLOSING
+  };
+  state_t state = state_t::NONE;
+
+  static const char *get_state_name(state_t state) {
+    const char *const statenames[] = {"NONE",
+                                      "ACCEPTING",
+                                      "SERVER_WAIT",
+                                      "ESTABLISHING",
+                                      "CONNECTING",
+                                      "READY",
+                                      "STANDBY",
+                                      "WAIT",
+                                      "REPLACING",
+                                      "CLOSING"};
+    return statenames[static_cast<int>(state)];
+  }
+
+  void trigger_state(state_t state, write_state_t write_state, bool reentrant);
+
+  uint64_t connection_features = 0;
+  uint64_t peer_required_features = 0;
+
+  uint64_t client_cookie = 0;
+  uint64_t server_cookie = 0;
+  uint64_t global_seq = 0;
+  uint64_t peer_global_seq = 0;
+  uint64_t connect_seq = 0;
+
+  seastar::shared_future<> execution_done = seastar::now();
+
+  template <typename Func>
+  void gated_execute(const char* what, Func&& func) {
+    gate.dispatch_in_background(what, *this, [this, &func] {
+      execution_done = seastar::futurize_invoke(std::forward<Func>(func));
+      return execution_done.get_future();
+    });
+  }
+
+  class Timer {
+    double last_dur_ = 0.0;
+    const SocketConnection& conn;
+    std::optional<seastar::abort_source> as;
+   public:
+    Timer(SocketConnection& conn) : conn(conn) {}
+    double last_dur() const { return last_dur_; }
+    seastar::future<> backoff(double seconds);
+    void cancel() {
+      last_dur_ = 0.0;
+      if (as) {
+        as->request_abort();
+        as = std::nullopt;
+      }
+    }
+  };
+  Timer protocol_timer;
+
+ // TODO: Frame related implementations, probably to a separate class.
+ private:
+  bool record_io = false;
+  ceph::bufferlist rxbuf;
+  ceph::bufferlist txbuf;
+
+  void enable_recording();
+  seastar::future<Socket::tmp_buf> read_exactly(size_t bytes);
+  seastar::future<bufferlist> read(size_t bytes);
+  seastar::future<> write(bufferlist&& buf);
+  seastar::future<> write_flush(bufferlist&& buf);
+
+  ceph::crypto::onwire::rxtx_t session_stream_handlers;
+  ceph::msgr::v2::FrameAssembler tx_frame_asm{&session_stream_handlers, false};
+  ceph::msgr::v2::FrameAssembler rx_frame_asm{&session_stream_handlers, false};
+  ceph::bufferlist rx_preamble;
+  ceph::msgr::v2::segment_bls_t rx_segments_data;
+
+  size_t get_current_msg_size() const;
+  seastar::future<ceph::msgr::v2::Tag> read_main_preamble();
+  seastar::future<> read_frame_payload();
+  template <class F>
+  seastar::future<> write_frame(F &frame, bool flush=true);
+
+ private:
+  void fault(bool backoff, const char* func_name, std::exception_ptr eptr);
+  void reset_session(bool full);
+  seastar::future<std::tuple<entity_type_t, entity_addr_t>>
+  banner_exchange(bool is_connect);
+
+  enum class next_step_t {
+    ready,
+    wait,
+    none,       // protocol should have been aborted or failed
+  };
+
+  // CONNECTING (client)
+  seastar::future<> handle_auth_reply();
+  inline seastar::future<> client_auth() {
+    std::vector<uint32_t> empty;
+    return client_auth(empty);
+  }
+  seastar::future<> client_auth(std::vector<uint32_t> &allowed_methods);
+
+  seastar::future<next_step_t> process_wait();
+  seastar::future<next_step_t> client_connect();
+  seastar::future<next_step_t> client_reconnect();
+  void execute_connecting();
+
+  // ACCEPTING (server)
+  seastar::future<> _auth_bad_method(int r);
+  seastar::future<> _handle_auth_request(bufferlist& auth_payload, bool more);
+  seastar::future<> server_auth();
+
+  bool validate_peer_name(const entity_name_t& peer_name) const;
+  seastar::future<next_step_t> send_wait();
+  seastar::future<next_step_t> reuse_connection(ProtocolV2* existing_proto,
+                                                bool do_reset=false,
+                                                bool reconnect=false,
+                                                uint64_t conn_seq=0,
+                                                uint64_t msg_seq=0);
+
+  seastar::future<next_step_t> handle_existing_connection(SocketConnectionRef existing_conn);
+  seastar::future<next_step_t> server_connect();
+
+  seastar::future<next_step_t> read_reconnect();
+  seastar::future<next_step_t> send_retry(uint64_t connect_seq);
+  seastar::future<next_step_t> send_retry_global(uint64_t global_seq);
+  seastar::future<next_step_t> send_reset(bool full);
+  seastar::future<next_step_t> server_reconnect();
+
+  void execute_accepting();
+
+  // CONNECTING/ACCEPTING
+  seastar::future<> finish_auth();
+
+  // ESTABLISHING
+  void execute_establishing(SocketConnectionRef existing_conn, bool dispatch_reset);
+
+  // ESTABLISHING/REPLACING (server)
+  seastar::future<> send_server_ident();
+
+  // REPLACING (server)
+  void trigger_replacing(bool reconnect,
+                         bool do_reset,
+                         SocketRef&& new_socket,
+                         AuthConnectionMetaRef&& new_auth_meta,
+                         ceph::crypto::onwire::rxtx_t new_rxtx,
+                         uint64_t new_peer_global_seq,
+                         // !reconnect
+                         uint64_t new_client_cookie,
+                         entity_name_t new_peer_name,
+                         uint64_t new_conn_features,
+                         bool tx_is_rev1,
+                         bool rx_is_rev1,
+                         // reconnect
+                         uint64_t new_connect_seq,
+                         uint64_t new_msg_seq);
+
+  // READY
+  seastar::future<> read_message(utime_t throttle_stamp);
+  void execute_ready(bool dispatch_connect);
+
+  // STANDBY
+  void execute_standby();
+
+  // WAIT
+  void execute_wait(bool max_backoff);
+
+  // SERVER_WAIT
+  void execute_server_wait();
+};
+
+} // namespace crimson::net
diff --git a/src/crimson/net/Socket.cc b/src/crimson/net/Socket.cc
new file mode 100644
index 000000000..8ad106dbd
--- /dev/null
+++ b/src/crimson/net/Socket.cc
@@ -0,0 +1,276 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Socket.h"
+
+#include <seastar/core/when_all.hh>
+
+#include "crimson/common/log.h"
+#include "Errors.h"
+
+namespace crimson::net {
+
+namespace {
+
+seastar::logger& logger() {
+  return crimson::get_logger(ceph_subsys_ms);
+}
+
+// an input_stream consumer that reads buffer segments into a bufferlist up to
+// the given number of remaining bytes
+struct bufferlist_consumer {
+  bufferlist& bl;
+  size_t& remaining;
+
+  bufferlist_consumer(bufferlist& bl, size_t& remaining)
+    : bl(bl), remaining(remaining) {}
+
+  using tmp_buf = seastar::temporary_buffer<char>;
+  using consumption_result_type = typename seastar::input_stream<char>::consumption_result_type;
+
+  // consume some or all of a buffer segment
+  seastar::future<consumption_result_type> operator()(tmp_buf&& data) {
+    if (remaining >= data.size()) {
+      // consume the whole buffer
+      remaining -= data.size();
+      bl.append(buffer::create_foreign(std::move(data)));
+      if (remaining > 0) {
+        // return none to request more segments
+        return seastar::make_ready_future<consumption_result_type>(
+            seastar::continue_consuming{});
+      } else {
+        // return an empty buffer to singal that we're done
+        return seastar::make_ready_future<consumption_result_type>(
+            consumption_result_type::stop_consuming_type({}));
+      }
+    }
+    if (remaining > 0) {
+      // consume the front
+      bl.append(buffer::create_foreign(data.share(0, remaining)));
+      data.trim_front(remaining);
+      remaining = 0;
+    }
+    // give the rest back to signal that we're done
+    return seastar::make_ready_future<consumption_result_type>(
+        consumption_result_type::stop_consuming_type{std::move(data)});
+  };
+};
+
+} // anonymous namespace
+
+seastar::future<bufferlist> Socket::read(size_t bytes)
+{
+#ifdef UNIT_TESTS_BUILT
+  return try_trap_pre(next_trap_read).then([bytes, this] {
+#endif
+    if (bytes == 0) {
+      return seastar::make_ready_future<bufferlist>();
+    }
+    r.buffer.clear();
+    r.remaining = bytes;
+    return in.consume(bufferlist_consumer{r.buffer, r.remaining}).then([this] {
+      if (r.remaining) { // throw on short reads
+        throw std::system_error(make_error_code(error::read_eof));
+      }
+      return seastar::make_ready_future<bufferlist>(std::move(r.buffer));
+    });
+#ifdef UNIT_TESTS_BUILT
+  }).then([this] (auto buf) {
+    return try_trap_post(next_trap_read
+    ).then([buf = std::move(buf)] () mutable {
+      return std::move(buf);
+    });
+  });
+#endif
+}
+
+seastar::future<seastar::temporary_buffer<char>>
+Socket::read_exactly(size_t bytes) {
+#ifdef UNIT_TESTS_BUILT
+  return try_trap_pre(next_trap_read).then([bytes, this] {
+#endif
+    if (bytes == 0) {
+      return seastar::make_ready_future<seastar::temporary_buffer<char>>();
+    }
+    return in.read_exactly(bytes).then([](auto buf) {
+      if (buf.empty()) {
+        throw std::system_error(make_error_code(error::read_eof));
+      }
+      return seastar::make_ready_future<tmp_buf>(std::move(buf));
+    });
+#ifdef UNIT_TESTS_BUILT
+  }).then([this] (auto buf) {
+    return try_trap_post(next_trap_read
+    ).then([buf = std::move(buf)] () mutable {
+      return std::move(buf);
+    });
+  });
+#endif
+}
+
+void Socket::shutdown() {
+  socket.shutdown_input();
+  socket.shutdown_output();
+}
+
+static inline seastar::future<>
+close_and_handle_errors(seastar::output_stream<char>& out)
+{
+  return out.close().handle_exception_type([] (const std::system_error& e) {
+    if (e.code() != std::errc::broken_pipe &&
+        e.code() != std::errc::connection_reset) {
+      logger().error("Socket::close(): unexpected error {}", e);
+      ceph_abort();
+    }
+    // can happen when out is already shutdown, ignore
+  });
+}
+
+seastar::future<> Socket::close() {
+#ifndef NDEBUG
+  ceph_assert(!closed);
+  closed = true;
+#endif
+  return seastar::when_all_succeed(
+    in.close(),
+    close_and_handle_errors(out)
+  ).then_unpack([] {
+    return seastar::make_ready_future<>();
+  }).handle_exception([] (auto eptr) {
+    logger().error("Socket::close(): unexpected exception {}", eptr);
+    ceph_abort();
+  });
+}
+
+#ifdef UNIT_TESTS_BUILT
+seastar::future<> Socket::try_trap_pre(bp_action_t& trap) {
+  auto action = trap;
+  trap = bp_action_t::CONTINUE;
+  switch (action) {
+   case bp_action_t::CONTINUE:
+    break;
+   case bp_action_t::FAULT:
+    logger().info("[Test] got FAULT");
+    throw std::system_error(make_error_code(crimson::net::error::negotiation_failure));
+   case bp_action_t::BLOCK:
+    logger().info("[Test] got BLOCK");
+    return blocker->block();
+   case bp_action_t::STALL:
+    trap = action;
+    break;
+   default:
+    ceph_abort("unexpected action from trap");
+  }
+  return seastar::make_ready_future<>();
+}
+
+seastar::future<> Socket::try_trap_post(bp_action_t& trap) {
+  auto action = trap;
+  trap = bp_action_t::CONTINUE;
+  switch (action) {
+   case bp_action_t::CONTINUE:
+    break;
+   case bp_action_t::STALL:
+    logger().info("[Test] got STALL and block");
+    shutdown();
+    return blocker->block();
+   default:
+    ceph_abort("unexpected action from trap");
+  }
+  return seastar::make_ready_future<>();
+}
+
+void Socket::set_trap(bp_type_t type, bp_action_t action, socket_blocker* blocker_) {
+  blocker = blocker_;
+  if (type == bp_type_t::READ) {
+    ceph_assert(next_trap_read == bp_action_t::CONTINUE);
+    next_trap_read = action;
+  } else { // type == bp_type_t::WRITE
+    if (next_trap_write == bp_action_t::CONTINUE) {
+      next_trap_write = action;
+    } else if (next_trap_write == bp_action_t::FAULT) {
+      // do_sweep_messages() may combine multiple write events into one socket write
+      ceph_assert(action == bp_action_t::FAULT || action == bp_action_t::CONTINUE);
+    } else {
+      ceph_abort();
+    }
+  }
+}
+#endif
+
+FixedCPUServerSocket::listen_ertr::future<>
+FixedCPUServerSocket::listen(entity_addr_t addr)
+{
+  assert(seastar::this_shard_id() == cpu);
+  logger().trace("FixedCPUServerSocket::listen({})...", addr);
+  return container().invoke_on_all([addr] (auto& ss) {
+    ss.addr = addr;
+    seastar::socket_address s_addr(addr.in4_addr());
+    seastar::listen_options lo;
+    lo.reuse_address = true;
+    lo.set_fixed_cpu(ss.cpu);
+    ss.listener = seastar::listen(s_addr, lo);
+  }).then([] {
+    return true;
+  }).handle_exception_type([addr] (const std::system_error& e) {
+    if (e.code() == std::errc::address_in_use) {
+      logger().trace("FixedCPUServerSocket::listen({}): address in use", addr);
+    } else {
+      logger().error("FixedCPUServerSocket::listen({}): "
+                     "got unexpeted error {}", addr, e);
+      ceph_abort();
+    }
+    return false;
+  }).then([] (bool success) -> listen_ertr::future<> {
+    if (success) {
+      return listen_ertr::now();
+    } else {
+      return crimson::ct_error::address_in_use::make();
+    }
+  });
+}
+
+seastar::future<> FixedCPUServerSocket::shutdown()
+{
+  assert(seastar::this_shard_id() == cpu);
+  logger().trace("FixedCPUServerSocket({})::shutdown()...", addr);
+  return container().invoke_on_all([] (auto& ss) {
+    if (ss.listener) {
+      ss.listener->abort_accept();
+    }
+    return ss.shutdown_gate.close();
+  }).then([this] {
+    return reset();
+  });
+}
+
+seastar::future<> FixedCPUServerSocket::destroy()
+{
+  assert(seastar::this_shard_id() == cpu);
+  return shutdown().then([this] {
+    // we should only construct/stop shards on #0
+    return container().invoke_on(0, [] (auto& ss) {
+      assert(ss.service);
+      return ss.service->stop().finally([cleanup = std::move(ss.service)] {});
+    });
+  });
+}
+
+seastar::future<FixedCPUServerSocket*> FixedCPUServerSocket::create()
+{
+  auto cpu = seastar::this_shard_id();
+  // we should only construct/stop shards on #0
+  return seastar::smp::submit_to(0, [cpu] {
+    auto service = std::make_unique<sharded_service_t>();
+    return service->start(cpu, construct_tag{}
+    ).then([service = std::move(service)] () mutable {
+      auto p_shard = service.get();
+      p_shard->local().service = std::move(service);
+      return p_shard;
+    });
+  }).then([] (auto p_shard) {
+    return &p_shard->local();
+  });
+}
+
+} // namespace crimson::net
diff --git a/src/crimson/net/Socket.h b/src/crimson/net/Socket.h
new file mode 100644
index 000000000..d39a2517f
--- /dev/null
+++ b/src/crimson/net/Socket.h
@@ -0,0 +1,268 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/gate.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/core/sharded.hh>
+#include <seastar/net/packet.hh>
+
+#include "include/buffer.h"
+
+#include "crimson/common/log.h"
+#include "Errors.h"
+#include "Fwd.h"
+
+#ifdef UNIT_TESTS_BUILT
+#include "Interceptor.h"
+#endif
+
+namespace crimson::net {
+
+class Socket;
+using SocketRef = std::unique_ptr<Socket>;
+
+class Socket
+{
+  struct construct_tag {};
+
+ public:
+  // if acceptor side, peer is using a different port (ephemeral_port)
+  // if connector side, I'm using a different port (ephemeral_port)
+  enum class side_t {
+    acceptor,
+    connector
+  };
+
+  Socket(seastar::connected_socket&& _socket, side_t _side, uint16_t e_port, construct_tag)
+    : sid{seastar::this_shard_id()},
+      socket(std::move(_socket)),
+      in(socket.input()),
+      // the default buffer size 8192 is too small that may impact our write
+      // performance. see seastar::net::connected_socket::output()
+      out(socket.output(65536)),
+      side(_side),
+      ephemeral_port(e_port) {}
+
+  ~Socket() {
+#ifndef NDEBUG
+    assert(closed);
+#endif
+  }
+
+  Socket(Socket&& o) = delete;
+
+  static seastar::future<SocketRef>
+  connect(const entity_addr_t& peer_addr) {
+    return seastar::connect(peer_addr.in4_addr()
+    ).then([] (seastar::connected_socket socket) {
+      return std::make_unique<Socket>(
+        std::move(socket), side_t::connector, 0, construct_tag{});
+    });
+  }
+
+  /// read the requested number of bytes into a bufferlist
+  seastar::future<bufferlist> read(size_t bytes);
+  using tmp_buf = seastar::temporary_buffer<char>;
+  using packet = seastar::net::packet;
+  seastar::future<tmp_buf> read_exactly(size_t bytes);
+
+  seastar::future<> write(packet&& buf) {
+#ifdef UNIT_TESTS_BUILT
+    return try_trap_pre(next_trap_write).then([buf = std::move(buf), this] () mutable {
+#endif
+      return out.write(std::move(buf));
+#ifdef UNIT_TESTS_BUILT
+    }).then([this] {
+      return try_trap_post(next_trap_write);
+    });
+#endif
+  }
+  seastar::future<> flush() {
+    return out.flush();
+  }
+  seastar::future<> write_flush(packet&& buf) {
+#ifdef UNIT_TESTS_BUILT
+    return try_trap_pre(next_trap_write).then([buf = std::move(buf), this] () mutable {
+#endif
+      return out.write(std::move(buf)).then([this] { return out.flush(); });
+#ifdef UNIT_TESTS_BUILT
+    }).then([this] {
+      return try_trap_post(next_trap_write);
+    });
+#endif
+  }
+
+  // preemptively disable further reads or writes, can only be shutdown once.
+  void shutdown();
+
+  /// Socket can only be closed once.
+  seastar::future<> close();
+
+  // shutdown input_stream only, for tests
+  void force_shutdown_in() {
+    socket.shutdown_input();
+  }
+
+  // shutdown output_stream only, for tests
+  void force_shutdown_out() {
+    socket.shutdown_output();
+  }
+
+  side_t get_side() const {
+    return side;
+  }
+
+  uint16_t get_ephemeral_port() const {
+    return ephemeral_port;
+  }
+
+  // learn my ephemeral_port as connector.
+  // unfortunately, there's no way to identify which port I'm using as
+  // connector with current seastar interface.
+  void learn_ephemeral_port_as_connector(uint16_t port) {
+    assert(side == side_t::connector &&
+           (ephemeral_port == 0 || ephemeral_port == port));
+    ephemeral_port = port;
+  }
+
+ private:
+  const seastar::shard_id sid;
+  seastar::connected_socket socket;
+  seastar::input_stream<char> in;
+  seastar::output_stream<char> out;
+  side_t side;
+  uint16_t ephemeral_port;
+
+#ifndef NDEBUG
+  bool closed = false;
+#endif
+
+  /// buffer state for read()
+  struct {
+    bufferlist buffer;
+    size_t remaining;
+  } r;
+
+#ifdef UNIT_TESTS_BUILT
+ public:
+  void set_trap(bp_type_t type, bp_action_t action, socket_blocker* blocker_);
+
+ private:
+  bp_action_t next_trap_read = bp_action_t::CONTINUE;
+  bp_action_t next_trap_write = bp_action_t::CONTINUE;
+  socket_blocker* blocker = nullptr;
+  seastar::future<> try_trap_pre(bp_action_t& trap);
+  seastar::future<> try_trap_post(bp_action_t& trap);
+
+#endif
+  friend class FixedCPUServerSocket;
+};
+
+class FixedCPUServerSocket
+    : public seastar::peering_sharded_service<FixedCPUServerSocket> {
+  const seastar::shard_id cpu;
+  entity_addr_t addr;
+  std::optional<seastar::server_socket> listener;
+  seastar::gate shutdown_gate;
+
+  using sharded_service_t = seastar::sharded<FixedCPUServerSocket>;
+  std::unique_ptr<sharded_service_t> service;
+
+  struct construct_tag {};
+
+  static seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_ms);
+  }
+
+  seastar::future<> reset() {
+    return container().invoke_on_all([] (auto& ss) {
+      assert(ss.shutdown_gate.is_closed());
+      ss.shutdown_gate = seastar::gate();
+      ss.addr = entity_addr_t();
+      ss.listener.reset();
+    });
+  }
+
+public:
+  FixedCPUServerSocket(seastar::shard_id cpu, construct_tag) : cpu{cpu} {}
+  ~FixedCPUServerSocket() {
+    assert(!listener);
+    // detect whether user have called destroy() properly
+    ceph_assert(!service);
+  }
+
+  FixedCPUServerSocket(FixedCPUServerSocket&&) = delete;
+  FixedCPUServerSocket(const FixedCPUServerSocket&) = delete;
+  FixedCPUServerSocket& operator=(const FixedCPUServerSocket&) = delete;
+
+  using listen_ertr = crimson::errorator<
+    crimson::ct_error::address_in_use // The address is already bound
+    >;
+  listen_ertr::future<> listen(entity_addr_t addr);
+
+  // fn_accept should be a nothrow function of type
+  // seastar::future<>(SocketRef, entity_addr_t)
+  template <typename Func>
+  seastar::future<> accept(Func&& fn_accept) {
+    assert(seastar::this_shard_id() == cpu);
+    logger().trace("FixedCPUServerSocket({})::accept()...", addr);
+    return container().invoke_on_all(
+        [fn_accept = std::move(fn_accept)] (auto& ss) mutable {
+      assert(ss.listener);
+      // gate accepting
+      // FixedCPUServerSocket::shutdown() will drain the continuations in the gate
+      // so ignore the returned future
+      std::ignore = seastar::with_gate(ss.shutdown_gate,
+          [&ss, fn_accept = std::move(fn_accept)] () mutable {
+        return seastar::keep_doing([&ss, fn_accept = std::move(fn_accept)] () mutable {
+          return ss.listener->accept().then(
+              [&ss, fn_accept = std::move(fn_accept)]
+              (seastar::accept_result accept_result) mutable {
+            // assert seastar::listen_options::set_fixed_cpu() works
+            assert(seastar::this_shard_id() == ss.cpu);
+            auto [socket, paddr] = std::move(accept_result);
+            entity_addr_t peer_addr;
+            peer_addr.set_sockaddr(&paddr.as_posix_sockaddr());
+            peer_addr.set_type(entity_addr_t::TYPE_ANY);
+            SocketRef _socket = std::make_unique<Socket>(
+                std::move(socket), Socket::side_t::acceptor,
+                peer_addr.get_port(), Socket::construct_tag{});
+            std::ignore = seastar::with_gate(ss.shutdown_gate,
+                [socket = std::move(_socket), peer_addr,
+                 &ss, fn_accept = std::move(fn_accept)] () mutable {
+              logger().trace("FixedCPUServerSocket({})::accept(): "
+                             "accepted peer {}", ss.addr, peer_addr);
+              return fn_accept(std::move(socket), peer_addr
+              ).handle_exception([&ss, peer_addr] (auto eptr) {
+                logger().error("FixedCPUServerSocket({})::accept(): "
+                               "fn_accept(s, {}) got unexpected exception {}",
+                               ss.addr, peer_addr, eptr);
+                ceph_abort();
+              });
+            });
+          });
+        }).handle_exception_type([&ss] (const std::system_error& e) {
+          if (e.code() == std::errc::connection_aborted ||
+              e.code() == std::errc::invalid_argument) {
+            logger().trace("FixedCPUServerSocket({})::accept(): stopped ({})",
+                           ss.addr, e);
+          } else {
+            throw;
+          }
+        }).handle_exception([&ss] (auto eptr) {
+          logger().error("FixedCPUServerSocket({})::accept(): "
+                         "got unexpected exception {}", ss.addr, eptr);
+          ceph_abort();
+        });
+      });
+    });
+  }
+
+  seastar::future<> shutdown();
+  seastar::future<> destroy();
+  static seastar::future<FixedCPUServerSocket*> create();
+};
+
+} // namespace crimson::net
diff --git a/src/crimson/net/SocketConnection.cc b/src/crimson/net/SocketConnection.cc
new file mode 100644
index 000000000..623dca32f
--- /dev/null
+++ b/src/crimson/net/SocketConnection.cc
@@ -0,0 +1,150 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "SocketConnection.h"
+
+#include "ProtocolV1.h"
+#include "ProtocolV2.h"
+#include "SocketMessenger.h"
+
+#ifdef UNIT_TESTS_BUILT
+#include "Interceptor.h"
+#endif
+
+using namespace crimson::net;
+using crimson::common::local_conf;
+
+SocketConnection::SocketConnection(SocketMessenger& messenger,
+                                   ChainedDispatchers& dispatchers,
+                                   bool is_msgr2)
+  : messenger(messenger)
+{
+  if (is_msgr2) {
+    protocol = std::make_unique<ProtocolV2>(dispatchers, *this, messenger);
+  } else {
+    protocol = std::make_unique<ProtocolV1>(dispatchers, *this, messenger);
+  }
+#ifdef UNIT_TESTS_BUILT
+  if (messenger.interceptor) {
+    interceptor = messenger.interceptor;
+    interceptor->register_conn(*this);
+  }
+#endif
+}
+
+SocketConnection::~SocketConnection() {}
+
+crimson::net::Messenger*
+SocketConnection::get_messenger() const {
+  return &messenger;
+}
+
+bool SocketConnection::is_connected() const
+{
+  assert(seastar::this_shard_id() == shard_id());
+  return protocol->is_connected();
+}
+
+#ifdef UNIT_TESTS_BUILT
+bool SocketConnection::is_closed() const
+{
+  assert(seastar::this_shard_id() == shard_id());
+  return protocol->is_closed();
+}
+
+bool SocketConnection::is_closed_clean() const
+{
+  assert(seastar::this_shard_id() == shard_id());
+  return protocol->is_closed_clean;
+}
+
+#endif
+bool SocketConnection::peer_wins() const
+{
+  return (messenger.get_myaddr() > peer_addr || policy.server);
+}
+
+seastar::future<> SocketConnection::send(MessageRef msg)
+{
+  assert(seastar::this_shard_id() == shard_id());
+  return protocol->send(std::move(msg));
+}
+
+seastar::future<> SocketConnection::keepalive()
+{
+  assert(seastar::this_shard_id() == shard_id());
+  return protocol->keepalive();
+}
+
+void SocketConnection::mark_down()
+{
+  assert(seastar::this_shard_id() == shard_id());
+  protocol->close(false);
+}
+
+bool SocketConnection::update_rx_seq(seq_num_t seq)
+{
+  if (seq <= in_seq) {
+    if (HAVE_FEATURE(features, RECONNECT_SEQ) &&
+        local_conf()->ms_die_on_old_message) {
+      ceph_abort_msg("old msgs despite reconnect_seq feature");
+    }
+    return false;
+  } else if (seq > in_seq + 1) {
+    if (local_conf()->ms_die_on_skipped_message) {
+      ceph_abort_msg("skipped incoming seq");
+    }
+    return false;
+  } else {
+    in_seq = seq;
+    return true;
+  }
+}
+
+void
+SocketConnection::start_connect(const entity_addr_t& _peer_addr,
+                                const entity_name_t& _peer_name)
+{
+  protocol->start_connect(_peer_addr, _peer_name);
+}
+
+void
+SocketConnection::start_accept(SocketRef&& sock,
+                               const entity_addr_t& _peer_addr)
+{
+  protocol->start_accept(std::move(sock), _peer_addr);
+}
+
+seastar::future<>
+SocketConnection::close_clean(bool dispatch_reset)
+{
+  return protocol->close_clean(dispatch_reset);
+}
+
+seastar::shard_id SocketConnection::shard_id() const {
+  return messenger.shard_id();
+}
+
+void SocketConnection::print(ostream& out) const {
+    messenger.print(out);
+    if (!protocol->socket) {
+      out << " >> " << get_peer_name() << " " << peer_addr;
+    } else if (protocol->socket->get_side() == Socket::side_t::acceptor) {
+      out << " >> " << get_peer_name() << " " << peer_addr
+          << "@" << protocol->socket->get_ephemeral_port();
+    } else { // protocol->socket->get_side() == Socket::side_t::connector
+      out << "@" << protocol->socket->get_ephemeral_port()
+          << " >> " << get_peer_name() << " " << peer_addr;
+    }
+}
diff --git a/src/crimson/net/SocketConnection.h b/src/crimson/net/SocketConnection.h
new file mode 100644
index 000000000..9c977c7cf
--- /dev/null
+++ b/src/crimson/net/SocketConnection.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <seastar/core/sharded.hh>
+
+#include "msg/Policy.h"
+#include "crimson/common/throttle.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Socket.h"
+
+namespace crimson::net {
+
+class Protocol;
+class SocketMessenger;
+class SocketConnection;
+using SocketConnectionRef = seastar::shared_ptr<SocketConnection>;
+
+class SocketConnection : public Connection {
+  SocketMessenger& messenger;
+  std::unique_ptr<Protocol> protocol;
+
+  ceph::net::Policy<crimson::common::Throttle> policy;
+
+  /// the seq num of the last transmitted message
+  seq_num_t out_seq = 0;
+  /// the seq num of the last received message
+  seq_num_t in_seq = 0;
+  /// update the seq num of last received message
+  /// @returns true if the @c seq is valid, and @c in_seq is updated,
+  ///          false otherwise.
+  bool update_rx_seq(seq_num_t seq);
+
+  // messages to be resent after connection gets reset
+  std::deque<MessageRef> out_q;
+  std::deque<MessageRef> pending_q;
+  // messages sent, but not yet acked by peer
+  std::deque<MessageRef> sent;
+
+  seastar::shard_id shard_id() const;
+
+ public:
+  SocketConnection(SocketMessenger& messenger,
+                   ChainedDispatchers& dispatchers,
+                   bool is_msgr2);
+  ~SocketConnection() override;
+
+  Messenger* get_messenger() const override;
+
+  bool is_connected() const override;
+
+#ifdef UNIT_TESTS_BUILT
+  bool is_closed_clean() const override;
+
+  bool is_closed() const override;
+
+  bool peer_wins() const override;
+#else
+  bool peer_wins() const;
+#endif
+
+  seastar::future<> send(MessageRef msg) override;
+
+  seastar::future<> keepalive() override;
+
+  void mark_down() override;
+
+  void print(ostream& out) const override;
+
+  /// start a handshake from the client's perspective,
+  /// only call when SocketConnection first construct
+  void start_connect(const entity_addr_t& peer_addr,
+                     const entity_name_t& peer_name);
+  /// start a handshake from the server's perspective,
+  /// only call when SocketConnection first construct
+  void start_accept(SocketRef&& socket,
+                    const entity_addr_t& peer_addr);
+
+  seastar::future<> close_clean(bool dispatch_reset);
+
+  bool is_server_side() const {
+    return policy.server;
+  }
+
+  bool is_lossy() const {
+    return policy.lossy;
+  }
+
+  friend class Protocol;
+  friend class ProtocolV1;
+  friend class ProtocolV2;
+};
+
+} // namespace crimson::net
diff --git a/src/crimson/net/SocketMessenger.cc b/src/crimson/net/SocketMessenger.cc
new file mode 100644
index 000000000..db9421e79
--- /dev/null
+++ b/src/crimson/net/SocketMessenger.cc
@@ -0,0 +1,351 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "SocketMessenger.h"
+
+#include <tuple>
+#include <boost/functional/hash.hpp>
+
+#include "auth/Auth.h"
+#include "Errors.h"
+#include "Socket.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_ms);
+  }
+}
+
+namespace crimson::net {
+
+SocketMessenger::SocketMessenger(const entity_name_t& myname,
+                                 const std::string& logic_name,
+                                 uint32_t nonce)
+  : Messenger{myname},
+    master_sid{seastar::this_shard_id()},
+    logic_name{logic_name},
+    nonce{nonce}
+{}
+
+seastar::future<> SocketMessenger::set_myaddrs(const entity_addrvec_t& addrs)
+{
+  assert(seastar::this_shard_id() == master_sid);
+  auto my_addrs = addrs;
+  for (auto& addr : my_addrs.v) {
+    addr.nonce = nonce;
+  }
+  return Messenger::set_myaddrs(my_addrs);
+}
+
+SocketMessenger::bind_ertr::future<> SocketMessenger::do_bind(const entity_addrvec_t& addrs)
+{
+  assert(seastar::this_shard_id() == master_sid);
+  ceph_assert(addrs.front().get_family() == AF_INET);
+  return set_myaddrs(addrs).then([this] {
+    if (!listener) {
+      return FixedCPUServerSocket::create().then([this] (auto _listener) {
+        listener = _listener;
+      });
+    } else {
+      return seastar::now();
+    }
+  }).then([this] () -> bind_ertr::future<> {
+    const entity_addr_t listen_addr = get_myaddr();
+    logger().debug("{} do_bind: try listen {}...", *this, listen_addr);
+    if (!listener) {
+      logger().warn("{} do_bind: listener doesn't exist", *this);
+      return bind_ertr::now();
+    }
+    return listener->listen(listen_addr);
+  });
+}
+
+SocketMessenger::bind_ertr::future<>
+SocketMessenger::bind(const entity_addrvec_t& addrs)
+{
+  return do_bind(addrs).safe_then([this] {
+    logger().info("{} bind: done", *this);
+  });
+}
+
+SocketMessenger::bind_ertr::future<>
+SocketMessenger::try_bind(const entity_addrvec_t& addrs,
+                          uint32_t min_port, uint32_t max_port)
+{
+  auto addr = addrs.front();
+  if (addr.get_port() != 0) {
+    return do_bind(addrs).safe_then([this] {
+      logger().info("{} try_bind: done", *this);
+    });
+  }
+  ceph_assert(min_port <= max_port);
+  return seastar::do_with(uint32_t(min_port),
+                          [this, max_port, addr] (auto& port) {
+    return seastar::repeat_until_value([this, max_port, addr, &port] {
+      auto to_bind = addr;
+      to_bind.set_port(port);
+      return do_bind(entity_addrvec_t{to_bind}
+      ).safe_then([this] () -> seastar::future<std::optional<bool>> {
+        logger().info("{} try_bind: done", *this);
+        return seastar::make_ready_future<std::optional<bool>>(
+            std::make_optional<bool>(true));
+      }, bind_ertr::all_same_way([this, max_port, &port]
+                                 (const std::error_code& e) mutable
+                                 -> seastar::future<std::optional<bool>> {
+        assert(e == std::errc::address_in_use);
+        logger().trace("{} try_bind: {} already used", *this, port);
+        if (port == max_port) {
+          return seastar::make_ready_future<std::optional<bool>>(
+              std::make_optional<bool>(false));
+        }
+        ++port;
+        return seastar::make_ready_future<std::optional<bool>>();
+      }));
+    }).then([] (bool success) -> bind_ertr::future<> {
+      if (success) {
+        return bind_ertr::now();
+      } else {
+        return crimson::ct_error::address_in_use::make();
+      }
+    });
+  });
+}
+
+seastar::future<> SocketMessenger::start(
+    const dispatchers_t& _dispatchers) {
+  assert(seastar::this_shard_id() == master_sid);
+
+  dispatchers.assign(_dispatchers);
+  if (listener) {
+    // make sure we have already bound to a valid address
+    ceph_assert(get_myaddr().is_legacy() || get_myaddr().is_msgr2());
+    ceph_assert(get_myaddr().get_port() > 0);
+
+    return listener->accept([this] (SocketRef socket, entity_addr_t peer_addr) {
+      assert(seastar::this_shard_id() == master_sid);
+      SocketConnectionRef conn = seastar::make_shared<SocketConnection>(
+          *this, dispatchers, get_myaddr().is_msgr2());
+      conn->start_accept(std::move(socket), peer_addr);
+      return seastar::now();
+    });
+  }
+  return seastar::now();
+}
+
+crimson::net::ConnectionRef
+SocketMessenger::connect(const entity_addr_t& peer_addr, const entity_name_t& peer_name)
+{
+  assert(seastar::this_shard_id() == master_sid);
+
+  // make sure we connect to a valid peer_addr
+  ceph_assert(peer_addr.is_legacy() || peer_addr.is_msgr2());
+  ceph_assert(peer_addr.get_port() > 0);
+
+  if (auto found = lookup_conn(peer_addr); found) {
+    logger().debug("{} connect to existing", *found);
+    return found->shared_from_this();
+  }
+  SocketConnectionRef conn = seastar::make_shared<SocketConnection>(
+      *this, dispatchers, peer_addr.is_msgr2());
+  conn->start_connect(peer_addr, peer_name);
+  return conn->shared_from_this();
+}
+
+seastar::future<> SocketMessenger::shutdown()
+{
+  assert(seastar::this_shard_id() == master_sid);
+  return seastar::futurize_invoke([this] {
+    assert(dispatchers.empty());
+    if (listener) {
+      auto d_listener = listener;
+      listener = nullptr;
+      return d_listener->destroy();
+    } else {
+      return seastar::now();
+    }
+  // close all connections
+  }).then([this] {
+    return seastar::parallel_for_each(accepting_conns, [] (auto conn) {
+      return conn->close_clean(false);
+    });
+  }).then([this] {
+    ceph_assert(accepting_conns.empty());
+    return seastar::parallel_for_each(connections, [] (auto conn) {
+      return conn.second->close_clean(false);
+    });
+  }).then([this] {
+    return seastar::parallel_for_each(closing_conns, [] (auto conn) {
+      return conn->close_clean(false);
+    });
+  }).then([this] {
+    ceph_assert(connections.empty());
+    shutdown_promise.set_value();
+  });
+}
+
+seastar::future<> SocketMessenger::learned_addr(const entity_addr_t &peer_addr_for_me, const SocketConnection& conn)
+{
+  assert(seastar::this_shard_id() == master_sid);
+  if (!need_addr) {
+    if ((!get_myaddr().is_any() &&
+         get_myaddr().get_type() != peer_addr_for_me.get_type()) ||
+        get_myaddr().get_family() != peer_addr_for_me.get_family() ||
+        !get_myaddr().is_same_host(peer_addr_for_me)) {
+      logger().warn("{} peer_addr_for_me {} type/family/IP doesn't match myaddr {}",
+                    conn, peer_addr_for_me, get_myaddr());
+      throw std::system_error(
+          make_error_code(crimson::net::error::bad_peer_address));
+    }
+    return seastar::now();
+  }
+
+  if (get_myaddr().get_type() == entity_addr_t::TYPE_NONE) {
+    // Not bound
+    entity_addr_t addr = peer_addr_for_me;
+    addr.set_type(entity_addr_t::TYPE_ANY);
+    addr.set_port(0);
+    need_addr = false;
+    return set_myaddrs(entity_addrvec_t{addr}
+    ).then([this, &conn, peer_addr_for_me] {
+      logger().info("{} learned myaddr={} (unbound) from {}",
+                    conn, get_myaddr(), peer_addr_for_me);
+    });
+  } else {
+    // Already bound
+    if (!get_myaddr().is_any() &&
+        get_myaddr().get_type() != peer_addr_for_me.get_type()) {
+      logger().warn("{} peer_addr_for_me {} type doesn't match myaddr {}",
+                    conn, peer_addr_for_me, get_myaddr());
+      throw std::system_error(
+          make_error_code(crimson::net::error::bad_peer_address));
+    }
+    if (get_myaddr().get_family() != peer_addr_for_me.get_family()) {
+      logger().warn("{} peer_addr_for_me {} family doesn't match myaddr {}",
+                    conn, peer_addr_for_me, get_myaddr());
+      throw std::system_error(
+          make_error_code(crimson::net::error::bad_peer_address));
+    }
+    if (get_myaddr().is_blank_ip()) {
+      entity_addr_t addr = peer_addr_for_me;
+      addr.set_type(get_myaddr().get_type());
+      addr.set_port(get_myaddr().get_port());
+      need_addr = false;
+      return set_myaddrs(entity_addrvec_t{addr}
+      ).then([this, &conn, peer_addr_for_me] {
+        logger().info("{} learned myaddr={} (blank IP) from {}",
+                      conn, get_myaddr(), peer_addr_for_me);
+      });
+    } else if (!get_myaddr().is_same_host(peer_addr_for_me)) {
+      logger().warn("{} peer_addr_for_me {} IP doesn't match myaddr {}",
+                    conn, peer_addr_for_me, get_myaddr());
+      throw std::system_error(
+          make_error_code(crimson::net::error::bad_peer_address));
+    } else {
+      need_addr = false;
+      return seastar::now();
+    }
+  }
+}
+
+SocketPolicy SocketMessenger::get_policy(entity_type_t peer_type) const
+{
+  return policy_set.get(peer_type);
+}
+
+SocketPolicy SocketMessenger::get_default_policy() const
+{
+  return policy_set.get_default();
+}
+
+void SocketMessenger::set_default_policy(const SocketPolicy& p)
+{
+  policy_set.set_default(p);
+}
+
+void SocketMessenger::set_policy(entity_type_t peer_type,
+				 const SocketPolicy& p)
+{
+  policy_set.set(peer_type, p);
+}
+
+void SocketMessenger::set_policy_throttler(entity_type_t peer_type,
+					   Throttle* throttle)
+{
+  // only byte throttler is used in OSD
+  policy_set.set_throttlers(peer_type, throttle, nullptr);
+}
+
+crimson::net::SocketConnectionRef SocketMessenger::lookup_conn(const entity_addr_t& addr)
+{
+  if (auto found = connections.find(addr);
+      found != connections.end()) {
+    return found->second;
+  } else {
+    return nullptr;
+  }
+}
+
+void SocketMessenger::accept_conn(SocketConnectionRef conn)
+{
+  accepting_conns.insert(conn);
+}
+
+void SocketMessenger::unaccept_conn(SocketConnectionRef conn)
+{
+  accepting_conns.erase(conn);
+}
+
+void SocketMessenger::register_conn(SocketConnectionRef conn)
+{
+  auto [i, added] = connections.emplace(conn->get_peer_addr(), conn);
+  std::ignore = i;
+  ceph_assert(added);
+}
+
+void SocketMessenger::unregister_conn(SocketConnectionRef conn)
+{
+  ceph_assert(conn);
+  auto found = connections.find(conn->get_peer_addr());
+  ceph_assert(found != connections.end());
+  ceph_assert(found->second == conn);
+  connections.erase(found);
+}
+
+void SocketMessenger::closing_conn(SocketConnectionRef conn)
+{
+  closing_conns.push_back(conn);
+}
+
+void SocketMessenger::closed_conn(SocketConnectionRef conn)
+{
+  for (auto it = closing_conns.begin();
+       it != closing_conns.end();) {
+    if (*it == conn) {
+      it = closing_conns.erase(it);
+    } else {
+      it++;
+    }
+  }
+}
+
+seastar::future<uint32_t>
+SocketMessenger::get_global_seq(uint32_t old)
+{
+  if (old > global_seq) {
+    global_seq = old;
+  }
+  return seastar::make_ready_future<uint32_t>(++global_seq);
+}
+
+} // namespace crimson::net
diff --git a/src/crimson/net/SocketMessenger.h b/src/crimson/net/SocketMessenger.h
new file mode 100644
index 000000000..44c1d3c21
--- /dev/null
+++ b/src/crimson/net/SocketMessenger.h
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <vector>
+#include <seastar/core/gate.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/core/sharded.hh>
+#include <seastar/core/shared_future.hh>
+
+#include "crimson/net/chained_dispatchers.h"
+#include "Messenger.h"
+#include "SocketConnection.h"
+
+namespace crimson::net {
+
+class FixedCPUServerSocket;
+
+class SocketMessenger final : public Messenger {
+  const seastar::shard_id master_sid;
+  seastar::promise<> shutdown_promise;
+
+  FixedCPUServerSocket* listener = nullptr;
+  ChainedDispatchers dispatchers;
+  std::map<entity_addr_t, SocketConnectionRef> connections;
+  std::set<SocketConnectionRef> accepting_conns;
+  std::vector<SocketConnectionRef> closing_conns;
+  ceph::net::PolicySet<Throttle> policy_set;
+  // Distinguish messengers with meaningful names for debugging
+  const std::string logic_name;
+  const uint32_t nonce;
+  // specifying we haven't learned our addr; set false when we find it.
+  bool need_addr = true;
+  uint32_t global_seq = 0;
+  bool started = false;
+
+  bind_ertr::future<> do_bind(const entity_addrvec_t& addr);
+
+ public:
+  SocketMessenger(const entity_name_t& myname,
+                  const std::string& logic_name,
+                  uint32_t nonce);
+  ~SocketMessenger() override { ceph_assert(!listener); }
+
+  seastar::future<> set_myaddrs(const entity_addrvec_t& addr) override;
+
+  // Messenger interfaces are assumed to be called from its own shard, but its
+  // behavior should be symmetric when called from any shard.
+  bind_ertr::future<> bind(const entity_addrvec_t& addr) override;
+
+  bind_ertr::future<> try_bind(const entity_addrvec_t& addr,
+                               uint32_t min_port, uint32_t max_port) override;
+
+  seastar::future<> start(const dispatchers_t& dispatchers) override;
+
+  ConnectionRef connect(const entity_addr_t& peer_addr,
+                        const entity_name_t& peer_name) override;
+  // can only wait once
+  seastar::future<> wait() override {
+    assert(seastar::this_shard_id() == master_sid);
+    return shutdown_promise.get_future();
+  }
+
+  void stop() override {
+    dispatchers.clear();
+  }
+
+  bool is_started() const override {
+    return !dispatchers.empty();
+  }
+
+  seastar::future<> shutdown() override;
+
+  void print(ostream& out) const override {
+    out << get_myname()
+        << "(" << logic_name
+        << ") " << get_myaddr();
+  }
+
+  SocketPolicy get_policy(entity_type_t peer_type) const override;
+
+  SocketPolicy get_default_policy() const override;
+
+  void set_default_policy(const SocketPolicy& p) override;
+
+  void set_policy(entity_type_t peer_type, const SocketPolicy& p) override;
+
+  void set_policy_throttler(entity_type_t peer_type, Throttle* throttle) override;
+
+ public:
+  seastar::future<uint32_t> get_global_seq(uint32_t old=0);
+  seastar::future<> learned_addr(const entity_addr_t &peer_addr_for_me,
+                                 const SocketConnection& conn);
+
+  SocketConnectionRef lookup_conn(const entity_addr_t& addr);
+  void accept_conn(SocketConnectionRef);
+  void unaccept_conn(SocketConnectionRef);
+  void register_conn(SocketConnectionRef);
+  void unregister_conn(SocketConnectionRef);
+  void closing_conn(SocketConnectionRef);
+  void closed_conn(SocketConnectionRef);
+  seastar::shard_id shard_id() const {
+    assert(seastar::this_shard_id() == master_sid);
+    return master_sid;
+  }
+};
+
+} // namespace crimson::net
diff --git a/src/crimson/net/chained_dispatchers.cc b/src/crimson/net/chained_dispatchers.cc
new file mode 100644
index 000000000..b13d40c8f
--- /dev/null
+++ b/src/crimson/net/chained_dispatchers.cc
@@ -0,0 +1,93 @@
+#include "crimson/common/log.h"
+#include "crimson/net/chained_dispatchers.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Dispatcher.h"
+#include "msg/Message.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_ms);
+  }
+}
+
+namespace crimson::net {
+
+seastar::future<>
+ChainedDispatchers::ms_dispatch(crimson::net::ConnectionRef conn,
+                                MessageRef m) {
+  try {
+    for (auto& dispatcher : dispatchers) {
+      auto dispatched = dispatcher->ms_dispatch(conn, m);
+      if (dispatched.has_value()) {
+        return std::move(*dispatched
+        ).handle_exception([conn] (std::exception_ptr eptr) {
+          logger().error("{} got unexpected exception in ms_dispatch() throttling {}",
+                         *conn, eptr);
+          ceph_abort();
+        });
+      }
+    }
+  } catch (...) {
+    logger().error("{} got unexpected exception in ms_dispatch() {}",
+                   *conn, std::current_exception());
+    ceph_abort();
+  }
+  if (!dispatchers.empty()) {
+    logger().error("ms_dispatch unhandled message {}", *m);
+  }
+  return seastar::now();
+}
+
+void
+ChainedDispatchers::ms_handle_accept(crimson::net::ConnectionRef conn) {
+  try {
+    for (auto& dispatcher : dispatchers) {
+      dispatcher->ms_handle_accept(conn);
+    }
+  } catch (...) {
+    logger().error("{} got unexpected exception in ms_handle_accept() {}",
+                   *conn, std::current_exception());
+    ceph_abort();
+  }
+}
+
+void
+ChainedDispatchers::ms_handle_connect(crimson::net::ConnectionRef conn) {
+  try {
+    for(auto& dispatcher : dispatchers) {
+      dispatcher->ms_handle_connect(conn);
+    }
+  } catch (...) {
+    logger().error("{} got unexpected exception in ms_handle_connect() {}",
+                   *conn, std::current_exception());
+    ceph_abort();
+  }
+}
+
+void
+ChainedDispatchers::ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) {
+  try {
+    for (auto& dispatcher : dispatchers) {
+      dispatcher->ms_handle_reset(conn, is_replace);
+    }
+  } catch (...) {
+    logger().error("{} got unexpected exception in ms_handle_reset() {}",
+                   *conn, std::current_exception());
+    ceph_abort();
+  }
+}
+
+void
+ChainedDispatchers::ms_handle_remote_reset(crimson::net::ConnectionRef conn) {
+  try {
+    for (auto& dispatcher : dispatchers) {
+      dispatcher->ms_handle_remote_reset(conn);
+    }
+  } catch (...) {
+    logger().error("{} got unexpected exception in ms_handle_remote_reset() {}",
+                   *conn, std::current_exception());
+    ceph_abort();
+  }
+}
+
+}
diff --git a/src/crimson/net/chained_dispatchers.h b/src/crimson/net/chained_dispatchers.h
new file mode 100644
index 000000000..712b0894b
--- /dev/null
+++ b/src/crimson/net/chained_dispatchers.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- 
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "Fwd.h"
+#include "crimson/common/log.h"
+
+namespace crimson::net {
+
+class Dispatcher;
+
+class ChainedDispatchers {
+public:
+  void assign(const dispatchers_t& _dispatchers) {
+    assert(empty());
+    assert(!_dispatchers.empty());
+    dispatchers = _dispatchers;
+  }
+  void clear() {
+    dispatchers.clear();
+  }
+  bool empty() const {
+    return dispatchers.empty();
+  }
+  seastar::future<> ms_dispatch(crimson::net::ConnectionRef, MessageRef);
+  void ms_handle_accept(crimson::net::ConnectionRef conn);
+  void ms_handle_connect(crimson::net::ConnectionRef conn);
+  void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace);
+  void ms_handle_remote_reset(crimson::net::ConnectionRef conn);
+
+ private:
+  dispatchers_t dispatchers;
+};
+
+}
diff --git a/src/crimson/os/CMakeLists.txt b/src/crimson/os/CMakeLists.txt
new file mode 100644
index 000000000..f221dd7c1
--- /dev/null
+++ b/src/crimson/os/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_library(crimson-os STATIC
+  futurized_store.cc
+  ${PROJECT_SOURCE_DIR}/src/os/Transaction.cc)
+add_subdirectory(cyanstore)
+
+if(WITH_BLUESTORE)
+  add_subdirectory(alienstore)
+endif()
+
+add_subdirectory(seastore)
+target_link_libraries(crimson-os
+  crimson-cyanstore
+  crimson-alienstore
+  crimson-seastore
+  crimson)
diff --git a/src/crimson/os/alienstore/CMakeLists.txt b/src/crimson/os/alienstore/CMakeLists.txt
new file mode 100644
index 000000000..659a3c6ce
--- /dev/null
+++ b/src/crimson/os/alienstore/CMakeLists.txt
@@ -0,0 +1,76 @@
+include_directories(SYSTEM "${CMAKE_SOURCE_DIR}/src/rocksdb/include")
+
+add_library(alien::cflags INTERFACE IMPORTED)
+set_target_properties(alien::cflags PROPERTIES
+  INTERFACE_COMPILE_DEFINITIONS "WITH_SEASTAR;WITH_ALIEN"
+  INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:Seastar::seastar,INTERFACE_INCLUDE_DIRECTORIES>)
+
+add_library(crimson-alien-common STATIC
+  ${PROJECT_SOURCE_DIR}/src/common/admin_socket.cc
+  ${PROJECT_SOURCE_DIR}/src/common/blkdev.cc
+  ${PROJECT_SOURCE_DIR}/src/common/ceph_context.cc
+  ${PROJECT_SOURCE_DIR}/src/common/ceph_crypto.cc
+  ${PROJECT_SOURCE_DIR}/src/common/condition_variable_debug.cc
+  ${PROJECT_SOURCE_DIR}/src/common/cmdparse.cc
+  ${PROJECT_SOURCE_DIR}/src/common/Finisher.cc
+  ${PROJECT_SOURCE_DIR}/src/common/HeartbeatMap.cc
+  ${PROJECT_SOURCE_DIR}/src/common/PluginRegistry.cc
+  ${PROJECT_SOURCE_DIR}/src/common/lockdep.cc
+  ${PROJECT_SOURCE_DIR}/src/common/mutex_debug.cc
+  ${PROJECT_SOURCE_DIR}/src/common/perf_counters.cc
+  ${PROJECT_SOURCE_DIR}/src/common/perf_counters_collection.cc
+  ${PROJECT_SOURCE_DIR}/src/common/RefCountedObj.cc
+  ${PROJECT_SOURCE_DIR}/src/common/shared_mutex_debug.cc
+  ${PROJECT_SOURCE_DIR}/src/common/SubProcess.cc
+  ${PROJECT_SOURCE_DIR}/src/common/Throttle.cc
+  ${PROJECT_SOURCE_DIR}/src/common/Timer.cc
+  ${PROJECT_SOURCE_DIR}/src/common/TrackedOp.cc
+  ${PROJECT_SOURCE_DIR}/src/common/WorkQueue.cc
+  ${PROJECT_SOURCE_DIR}/src/common/util.cc
+  ${PROJECT_SOURCE_DIR}/src/crush/CrushLocation.cc
+  ${PROJECT_SOURCE_DIR}/src/global/global_context.cc
+  $<TARGET_OBJECTS:compressor_objs>
+  $<TARGET_OBJECTS:common_prioritycache_obj>)
+target_link_libraries(crimson-alien-common
+  crimson-common
+  alien::cflags)
+
+set(alien_store_srcs
+  alien_store.cc
+  thread_pool.cc
+  ${PROJECT_SOURCE_DIR}/src/os/ObjectStore.cc
+  ${PROJECT_SOURCE_DIR}/src/os/bluestore/Allocator.cc
+  ${PROJECT_SOURCE_DIR}/src/os/bluestore/AvlAllocator.cc
+  ${PROJECT_SOURCE_DIR}/src/os/bluestore/BitmapFreelistManager.cc
+  ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueFS.cc
+  ${PROJECT_SOURCE_DIR}/src/os/bluestore/bluefs_types.cc
+  ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueRocksEnv.cc
+  ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueStore.cc
+  ${PROJECT_SOURCE_DIR}/src/os/bluestore/bluestore_types.cc
+  ${PROJECT_SOURCE_DIR}/src/os/bluestore/fastbmap_allocator_impl.cc
+  ${PROJECT_SOURCE_DIR}/src/os/bluestore/FreelistManager.cc
+  ${PROJECT_SOURCE_DIR}/src/os/bluestore/HybridAllocator.cc
+  ${PROJECT_SOURCE_DIR}/src/os/bluestore/StupidAllocator.cc
+  ${PROJECT_SOURCE_DIR}/src/os/bluestore/BitmapAllocator.cc)
+if(WITH_ZBD)
+  list(APPEND alien_store_srcs
+    ${PROJECT_SOURCE_DIR}/src/os/bluestore/zoned_types.cc
+    ${PROJECT_SOURCE_DIR}/src/os/bluestore/ZonedFreelistManager.cc
+    ${PROJECT_SOURCE_DIR}/src/os/bluestore/ZonedAllocator.cc)
+endif()
+add_library(crimson-alienstore STATIC
+  ${alien_store_srcs})
+if(WITH_LTTNG)
+  add_dependencies(crimson-alienstore bluestore-tp)
+endif()
+target_link_libraries(crimson-alienstore
+  PRIVATE
+    alien::cflags
+    fmt::fmt
+    kv
+    heap_profiler
+    crimson-alien-common
+    ${BLKID_LIBRARIES}
+    ${UDEV_LIBRARIES}
+    crimson
+    blk)
diff --git a/src/crimson/os/alienstore/alien_collection.h b/src/crimson/os/alienstore/alien_collection.h
new file mode 100644
index 000000000..98b8fdef4
--- /dev/null
+++ b/src/crimson/os/alienstore/alien_collection.h
@@ -0,0 +1,26 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "os/ObjectStore.h"
+
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+#include "alien_store.h"
+
+namespace crimson::os {
+
+class AlienCollection final : public FuturizedCollection {
+public:
+  AlienCollection(ObjectStore::CollectionHandle ch)
+  : FuturizedCollection(ch->cid),
+    collection(ch) {}
+
+  ~AlienCollection() {}
+
+private:
+  ObjectStore::CollectionHandle collection;
+  friend AlienStore;
+};
+}
diff --git a/src/crimson/os/alienstore/alien_store.cc b/src/crimson/os/alienstore/alien_store.cc
new file mode 100644
index 000000000..cb5553254
--- /dev/null
+++ b/src/crimson/os/alienstore/alien_store.cc
@@ -0,0 +1,575 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "alien_collection.h"
+#include "alien_store.h"
+
+#include <map>
+#include <string_view>
+#include <boost/algorithm/string/trim.hpp>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include <seastar/core/alien.hh>
+#include <seastar/core/future-util.hh>
+#include <seastar/core/reactor.hh>
+
+#include "common/ceph_context.h"
+#include "global/global_context.h"
+#include "include/Context.h"
+#include "os/bluestore/BlueStore.h"
+#include "os/ObjectStore.h"
+#include "os/Transaction.h"
+
+#include "crimson/common/log.h"
+#include "crimson/os/futurized_store.h"
+
+namespace {
+  seastar::logger& logger()
+  {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+
+class OnCommit final: public Context
+{
+  int cpuid;
+  Context *oncommit;
+  seastar::promise<> &alien_done;
+public:
+  OnCommit(
+    int id,
+    seastar::promise<> &done,
+    Context *oncommit,
+    ceph::os::Transaction& txn)
+    : cpuid(id), oncommit(oncommit),
+      alien_done(done) {}
+
+  void finish(int) final {
+    return seastar::alien::submit_to(cpuid, [this] {
+      if (oncommit) oncommit->complete(0);
+      alien_done.set_value();
+      return seastar::make_ready_future<>();
+    }).wait();
+  }
+};
+}
+
+namespace crimson::os {
+
+AlienStore::AlienStore(const std::string& path, const ConfigValues& values)
+  : path{path}
+{
+  cct = std::make_unique<CephContext>(CEPH_ENTITY_TYPE_OSD);
+  g_ceph_context = cct.get();
+  cct->_conf.set_config_values(values);
+  store = std::make_unique<BlueStore>(cct.get(), path);
+
+  long cpu_id = 0;
+  if (long nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); nr_cpus != -1) {
+    cpu_id = nr_cpus - 1;
+  } else {
+    logger().error("{}: unable to get nproc: {}", __func__, errno);
+    cpu_id = -1;
+  }
+  tp = std::make_unique<crimson::os::ThreadPool>(1, 128, cpu_id);
+}
+
+seastar::future<> AlienStore::start()
+{
+  return tp->start();
+}
+
+seastar::future<> AlienStore::stop()
+{
+  return tp->submit([this] {
+    for (auto [cid, ch]: coll_map)
+      static_cast<AlienCollection*>(ch.get())->collection.reset();
+    store.reset();
+  }).then([this] {
+    return tp->stop();
+  });
+}
+
+AlienStore::~AlienStore() = default;
+
+seastar::future<> AlienStore::mount()
+{
+  logger().debug("{}", __func__);
+  return tp->submit([this] {
+    return store->mount();
+  }).then([] (int r) {
+    assert(r == 0);
+    return seastar::now();
+  });
+}
+
+seastar::future<> AlienStore::umount()
+{
+  logger().info("{}", __func__);
+  return transaction_gate.close().then([this] {
+    return tp->submit([this] {
+      return store->umount();
+    });
+  }).then([] (int r) {
+    assert(r == 0);
+    return seastar::now();
+  });
+}
+
+seastar::future<> AlienStore::mkfs(uuid_d osd_fsid)
+{
+  logger().debug("{}", __func__);
+  store->set_fsid(osd_fsid);
+  return tp->submit([this] {
+    return store->mkfs();
+  }).then([] (int r) {
+    assert(r == 0);
+    return seastar::now();
+  });
+}
+
+seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
+AlienStore::list_objects(CollectionRef ch,
+                        const ghobject_t& start,
+                        const ghobject_t& end,
+                        uint64_t limit) const
+{
+  logger().debug("{}", __func__);
+  return seastar::do_with(std::vector<ghobject_t>(), ghobject_t(),
+                          [=] (auto &objects, auto &next) {
+    objects.reserve(limit);
+    return tp->submit([=, &objects, &next] {
+      auto c = static_cast<AlienCollection*>(ch.get());
+      return store->collection_list(c->collection, start, end,
+                                    store->get_ideal_list_max(),
+                                    &objects, &next);
+    }).then([&objects, &next] (int r) {
+      assert(r == 0);
+      return seastar::make_ready_future<std::tuple<std::vector<ghobject_t>, ghobject_t>>(
+	std::make_tuple(std::move(objects), std::move(next)));
+    });
+  });
+}
+
+seastar::future<CollectionRef> AlienStore::create_new_collection(const coll_t& cid)
+{
+  logger().debug("{}", __func__);
+  return tp->submit([this, cid] {
+    return store->create_new_collection(cid);
+  }).then([this, cid] (ObjectStore::CollectionHandle c) {
+    CollectionRef ch;
+    auto cp = coll_map.find(c->cid);
+    if (cp == coll_map.end()) {
+      ch = new AlienCollection(c);
+      coll_map[c->cid] = ch;
+    } else {
+      ch = cp->second;
+      auto ach = static_cast<AlienCollection*>(ch.get());
+      if (ach->collection != c) {
+        ach->collection = c;
+      }
+    }
+    return seastar::make_ready_future<CollectionRef>(ch);
+  });
+
+}
+
+seastar::future<CollectionRef> AlienStore::open_collection(const coll_t& cid)
+{
+  logger().debug("{}", __func__);
+  return tp->submit([this, cid] {
+    return store->open_collection(cid);
+  }).then([this] (ObjectStore::CollectionHandle c) {
+    CollectionRef ch;
+    auto cp = coll_map.find(c->cid);
+    if (cp == coll_map.end()){
+      ch = new AlienCollection(c);
+      coll_map[c->cid] = ch;
+    } else {
+      ch = cp->second;
+      auto ach = static_cast<AlienCollection*>(ch.get());
+      if (ach->collection != c){
+        ach->collection = c;
+      }
+    }
+    return seastar::make_ready_future<CollectionRef>(ch);
+  });
+}
+
+seastar::future<std::vector<coll_t>> AlienStore::list_collections()
+{
+  logger().debug("{}", __func__);
+
+  return seastar::do_with(std::vector<coll_t>{}, [=] (auto &ls) {
+    return tp->submit([this, &ls] {
+      return store->list_collections(ls);
+    }).then([&ls] (int r) {
+      assert(r == 0);
+      return seastar::make_ready_future<std::vector<coll_t>>(std::move(ls));
+    });
+  });
+}
+
+AlienStore::read_errorator::future<ceph::bufferlist>
+AlienStore::read(CollectionRef ch,
+                 const ghobject_t& oid,
+                 uint64_t offset,
+                 size_t len,
+                 uint32_t op_flags)
+{
+  logger().debug("{}", __func__);
+  return seastar::do_with(ceph::bufferlist{}, [=] (auto &bl) {
+    return tp->submit([=, &bl] {
+      auto c = static_cast<AlienCollection*>(ch.get());
+      return store->read(c->collection, oid, offset, len, bl, op_flags);
+    }).then([&bl] (int r) -> read_errorator::future<ceph::bufferlist> {
+      if (r == -ENOENT) {
+        return crimson::ct_error::enoent::make();
+      } else if (r == -EIO) {
+        return crimson::ct_error::input_output_error::make();
+      } else {
+        return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl));
+      }
+    });
+  });
+}
+
+AlienStore::read_errorator::future<ceph::bufferlist>
+AlienStore::readv(CollectionRef ch,
+		  const ghobject_t& oid,
+		  interval_set<uint64_t>& m,
+		  uint32_t op_flags)
+{
+  logger().debug("{}", __func__);
+  return seastar::do_with(ceph::bufferlist{},
+    [this, ch, oid, &m, op_flags](auto& bl) {
+    return tp->submit([this, ch, oid, &m, op_flags, &bl] {
+      auto c = static_cast<AlienCollection*>(ch.get());
+      return store->readv(c->collection, oid, m, bl, op_flags);
+    }).then([&bl](int r) -> read_errorator::future<ceph::bufferlist> {
+      if (r == -ENOENT) {
+        return crimson::ct_error::enoent::make();
+      } else if (r == -EIO) {
+        return crimson::ct_error::input_output_error::make();
+      } else {
+        return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl));
+      }
+    });
+  });
+}
+
+AlienStore::get_attr_errorator::future<ceph::bufferptr>
+AlienStore::get_attr(CollectionRef ch,
+                     const ghobject_t& oid,
+                     std::string_view name) const
+{
+  logger().debug("{}", __func__);
+  return seastar::do_with(ceph::bufferptr{}, [=] (auto &value) {
+    return tp->submit([=, &value] {
+      auto c =static_cast<AlienCollection*>(ch.get());
+      return store->getattr(c->collection, oid,
+		            static_cast<std::string>(name).c_str(), value);
+    }).then([oid, &value] (int r) -> get_attr_errorator::future<ceph::bufferptr> {
+      if (r == -ENOENT) {
+        return crimson::ct_error::enoent::make();
+      } else if (r == -ENODATA) {
+        return crimson::ct_error::enodata::make();
+      } else {
+        return get_attr_errorator::make_ready_future<ceph::bufferptr>(
+          std::move(value));
+      }
+    });
+  });
+}
+
+AlienStore::get_attrs_ertr::future<AlienStore::attrs_t>
+AlienStore::get_attrs(CollectionRef ch,
+                      const ghobject_t& oid)
+{
+  logger().debug("{}", __func__);
+  return seastar::do_with(attrs_t{}, [=] (auto &aset) {
+    return tp->submit([=, &aset] {
+      auto c = static_cast<AlienCollection*>(ch.get());
+      return store->getattrs(c->collection, oid,
+		             reinterpret_cast<map<string,bufferptr>&>(aset));
+    }).then([&aset] (int r) -> get_attrs_ertr::future<attrs_t> {
+      if (r == -ENOENT) {
+        return crimson::ct_error::enoent::make();
+      } else {
+        return get_attrs_ertr::make_ready_future<attrs_t>(std::move(aset));
+      }
+    });
+  });
+}
+
+auto AlienStore::omap_get_values(CollectionRef ch,
+                                 const ghobject_t& oid,
+                                 const set<string>& keys)
+  -> read_errorator::future<omap_values_t>
+{
+  logger().debug("{}", __func__);
+  return seastar::do_with(omap_values_t{}, [=] (auto &values) {
+    return tp->submit([=, &values] {
+      auto c = static_cast<AlienCollection*>(ch.get());
+      return store->omap_get_values(c->collection, oid, keys,
+		                    reinterpret_cast<map<string, bufferlist>*>(&values));
+    }).then([&values] (int r) -> read_errorator::future<omap_values_t> {
+      if (r == -ENOENT) {
+        return crimson::ct_error::enoent::make();
+      } else {
+        assert(r == 0);
+        return read_errorator::make_ready_future<omap_values_t>(std::move(values));
+      }
+    });
+  });
+}
+
+auto AlienStore::omap_get_values(CollectionRef ch,
+                                 const ghobject_t &oid,
+                                 const std::optional<string> &start)
+  -> read_errorator::future<std::tuple<bool, omap_values_t>>
+{
+  logger().debug("{} with_start", __func__);
+  return seastar::do_with(omap_values_t{}, [=] (auto &values) {
+    return tp->submit([=, &values] {
+      auto c = static_cast<AlienCollection*>(ch.get());
+      return store->omap_get_values(c->collection, oid, start,
+		                    reinterpret_cast<map<string, bufferlist>*>(&values));
+    }).then([&values] (int r)
+      -> read_errorator::future<std::tuple<bool, omap_values_t>> {
+      if (r == -ENOENT) {
+        return crimson::ct_error::enoent::make();
+      } else if (r < 0){
+        logger().error("omap_get_values(start): {}", r);
+        return crimson::ct_error::input_output_error::make();
+      } else {
+        return read_errorator::make_ready_future<std::tuple<bool, omap_values_t>>(
+          std::make_tuple(true, std::move(values)));
+      }
+    });
+  });
+}
+
+seastar::future<> AlienStore::do_transaction(CollectionRef ch,
+                                             ceph::os::Transaction&& txn)
+{
+  logger().debug("{}", __func__);
+  auto id = seastar::this_shard_id();
+  auto done = seastar::promise<>();
+  return seastar::do_with(
+    std::move(txn),
+    std::move(done),
+    [this, ch, id] (auto &txn, auto &done) {
+      return seastar::with_gate(transaction_gate, [this, ch, id, &txn, &done] {
+	return tp_mutex.lock().then ([this, ch, id, &txn, &done] {
+	  Context *crimson_wrapper =
+	    ceph::os::Transaction::collect_all_contexts(txn);
+	  return tp->submit([this, ch, id, crimson_wrapper, &txn, &done] {
+	    txn.register_on_commit(new OnCommit(id, done, crimson_wrapper, txn));
+	    auto c = static_cast<AlienCollection*>(ch.get());
+	    return store->queue_transaction(c->collection, std::move(txn));
+	  });
+	}).then([this, &done] (int r) {
+	  assert(r == 0);
+	  tp_mutex.unlock();
+	  return done.get_future();
+	});
+      });
+    });
+}
+
+seastar::future<> AlienStore::write_meta(const std::string& key,
+                                         const std::string& value)
+{
+  logger().debug("{}", __func__);
+  return tp->submit([=] {
+    return store->write_meta(key, value);
+  }).then([] (int r) {
+    assert(r == 0);
+    return seastar::make_ready_future<>();
+  });
+}
+
+seastar::future<std::tuple<int, std::string>>
+AlienStore::read_meta(const std::string& key)
+{
+  logger().debug("{}", __func__);
+  return tp->submit([this, key] {
+    std::string value;
+    int r = store->read_meta(key, &value);
+    if (r > 0) {
+      value.resize(r);
+      boost::algorithm::trim_right_if(value,
+        [] (unsigned char c) {return isspace(c);});
+    } else {
+      value.clear();
+    }
+    return std::make_pair(r, value);
+  }).then([] (auto entry) {
+    return seastar::make_ready_future<std::tuple<int, std::string>>(
+      std::move(entry));
+  });
+}
+
+uuid_d AlienStore::get_fsid() const
+{
+  logger().debug("{}", __func__);
+  return store->get_fsid();
+}
+
+seastar::future<store_statfs_t> AlienStore::stat() const
+{
+  logger().info("{}", __func__);
+  return seastar::do_with(store_statfs_t{}, [this] (store_statfs_t &st) {
+    return tp->submit([this, &st] {
+      return store->statfs(&st, nullptr);
+    }).then([&st] (int r) {
+      assert(r == 0);
+      return seastar::make_ready_future<store_statfs_t>(std::move(st));
+    });
+  });
+}
+
+unsigned AlienStore::get_max_attr_name_length() const
+{
+  logger().info("{}", __func__);
+  return 256;
+}
+
+seastar::future<struct stat> AlienStore::stat(
+  CollectionRef ch,
+  const ghobject_t& oid)
+{
+  return seastar::do_with((struct stat){}, [this, ch, oid](auto& st) {
+    return tp->submit([this, ch, oid, &st] {
+      auto c = static_cast<AlienCollection*>(ch.get());
+      store->stat(c->collection, oid, &st);
+      return st;
+    });
+  });
+}
+
+auto AlienStore::omap_get_header(CollectionRef ch,
+                                 const ghobject_t& oid)
+  -> read_errorator::future<ceph::bufferlist>
+{
+  return seastar::do_with(ceph::bufferlist(), [=](auto& bl) {
+    return tp->submit([=, &bl] {
+      auto c = static_cast<AlienCollection*>(ch.get());
+      return store->omap_get_header(c->collection, oid, &bl);
+    }).then([&bl] (int r) -> read_errorator::future<ceph::bufferlist> {
+      if (r == -ENOENT) {
+        return crimson::ct_error::enoent::make();
+      } else if (r < 0) {
+        logger().error("omap_get_header: {}", r);
+        return crimson::ct_error::input_output_error::make();
+      } else {
+        return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl));
+      }
+    });
+  });
+}
+
+seastar::future<std::map<uint64_t, uint64_t>> AlienStore::fiemap(
+  CollectionRef ch,
+  const ghobject_t& oid,
+  uint64_t off,
+  uint64_t len)
+{
+  return seastar::do_with(std::map<uint64_t, uint64_t>(), [=](auto& destmap) {
+    return tp->submit([=, &destmap] {
+      auto c = static_cast<AlienCollection*>(ch.get());
+      return store->fiemap(c->collection, oid, off, len, destmap);
+    }).then([&destmap] (int i) {
+      return seastar::make_ready_future
+	  <std::map<uint64_t, uint64_t>>
+	  (std::move(destmap));
+    });
+  });
+}
+
+seastar::future<FuturizedStore::OmapIteratorRef> AlienStore::get_omap_iterator(
+  CollectionRef ch,
+  const ghobject_t& oid)
+{
+  return tp->submit([=] {
+    auto c = static_cast<AlienCollection*>(ch.get());
+    auto iter = store->get_omap_iterator(c->collection, oid);
+    return FuturizedStore::OmapIteratorRef(
+	      new AlienStore::AlienOmapIterator(iter,
+						this));
+  });
+}
+
+//TODO: each iterator op needs one submit, this is not efficient,
+//      needs further optimization.
+seastar::future<> AlienStore::AlienOmapIterator::seek_to_first()
+{
+  return store->tp->submit([=] {
+    return iter->seek_to_first();
+  }).then([] (int r) {
+    assert(r == 0);
+    return seastar::now();
+  });
+}
+
+seastar::future<> AlienStore::AlienOmapIterator::upper_bound(
+  const std::string& after)
+{
+  return store->tp->submit([this, after] {
+    return iter->upper_bound(after);
+  }).then([] (int r) {
+    assert(r == 0);
+    return seastar::now();
+  });
+}
+
+seastar::future<> AlienStore::AlienOmapIterator::lower_bound(
+  const std::string& to)
+{
+  return store->tp->submit([this, to] {
+    return iter->lower_bound(to);
+  }).then([] (int r) {
+    assert(r == 0);
+    return seastar::now();
+  });
+}
+
+seastar::future<> AlienStore::AlienOmapIterator::next()
+{
+  return store->tp->submit([this] {
+    return iter->next();
+  }).then([] (int r) {
+    assert(r == 0);
+    return seastar::now();
+  });
+}
+
+bool AlienStore::AlienOmapIterator::valid() const
+{
+  return iter->valid();
+}
+
+std::string AlienStore::AlienOmapIterator::key()
+{
+  return iter->key();
+}
+
+seastar::future<std::string> AlienStore::AlienOmapIterator::tail_key()
+{
+  return store->tp->submit([this] {
+    return iter->tail_key();
+  });
+}
+
+ceph::buffer::list AlienStore::AlienOmapIterator::value()
+{
+  return iter->value();
+}
+
+int AlienStore::AlienOmapIterator::status() const
+{
+  return iter->status();
+}
+
+}
diff --git a/src/crimson/os/alienstore/alien_store.h b/src/crimson/os/alienstore/alien_store.h
new file mode 100644
index 000000000..92739340e
--- /dev/null
+++ b/src/crimson/os/alienstore/alien_store.h
@@ -0,0 +1,125 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_mutex.hh>
+
+#include "common/ceph_context.h"
+#include "os/ObjectStore.h"
+#include "osd/osd_types.h"
+
+#include "crimson/os/alienstore/thread_pool.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+
+namespace ceph::os {
+class Transaction;
+}
+
+namespace crimson::os {
+class AlienStore final : public FuturizedStore {
+public:
+  class AlienOmapIterator final : public OmapIterator {
+  public:
+    AlienOmapIterator(ObjectMap::ObjectMapIterator& it,
+	AlienStore* store) : iter(it), store(store) {}
+    seastar::future<> seek_to_first();
+    seastar::future<> upper_bound(const std::string& after);
+    seastar::future<> lower_bound(const std::string& to);
+    bool valid() const;
+    seastar::future<> next();
+    std::string key();
+    seastar::future<std::string> tail_key();
+    ceph::buffer::list value();
+    int status() const;
+  private:
+    ObjectMap::ObjectMapIterator iter;
+    AlienStore* store;
+  };
+  AlienStore(const std::string& path, const ConfigValues& values);
+  ~AlienStore() final;
+
+  seastar::future<> start() final;
+  seastar::future<> stop() final;
+  seastar::future<> mount() final;
+  seastar::future<> umount() final;
+
+  seastar::future<> mkfs(uuid_d new_osd_fsid) final;
+  read_errorator::future<ceph::bufferlist> read(CollectionRef c,
+                                   const ghobject_t& oid,
+                                   uint64_t offset,
+                                   size_t len,
+                                   uint32_t op_flags = 0) final;
+  read_errorator::future<ceph::bufferlist> readv(CollectionRef c,
+						 const ghobject_t& oid,
+						 interval_set<uint64_t>& m,
+						 uint32_t op_flags = 0) final;
+					      
+
+  get_attr_errorator::future<ceph::bufferptr> get_attr(CollectionRef c,
+                                            const ghobject_t& oid,
+                                            std::string_view name) const final;
+  get_attrs_ertr::future<attrs_t> get_attrs(CollectionRef c,
+                                     const ghobject_t& oid) final;
+
+  read_errorator::future<omap_values_t> omap_get_values(
+    CollectionRef c,
+    const ghobject_t& oid,
+    const omap_keys_t& keys) final;
+
+  /// Retrieves paged set of values > start (if present)
+  read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
+    CollectionRef c,           ///< [in] collection
+    const ghobject_t &oid,     ///< [in] oid
+    const std::optional<std::string> &start ///< [in] start, empty for begin
+    ) final; ///< @return <done, values> values.empty() iff done
+
+  seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
+    CollectionRef c,
+    const ghobject_t& start,
+    const ghobject_t& end,
+    uint64_t limit) const final;
+
+  seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
+  seastar::future<CollectionRef> open_collection(const coll_t& cid) final;
+  seastar::future<std::vector<coll_t>> list_collections() final;
+
+  seastar::future<> do_transaction(CollectionRef c,
+                                   ceph::os::Transaction&& txn) final;
+
+  seastar::future<> write_meta(const std::string& key,
+                  const std::string& value) final;
+  seastar::future<std::tuple<int, std::string>> read_meta(
+    const std::string& key) final;
+  uuid_d get_fsid() const final;
+  seastar::future<store_statfs_t> stat() const final;
+  unsigned get_max_attr_name_length() const final;
+  seastar::future<struct stat> stat(
+    CollectionRef,
+    const ghobject_t&) final;
+  read_errorator::future<ceph::bufferlist> omap_get_header(
+    CollectionRef,
+    const ghobject_t&) final;
+  seastar::future<std::map<uint64_t, uint64_t>> fiemap(
+    CollectionRef,
+    const ghobject_t&,
+    uint64_t off,
+    uint64_t len) final;
+  seastar::future<FuturizedStore::OmapIteratorRef> get_omap_iterator(
+    CollectionRef ch,
+    const ghobject_t& oid) final;
+
+private:
+  constexpr static unsigned MAX_KEYS_PER_OMAP_GET_CALL = 32;
+  mutable std::unique_ptr<crimson::os::ThreadPool> tp;
+  const std::string path;
+  uint64_t used_bytes = 0;
+  std::unique_ptr<ObjectStore> store;
+  std::unique_ptr<CephContext> cct;
+  seastar::gate transaction_gate;
+  std::unordered_map<coll_t, CollectionRef> coll_map;
+  seastar::shared_mutex tp_mutex;
+};
+}
diff --git a/src/crimson/os/alienstore/thread_pool.cc b/src/crimson/os/alienstore/thread_pool.cc
new file mode 100644
index 000000000..e127d87d5
--- /dev/null
+++ b/src/crimson/os/alienstore/thread_pool.cc
@@ -0,0 +1,80 @@
+#include "thread_pool.h"
+
+#include <chrono>
+#include <pthread.h>
+
+#include "include/ceph_assert.h"
+#include "crimson/common/config_proxy.h"
+
+using crimson::common::local_conf;
+
+namespace crimson::os {
+
+ThreadPool::ThreadPool(size_t n_threads,
+                       size_t queue_sz,
+                       long cpu_id)
+  : queue_size{round_up_to(queue_sz, seastar::smp::count)},
+    pending{queue_size}
+{
+  auto queue_max_wait = std::chrono::seconds(local_conf()->threadpool_empty_queue_max_wait);
+  for (size_t i = 0; i < n_threads; i++) {
+    threads.emplace_back([this, cpu_id, queue_max_wait] {
+      if (cpu_id >= 0) {
+        pin(cpu_id);
+      }
+      loop(queue_max_wait);
+    });
+  }
+}
+
+ThreadPool::~ThreadPool()
+{
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
+void ThreadPool::pin(unsigned cpu_id)
+{
+  cpu_set_t cs;
+  CPU_ZERO(&cs);
+  CPU_SET(cpu_id, &cs);
+  [[maybe_unused]] auto r = pthread_setaffinity_np(pthread_self(),
+                                                   sizeof(cs), &cs);
+  ceph_assert(r == 0);
+}
+
+void ThreadPool::loop(std::chrono::milliseconds queue_max_wait)
+{
+  for (;;) {
+    WorkItem* work_item = nullptr;
+    {
+      std::unique_lock lock{mutex};
+      cond.wait_for(lock, queue_max_wait,
+                    [this, &work_item] {
+        return pending.pop(work_item) || is_stopping();
+      });
+    }
+    if (work_item) {
+      work_item->process();
+    } else if (is_stopping()) {
+      break;
+    }
+  }
+}
+
+seastar::future<> ThreadPool::start()
+{
+  auto slots_per_shard = queue_size / seastar::smp::count;
+  return submit_queue.start(slots_per_shard);
+}
+
+seastar::future<> ThreadPool::stop()
+{
+  return submit_queue.stop().then([this] {
+    stopping = true;
+    cond.notify_all();
+  });
+}
+
+} // namespace crimson::os
diff --git a/src/crimson/os/alienstore/thread_pool.h b/src/crimson/os/alienstore/thread_pool.h
new file mode 100644
index 000000000..27840da18
--- /dev/null
+++ b/src/crimson/os/alienstore/thread_pool.h
@@ -0,0 +1,132 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <tuple>
+#include <type_traits>
+#include <boost/lockfree/queue.hpp>
+#include <boost/optional.hpp>
+#include <seastar/core/future.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/core/semaphore.hh>
+#include <seastar/core/sharded.hh>
+
+namespace crimson::os {
+
+struct WorkItem {
+  virtual ~WorkItem() {}
+  virtual void process() = 0;
+};
+
+template<typename Func>
+struct Task final : WorkItem {
+  using T = std::invoke_result_t<Func>;
+  using future_stored_type_t =
+    std::conditional_t<std::is_void_v<T>,
+		       seastar::internal::future_stored_type_t<>,
+		       seastar::internal::future_stored_type_t<T>>;
+  using futurator_t = seastar::futurize<T>;
+public:
+  explicit Task(Func&& f)
+    : func(std::move(f))
+  {}
+  void process() override {
+    try {
+      if constexpr (std::is_void_v<T>) {
+        func();
+        state.set();
+      } else {
+        state.set(func());
+      }
+    } catch (...) {
+      state.set_exception(std::current_exception());
+    }
+    on_done.write_side().signal(1);
+  }
+  typename futurator_t::type get_future() {
+    return on_done.wait().then([this](size_t) {
+      if (state.failed()) {
+	return futurator_t::make_exception_future(state.get_exception());
+      } else {
+	return futurator_t::from_tuple(state.get_value());
+      }
+    });
+  }
+private:
+  Func func;
+  seastar::future_state<future_stored_type_t> state;
+  seastar::readable_eventfd on_done;
+};
+
+struct SubmitQueue {
+  seastar::semaphore free_slots;
+  seastar::gate pending_tasks;
+  explicit SubmitQueue(size_t num_free_slots)
+    : free_slots(num_free_slots)
+  {}
+  seastar::future<> stop() {
+    return pending_tasks.close();
+  }
+};
+
+/// an engine for scheduling non-seastar tasks from seastar fibers
+class ThreadPool {
+  std::atomic<bool> stopping = false;
+  std::mutex mutex;
+  std::condition_variable cond;
+  std::vector<std::thread> threads;
+  seastar::sharded<SubmitQueue> submit_queue;
+  const size_t queue_size;
+  boost::lockfree::queue<WorkItem*> pending;
+
+  void loop(std::chrono::milliseconds queue_max_wait);
+  bool is_stopping() const {
+    return stopping.load(std::memory_order_relaxed);
+  }
+  static void pin(unsigned cpu_id);
+  seastar::semaphore& local_free_slots() {
+    return submit_queue.local().free_slots;
+  }
+  ThreadPool(const ThreadPool&) = delete;
+  ThreadPool& operator=(const ThreadPool&) = delete;
+public:
+  /**
+   * @param queue_sz the depth of pending queue. before a task is scheduled,
+   *                 it waits in this queue. we will round this number to
+   *                 multiple of the number of cores.
+   * @param n_threads the number of threads in this thread pool.
+   * @param cpu the CPU core to which this thread pool is assigned
+   * @note each @c Task has its own crimson::thread::Condition, which possesses
+   * an fd, so we should keep the size of queue under a reasonable limit.
+   */
+  ThreadPool(size_t n_threads, size_t queue_sz, long cpu);
+  ~ThreadPool();
+  seastar::future<> start();
+  seastar::future<> stop();
+  template<typename Func, typename...Args>
+  auto submit(Func&& func, Args&&... args) {
+    auto packaged = [func=std::move(func),
+                     args=std::forward_as_tuple(args...)] {
+      return std::apply(std::move(func), std::move(args));
+    };
+    return seastar::with_gate(submit_queue.local().pending_tasks,
+      [packaged=std::move(packaged), this] {
+        return local_free_slots().wait()
+          .then([packaged=std::move(packaged), this] {
+            auto task = new Task{std::move(packaged)};
+            auto fut = task->get_future();
+            pending.push(task);
+            cond.notify_one();
+            return fut.finally([task, this] {
+              local_free_slots().signal();
+              delete task;
+            });
+          });
+        });
+  }
+};
+
+} // namespace crimson::os
diff --git a/src/crimson/os/cyanstore/CMakeLists.txt b/src/crimson/os/cyanstore/CMakeLists.txt
new file mode 100644
index 000000000..65f2b5498
--- /dev/null
+++ b/src/crimson/os/cyanstore/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_library(crimson-cyanstore STATIC
+  cyan_store.cc
+  cyan_collection.cc
+  cyan_object.cc)
+target_link_libraries(crimson-cyanstore
+  crimson
+  crimson-os)
diff --git a/src/crimson/os/cyanstore/cyan_collection.cc b/src/crimson/os/cyanstore/cyan_collection.cc
new file mode 100644
index 000000000..f44234e84
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_collection.cc
@@ -0,0 +1,76 @@
+#include "cyan_collection.h"
+
+#include "cyan_object.h"
+
+namespace crimson::os
+{
+
+Collection::Collection(const coll_t& c)
+  : FuturizedCollection{c}
+{}
+
+Collection::~Collection() = default;
+
+Collection::ObjectRef Collection::create_object() const
+{
+  return new crimson::os::Object;
+}
+
+Collection::ObjectRef Collection::get_object(ghobject_t oid)
+{
+  auto o = object_hash.find(oid);
+  if (o == object_hash.end())
+    return ObjectRef();
+  return o->second;
+}
+
+Collection::ObjectRef Collection::get_or_create_object(ghobject_t oid)
+{
+  auto result = object_hash.emplace(oid, ObjectRef{});
+  if (result.second)
+    object_map[oid] = result.first->second = create_object();
+  return result.first->second;
+}
+
+uint64_t Collection::used_bytes() const
+{
+  uint64_t result = 0;
+  for (auto& obj : object_map) {
+    result += obj.second->get_size();
+  }
+  return result;
+}
+
+void Collection::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(xattr, bl);
+  encode(use_page_set, bl);
+  uint32_t s = object_map.size();
+  encode(s, bl);
+  for (auto& [oid, obj] : object_map) {
+    encode(oid, bl);
+    obj->encode(bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void Collection::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(xattr, p);
+  decode(use_page_set, p);
+  uint32_t s;
+  decode(s, p);
+  while (s--) {
+    ghobject_t k;
+    decode(k, p);
+    auto o = create_object();
+    o->decode(p);
+    object_map.insert(make_pair(k, o));
+    object_hash.insert(make_pair(k, o));
+  }
+  DECODE_FINISH(p);
+}
+
+}
diff --git a/src/crimson/os/cyanstore/cyan_collection.h b/src/crimson/os/cyanstore/cyan_collection.h
new file mode 100644
index 000000000..068e427d8
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_collection.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include "include/buffer.h"
+#include "osd/osd_types.h"
+
+#include "crimson/os/futurized_collection.h"
+
+namespace crimson::os {
+
+class Object;
+/**
+ * a collection also orders transactions
+ *
+ * Any transactions queued under a given collection will be applied in
+ * sequence.  Transactions queued under different collections may run
+ * in parallel.
+ *
+ * ObjectStore users may get collection handles with open_collection() (or,
+ * for bootstrapping a new collection, create_new_collection()).
+ */
+struct Collection final : public FuturizedCollection {
+  using ObjectRef = boost::intrusive_ptr<Object>;
+  int bits = 0;
+  // always use bufferlist object for testing
+  bool use_page_set = false;
+  std::unordered_map<ghobject_t, ObjectRef> object_hash;  ///< for lookup
+  std::map<ghobject_t, ObjectRef> object_map;        ///< for iteration
+  std::map<std::string,bufferptr> xattr;
+  bool exists = true;
+
+  Collection(const coll_t& c);
+  ~Collection() final;
+
+  ObjectRef create_object() const;
+  ObjectRef get_object(ghobject_t oid);
+  ObjectRef get_or_create_object(ghobject_t oid);
+  uint64_t used_bytes() const;
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& p);
+};
+
+}
diff --git a/src/crimson/os/cyanstore/cyan_object.cc b/src/crimson/os/cyanstore/cyan_object.cc
new file mode 100644
index 000000000..34bc13b7f
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_object.cc
@@ -0,0 +1,89 @@
+#include "cyan_object.h"
+#include "include/encoding.h"
+
+namespace crimson::os {
+
+size_t Object::get_size() const {
+  return data.length();
+}
+
+ceph::bufferlist Object::read(uint64_t offset, uint64_t len)
+{
+  bufferlist ret;
+  ret.substr_of(data, offset, len);
+  return ret;
+}
+
+int Object::write(uint64_t offset, const bufferlist &src)
+{
+  unsigned len = src.length();
+  // before
+  bufferlist newdata;
+  if (get_size() >= offset) {
+    newdata.substr_of(data, 0, offset);
+  } else {
+    if (get_size()) {
+      newdata.substr_of(data, 0, get_size());
+    }
+    newdata.append_zero(offset - get_size());
+  }
+
+  newdata.append(src);
+
+  // after
+  if (get_size() > offset + len) {
+    bufferlist tail;
+    tail.substr_of(data, offset + len, get_size() - (offset + len));
+    newdata.append(tail);
+  }
+
+  data = std::move(newdata);
+  return 0;
+}
+
+int Object::clone(Object *src, uint64_t srcoff, uint64_t len,
+                  uint64_t dstoff)
+{
+  bufferlist bl;
+  if (srcoff == dstoff && len == src->get_size()) {
+    data = src->data;
+    return 0;
+  }
+  bl.substr_of(src->data, srcoff, len);
+  return write(dstoff, bl);
+
+}
+
+int Object::truncate(uint64_t size)
+{
+  if (get_size() > size) {
+    bufferlist bl;
+    bl.substr_of(data, 0, size);
+    data = std::move(bl);
+  } else if (get_size() == size) {
+    // do nothing
+  } else {
+    data.append_zero(size - get_size());
+  }
+  return 0;
+}
+
+void Object::encode(bufferlist& bl) const {
+  ENCODE_START(1, 1, bl);
+  encode(data, bl);
+  encode(xattr, bl);
+  encode(omap_header, bl);
+  encode(omap, bl);
+  ENCODE_FINISH(bl);
+}
+
+void Object::decode(bufferlist::const_iterator& p) {
+  DECODE_START(1, p);
+  decode(data, p);
+  decode(xattr, p);
+  decode(omap_header, p);
+  decode(omap, p);
+  DECODE_FINISH(p);
+}
+
+}
diff --git a/src/crimson/os/cyanstore/cyan_object.h b/src/crimson/os/cyanstore/cyan_object.h
new file mode 100644
index 000000000..f19b87212
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_object.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <cstddef>
+#include <map>
+#include <string>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include "include/buffer.h"
+
+namespace crimson::os {
+
+struct Object : public boost::intrusive_ref_counter<
+  Object,
+  boost::thread_unsafe_counter>
+{
+  using bufferlist = ceph::bufferlist;
+
+  bufferlist data;
+  // use transparent comparator for better performance, see
+  // https://en.cppreference.com/w/cpp/utility/functional/less_void
+  std::map<std::string,bufferptr,std::less<>> xattr;
+  bufferlist omap_header;
+  std::map<std::string,bufferlist> omap;
+
+  typedef boost::intrusive_ptr<Object> Ref;
+
+  Object() = default;
+
+  // interface for object data
+  size_t get_size() const;
+  ceph::bufferlist read(uint64_t offset, uint64_t len);
+  int write(uint64_t offset, const bufferlist &bl);
+  int clone(Object *src, uint64_t srcoff, uint64_t len,
+	     uint64_t dstoff);
+  int truncate(uint64_t offset);
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& p);
+};
+using ObjectRef = boost::intrusive_ptr<Object>;
+
+}
diff --git a/src/crimson/os/cyanstore/cyan_store.cc b/src/crimson/os/cyanstore/cyan_store.cc
new file mode 100644
index 000000000..eb93d72ec
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_store.cc
@@ -0,0 +1,835 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "cyan_store.h"
+
+#include <boost/algorithm/string/trim.hpp>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "common/safe_io.h"
+#include "os/Transaction.h"
+
+#include "crimson/common/buffer_io.h"
+#include "crimson/common/config_proxy.h"
+#include "cyan_collection.h"
+#include "cyan_object.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+using crimson::common::local_conf;
+
+namespace crimson::os {
+
+using ObjectRef = boost::intrusive_ptr<Object>;
+
+CyanStore::CyanStore(const std::string& path)
+  : path{path}
+{}
+
+CyanStore::~CyanStore() = default;
+
+seastar::future<> CyanStore::mount()
+{
+  ceph::bufferlist bl;
+  std::string fn = path + "/collections";
+  std::string err;
+  if (int r = bl.read_file(fn.c_str(), &err); r < 0) {
+    throw std::runtime_error("read_file");
+  }
+
+  std::set<coll_t> collections;
+  auto p = bl.cbegin();
+  ceph::decode(collections, p);
+
+  for (auto& coll : collections) {
+    std::string fn = fmt::format("{}/{}", path, coll);
+    ceph::bufferlist cbl;
+    if (int r = cbl.read_file(fn.c_str(), &err); r < 0) {
+      throw std::runtime_error("read_file");
+    }
+    boost::intrusive_ptr<Collection> c{new Collection{coll}};
+    auto p = cbl.cbegin();
+    c->decode(p);
+    coll_map[coll] = c;
+    used_bytes += c->used_bytes();
+  }
+  return seastar::now();
+}
+
+seastar::future<> CyanStore::umount()
+{
+  return seastar::do_with(std::set<coll_t>{}, [this](auto& collections) {
+    return seastar::do_for_each(coll_map, [&collections, this](auto& coll) {
+      auto& [col, ch] = coll;
+      collections.insert(col);
+      ceph::bufferlist bl;
+      ceph_assert(ch);
+      ch->encode(bl);
+      std::string fn = fmt::format("{}/{}", path, col);
+      return crimson::write_file(std::move(bl), fn);
+    }).then([&collections, this] {
+      ceph::bufferlist bl;
+      ceph::encode(collections, bl);
+      std::string fn = fmt::format("{}/collections", path);
+      return crimson::write_file(std::move(bl), fn);
+    });
+  });
+}
+
+seastar::future<> CyanStore::mkfs(uuid_d new_osd_fsid)
+{
+  return read_meta("fsid").then([=](auto&& ret) {
+    auto& [r, fsid_str] = ret;
+    if (r == -ENOENT) {
+      if (new_osd_fsid.is_zero()) {
+        osd_fsid.generate_random();
+      } else {
+        osd_fsid = new_osd_fsid;
+      }
+      return write_meta("fsid", fmt::format("{}", osd_fsid));
+    } else if (r < 0) {
+      throw std::runtime_error("read_meta");
+    } else {
+      logger().info("{} already has fsid {}", __func__, fsid_str);
+      if (!osd_fsid.parse(fsid_str.c_str())) {
+        throw std::runtime_error("failed to parse fsid");
+      } else if (osd_fsid != new_osd_fsid) {
+        logger().error("on-disk fsid {} != provided {}", osd_fsid, new_osd_fsid);
+        throw std::runtime_error("unmatched osd_fsid");
+      } else {
+	return seastar::now();
+      }
+    }
+  }).then([this]{
+    std::string fn = path + "/collections";
+    ceph::bufferlist bl;
+    std::set<coll_t> collections;
+    ceph::encode(collections, bl);
+    return crimson::write_file(std::move(bl), fn);
+  }).then([this] {
+    return write_meta("type", "memstore");
+  });
+}
+
+seastar::future<store_statfs_t> CyanStore::stat() const
+{
+  logger().debug("{}", __func__);
+  store_statfs_t st;
+  st.total = crimson::common::local_conf().get_val<Option::size_t>("memstore_device_bytes");
+  st.available = st.total - used_bytes;
+  return seastar::make_ready_future<store_statfs_t>(std::move(st));
+}
+
+seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
+CyanStore::list_objects(CollectionRef ch,
+                        const ghobject_t& start,
+                        const ghobject_t& end,
+                        uint64_t limit) const
+{
+  auto c = static_cast<Collection*>(ch.get());
+  logger().debug("{} {} {} {} {}",
+                 __func__, c->get_cid(), start, end, limit);
+  std::vector<ghobject_t> objects;
+  objects.reserve(limit);
+  ghobject_t next = ghobject_t::get_max();
+  for (const auto& [oid, obj] :
+         boost::make_iterator_range(c->object_map.lower_bound(start),
+                                    c->object_map.end())) {
+    std::ignore = obj;
+    if (oid >= end || objects.size() >= limit) {
+      next = oid;
+      break;
+    }
+    objects.push_back(oid);
+  }
+  return seastar::make_ready_future<std::tuple<std::vector<ghobject_t>, ghobject_t>>(
+    std::make_tuple(std::move(objects), next));
+}
+
+seastar::future<CollectionRef> CyanStore::create_new_collection(const coll_t& cid)
+{
+  auto c = new Collection{cid};
+  new_coll_map[cid] = c;
+  return seastar::make_ready_future<CollectionRef>(c);
+}
+
+seastar::future<CollectionRef> CyanStore::open_collection(const coll_t& cid)
+{
+  return seastar::make_ready_future<CollectionRef>(_get_collection(cid));
+}
+
+seastar::future<std::vector<coll_t>> CyanStore::list_collections()
+{
+  std::vector<coll_t> collections;
+  for (auto& coll : coll_map) {
+    collections.push_back(coll.first);
+  }
+  return seastar::make_ready_future<std::vector<coll_t>>(std::move(collections));
+}
+
+CyanStore::read_errorator::future<ceph::bufferlist> CyanStore::read(
+  CollectionRef ch,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t len,
+  uint32_t op_flags)
+{
+  auto c = static_cast<Collection*>(ch.get());
+  logger().debug("{} {} {} {}~{}",
+                __func__, c->get_cid(), oid, offset, len);
+  if (!c->exists) {
+    return crimson::ct_error::enoent::make();
+  }
+  ObjectRef o = c->get_object(oid);
+  if (!o) {
+    return crimson::ct_error::enoent::make();
+  }
+  if (offset >= o->get_size())
+    return read_errorator::make_ready_future<ceph::bufferlist>();
+  size_t l = len;
+  if (l == 0 && offset == 0)  // note: len == 0 means read the entire object
+    l = o->get_size();
+  else if (offset + l > o->get_size())
+    l = o->get_size() - offset;
+  return read_errorator::make_ready_future<ceph::bufferlist>(o->read(offset, l));
+}
+
+CyanStore::read_errorator::future<ceph::bufferlist> CyanStore::readv(
+  CollectionRef ch,
+  const ghobject_t& oid,
+  interval_set<uint64_t>& m,
+  uint32_t op_flags)
+{
+  return seastar::do_with(ceph::bufferlist{},
+    [this, ch, oid, &m, op_flags](auto& bl) {
+    return crimson::do_for_each(m,
+      [this, ch, oid, op_flags, &bl](auto& p) {
+      return read(ch, oid, p.first, p.second, op_flags)
+      .safe_then([&bl](auto ret) {
+	bl.claim_append(ret);
+      });
+    }).safe_then([&bl] {
+      return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl));
+    });
+  });
+}
+
+
+CyanStore::get_attr_errorator::future<ceph::bufferptr> CyanStore::get_attr(
+  CollectionRef ch,
+  const ghobject_t& oid,
+  std::string_view name) const
+{
+  auto c = static_cast<Collection*>(ch.get());
+  logger().debug("{} {} {}",
+                __func__, c->get_cid(), oid);
+  auto o = c->get_object(oid);
+  if (!o) {
+    return crimson::ct_error::enoent::make();
+  }
+  if (auto found = o->xattr.find(name); found != o->xattr.end()) {
+    return get_attr_errorator::make_ready_future<ceph::bufferptr>(found->second);
+  } else {
+    return crimson::ct_error::enodata::make();
+  }
+}
+
+CyanStore::get_attrs_ertr::future<CyanStore::attrs_t> CyanStore::get_attrs(
+  CollectionRef ch,
+  const ghobject_t& oid)
+{
+  auto c = static_cast<Collection*>(ch.get());
+  logger().debug("{} {} {}",
+		 __func__, c->get_cid(), oid);
+  auto o = c->get_object(oid);
+  if (!o) {
+    return crimson::ct_error::enoent::make();
+  }
+  return get_attrs_ertr::make_ready_future<attrs_t>(o->xattr);
+}
+
+auto CyanStore::omap_get_values(CollectionRef ch,
+				const ghobject_t& oid,
+				const omap_keys_t& keys)
+  -> read_errorator::future<omap_values_t>
+{
+  auto c = static_cast<Collection*>(ch.get());
+  logger().debug("{} {} {}", __func__, c->get_cid(), oid);
+  auto o = c->get_object(oid);
+  if (!o) {
+    return crimson::ct_error::enoent::make();
+  }
+  omap_values_t values;
+  for (auto& key : keys) {
+    if (auto found = o->omap.find(key); found != o->omap.end()) {
+      values.insert(*found);
+    }
+  }
+  return seastar::make_ready_future<omap_values_t>(std::move(values));
+}
+
+auto
+CyanStore::omap_get_values(CollectionRef ch,
+			   const ghobject_t &oid,
+			   const std::optional<string> &start)
+  -> read_errorator::future<std::tuple<bool, omap_values_t>>
+{
+  auto c = static_cast<Collection*>(ch.get());
+  logger().debug("{} {} {}", __func__, c->get_cid(), oid);
+  auto o = c->get_object(oid);
+  if (!o) {
+    return crimson::ct_error::enoent::make();
+  }
+  omap_values_t values;
+  for (auto i = start ? o->omap.upper_bound(*start) : o->omap.begin();
+       values.size() < MAX_KEYS_PER_OMAP_GET_CALL && i != o->omap.end();
+       ++i) {
+    values.insert(*i);
+  }
+  return seastar::make_ready_future<std::tuple<bool, omap_values_t>>(
+    std::make_tuple(true, std::move(values)));
+}
+
+auto
+CyanStore::omap_get_header(CollectionRef ch,
+			   const ghobject_t& oid)
+  -> read_errorator::future<ceph::bufferlist>
+{
+  auto c = static_cast<Collection*>(ch.get());
+  auto o = c->get_object(oid);
+  if (!o) {
+    return crimson::ct_error::enoent::make();
+  }
+
+  return read_errorator::make_ready_future<ceph::bufferlist>(
+    o->omap_header);
+}
+
+seastar::future<> CyanStore::do_transaction(CollectionRef ch,
+                                            ceph::os::Transaction&& t)
+{
+  using ceph::os::Transaction;
+  int r = 0;
+  try {
+    auto i = t.begin();
+    while (i.have_op()) {
+      r = 0;
+      switch (auto op = i.decode_op(); op->op) {
+      case Transaction::OP_NOP:
+	break;
+      case Transaction::OP_REMOVE:
+      {
+	coll_t cid = i.get_cid(op->cid);
+	ghobject_t oid = i.get_oid(op->oid);
+	r = _remove(cid, oid);
+	if (r == -ENOENT) {
+	  r = 0;
+	}
+      }
+      break;
+      case Transaction::OP_TOUCH:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        r = _touch(cid, oid);
+      }
+      break;
+      case Transaction::OP_WRITE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+        uint32_t fadvise_flags = i.get_fadvise_flags();
+        ceph::bufferlist bl;
+        i.decode_bl(bl);
+        r = _write(cid, oid, off, len, bl, fadvise_flags);
+      }
+      break;
+      case Transaction::OP_ZERO:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+        r = _zero(cid, oid, off, len);
+      }
+      break;
+      case Transaction::OP_TRUNCATE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+        r = _truncate(cid, oid, off);
+      }
+      break;
+      case Transaction::OP_SETATTR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        std::string name = i.decode_string();
+        ceph::bufferlist bl;
+        i.decode_bl(bl);
+        std::map<std::string, bufferptr> to_set;
+        to_set[name] = bufferptr(bl.c_str(), bl.length());
+        r = _setattrs(cid, oid, to_set);
+      }
+      break;
+      case Transaction::OP_RMATTR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        std::string name = i.decode_string();
+        r = _rm_attr(cid, oid, name);	
+      }
+      break;
+      case Transaction::OP_MKCOLL:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        r = _create_collection(cid, op->split_bits);
+      }
+      break;
+      case Transaction::OP_OMAP_CLEAR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        r = _omap_clear(cid, oid);
+      }
+      break;
+      case Transaction::OP_OMAP_SETKEYS:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        std::map<std::string, ceph::bufferlist> aset;
+        i.decode_attrset(aset);
+        r = _omap_set_values(cid, oid, std::move(aset));
+      }
+      break;
+      case Transaction::OP_OMAP_SETHEADER:
+      {
+	const coll_t &cid = i.get_cid(op->cid);
+	const ghobject_t &oid = i.get_oid(op->oid);
+	ceph::bufferlist bl;
+	i.decode_bl(bl);
+	r = _omap_set_header(cid, oid, bl);
+      }
+      break;
+      case Transaction::OP_OMAP_RMKEYS:
+      {
+	const coll_t &cid = i.get_cid(op->cid);
+	const ghobject_t &oid = i.get_oid(op->oid);
+	omap_keys_t keys;
+	i.decode_keyset(keys);
+	r = _omap_rmkeys(cid, oid, keys);
+      }
+      break;
+      case Transaction::OP_OMAP_RMKEYRANGE:
+      {
+	const coll_t &cid = i.get_cid(op->cid);
+	const ghobject_t &oid = i.get_oid(op->oid);
+	string first, last;
+	first = i.decode_string();
+	last = i.decode_string();
+	r = _omap_rmkeyrange(cid, oid, first, last);
+      }
+      break;
+      case Transaction::OP_COLL_HINT:
+      {
+        ceph::bufferlist hint;
+        i.decode_bl(hint);
+	// ignored
+	break;
+      }
+      default:
+	logger().error("bad op {}", static_cast<unsigned>(op->op));
+	abort();
+      }
+      if (r < 0) {
+	break;
+      }
+    }
+  } catch (std::exception &e) {
+    logger().error("{} got exception {}", __func__, e);
+    r = -EINVAL;
+  }
+  if (r < 0) {
+    logger().error(" transaction dump:\n");
+    JSONFormatter f(true);
+    f.open_object_section("transaction");
+    t.dump(&f);
+    f.close_section();
+    std::stringstream str;
+    f.flush(str);
+    logger().error("{}", str.str());
+    ceph_assert(r == 0);
+  }
+  for (auto i : {
+      t.get_on_applied(),
+      t.get_on_commit(),
+      t.get_on_applied_sync()}) {
+    if (i) {
+      i->complete(0);
+    }
+  }
+  return seastar::now();
+}
+
+int CyanStore::_remove(const coll_t& cid, const ghobject_t& oid)
+{
+  logger().debug("{} cid={} oid={}",
+                __func__, cid, oid);
+  auto c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  auto i = c->object_hash.find(oid);
+  if (i == c->object_hash.end())
+    return -ENOENT;
+  used_bytes -= i->second->get_size();
+  c->object_hash.erase(i);
+  c->object_map.erase(oid);
+  return 0;
+}
+
+int CyanStore::_touch(const coll_t& cid, const ghobject_t& oid)
+{
+  logger().debug("{} cid={} oid={}",
+                __func__, cid, oid);
+  auto c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  c->get_or_create_object(oid);
+  return 0;
+}
+
+int CyanStore::_write(const coll_t& cid, const ghobject_t& oid,
+                      uint64_t offset, size_t len, const ceph::bufferlist& bl,
+                      uint32_t fadvise_flags)
+{
+  logger().debug("{} {} {} {} ~ {}",
+                __func__, cid, oid, offset, len);
+  assert(len == bl.length());
+
+  auto c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_or_create_object(oid);
+  if (len > 0 && !local_conf()->memstore_debug_omit_block_device_write) {
+    const ssize_t old_size = o->get_size();
+    o->write(offset, bl);
+    used_bytes += (o->get_size() - old_size);
+  }
+
+  return 0;
+}
+
+int CyanStore::_zero(const coll_t& cid, const ghobject_t& oid,
+                     uint64_t offset, size_t len)
+{
+  logger().debug("{} {} {} {} ~ {}",
+                __func__, cid, oid, offset, len);
+
+  ceph::buffer::list bl;
+  bl.append_zero(len);
+  return _write(cid, oid, offset, len, bl, 0);
+}
+
+int CyanStore::_omap_clear(
+  const coll_t& cid,
+  const ghobject_t& oid)
+{
+  logger().debug("{} {} {}", __func__, cid, oid);
+
+  auto c = _get_collection(cid);
+  if (!c) {
+    return -ENOENT;
+  }
+  ObjectRef o = c->get_object(oid);
+  if (!o) {
+    return -ENOENT;
+  }
+  o->omap.clear();
+  o->omap_header.clear();
+  return 0;
+}
+
+int CyanStore::_omap_set_values(
+  const coll_t& cid,
+  const ghobject_t& oid,
+  std::map<std::string, ceph::bufferlist> &&aset)
+{
+  logger().debug(
+    "{} {} {} {} keys",
+    __func__, cid, oid, aset.size());
+
+  auto c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_or_create_object(oid);
+  for (auto &&i: aset) {
+    o->omap.insert(std::move(i));
+  }
+  return 0;
+}
+
+int CyanStore::_omap_set_header(
+  const coll_t& cid,
+  const ghobject_t& oid,
+  const ceph::bufferlist &header)
+{
+  logger().debug(
+    "{} {} {} {} bytes",
+    __func__, cid, oid, header.length());
+
+  auto c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_or_create_object(oid);
+  o->omap_header = header;
+  return 0;
+}
+
+int CyanStore::_omap_rmkeys(
+  const coll_t& cid,
+  const ghobject_t& oid,
+  const omap_keys_t& aset)
+{
+  logger().debug(
+    "{} {} {} {} keys",
+    __func__, cid, oid, aset.size());
+
+  auto c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_or_create_object(oid);
+  for (auto &i: aset) {
+    o->omap.erase(i);
+  }
+  return 0;
+}
+
+int CyanStore::_omap_rmkeyrange(
+  const coll_t& cid,
+  const ghobject_t& oid,
+  const std::string &first,
+  const std::string &last)
+{
+  logger().debug(
+    "{} {} {} first={} last={}",
+    __func__, cid, oid, first, last);
+
+  auto c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_or_create_object(oid);
+  for (auto i = o->omap.lower_bound(first);
+       i != o->omap.end() && i->first <= last;
+       o->omap.erase(i++));
+  return 0;
+}
+
+int CyanStore::_truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size)
+{
+  logger().debug("{} cid={} oid={} size={}",
+                __func__, cid, oid, size);
+  auto c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  if (local_conf()->memstore_debug_omit_block_device_write)
+    return 0;
+  const ssize_t old_size = o->get_size();
+  int r = o->truncate(size);
+  used_bytes += (o->get_size() - old_size);
+  return r;
+}
+
+int CyanStore::_setattrs(const coll_t& cid, const ghobject_t& oid,
+                         std::map<std::string,bufferptr>& aset)
+{
+  logger().debug("{} cid={} oid={}",
+                __func__, cid, oid);
+  auto c = _get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  for (std::map<std::string, bufferptr>::const_iterator p = aset.begin();
+       p != aset.end(); ++p)
+    o->xattr[p->first] = p->second;
+  return 0;
+}
+
+int CyanStore::_rm_attr(const coll_t& cid, const ghobject_t& oid,
+			std::string_view name)
+{
+  logger().debug("{} cid={} oid={} name={}", __func__, cid, oid, name);
+  auto c = _get_collection(cid);
+  if (!c) {
+    return -ENOENT;
+  }
+  ObjectRef o = c->get_object(oid);
+  if (!o) {
+    return -ENOENT;
+  }
+  auto i = o->xattr.find(name);
+  if (i == o->xattr.end()) {
+    return -ENODATA;
+  }
+  o->xattr.erase(i);
+  return 0;
+}
+
+int CyanStore::_create_collection(const coll_t& cid, int bits)
+{
+  auto result = coll_map.try_emplace(cid);
+  if (!result.second)
+    return -EEXIST;
+  auto p = new_coll_map.find(cid);
+  assert(p != new_coll_map.end());
+  result.first->second = p->second;
+  result.first->second->bits = bits;
+  new_coll_map.erase(p);
+  return 0;
+}
+
+boost::intrusive_ptr<Collection> CyanStore::_get_collection(const coll_t& cid)
+{
+  auto cp = coll_map.find(cid);
+  if (cp == coll_map.end())
+    return {};
+  return cp->second;
+}
+
+seastar::future<> CyanStore::write_meta(const std::string& key,
+					const std::string& value)
+{
+  std::string v = value;
+  v += "\n";
+  if (int r = safe_write_file(path.c_str(), key.c_str(),
+                              v.c_str(), v.length(), 0600);
+      r < 0) {
+    throw std::runtime_error{fmt::format("unable to write_meta({})", key)};
+  }
+  return seastar::make_ready_future<>();
+}
+
+seastar::future<std::tuple<int, std::string>>
+CyanStore::read_meta(const std::string& key)
+{
+  std::string fsid(4096, '\0');
+  int r = safe_read_file(path.c_str(), key.c_str(), fsid.data(), fsid.size());
+  if (r > 0) {
+    fsid.resize(r);
+    // drop trailing newlines
+    boost::algorithm::trim_right_if(fsid,
+				    [](unsigned char c) {return isspace(c);});
+  } else {
+    fsid.clear();
+  }
+  return seastar::make_ready_future<std::tuple<int, std::string>>(
+    std::make_tuple(r, fsid));
+}
+
+uuid_d CyanStore::get_fsid() const
+{
+  return osd_fsid;
+}
+
+unsigned CyanStore::get_max_attr_name_length() const
+{
+  // arbitrary limitation exactly like in the case of MemStore.
+  return 256;
+}
+
+seastar::future<FuturizedStore::OmapIteratorRef> CyanStore::get_omap_iterator(
+    CollectionRef ch,
+    const ghobject_t& oid)
+{
+  auto c = static_cast<Collection*>(ch.get());
+  auto o = c->get_object(oid);
+  if (!o) {
+    throw std::runtime_error(fmt::format("object does not exist: {}", oid));
+  }
+  return seastar::make_ready_future<FuturizedStore::OmapIteratorRef>(
+	    new CyanStore::CyanOmapIterator(o));
+}
+
+seastar::future<std::map<uint64_t, uint64_t>>
+CyanStore::fiemap(
+    CollectionRef ch,
+    const ghobject_t& oid,
+    uint64_t off,
+    uint64_t len)
+{
+  auto c = static_cast<Collection*>(ch.get());
+
+  ObjectRef o = c->get_object(oid);
+  if (!o) {
+    throw std::runtime_error(fmt::format("object does not exist: {}", oid));
+  }
+  std::map<uint64_t, uint64_t> m{{0, o->get_size()}};
+  return seastar::make_ready_future<std::map<uint64_t, uint64_t>>(std::move(m));
+}
+
+seastar::future<struct stat>
+CyanStore::stat(
+  CollectionRef ch,
+  const ghobject_t& oid)
+{
+  auto c = static_cast<Collection*>(ch.get());
+  auto o = c->get_object(oid);
+  if (!o) {
+    throw std::runtime_error(fmt::format("object does not exist: {}", oid));
+  }
+  struct stat st;
+  st.st_size = o->get_size();
+  return seastar::make_ready_future<struct stat>(std::move(st));
+}
+
+seastar::future<> CyanStore::CyanOmapIterator::seek_to_first()
+{
+  iter = obj->omap.begin();
+  return seastar::make_ready_future<>();
+}
+
+seastar::future<> CyanStore::CyanOmapIterator::upper_bound(const std::string& after)
+{
+  iter = obj->omap.upper_bound(after);
+  return seastar::make_ready_future<>();
+}
+
+seastar::future<> CyanStore::CyanOmapIterator::lower_bound(const std::string &to)
+{
+  iter = obj->omap.lower_bound(to);
+  return seastar::make_ready_future<>();
+}
+
+bool CyanStore::CyanOmapIterator::valid() const
+{
+  return iter != obj->omap.end();
+}
+
+seastar::future<> CyanStore::CyanOmapIterator::next()
+{
+  ++iter;
+  return seastar::make_ready_future<>();
+}
+
+}
diff --git a/src/crimson/os/cyanstore/cyan_store.h b/src/crimson/os/cyanstore/cyan_store.h
new file mode 100644
index 000000000..07a8ff29e
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_store.h
@@ -0,0 +1,185 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <map>
+#include <typeinfo>
+#include <vector>
+
+#include <optional>
+#include <seastar/core/future.hh>
+#include <seastar/core/future-util.hh>
+
+#include "osd/osd_types.h"
+#include "include/uuid.h"
+
+#include "crimson/os/cyanstore/cyan_object.h"
+#include "crimson/os/futurized_store.h"
+
+namespace ceph::os {
+class Transaction;
+}
+
+namespace crimson::os {
+class Collection;
+
+class CyanStore final : public FuturizedStore {
+  constexpr static unsigned MAX_KEYS_PER_OMAP_GET_CALL = 32;
+
+  const std::string path;
+  std::unordered_map<coll_t, boost::intrusive_ptr<Collection>> coll_map;
+  std::map<coll_t, boost::intrusive_ptr<Collection>> new_coll_map;
+  uint64_t used_bytes = 0;
+  uuid_d osd_fsid;
+
+public:
+  class CyanOmapIterator final : public OmapIterator {
+  public:
+    CyanOmapIterator() {}
+    CyanOmapIterator(ObjectRef obj) : obj(obj) {
+      iter = obj->omap.begin();
+    }
+    seastar::future<> seek_to_first() final;
+    seastar::future<> upper_bound(const std::string &after) final;
+    seastar::future<> lower_bound(const std::string &to) final;
+    bool valid() const final;
+    seastar::future<> next() final;
+    std::string key() final {
+      return iter->first;
+    }
+    virtual seastar::future<std::string> tail_key(){
+      return seastar::make_ready_future<std::string>((++obj->omap.end())->first);
+    }
+    virtual ceph::buffer::list value() {
+      return iter->second;
+    }
+    virtual int status() const {
+      return iter != obj->omap.end() ? 0 : -1;
+    }
+    virtual ~CyanOmapIterator() {}
+  private:
+    std::map<std::string, bufferlist>::const_iterator iter;
+    ObjectRef obj;
+  };
+
+  CyanStore(const std::string& path);
+  ~CyanStore() final;
+
+  seastar::future<> stop() final {
+    return seastar::now();
+  }
+  seastar::future<> mount() final;
+  seastar::future<> umount() final;
+
+  seastar::future<> mkfs(uuid_d new_osd_fsid) final;
+  seastar::future<store_statfs_t> stat() const final;
+  seastar::future<struct stat> stat(
+    CollectionRef c,
+    const ghobject_t& oid) final;
+
+  read_errorator::future<ceph::bufferlist> read(
+    CollectionRef c,
+    const ghobject_t& oid,
+    uint64_t offset,
+    size_t len,
+    uint32_t op_flags = 0) final;
+  read_errorator::future<ceph::bufferlist> readv(
+    CollectionRef c,
+    const ghobject_t& oid,
+    interval_set<uint64_t>& m,
+    uint32_t op_flags = 0) final;
+
+  get_attr_errorator::future<ceph::bufferptr> get_attr(
+    CollectionRef c,
+    const ghobject_t& oid,
+    std::string_view name) const final;
+  get_attrs_ertr::future<attrs_t> get_attrs(
+    CollectionRef c,
+    const ghobject_t& oid);
+
+  read_errorator::future<omap_values_t> omap_get_values(
+    CollectionRef c,
+    const ghobject_t& oid,
+    const omap_keys_t& keys) final;
+
+  /// Retrieves paged set of values > start (if present)
+  read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
+    CollectionRef c,           ///< [in] collection
+    const ghobject_t &oid,     ///< [in] oid
+    const std::optional<std::string> &start ///< [in] start, empty for begin
+    ) final; ///< @return <done, values> values.empty() iff done
+
+  seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
+    CollectionRef c,
+    const ghobject_t& start,
+    const ghobject_t& end,
+    uint64_t limit) const final;
+
+  read_errorator::future<ceph::bufferlist> omap_get_header(
+    CollectionRef c,
+    const ghobject_t& oid) final;
+
+  seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
+  seastar::future<CollectionRef> open_collection(const coll_t& cid) final;
+  seastar::future<std::vector<coll_t>> list_collections() final;
+
+  seastar::future<> do_transaction(CollectionRef ch,
+				   ceph::os::Transaction&& txn) final;
+
+  seastar::future<> write_meta(const std::string& key,
+		  const std::string& value) final;
+  seastar::future<std::tuple<int, std::string>>
+  read_meta(const std::string& key) final;
+  uuid_d get_fsid() const final;
+  unsigned get_max_attr_name_length() const final;
+
+  seastar::future<OmapIteratorRef> get_omap_iterator(
+    CollectionRef c,
+    const ghobject_t& oid);
+
+  seastar::future<std::map<uint64_t, uint64_t>> fiemap(CollectionRef c,
+						       const ghobject_t& oid,
+						       uint64_t off,
+						       uint64_t len);
+
+private:
+  int _remove(const coll_t& cid, const ghobject_t& oid);
+  int _touch(const coll_t& cid, const ghobject_t& oid);
+  int _write(const coll_t& cid, const ghobject_t& oid,
+	     uint64_t offset, size_t len, const ceph::bufferlist& bl,
+	     uint32_t fadvise_flags);
+  int _zero(const coll_t& cid, const ghobject_t& oid,
+	    uint64_t offset, size_t len);
+  int _omap_clear(
+    const coll_t& cid,
+    const ghobject_t& oid);
+  int _omap_set_values(
+    const coll_t& cid,
+    const ghobject_t& oid,
+    std::map<std::string, ceph::bufferlist> &&aset);
+  int _omap_set_header(
+    const coll_t& cid,
+    const ghobject_t& oid,
+    const ceph::bufferlist &header);
+  int _omap_rmkeys(
+    const coll_t& cid,
+    const ghobject_t& oid,
+    const omap_keys_t& aset);
+  int _omap_rmkeyrange(
+    const coll_t& cid,
+    const ghobject_t& oid,
+    const std::string &first,
+    const std::string &last);
+  int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size);
+  int _setattrs(const coll_t& cid, const ghobject_t& oid,
+                std::map<std::string,bufferptr>& aset);
+  int _rm_attr(const coll_t& cid, const ghobject_t& oid,
+	       string_view name);
+  int _create_collection(const coll_t& cid, int bits);
+  boost::intrusive_ptr<Collection> _get_collection(const coll_t& cid);
+};
+
+}
diff --git a/src/crimson/os/futurized_collection.h b/src/crimson/os/futurized_collection.h
new file mode 100644
index 000000000..06f7d2f47
--- /dev/null
+++ b/src/crimson/os/futurized_collection.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "osd/osd_types.h"
+
+namespace crimson::os {
+class FuturizedStore;
+
+class FuturizedCollection
+  : public boost::intrusive_ref_counter<FuturizedCollection,
+                                        boost::thread_unsafe_counter>
+{
+public:
+  FuturizedCollection(const coll_t& cid)
+    : cid{cid} {}
+  virtual ~FuturizedCollection() {}
+  virtual seastar::future<> flush() {
+    return seastar::make_ready_future<>();
+  }
+  virtual seastar::future<bool> flush_commit() {
+    return seastar::make_ready_future<bool>(true);
+  }
+  const coll_t& get_cid() const {
+    return cid;
+  }
+private:
+  const coll_t cid;
+};
+
+using CollectionRef =  boost::intrusive_ptr<FuturizedCollection>;
+}
diff --git a/src/crimson/os/futurized_store.cc b/src/crimson/os/futurized_store.cc
new file mode 100644
index 000000000..bb73c3478
--- /dev/null
+++ b/src/crimson/os/futurized_store.cc
@@ -0,0 +1,22 @@
+#include "futurized_store.h"
+#include "cyanstore/cyan_store.h"
+#include "alienstore/alien_store.h"
+
+namespace crimson::os {
+
+std::unique_ptr<FuturizedStore>
+FuturizedStore::create(const std::string& type,
+                       const std::string& data,
+                       const ConfigValues& values)
+{
+  if (type == "memstore") {
+    return std::make_unique<crimson::os::CyanStore>(data);
+  } else if (type == "bluestore") {
+    return std::make_unique<crimson::os::AlienStore>(data, values);
+  } else {
+    ceph_abort_msgf("unsupported objectstore type: %s", type.c_str());
+    return {};
+  }
+}
+
+}
diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h
new file mode 100644
index 000000000..bb173056b
--- /dev/null
+++ b/src/crimson/os/futurized_store.h
@@ -0,0 +1,167 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <map>
+#include <optional>
+#include <vector>
+
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/exceptions.h"
+#include "include/buffer_fwd.h"
+#include "include/uuid.h"
+#include "osd/osd_types.h"
+
+namespace ceph::os {
+class Transaction;
+}
+
+namespace crimson::os {
+class FuturizedCollection;
+
+class FuturizedStore {
+
+public:
+  class OmapIterator {
+  public:
+    virtual seastar::future<> seek_to_first() = 0;
+    virtual seastar::future<> upper_bound(const std::string &after) = 0;
+    virtual seastar::future<> lower_bound(const std::string &to) = 0;
+    virtual bool valid() const {
+      return false;
+    }
+    virtual seastar::future<> next() = 0;
+    virtual std::string key() {
+      return {};
+    }
+    virtual seastar::future<std::string> tail_key() {
+      return seastar::make_ready_future<std::string>();
+    }
+    virtual ceph::buffer::list value() {
+      return {};
+    }
+    virtual int status() const {
+      return 0;
+    }
+    virtual ~OmapIterator() {}
+  private:
+    unsigned count = 0;
+    friend void intrusive_ptr_add_ref(FuturizedStore::OmapIterator* iter);
+    friend void intrusive_ptr_release(FuturizedStore::OmapIterator* iter);
+  };
+  using OmapIteratorRef = boost::intrusive_ptr<OmapIterator>;
+
+  static std::unique_ptr<FuturizedStore> create(const std::string& type,
+                                                const std::string& data,
+                                                const ConfigValues& values);
+  FuturizedStore() = default;
+  virtual ~FuturizedStore() = default;
+
+  // no copying
+  explicit FuturizedStore(const FuturizedStore& o) = delete;
+  const FuturizedStore& operator=(const FuturizedStore& o) = delete;
+
+  virtual seastar::future<> start() {
+    return seastar::now();
+  }
+  virtual seastar::future<> stop() = 0;
+  virtual seastar::future<> mount() = 0;
+  virtual seastar::future<> umount() = 0;
+
+  virtual seastar::future<> mkfs(uuid_d new_osd_fsid) = 0;
+  virtual seastar::future<store_statfs_t> stat() const = 0;
+
+  using CollectionRef = boost::intrusive_ptr<FuturizedCollection>;
+  using read_errorator = crimson::errorator<crimson::ct_error::enoent,
+                                            crimson::ct_error::input_output_error>;
+  virtual read_errorator::future<ceph::bufferlist> read(
+    CollectionRef c,
+    const ghobject_t& oid,
+    uint64_t offset,
+    size_t len,
+    uint32_t op_flags = 0) = 0;
+  virtual read_errorator::future<ceph::bufferlist> readv(
+    CollectionRef c,
+    const ghobject_t& oid,
+    interval_set<uint64_t>& m,
+    uint32_t op_flags = 0) = 0;
+
+  using get_attr_errorator = crimson::errorator<
+    crimson::ct_error::enoent,
+    crimson::ct_error::enodata>;
+  virtual get_attr_errorator::future<ceph::bufferptr> get_attr(
+    CollectionRef c,
+    const ghobject_t& oid,
+    std::string_view name) const = 0;
+
+  using get_attrs_ertr = crimson::errorator<
+    crimson::ct_error::enoent>;
+  using attrs_t = std::map<std::string, ceph::bufferptr, std::less<>>;
+  virtual get_attrs_ertr::future<attrs_t> get_attrs(
+    CollectionRef c,
+    const ghobject_t& oid) = 0;
+  virtual seastar::future<struct stat> stat(
+    CollectionRef c,
+    const ghobject_t& oid) = 0;
+
+  using omap_values_t = std::map<std::string, bufferlist, std::less<>>;
+  using omap_keys_t = std::set<std::string>;
+  virtual read_errorator::future<omap_values_t> omap_get_values(
+    CollectionRef c,
+    const ghobject_t& oid,
+    const omap_keys_t& keys) = 0;
+  virtual seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
+    CollectionRef c,
+    const ghobject_t& start,
+    const ghobject_t& end,
+    uint64_t limit) const = 0;
+  virtual read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
+    CollectionRef c,           ///< [in] collection
+    const ghobject_t &oid,     ///< [in] oid
+    const std::optional<std::string> &start ///< [in] start, empty for begin
+    ) = 0; ///< @return <done, values> values.empty() iff done
+
+  virtual read_errorator::future<bufferlist> omap_get_header(
+    CollectionRef c,
+    const ghobject_t& oid) = 0;
+
+  virtual seastar::future<CollectionRef> create_new_collection(const coll_t& cid) = 0;
+  virtual seastar::future<CollectionRef> open_collection(const coll_t& cid) = 0;
+  virtual seastar::future<std::vector<coll_t>> list_collections() = 0;
+
+  virtual seastar::future<> do_transaction(CollectionRef ch,
+					   ceph::os::Transaction&& txn) = 0;
+  virtual seastar::future<OmapIteratorRef> get_omap_iterator(
+    CollectionRef ch,
+    const ghobject_t& oid) = 0;
+  virtual seastar::future<std::map<uint64_t, uint64_t>> fiemap(
+    CollectionRef ch,
+    const ghobject_t& oid,
+    uint64_t off,
+    uint64_t len) = 0;
+
+  virtual seastar::future<> write_meta(const std::string& key,
+				       const std::string& value) = 0;
+  virtual seastar::future<std::tuple<int, std::string>> read_meta(
+    const std::string& key) = 0;
+  virtual uuid_d get_fsid() const  = 0;
+  virtual unsigned get_max_attr_name_length() const = 0;
+};
+
+inline void intrusive_ptr_add_ref(FuturizedStore::OmapIterator* iter) {
+  assert(iter);
+  iter->count++;
+}
+
+inline void intrusive_ptr_release(FuturizedStore::OmapIterator* iter) {
+  assert(iter);
+  assert(iter->count > 0);
+  if ((--iter->count) == 0) {
+    delete iter;
+  }
+}
+
+}
diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt
new file mode 100644
index 000000000..77f8465cf
--- /dev/null
+++ b/src/crimson/os/seastore/CMakeLists.txt
@@ -0,0 +1,37 @@
+add_library(crimson-seastore STATIC
+  cached_extent.cc
+  seastore_types.cc
+  segment_manager/ephemeral.cc
+  segment_manager/block.cc
+  transaction_manager.cc
+  journal.cc
+  cache.cc
+  lba_manager.cc
+  segment_cleaner.cc
+  lba_manager/btree/btree_lba_manager.cc
+  lba_manager/btree/lba_btree_node_impl.cc
+  lba_manager/btree/btree_range_pin.cc
+  onode.cc
+  onode_manager/simple-fltree/onode_block.cc
+  onode_manager/simple-fltree/onode_delta.cc
+  onode_manager/simple-fltree/onode_node.cc
+  onode_manager/staged-fltree/node.cc
+  onode_manager/staged-fltree/node_extent_manager.cc
+  onode_manager/staged-fltree/node_extent_manager/seastore.cc
+  onode_manager/staged-fltree/node_extent_mutable.cc
+  onode_manager/staged-fltree/node_impl.cc
+  onode_manager/staged-fltree/stages/item_iterator_stage.cc
+  onode_manager/staged-fltree/stages/key_layout.cc
+  onode_manager/staged-fltree/stages/node_stage_layout.cc
+  onode_manager/staged-fltree/stages/node_stage.cc
+  onode_manager/staged-fltree/stages/sub_items_stage.cc
+  onode_manager/staged-fltree/super.cc
+  onode_manager/staged-fltree/tree.cc
+  extentmap_manager.cc
+  extentmap_manager/btree/extentmap_btree_node_impl.cc
+  extentmap_manager/btree/btree_extentmap_manager.cc
+  seastore.cc
+  ../../../test/crimson/seastore/test_block.cc
+	)
+target_link_libraries(crimson-seastore
+  crimson)
diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc
new file mode 100644
index 000000000..6a406c1b8
--- /dev/null
+++ b/src/crimson/os/seastore/cache.cc
@@ -0,0 +1,541 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/cache.h"
+#include "crimson/common/log.h"
+
+// included for get_extent_by_type
+#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h"
+#include "crimson/os/seastore/onode_manager/simple-fltree/onode_block.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h"
+#include "test/crimson/seastore/test_block.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+namespace crimson::os::seastore {
+
+Cache::Cache(SegmentManager &segment_manager) :
+  segment_manager(segment_manager) {}
+
+Cache::~Cache()
+{
+  for (auto &i: extents) {
+    logger().error("~Cache: extent {} still alive", i);
+  }
+  ceph_assert(extents.empty());
+}
+
+Cache::retire_extent_ret Cache::retire_extent_if_cached(
+  Transaction &t, paddr_t addr)
+{
+  if (auto ext = t.write_set.find_offset(addr); ext != t.write_set.end()) {
+    logger().debug("{}: found {} in t.write_set", __func__, addr);
+    t.add_to_retired_set(CachedExtentRef(&*ext));
+    return retire_extent_ertr::now();
+  } else if (auto iter = extents.find_offset(addr);
+      iter != extents.end()) {
+    auto ret = CachedExtentRef(&*iter);
+    return ret->wait_io().then([&t, ret=std::move(ret)]() mutable {
+      t.add_to_retired_set(ret);
+      return retire_extent_ertr::now();
+    });
+  } else {
+    return retire_extent_ertr::now();
+  }
+}
+
+void Cache::add_extent(CachedExtentRef ref)
+{
+  assert(ref->is_valid());
+  extents.insert(*ref);
+
+  if (ref->is_dirty()) {
+    add_to_dirty(ref);
+  } else {
+    ceph_assert(!ref->primary_ref_list_hook.is_linked());
+  }
+  logger().debug("add_extent: {}", *ref);
+}
+
+void Cache::mark_dirty(CachedExtentRef ref)
+{
+  if (ref->is_dirty()) {
+    assert(ref->primary_ref_list_hook.is_linked());
+    return;
+  }
+
+  add_to_dirty(ref);
+  ref->state = CachedExtent::extent_state_t::DIRTY;
+
+  logger().debug("mark_dirty: {}", *ref);
+}
+
+void Cache::add_to_dirty(CachedExtentRef ref)
+{
+  assert(ref->is_valid());
+  assert(!ref->primary_ref_list_hook.is_linked());
+  intrusive_ptr_add_ref(&*ref);
+  dirty.push_back(*ref);
+}
+
+void Cache::remove_extent(CachedExtentRef ref)
+{
+  logger().debug("remove_extent: {}", *ref);
+  assert(ref->is_valid());
+  extents.erase(*ref);
+
+  if (ref->is_dirty()) {
+    ceph_assert(ref->primary_ref_list_hook.is_linked());
+    dirty.erase(dirty.s_iterator_to(*ref));
+    intrusive_ptr_release(&*ref);
+  } else {
+    ceph_assert(!ref->primary_ref_list_hook.is_linked());
+  }
+}
+
+void Cache::replace_extent(CachedExtentRef next, CachedExtentRef prev)
+{
+  assert(next->get_paddr() == prev->get_paddr());
+  assert(next->version == prev->version + 1);
+  extents.replace(*next, *prev);
+
+  if (prev->is_dirty()) {
+    ceph_assert(prev->primary_ref_list_hook.is_linked());
+    auto prev_it = dirty.iterator_to(*prev);
+    dirty.insert(prev_it, *next);
+    dirty.erase(prev_it);
+    intrusive_ptr_release(&*prev);
+    intrusive_ptr_add_ref(&*next);
+  } else {
+    add_to_dirty(next);
+  }
+}
+
+CachedExtentRef Cache::alloc_new_extent_by_type(
+  Transaction &t,       ///< [in, out] current transaction
+  extent_types_t type,  ///< [in] type tag
+  segment_off_t length  ///< [in] length
+)
+{
+  switch (type) {
+  case extent_types_t::ROOT:
+    assert(0 == "ROOT is never directly alloc'd");
+    return CachedExtentRef();
+  case extent_types_t::LADDR_INTERNAL:
+    return alloc_new_extent<lba_manager::btree::LBAInternalNode>(t, length);
+  case extent_types_t::LADDR_LEAF:
+    return alloc_new_extent<lba_manager::btree::LBALeafNode>(t, length);
+  case extent_types_t::ONODE_BLOCK:
+    return alloc_new_extent<OnodeBlock>(t, length);
+  case extent_types_t::EXTMAP_INNER:
+    return alloc_new_extent<extentmap_manager::ExtMapInnerNode>(t, length);
+  case extent_types_t::EXTMAP_LEAF:
+    return alloc_new_extent<extentmap_manager::ExtMapLeafNode>(t, length);
+  case extent_types_t::TEST_BLOCK:
+    return alloc_new_extent<TestBlock>(t, length);
+  case extent_types_t::TEST_BLOCK_PHYSICAL:
+    return alloc_new_extent<TestBlockPhysical>(t, length);
+  case extent_types_t::NONE: {
+    ceph_assert(0 == "NONE is an invalid extent type");
+    return CachedExtentRef();
+  }
+  default:
+    ceph_assert(0 == "impossible");
+    return CachedExtentRef();
+  }
+}
+
+CachedExtentRef Cache::duplicate_for_write(
+  Transaction &t,
+  CachedExtentRef i) {
+  if (i->is_pending())
+    return i;
+
+  auto ret = i->duplicate_for_write();
+  if (ret->get_type() == extent_types_t::ROOT) {
+    // root must be loaded before mutate
+    assert(t.root == i);
+    t.root = ret->cast<RootBlock>();
+  } else {
+    ret->last_committed_crc = i->last_committed_crc;
+    ret->prior_instance = i;
+    t.add_mutated_extent(ret);
+  }
+
+  ret->version++;
+  ret->state = CachedExtent::extent_state_t::MUTATION_PENDING;
+  logger().debug("Cache::duplicate_for_write: {} -> {}", *i, *ret);
+  return ret;
+}
+
+std::optional<record_t> Cache::try_construct_record(Transaction &t)
+{
+  // First, validate read set
+  for (auto &i: t.read_set) {
+    if (i->state == CachedExtent::extent_state_t::INVALID)
+      return std::nullopt;
+  }
+
+  record_t record;
+
+  t.write_set.clear();
+
+  // Add new copy of mutated blocks, set_io_wait to block until written
+  record.deltas.reserve(t.mutated_block_list.size());
+  for (auto &i: t.mutated_block_list) {
+    if (!i->is_valid()) {
+      logger().debug("try_construct_record: ignoring invalid {}", *i);
+      continue;
+    }
+    logger().debug("try_construct_record: mutating {}", *i);
+
+    assert(i->prior_instance);
+    replace_extent(i, i->prior_instance);
+
+    i->prepare_write();
+    i->set_io_wait();
+
+    assert(i->get_version() > 0);
+    auto final_crc = i->get_crc32c();
+    record.deltas.push_back(
+      delta_info_t{
+	i->get_type(),
+	i->get_paddr(),
+	(i->is_logical()
+	? i->cast<LogicalCachedExtent>()->get_laddr()
+	: L_ADDR_NULL),
+	i->last_committed_crc,
+	final_crc,
+	(segment_off_t)i->get_length(),
+	i->get_version() - 1,
+	i->get_delta()
+      });
+    i->last_committed_crc = final_crc;
+  }
+
+  if (t.root) {
+    logger().debug(
+      "{}: writing out root delta for {}",
+      __func__,
+      *t.root);
+    record.deltas.push_back(
+      delta_info_t{
+	extent_types_t::ROOT,
+	paddr_t{},
+	L_ADDR_NULL,
+	0,
+	0,
+	0,
+	t.root->get_version() - 1,
+	t.root->get_delta()
+      });
+  }
+
+  // Transaction is now a go, set up in-memory cache state
+  // invalidate now invalid blocks
+  for (auto &i: t.retired_set) {
+    logger().debug("try_construct_record: retiring {}", *i);
+    ceph_assert(i->is_valid());
+    remove_extent(i);
+    i->state = CachedExtent::extent_state_t::INVALID;
+  }
+
+  record.extents.reserve(t.fresh_block_list.size());
+  for (auto &i: t.fresh_block_list) {
+    logger().debug("try_construct_record: fresh block {}", *i);
+    bufferlist bl;
+    i->prepare_write();
+    bl.append(i->get_bptr());
+    if (i->get_type() == extent_types_t::ROOT) {
+      assert(0 == "ROOT never gets written as a fresh block");
+    }
+
+    assert(bl.length() == i->get_length());
+    record.extents.push_back(extent_t{
+	i->get_type(),
+	i->is_logical()
+	? i->cast<LogicalCachedExtent>()->get_laddr()
+	: L_ADDR_NULL,
+	std::move(bl)
+      });
+  }
+
+  return std::make_optional<record_t>(std::move(record));
+}
+
+void Cache::complete_commit(
+  Transaction &t,
+  paddr_t final_block_start,
+  journal_seq_t seq,
+  SegmentCleaner *cleaner)
+{
+  if (t.root) {
+    remove_extent(root);
+    root = t.root;
+    root->state = CachedExtent::extent_state_t::DIRTY;
+    root->on_delta_write(final_block_start);
+    root->dirty_from = seq;
+    add_extent(root);
+    logger().debug("complete_commit: new root {}", *t.root);
+  }
+
+  for (auto &i: t.fresh_block_list) {
+    i->set_paddr(final_block_start.add_relative(i->get_paddr()));
+    i->last_committed_crc = i->get_crc32c();
+    i->on_initial_write();
+
+    if (!i->is_valid()) {
+      logger().debug("complete_commit: invalid {}", *i);
+      continue;
+    }
+
+    i->state = CachedExtent::extent_state_t::CLEAN;
+    logger().debug("complete_commit: fresh {}", *i);
+    add_extent(i);
+    if (cleaner) {
+      cleaner->mark_space_used(
+	i->get_paddr(),
+	i->get_length());
+    }
+  }
+
+  // Add new copy of mutated blocks, set_io_wait to block until written
+  for (auto &i: t.mutated_block_list) {
+    logger().debug("complete_commit: mutated {}", *i);
+    assert(i->prior_instance);
+    i->on_delta_write(final_block_start);
+    i->prior_instance = CachedExtentRef();
+    if (!i->is_valid()) {
+      logger().debug("complete_commit: not dirtying invalid {}", *i);
+      continue;
+    }
+    i->state = CachedExtent::extent_state_t::DIRTY;
+    if (i->version == 1) {
+      i->dirty_from = seq;
+    }
+  }
+
+  if (cleaner) {
+    for (auto &i: t.retired_set) {
+      cleaner->mark_space_free(
+	i->get_paddr(),
+	i->get_length());
+    }
+  }
+
+  for (auto &i: t.mutated_block_list) {
+    i->complete_io();
+  }
+}
+
+void Cache::init() {
+  if (root) {
+    // initial creation will do mkfs followed by mount each of which calls init
+    remove_extent(root);
+    root = nullptr;
+  }
+  root = new RootBlock();
+  root->state = CachedExtent::extent_state_t::DIRTY;
+  add_extent(root);
+}
+
+Cache::mkfs_ertr::future<> Cache::mkfs(Transaction &t)
+{
+  return get_root(t).safe_then([this, &t](auto croot) {
+    duplicate_for_write(t, croot);
+    return mkfs_ertr::now();
+  });
+}
+
+Cache::close_ertr::future<> Cache::close()
+{
+  root.reset();
+  for (auto i = dirty.begin(); i != dirty.end(); ) {
+    auto ptr = &*i;
+    dirty.erase(i++);
+    intrusive_ptr_release(ptr);
+  }
+  return close_ertr::now();
+}
+
+Cache::replay_delta_ret
+Cache::replay_delta(
+  journal_seq_t journal_seq,
+  paddr_t record_base,
+  const delta_info_t &delta)
+{
+  if (delta.type == extent_types_t::ROOT) {
+    logger().debug("replay_delta: found root delta");
+    root->apply_delta_and_adjust_crc(record_base, delta.bl);
+    root->dirty_from = journal_seq;
+    return replay_delta_ertr::now();
+  } else {
+    auto get_extent_if_cached = [this](paddr_t addr)
+      -> replay_delta_ertr::future<CachedExtentRef> {
+      auto retiter = extents.find_offset(addr);
+      if (retiter != extents.end()) {
+	return replay_delta_ertr::make_ready_future<CachedExtentRef>(&*retiter);
+      } else {
+	return replay_delta_ertr::make_ready_future<CachedExtentRef>();
+      }
+    };
+    auto extent_fut = delta.pversion == 0 ?
+      get_extent_by_type(
+	delta.type,
+	delta.paddr,
+	delta.laddr,
+	delta.length) :
+      get_extent_if_cached(
+	delta.paddr);
+    return extent_fut.safe_then([=, &delta](auto extent) {
+      if (!extent) {
+	assert(delta.pversion > 0);
+	logger().debug(
+	  "replay_delta: replaying {}, extent not present so delta is obsolete",
+	  delta);
+	return;
+      }
+
+      logger().debug(
+	"replay_delta: replaying {} on {}",
+	  *extent,
+	delta);
+
+      assert(extent->version == delta.pversion);
+
+      assert(extent->last_committed_crc == delta.prev_crc);
+      extent->apply_delta_and_adjust_crc(record_base, delta.bl);
+      assert(extent->last_committed_crc == delta.final_crc);
+
+      if (extent->version == 0) {
+	extent->dirty_from = journal_seq;
+      }
+      extent->version++;
+      mark_dirty(extent);
+    });
+  }
+}
+
+Cache::get_next_dirty_extents_ret Cache::get_next_dirty_extents(
+  journal_seq_t seq)
+{
+  std::vector<CachedExtentRef> ret;
+  for (auto i = dirty.begin(); i != dirty.end(); ++i) {
+    CachedExtentRef cand;
+    if (i->dirty_from < seq) {
+      assert(ret.empty() || ret.back()->dirty_from <= i->dirty_from);
+      ret.push_back(&*i);
+    } else {
+      break;
+    }
+  }
+  return seastar::do_with(
+    std::move(ret),
+    [](auto &ret) {
+      return seastar::do_for_each(
+	ret,
+	[](auto &ext) {
+	  logger().debug(
+	    "get_next_dirty_extents: waiting on {}",
+	    *ext);
+	  return ext->wait_io();
+	}).then([&ret]() mutable {
+	  return seastar::make_ready_future<std::vector<CachedExtentRef>>(
+	    std::move(ret));
+	});
+    });
+}
+
+Cache::get_root_ret Cache::get_root(Transaction &t)
+{
+  if (t.root) {
+    return get_root_ret(
+      get_root_ertr::ready_future_marker{},
+      t.root);
+  } else {
+    auto ret = root;
+    return ret->wait_io().then([ret, &t] {
+      t.root = ret;
+      return get_root_ret(
+	get_root_ertr::ready_future_marker{},
+	ret);
+    });
+  }
+}
+
+using StagedOnodeBlock = crimson::os::seastore::onode::SeastoreNodeExtent;
+
+Cache::get_extent_ertr::future<CachedExtentRef> Cache::get_extent_by_type(
+  extent_types_t type,
+  paddr_t offset,
+  laddr_t laddr,
+  segment_off_t length)
+{
+  return [=] {
+    switch (type) {
+    case extent_types_t::ROOT:
+      assert(0 == "ROOT is never directly read");
+      return get_extent_ertr::make_ready_future<CachedExtentRef>();
+    case extent_types_t::LADDR_INTERNAL:
+      return get_extent<lba_manager::btree::LBAInternalNode>(offset, length
+      ).safe_then([](auto extent) {
+	return CachedExtentRef(extent.detach(), false /* add_ref */);
+      });
+    case extent_types_t::LADDR_LEAF:
+      return get_extent<lba_manager::btree::LBALeafNode>(offset, length
+      ).safe_then([](auto extent) {
+	return CachedExtentRef(extent.detach(), false /* add_ref */);
+      });
+    case extent_types_t::EXTMAP_INNER:
+      return get_extent<extentmap_manager::ExtMapInnerNode>(offset, length
+      ).safe_then([](auto extent) {
+        return CachedExtentRef(extent.detach(), false /* add_ref */);
+      });
+    case extent_types_t::EXTMAP_LEAF:
+      return get_extent<extentmap_manager::ExtMapLeafNode>(offset, length
+      ).safe_then([](auto extent) {
+        return CachedExtentRef(extent.detach(), false /* add_ref */);
+      });
+    case extent_types_t::ONODE_BLOCK:
+      return get_extent<OnodeBlock>(offset, length
+      ).safe_then([](auto extent) {
+	return CachedExtentRef(extent.detach(), false /* add_ref */);
+      });
+    case extent_types_t::ONODE_BLOCK_STAGED:
+      return get_extent<StagedOnodeBlock>(offset, length
+      ).safe_then([](auto extent) {
+	return CachedExtentRef(extent.detach(), false /* add_ref */);
+      });
+    case extent_types_t::TEST_BLOCK:
+      return get_extent<TestBlock>(offset, length
+      ).safe_then([](auto extent) {
+	return CachedExtentRef(extent.detach(), false /* add_ref */);
+      });
+    case extent_types_t::TEST_BLOCK_PHYSICAL:
+      return get_extent<TestBlockPhysical>(offset, length
+      ).safe_then([](auto extent) {
+	return CachedExtentRef(extent.detach(), false /* add_ref */);
+      });
+    case extent_types_t::NONE: {
+      ceph_assert(0 == "NONE is an invalid extent type");
+      return get_extent_ertr::make_ready_future<CachedExtentRef>();
+    }
+    default:
+      ceph_assert(0 == "impossible");
+      return get_extent_ertr::make_ready_future<CachedExtentRef>();
+    }
+  }().safe_then([laddr](CachedExtentRef e) {
+    assert(e->is_logical() == (laddr != L_ADDR_NULL));
+    if (e->is_logical()) {
+      e->cast<LogicalCachedExtent>()->set_laddr(laddr);
+    }
+    return get_extent_ertr::make_ready_future<CachedExtentRef>(e);
+  });
+}
+
+}
diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h
new file mode 100644
index 000000000..624272162
--- /dev/null
+++ b/src/crimson/os/seastore/cache.h
@@ -0,0 +1,516 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include "seastar/core/shared_future.hh"
+
+#include "include/buffer.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction.h"
+#include "crimson/os/seastore/segment_manager.h"
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/root_block.h"
+#include "crimson/os/seastore/segment_cleaner.h"
+
+namespace crimson::os::seastore {
+
+/**
+ * Cache
+ *
+ * This component is responsible for buffer management, including
+ * transaction lifecycle.
+ *
+ * Seastore transactions are expressed as an atomic combination of
+ * 1) newly written blocks
+ * 2) logical mutations to existing physical blocks
+ *
+ * See record_t
+ *
+ * As such, any transaction has 3 components:
+ * 1) read_set: references to extents read during the transaction
+ *       See Transaction::read_set
+ * 2) write_set: references to extents to be written as:
+ *    a) new physical blocks, see Transaction::fresh_block_list
+ *    b) mutations to existing physical blocks,
+ *       see Transaction::mutated_block_list
+ * 3) retired_set: extent refs to be retired either due to 2b or
+ *    due to releasing the extent generally.
+
+ * In the case of 2b, the CachedExtent will have been copied into
+ * a fresh CachedExtentRef such that the source extent ref is present
+ * in the read set and the newly allocated extent is present in the
+ * write_set.
+ *
+ * A transaction has 3 phases:
+ * 1) construction: user calls Cache::get_transaction() and populates
+ *    the returned transaction by calling Cache methods
+ * 2) submission: user calls Cache::try_start_transaction().  If
+ *    succcessful, the user may construct a record and submit the
+ *    transaction to the journal.
+ * 3) completion: once the transaction is durable, the user must call
+ *    Cache::complete_transaction() with the block offset to complete
+ *    the transaction.
+ *
+ * Internally, in phase 1, the fields in Transaction are filled in.
+ * - reads may block if the referenced extent is being written
+ * - once a read obtains a particular CachedExtentRef for a paddr_t,
+ *   it'll always get the same one until overwritten
+ * - once a paddr_t is overwritten or written, subsequent reads of
+ *   that addr will get the new ref
+ *
+ * In phase 2, if all extents in the read set are valid (not expired),
+ * we can commit (otherwise, we fail and the user must retry).
+ * - Expire all extents in the retired_set (they must all be valid)
+ * - Remove all extents in the retired_set from Cache::extents
+ * - Mark all extents in the write_set wait_io(), add promises to
+ *   transaction
+ * - Merge Transaction::write_set into Cache::extents
+ *
+ * After phase 2, the user will submit the record to the journal.
+ * Once complete, we perform phase 3:
+ * - For each CachedExtent in block_list, call
+ *   CachedExtent::complete_initial_write(paddr_t) with the block's
+ *   final offset (inferred from the extent's position in the block_list
+ *   and extent lengths).
+ * - For each block in mutation_list, call
+ *   CachedExtent::delta_written(paddr_t) with the address of the start
+ *   of the record
+ * - Complete all promises with the final record start paddr_t
+ */
+class Cache {
+public:
+  Cache(SegmentManager &segment_manager);
+  ~Cache();
+
+  /**
+   * drop_from_cache
+   *
+   * Drop extent from cache.  Intended for use when
+   * ref refers to a logically dead extent as during
+   * replay.
+   */
+  void drop_from_cache(CachedExtentRef ref) {
+    remove_extent(ref);
+  }
+
+  /// Declare ref retired in t
+  void retire_extent(Transaction &t, CachedExtentRef ref) {
+    t.add_to_retired_set(ref);
+  }
+
+  /// Declare paddr retired in t, noop if not cached
+  using retire_extent_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using retire_extent_ret = retire_extent_ertr::future<>;
+  retire_extent_ret retire_extent_if_cached(
+    Transaction &t, paddr_t addr);
+
+  /**
+   * get_root
+   *
+   * returns ref to current root or t.root if modified in t
+   */
+  using get_root_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using get_root_ret = get_root_ertr::future<RootBlockRef>;
+  get_root_ret get_root(Transaction &t);
+
+  /**
+   * get_root_fast
+   *
+   * returns t.root and assume it is already present/read in t
+   */
+  RootBlockRef get_root_fast(Transaction &t) {
+    assert(t.root);
+    return t.root;
+  }
+
+  /**
+   * get_extent
+   *
+   * returns ref to extent at offset~length of type T either from
+   * - extent_set if already in cache
+   * - disk
+   */
+  using get_extent_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  template <typename T>
+  get_extent_ertr::future<TCachedExtentRef<T>> get_extent(
+    paddr_t offset,       ///< [in] starting addr
+    segment_off_t length  ///< [in] length
+  ) {
+    if (auto iter = extents.find_offset(offset);
+	       iter != extents.end()) {
+      auto ret = TCachedExtentRef<T>(static_cast<T*>(&*iter));
+      return ret->wait_io().then([ret=std::move(ret)]() mutable {
+	return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>(
+	  std::move(ret));
+      });
+    } else {
+      auto ref = CachedExtent::make_cached_extent_ref<T>(
+	alloc_cache_buf(length));
+      ref->set_io_wait();
+      ref->set_paddr(offset);
+      ref->state = CachedExtent::extent_state_t::CLEAN;
+
+      return segment_manager.read(
+	offset,
+	length,
+	ref->get_bptr()).safe_then(
+	  [this, ref=std::move(ref)]() mutable {
+	    /* TODO: crc should be checked against LBA manager */
+	    ref->last_committed_crc = ref->get_crc32c();
+
+	    ref->on_clean_read();
+	    ref->complete_io();
+	    add_extent(ref);
+	    return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>(
+	      std::move(ref));
+	  },
+	  get_extent_ertr::pass_further{},
+	  crimson::ct_error::discard_all{});
+    }
+  }
+
+  /**
+   * get_extent_if_cached
+   *
+   * Returns extent at offset if in cache
+   */
+  Transaction::get_extent_ret get_extent_if_cached(
+    Transaction &t,
+    paddr_t offset,
+    CachedExtentRef *out) {
+    auto result = t.get_extent(offset, out);
+    if (result != Transaction::get_extent_ret::ABSENT) {
+      return result;
+    } else if (auto iter = extents.find_offset(offset);
+	       iter != extents.end()) {
+      if (out)
+	*out = &*iter;
+      return Transaction::get_extent_ret::PRESENT;
+    } else {
+      return Transaction::get_extent_ret::ABSENT;
+    }
+  }
+
+  /**
+   * get_extent
+   *
+   * returns ref to extent at offset~length of type T either from
+   * - t if modified by t
+   * - extent_set if already in cache
+   * - disk
+   *
+   * t *must not* have retired offset
+   */
+  template <typename T>
+  get_extent_ertr::future<TCachedExtentRef<T>> get_extent(
+    Transaction &t,       ///< [in,out] current transaction
+    paddr_t offset,       ///< [in] starting addr
+    segment_off_t length  ///< [in] length
+  ) {
+    CachedExtentRef ret;
+    auto result = t.get_extent(offset, &ret);
+    if (result != Transaction::get_extent_ret::ABSENT) {
+      assert(result != Transaction::get_extent_ret::RETIRED);
+      return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>(
+	ret->cast<T>());
+    } else {
+      return get_extent<T>(offset, length).safe_then(
+	[&t](auto ref) mutable {
+	  t.add_to_read_set(ref);
+	  return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>(
+	    std::move(ref));
+	});
+    }
+  }
+
+  /**
+   * get_extent_by_type
+   *
+   * Based on type, instantiate the correct concrete type
+   * and read in the extent at location offset~length.
+   */
+  get_extent_ertr::future<CachedExtentRef> get_extent_by_type(
+    extent_types_t type,  ///< [in] type tag
+    paddr_t offset,       ///< [in] starting addr
+    laddr_t laddr,        ///< [in] logical address if logical
+    segment_off_t length  ///< [in] length
+  );
+
+  get_extent_ertr::future<CachedExtentRef> get_extent_by_type(
+    Transaction &t,
+    extent_types_t type,
+    paddr_t offset,
+    laddr_t laddr,
+    segment_off_t length) {
+    CachedExtentRef ret;
+    auto status = get_extent_if_cached(t, offset, &ret);
+    if (status == Transaction::get_extent_ret::RETIRED) {
+      return get_extent_ertr::make_ready_future<CachedExtentRef>();
+    } else if (status == Transaction::get_extent_ret::PRESENT) {
+      return get_extent_ertr::make_ready_future<CachedExtentRef>(ret);
+    } else {
+      return get_extent_by_type(type, offset, laddr, length
+      ).safe_then([=, &t](CachedExtentRef ret) {
+	t.add_to_read_set(ret);
+	return get_extent_ertr::make_ready_future<CachedExtentRef>(
+	  std::move(ret));
+      });
+    }
+  }
+
+  /**
+   * get_extents
+   *
+   * returns refs to extents in extents from:
+   * - t if modified by t
+   * - extent_set if already in cache
+   * - disk
+   */
+  template<typename T>
+  get_extent_ertr::future<t_pextent_list_t<T>> get_extents(
+    Transaction &t,        ///< [in, out] current transaction
+    paddr_list_t &&extents ///< [in] extent list for lookup
+  ) {
+    auto retref = std::make_unique<t_pextent_list_t<T>>();
+    auto &ret = *retref;
+    auto ext = std::make_unique<paddr_list_t>(std::move(extents));
+    return crimson::do_for_each(
+      ext->begin(),
+      ext->end(),
+      [this, &t, &ret](auto &p) {
+	auto &[offset, len] = p;
+	return get_extent(t, offset, len).safe_then([&ret](auto cext) {
+	  ret.push_back(std::move(cext));
+	});
+      }).safe_then([retref=std::move(retref), ext=std::move(ext)]() mutable {
+	return get_extent_ertr::make_ready_future<t_pextent_list_t<T>>(
+	  std::move(*retref));
+      });
+  }
+
+  /**
+   * alloc_new_extent
+   *
+   * Allocates a fresh extent.  addr will be relative until commit.
+   */
+  template <typename T>
+  TCachedExtentRef<T> alloc_new_extent(
+    Transaction &t,      ///< [in, out] current transaction
+    segment_off_t length ///< [in] length
+  ) {
+    auto ret = CachedExtent::make_cached_extent_ref<T>(
+      alloc_cache_buf(length));
+    t.add_fresh_extent(ret);
+    ret->state = CachedExtent::extent_state_t::INITIAL_WRITE_PENDING;
+    return ret;
+  }
+
+  /**
+   * alloc_new_extent
+   *
+   * Allocates a fresh extent.  addr will be relative until commit.
+   */
+  CachedExtentRef alloc_new_extent_by_type(
+    Transaction &t,       ///< [in, out] current transaction
+    extent_types_t type,  ///< [in] type tag
+    segment_off_t length  ///< [in] length
+    );
+
+  /**
+   * Allocates mutable buffer from extent_set on offset~len
+   *
+   * TODO: Note, currently all implementations literally copy the
+   * buffer.  This needn't be true, CachedExtent implementations could
+   * choose to refer to the same buffer unmodified until commit and just
+   * buffer the mutations in an ancillary data structure.
+   *
+   * @param current transaction
+   * @param extent to duplicate
+   * @return mutable extent
+   */
+  CachedExtentRef duplicate_for_write(
+    Transaction &t,    ///< [in, out] current transaction
+    CachedExtentRef i  ///< [in] ref to existing extent
+  );
+
+  /**
+   * try_construct_record
+   *
+   * First checks for conflicts.  If a racing write has mutated/retired
+   * an extent mutated by this transaction, nullopt will be returned.
+   *
+   * Otherwise, a record will be returned valid for use with Journal.
+   */
+  std::optional<record_t> try_construct_record(
+    Transaction &t ///< [in, out] current transaction
+  );
+
+  /**
+   * complete_commit
+   *
+   * Must be called upon completion of write.  Releases blocks on mutating
+   * extents, fills in addresses, and calls relevant callbacks on fresh
+   * and mutated exents.
+   */
+  void complete_commit(
+    Transaction &t,            ///< [in, out] current transaction
+    paddr_t final_block_start, ///< [in] offset of initial block
+    journal_seq_t seq,         ///< [in] journal commit seq
+    SegmentCleaner *cleaner=nullptr ///< [out] optional segment stat listener
+  );
+
+  /**
+   * init
+   */
+  void init();
+
+  /**
+   * mkfs
+   *
+   * Alloc initial root node and add to t.  The intention is for other
+   * components to use t to adjust the resulting root ref prior to commit.
+   */
+  using mkfs_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  mkfs_ertr::future<> mkfs(Transaction &t);
+
+  /**
+   * close
+   *
+   * TODO: should flush dirty blocks
+   */
+  using close_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  close_ertr::future<> close();
+
+  /**
+   * replay_delta
+   *
+   * Intended for use in Journal::delta. For each delta, should decode delta,
+   * read relevant block from disk or cache (using correct type), and call
+   * CachedExtent::apply_delta marking the extent dirty.
+   */
+  using replay_delta_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using replay_delta_ret = replay_delta_ertr::future<>;
+  replay_delta_ret replay_delta(
+    journal_seq_t seq,
+    paddr_t record_block_base,
+    const delta_info_t &delta);
+
+  /**
+   * init_cached_extents
+   *
+   * Calls passed lambda for each dirty cached block.  Intended for use
+   * after replay to allow lba_manager (or w/e) to read in any ancestor
+   * blocks.
+   */
+  using init_cached_extents_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using init_cached_extents_ret = replay_delta_ertr::future<>;
+  template <typename F>
+  init_cached_extents_ret init_cached_extents(
+    Transaction &t,
+    F &&f)
+  {
+    std::vector<CachedExtentRef> dirty;
+    for (auto &e : extents) {
+      dirty.push_back(CachedExtentRef(&e));
+    }
+    return seastar::do_with(
+      std::forward<F>(f),
+      std::move(dirty),
+      [&t](auto &f, auto &refs) mutable {
+	return crimson::do_for_each(
+	  refs,
+	  [&t, &f](auto &e) { return f(t, e); });
+      });
+  }
+
+  /**
+   * update_extent_from_transaction
+   *
+   * Updates passed extent based on t.  If extent has been retired,
+   * a null result will be returned.
+   */
+  CachedExtentRef update_extent_from_transaction(
+    Transaction &t,
+    CachedExtentRef extent) {
+    if (extent->get_type() == extent_types_t::ROOT) {
+      if (t.root) {
+	return t.root;
+      } else {
+	return extent;
+      }
+    } else {
+      auto result = t.get_extent(extent->get_paddr(), &extent);
+      if (result == Transaction::get_extent_ret::RETIRED) {
+	return CachedExtentRef();
+      } else {
+	return extent;
+      }
+    }
+  }
+
+  /**
+   * print
+   *
+   * Dump summary of contents (TODO)
+   */
+  std::ostream &print(
+    std::ostream &out) const {
+    return out;
+  }
+
+  /// returns extents with dirty_from < seq
+  using get_next_dirty_extents_ertr = crimson::errorator<>;
+  using get_next_dirty_extents_ret = get_next_dirty_extents_ertr::future<
+    std::vector<CachedExtentRef>>;
+  get_next_dirty_extents_ret get_next_dirty_extents(
+    journal_seq_t seq);
+
+private:
+  SegmentManager &segment_manager; ///< ref to segment_manager
+  RootBlockRef root;               ///< ref to current root
+  ExtentIndex extents;             ///< set of live extents
+
+  /**
+   * dirty
+   *
+   * holds refs to dirty extents.  Ordered by CachedExtent::dirty_from.
+   */
+  CachedExtent::list dirty;
+
+  /// alloc buffer for cached extent
+  bufferptr alloc_cache_buf(size_t size) {
+    // TODO: memory pooling etc
+    auto bp = ceph::bufferptr(
+      buffer::create_page_aligned(size));
+    bp.zero();
+    return bp;
+  }
+
+  /// Add extent to extents handling dirty and refcounting
+  void add_extent(CachedExtentRef ref);
+
+  /// Mark exising extent ref dirty -- mainly for replay
+  void mark_dirty(CachedExtentRef ref);
+
+  /// Add dirty extent to dirty list
+  void add_to_dirty(CachedExtentRef ref);
+
+  /// Remove extent from extents handling dirty and refcounting
+  void remove_extent(CachedExtentRef ref);
+
+  /// Replace prev with next
+  void replace_extent(CachedExtentRef next, CachedExtentRef prev);
+};
+
+}
diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc
new file mode 100644
index 000000000..7019b9fb8
--- /dev/null
+++ b/src/crimson/os/seastore/cached_extent.cc
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/cached_extent.h"
+
+#include "crimson/common/log.h"
+
+namespace {
+  [[maybe_unused]] seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+namespace crimson::os::seastore {
+
+#ifdef DEBUG_CACHED_EXTENT_REF
+
+void intrusive_ptr_add_ref(CachedExtent *ptr)
+{
+  intrusive_ptr_add_ref(
+    static_cast<boost::intrusive_ref_counter<
+    CachedExtent,
+    boost::thread_unsafe_counter>*>(ptr));
+    logger().debug("intrusive_ptr_add_ref: {}", *ptr);
+}
+
+void intrusive_ptr_release(CachedExtent *ptr)
+{
+  logger().debug("intrusive_ptr_release: {}", *ptr);
+  intrusive_ptr_release(
+    static_cast<boost::intrusive_ref_counter<
+    CachedExtent,
+    boost::thread_unsafe_counter>*>(ptr));
+}
+
+#endif
+
+std::ostream &operator<<(std::ostream &out, CachedExtent::extent_state_t state)
+{
+  switch (state) {
+  case CachedExtent::extent_state_t::INITIAL_WRITE_PENDING:
+    return out << "INITIAL_WRITE_PENDING";
+  case CachedExtent::extent_state_t::MUTATION_PENDING:
+    return out << "MUTATION_PENDING";
+  case CachedExtent::extent_state_t::CLEAN:
+    return out << "CLEAN";
+  case CachedExtent::extent_state_t::DIRTY:
+    return out << "DIRTY";
+  case CachedExtent::extent_state_t::INVALID:
+    return out << "INVALID";
+  default:
+    return out << "UNKNOWN";
+  }
+}
+
+std::ostream &operator<<(std::ostream &out, const CachedExtent &ext)
+{
+  return ext.print(out);
+}
+
+CachedExtent::~CachedExtent()
+{
+  if (parent_index) {
+    parent_index->erase(*this);
+  }
+}
+
+std::ostream &LogicalCachedExtent::print_detail(std::ostream &out) const
+{
+  out << ", laddr=" << laddr;
+  if (pin) {
+    out << ", pin=" << *pin;
+  } else {
+    out << ", pin=empty";
+  }
+  return print_detail_l(out);
+}
+
+std::ostream &operator<<(std::ostream &out, const LBAPin &rhs)
+{
+  return out << "LBAPin(" << rhs.get_laddr() << "~" << rhs.get_length()
+	     << "->" << rhs.get_paddr();
+}
+
+std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs)
+{
+  bool first = true;
+  out << '[';
+  for (auto &i: rhs) {
+    out << (first ? "" : ",") << *i;
+    first = false;
+  }
+  return out << ']';
+}
+
+}
diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h
new file mode 100644
index 000000000..974988489
--- /dev/null
+++ b/src/crimson/os/seastore/cached_extent.h
@@ -0,0 +1,659 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include "seastar/core/shared_future.hh"
+
+#include "include/buffer.h"
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/seastore_types.h"
+
+namespace crimson::os::seastore {
+
+class CachedExtent;
+using CachedExtentRef = boost::intrusive_ptr<CachedExtent>;
+
+// #define DEBUG_CACHED_EXTENT_REF
+#ifdef DEBUG_CACHED_EXTENT_REF
+
+void intrusive_ptr_add_ref(CachedExtent *);
+void intrusive_ptr_release(CachedExtent *);
+
+#endif
+
+template <typename T>
+using TCachedExtentRef = boost::intrusive_ptr<T>;
+
+/**
+ * CachedExtent
+ */
+namespace onode {
+  class DummyNodeExtent;
+  class TestReplayExtent;
+}
+class ExtentIndex;
+class CachedExtent : public boost::intrusive_ref_counter<
+  CachedExtent, boost::thread_unsafe_counter> {
+  enum class extent_state_t : uint8_t {
+    INITIAL_WRITE_PENDING, // In Transaction::write_set and fresh_block_list
+    MUTATION_PENDING,      // In Transaction::write_set and mutated_block_list
+    CLEAN,                 // In Cache::extent_index, Transaction::read_set
+                           //  during write, contents match disk, version == 0
+    DIRTY,                 // Same as CLEAN, but contents do not match disk,
+                           //  version > 0
+    INVALID                // Part of no ExtentIndex set
+  } state = extent_state_t::INVALID;
+  friend std::ostream &operator<<(std::ostream &, extent_state_t);
+  // allow a dummy extent to pretend it is at a specific state
+  friend class onode::DummyNodeExtent;
+  friend class onode::TestReplayExtent;
+
+  uint32_t last_committed_crc = 0;
+
+  // Points at current version while in state MUTATION_PENDING
+  CachedExtentRef prior_instance;
+
+  /**
+   * dirty_from
+   *
+   * When dirty, indiciates the oldest journal entry which mutates
+   * this extent.
+   */
+  journal_seq_t dirty_from;
+
+public:
+  /**
+   *  duplicate_for_write
+   *
+   * Implementation should return a fresh CachedExtentRef
+   * which represents a copy of *this until on_delta_write()
+   * is complete, at which point the user may assume *this
+   * will be in state INVALID.  As such, the implementation
+   * may involve a copy of get_bptr(), or an ancillary
+   * structure which defers updating the actual buffer until
+   * on_delta_write().
+   */
+  virtual CachedExtentRef duplicate_for_write() = 0;
+
+  /**
+   * prepare_write
+   *
+   * Called prior to reading buffer.
+   * Implemenation may use this callback to fully write out
+   * updates to the buffer.
+   */
+  virtual void prepare_write() {}
+
+  /**
+   * on_initial_write
+   *
+   * Called after commit of extent.  State will be CLEAN.
+   * Implentation may use this call to fixup the buffer
+   * with the newly available absolute get_paddr().
+   */
+  virtual void on_initial_write() {}
+
+  /**
+   * on_clean_read
+   *
+   * Called after read of initially written extent.
+   *  State will be CLEAN. Implentation may use this
+   * call to fixup the buffer with the newly available
+   * absolute get_paddr().
+   */
+  virtual void on_clean_read() {}
+
+  /**
+   * on_delta_write
+   *
+   * Called after commit of delta.  State will be DIRTY.
+   * Implentation may use this call to fixup any relative
+   * references in the the buffer with the passed
+   * record_block_offset record location.
+   */
+  virtual void on_delta_write(paddr_t record_block_offset) {}
+
+  /**
+   * get_type
+   *
+   * Returns concrete type.
+   */
+  virtual extent_types_t get_type() const = 0;
+
+  virtual bool is_logical() const {
+    return false;
+  }
+
+  friend std::ostream &operator<<(std::ostream &, extent_state_t);
+  virtual std::ostream &print_detail(std::ostream &out) const { return out; }
+  std::ostream &print(std::ostream &out) const {
+    out << "CachedExtent(addr=" << this
+	<< ", type=" << get_type()
+	<< ", version=" << version
+	<< ", dirty_from=" << dirty_from
+	<< ", paddr=" << get_paddr()
+	<< ", state=" << state
+	<< ", last_committed_crc=" << last_committed_crc
+	<< ", refcount=" << use_count();
+    print_detail(out);
+    return out << ")";
+  }
+
+  /**
+   * get_delta
+   *
+   * Must return a valid delta usable in apply_delta() in submit_transaction
+   * if state == MUTATION_PENDING.
+   */
+  virtual ceph::bufferlist get_delta() = 0;
+
+  /**
+   * apply_delta
+   *
+   * bl is a delta obtained previously from get_delta.  The versions will
+   * match.  Implementation should mutate buffer based on bl.  base matches
+   * the address passed on_delta_write.
+   *
+   * Implementation *must* use set_last_committed_crc to update the crc to
+   * what the crc of the buffer would have been at submission.  For physical
+   * extents that use base to adjust internal record-relative deltas, this
+   * means that the crc should be of the buffer after applying the delta,
+   * but before that adjustment.  We do it this way because the crc in the
+   * commit path does not yet know the record base address.
+   *
+   * LogicalCachedExtent overrides this method and provides a simpler
+   * apply_delta override for LogicalCachedExtent implementers.
+   */
+  virtual void apply_delta_and_adjust_crc(
+    paddr_t base, const ceph::bufferlist &bl) = 0;
+
+  /**
+   * Called on dirty CachedExtent implementation after replay.
+   * Implementation should perform any reads/in-memory-setup
+   * necessary. (for instance, the lba implementation will use this
+   * to load in lba_manager blocks)
+   */
+  using complete_load_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  virtual complete_load_ertr::future<> complete_load() {
+    return complete_load_ertr::now();
+  }
+
+  /**
+   * cast
+   *
+   * Returns a TCachedExtentRef of the specified type.
+   * TODO: add dynamic check that the requested type is actually correct.
+   */
+  template <typename T>
+  TCachedExtentRef<T> cast() {
+    return TCachedExtentRef<T>(static_cast<T*>(this));
+  }
+  template <typename T>
+  TCachedExtentRef<const T> cast() const {
+    return TCachedExtentRef<const T>(static_cast<const T*>(this));
+  }
+
+  /// Returns true if extent is part of an open transaction
+  bool is_pending() const {
+    return state == extent_state_t::INITIAL_WRITE_PENDING ||
+      state == extent_state_t::MUTATION_PENDING;
+  }
+
+  /// Returns true if extent has a pending delta
+  bool is_mutation_pending() const {
+    return state == extent_state_t::MUTATION_PENDING;
+  }
+
+  /// Returns true if extent is a fresh extent
+  bool is_initial_pending() const {
+    return state == extent_state_t::INITIAL_WRITE_PENDING;
+  }
+
+  /// Returns true if extent is clean (does not have deltas on disk)
+  bool is_clean() const {
+    ceph_assert(is_valid());
+    return state == extent_state_t::INITIAL_WRITE_PENDING ||
+      state == extent_state_t::CLEAN;
+  }
+
+  /// Returns true if extent is dirty (has deltas on disk)
+  bool is_dirty() const {
+    ceph_assert(is_valid());
+    return !is_clean();
+  }
+
+  /// Returns true if extent has not been superceded or retired
+  bool is_valid() const {
+    return state != extent_state_t::INVALID;
+  }
+
+  /**
+   * get_dirty_from
+   *
+   * Return journal location of oldest relevant delta.
+   */
+  auto get_dirty_from() const { return dirty_from; }
+
+
+  /**
+   * get_paddr
+   *
+   * Returns current address of extent.  If is_initial_pending(), address will
+   * be relative, otherwise address will be absolute.
+   */
+  paddr_t get_paddr() const { return poffset; }
+
+  /// Returns length of extent
+  extent_len_t get_length() const { return ptr.length(); }
+
+  /// Returns version, get_version() == 0 iff is_clean()
+  extent_version_t get_version() const {
+    return version;
+  }
+
+  /// Returns crc32c of buffer
+  uint32_t get_crc32c() {
+    return ceph_crc32c(
+      1,
+      reinterpret_cast<const unsigned char *>(get_bptr().c_str()),
+      get_length());
+  }
+
+  /// Get ref to raw buffer
+  bufferptr &get_bptr() { return ptr; }
+  const bufferptr &get_bptr() const { return ptr; }
+
+  /// Compare by paddr
+  friend bool operator< (const CachedExtent &a, const CachedExtent &b) {
+    return a.poffset < b.poffset;
+  }
+  friend bool operator> (const CachedExtent &a, const CachedExtent &b) {
+    return a.poffset > b.poffset;
+  }
+  friend bool operator== (const CachedExtent &a, const CachedExtent &b) {
+    return a.poffset == b.poffset;
+  }
+
+  virtual ~CachedExtent();
+
+private:
+  friend struct paddr_cmp;
+  friend struct ref_paddr_cmp;
+  friend class ExtentIndex;
+
+  /// Pointer to containing index (or null)
+  ExtentIndex *parent_index = nullptr;
+
+  /// hook for intrusive extent_index
+  boost::intrusive::set_member_hook<> extent_index_hook;
+  using index_member_options = boost::intrusive::member_hook<
+    CachedExtent,
+    boost::intrusive::set_member_hook<>,
+    &CachedExtent::extent_index_hook>;
+  using index = boost::intrusive::set<CachedExtent, index_member_options>;
+  friend class ExtentIndex;
+  friend class Transaction;
+
+  /// hook for intrusive ref list (mainly dirty or lru list)
+  boost::intrusive::list_member_hook<> primary_ref_list_hook;
+  using primary_ref_list_member_options = boost::intrusive::member_hook<
+    CachedExtent,
+    boost::intrusive::list_member_hook<>,
+    &CachedExtent::primary_ref_list_hook>;
+  using list = boost::intrusive::list<
+    CachedExtent,
+    primary_ref_list_member_options>;
+
+  /// Actual data contents
+  ceph::bufferptr ptr;
+
+  /// number of deltas since initial write
+  extent_version_t version = EXTENT_VERSION_NULL;
+
+  /// address of original block -- relative iff is_pending() and is_clean()
+  paddr_t poffset;
+
+  /// used to wait while in-progress commit completes
+  std::optional<seastar::shared_promise<>> io_wait_promise;
+  void set_io_wait() {
+    ceph_assert(!io_wait_promise);
+    io_wait_promise = seastar::shared_promise<>();
+  }
+  void complete_io() {
+    ceph_assert(io_wait_promise);
+    io_wait_promise->set_value();
+    io_wait_promise = std::nullopt;
+  }
+  seastar::future<> wait_io() {
+    if (!io_wait_promise) {
+      return seastar::now();
+    } else {
+      return io_wait_promise->get_shared_future();
+    }
+  }
+
+protected:
+  CachedExtent(CachedExtent &&other) = delete;
+  CachedExtent(ceph::bufferptr &&ptr) : ptr(std::move(ptr)) {}
+  CachedExtent(const CachedExtent &other)
+    : state(other.state),
+      dirty_from(other.dirty_from),
+      ptr(other.ptr.c_str(), other.ptr.length()),
+      version(other.version),
+      poffset(other.poffset) {}
+
+  struct share_buffer_t {};
+  CachedExtent(const CachedExtent &other, share_buffer_t) :
+    state(other.state),
+    dirty_from(other.dirty_from),
+    ptr(other.ptr),
+    version(other.version),
+    poffset(other.poffset) {}
+
+
+  friend class Cache;
+  template <typename T>
+  static TCachedExtentRef<T> make_cached_extent_ref(bufferptr &&ptr) {
+    return new T(std::move(ptr));
+  }
+
+  CachedExtentRef get_prior_instance() {
+    return prior_instance;
+  }
+
+  /// Sets last_committed_crc
+  void set_last_committed_crc(uint32_t crc) {
+    last_committed_crc = crc;
+  }
+
+  void set_paddr(paddr_t offset) { poffset = offset; }
+
+  /**
+   * maybe_generate_relative
+   *
+   * There are three kinds of addresses one might want to
+   * store within an extent:
+   * - addr for a block within the same transaction relative to the
+   *   physical location of this extent in the
+   *   event that we will read it in the initial read of the extent
+   * - addr relative to the physical location of the next record to a
+   *   block within that record to contain a delta for this extent in
+   *   the event that we'll read it from a delta and overlay it onto a
+   *   dirty representation of the extent.
+   * - absolute addr to a block already written outside of the current
+   *   transaction.
+   *
+   * This helper checks addr and the current state to create the correct
+   * reference.
+   */
+  paddr_t maybe_generate_relative(paddr_t addr) {
+    if (!addr.is_relative()) {
+      return addr;
+    } else if (is_mutation_pending()) {
+      return addr;
+    } else {
+      ceph_assert(is_initial_pending());
+      ceph_assert(get_paddr().is_record_relative());
+      return addr - get_paddr();
+    }
+  }
+
+};
+
+std::ostream &operator<<(std::ostream &, CachedExtent::extent_state_t);
+std::ostream &operator<<(std::ostream &, const CachedExtent&);
+
+/// Compare extents by paddr
+struct paddr_cmp {
+  bool operator()(paddr_t lhs, const CachedExtent &rhs) const {
+    return lhs < rhs.poffset;
+  }
+  bool operator()(const CachedExtent &lhs, paddr_t rhs) const {
+    return lhs.poffset < rhs;
+  }
+};
+
+/// Compare extent refs by paddr
+struct ref_paddr_cmp {
+  using is_transparent = paddr_t;
+  bool operator()(const CachedExtentRef &lhs, const CachedExtentRef &rhs) const {
+    return lhs->poffset < rhs->poffset;
+  }
+  bool operator()(const paddr_t &lhs, const CachedExtentRef &rhs) const {
+    return lhs < rhs->poffset;
+  }
+  bool operator()(const CachedExtentRef &lhs, const paddr_t &rhs) const {
+    return lhs->poffset < rhs;
+  }
+};
+
+template <typename T, typename C>
+class addr_extent_list_base_t
+  : public std::list<std::pair<T, C>> {};
+
+using pextent_list_t = addr_extent_list_base_t<paddr_t, CachedExtentRef>;
+
+template <typename T, typename C, typename Cmp>
+class addr_extent_set_base_t
+  : public std::set<C, Cmp> {};
+
+using pextent_set_t = addr_extent_set_base_t<
+  paddr_t,
+  CachedExtentRef,
+  ref_paddr_cmp
+  >;
+
+template <typename T>
+using t_pextent_list_t = addr_extent_list_base_t<paddr_t, TCachedExtentRef<T>>;
+
+/**
+ * ExtentIndex
+ *
+ * Index of CachedExtent & by poffset, does not hold a reference,
+ * user must ensure each extent is removed prior to deletion
+ */
+class ExtentIndex {
+  friend class Cache;
+  CachedExtent::index extent_index;
+public:
+  auto get_overlap(paddr_t addr, segment_off_t len) {
+    auto bottom = extent_index.upper_bound(addr, paddr_cmp());
+    if (bottom != extent_index.begin())
+      --bottom;
+    if (bottom != extent_index.end() &&
+	bottom->get_paddr().add_offset(bottom->get_length()) <= addr)
+      ++bottom;
+
+    auto top = extent_index.lower_bound(addr.add_offset(len), paddr_cmp());
+    return std::make_pair(
+      bottom,
+      top
+    );
+  }
+
+  void clear() {
+    extent_index.clear();
+  }
+
+  void insert(CachedExtent &extent) {
+    // sanity check
+    auto [a, b] = get_overlap(
+      extent.get_paddr(),
+      extent.get_length());
+    ceph_assert(a == b);
+
+    extent_index.insert(extent);
+    extent.parent_index = this;
+  }
+
+  void erase(CachedExtent &extent) {
+    extent_index.erase(extent);
+    extent.parent_index = nullptr;
+  }
+
+  void replace(CachedExtent &to, CachedExtent &from) {
+    extent_index.replace_node(extent_index.s_iterator_to(from), to);
+    from.parent_index = nullptr;
+    to.parent_index = this;
+  }
+
+  bool empty() const {
+    return extent_index.empty();
+  }
+
+  auto find_offset(paddr_t offset) {
+    return extent_index.find(offset, paddr_cmp());
+  }
+
+  auto begin() {
+    return extent_index.begin();
+  }
+
+  auto end() {
+    return extent_index.end();
+  }
+
+  void merge(ExtentIndex &&other) {
+    for (auto it = other.extent_index.begin();
+	 it != other.extent_index.end();
+	 ) {
+      auto &ext = *it;
+      ++it;
+      other.extent_index.erase(ext);
+      extent_index.insert(ext);
+    }
+  }
+
+  template <typename T>
+  void remove(T &l) {
+    for (auto &ext : l) {
+      extent_index.erase(l);
+    }
+  }
+};
+
+class LogicalCachedExtent;
+class LBAPin;
+using LBAPinRef = std::unique_ptr<LBAPin>;
+class LBAPin {
+public:
+  virtual void link_extent(LogicalCachedExtent *ref) = 0;
+  virtual void take_pin(LBAPin &pin) = 0;
+  virtual extent_len_t get_length() const = 0;
+  virtual paddr_t get_paddr() const = 0;
+  virtual laddr_t get_laddr() const = 0;
+  virtual LBAPinRef duplicate() const = 0;
+
+  virtual ~LBAPin() {}
+};
+std::ostream &operator<<(std::ostream &out, const LBAPin &rhs);
+
+using lba_pin_list_t = std::list<LBAPinRef>;
+
+std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs);
+
+
+/**
+ * LogicalCachedExtent
+ *
+ * CachedExtent with associated lba mapping.
+ *
+ * Users of TransactionManager should be using extents derived from
+ * LogicalCachedExtent.
+ */
+class LogicalCachedExtent : public CachedExtent {
+public:
+  template <typename... T>
+  LogicalCachedExtent(T&&... t) : CachedExtent(std::forward<T>(t)...) {}
+
+  void set_pin(LBAPinRef &&npin) {
+    assert(!pin);
+    pin = std::move(npin);
+    laddr = pin->get_laddr();
+    pin->link_extent(this);
+  }
+
+  bool has_pin() const {
+    return !!pin;
+  }
+
+  LBAPin &get_pin() {
+    assert(pin);
+    return *pin;
+  }
+
+  laddr_t get_laddr() const {
+    assert(laddr != L_ADDR_NULL);
+    return laddr;
+  }
+
+  void set_laddr(laddr_t nladdr) {
+    laddr = nladdr;
+  }
+
+  void apply_delta_and_adjust_crc(
+    paddr_t base, const ceph::bufferlist &bl) final {
+    apply_delta(bl);
+    set_last_committed_crc(get_crc32c());
+  }
+
+  bool is_logical() const final {
+    return true;
+  }
+
+  std::ostream &print_detail(std::ostream &out) const final;
+protected:
+  virtual void apply_delta(const ceph::bufferlist &bl) = 0;
+  virtual std::ostream &print_detail_l(std::ostream &out) const {
+    return out;
+  }
+
+  virtual void logical_on_delta_write() {}
+
+  void on_delta_write(paddr_t record_block_offset) final {
+    assert(get_prior_instance());
+    pin->take_pin(*(get_prior_instance()->cast<LogicalCachedExtent>()->pin));
+    logical_on_delta_write();
+  }
+
+private:
+  laddr_t laddr = L_ADDR_NULL;
+  LBAPinRef pin;
+};
+
+using LogicalCachedExtentRef = TCachedExtentRef<LogicalCachedExtent>;
+struct ref_laddr_cmp {
+  using is_transparent = laddr_t;
+  bool operator()(const LogicalCachedExtentRef &lhs,
+		  const LogicalCachedExtentRef &rhs) const {
+    return lhs->get_laddr() < rhs->get_laddr();
+  }
+  bool operator()(const laddr_t &lhs,
+		  const LogicalCachedExtentRef &rhs) const {
+    return lhs < rhs->get_laddr();
+  }
+  bool operator()(const LogicalCachedExtentRef &lhs,
+		  const laddr_t &rhs) const {
+    return lhs->get_laddr() < rhs;
+  }
+};
+
+using lextent_set_t = addr_extent_set_base_t<
+  laddr_t,
+  LogicalCachedExtentRef,
+  ref_laddr_cmp
+  >;
+
+template <typename T>
+using lextent_list_t = addr_extent_list_base_t<
+  laddr_t, TCachedExtentRef<T>>;
+
+}
diff --git a/src/crimson/os/seastore/extentmap_manager.cc b/src/crimson/os/seastore/extentmap_manager.cc
new file mode 100644
index 000000000..32de3a6ed
--- /dev/null
+++ b/src/crimson/os/seastore/extentmap_manager.cc
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include <experimental/iterator>
+#include <iostream>
+
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/extentmap_manager.h"
+#include "crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h"
+namespace crimson::os::seastore::extentmap_manager {
+
+ExtentMapManagerRef create_extentmap_manager(TransactionManager &trans_manager) {
+  return ExtentMapManagerRef(new BtreeExtentMapManager(trans_manager));
+}
+
+}
+
+namespace crimson::os::seastore {
+
+std::ostream &operator<<(std::ostream &out, const extent_mapping_t &rhs)
+{
+  return out << "extent_mapping_t (" << rhs.logical_offset << "~" << rhs.length
+	        << "->" << rhs.laddr << ")";
+}
+
+std::ostream &operator<<(std::ostream &out, const extent_map_list_t &rhs)
+{
+  out << '[';
+  std::copy(std::begin(rhs), std::end(rhs), std::experimental::make_ostream_joiner(out, ", "));
+  return out << ']';
+}
+
+}
diff --git a/src/crimson/os/seastore/extentmap_manager.h b/src/crimson/os/seastore/extentmap_manager.h
new file mode 100644
index 000000000..7d5223b94
--- /dev/null
+++ b/src/crimson/os/seastore/extentmap_manager.h
@@ -0,0 +1,124 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iosfwd>
+#include <list>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/exceptions.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+
+#define PAGE_SIZE 4096
+#define EXTMAP_BLOCK_SIZE 4096
+
+namespace crimson::os::seastore {
+
+struct lext_map_val_t {
+  laddr_t laddr;
+  extent_len_t length = 0;
+
+  lext_map_val_t(
+    laddr_t laddr,
+    extent_len_t length)
+    : laddr(laddr), length(length) {}
+
+};
+
+class extent_mapping_t
+{
+public:
+  objaddr_t logical_offset = 0;  //offset in object
+  laddr_t laddr;     // lextent start address aligned with block size.
+  extent_len_t length = 0;
+  explicit extent_mapping_t(objaddr_t lo) : logical_offset(lo) { }
+
+  extent_mapping_t(
+    objaddr_t lo,
+    laddr_t laddr,
+    extent_len_t length)
+    : logical_offset(lo), laddr(laddr), length(length) {}
+
+  ~extent_mapping_t() {}
+};
+
+enum class extmap_root_state_t : uint8_t {
+  INITIAL = 0,
+  MUTATED = 1,
+  NONE = 0xFF
+};
+
+using extent_map_list_t = std::list<extent_mapping_t>;
+std::ostream &operator<<(std::ostream &out, const extent_mapping_t &rhs);
+std::ostream &operator<<(std::ostream &out, const extent_map_list_t &rhs);
+
+struct extmap_root_t {
+  depth_t depth = 0;
+  extmap_root_state_t state;
+  laddr_t extmap_root_laddr;
+  extmap_root_t(depth_t dep, laddr_t laddr)
+  : depth(dep),
+    extmap_root_laddr(laddr) { state = extmap_root_state_t::INITIAL; }
+};
+
+/**
+ * Abstract interface for managing the object inner offset to logical addr mapping
+ * each onode has an extentmap tree for a particular onode.
+ */
+class ExtentMapManager {
+public:
+  using initialize_extmap_ertr = TransactionManager::alloc_extent_ertr;
+  using initialize_extmap_ret = initialize_extmap_ertr::future<extmap_root_t>;
+  virtual initialize_extmap_ret initialize_extmap(Transaction &t) = 0;
+
+  /* find_lextents
+   *
+   * Return a list of all extent_mapping_t overlapping any portion of lo~len.
+   * or if not find any overlap extent_mapping_t will return the next extent after the range.
+   */
+  using find_lextent_ertr = TransactionManager::read_extent_ertr;
+  using find_lextent_ret = find_lextent_ertr::future<extent_map_list_t>;
+  virtual find_lextent_ret
+    find_lextent(const extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, extent_len_t len) = 0;
+
+  /* add_lextent
+   *
+   * add a new mapping (object offset -> laddr, length) to extent map
+   * return the added extent_mapping_t
+   */
+  using add_lextent_ertr = TransactionManager::read_extent_ertr;
+  using add_lextent_ret = add_lextent_ertr::future<extent_mapping_t>;
+  virtual add_lextent_ret
+    add_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) = 0;
+
+  /* rm_lextent
+   *
+   * remove an existing extent mapping from extent map
+   * return true if the extent mapping is removed, otherwise return false
+   */
+  using rm_lextent_ertr = TransactionManager::read_extent_ertr;
+  using rm_lextent_ret = rm_lextent_ertr::future<bool>;
+  virtual rm_lextent_ret rm_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) = 0;
+
+  virtual ~ExtentMapManager() {}
+};
+using ExtentMapManagerRef = std::unique_ptr<ExtentMapManager>;
+
+namespace extentmap_manager {
+/* creat ExtentMapManager for an extentmap
+ * if it is a new extmap after create_extentmap_manager need call initialize_extmap
+ * to initialize the extent map before use it
+ * if it is an exsiting extmap, needn't initialize_extmap
+ */
+ExtentMapManagerRef create_extentmap_manager(
+  TransactionManager &trans_manager);
+
+}
+
+}
diff --git a/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.cc b/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.cc
new file mode 100644
index 000000000..f7609d3e8
--- /dev/null
+++ b/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.cc
@@ -0,0 +1,118 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include <sys/mman.h>
+#include <string.h>
+
+#include "crimson/common/log.h"
+
+#include "include/buffer.h"
+#include "crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h"
+#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+namespace crimson::os::seastore::extentmap_manager {
+
+BtreeExtentMapManager::BtreeExtentMapManager(
+  TransactionManager &tm)
+  : tm(tm) {}
+
+BtreeExtentMapManager::initialize_extmap_ret
+BtreeExtentMapManager::initialize_extmap(Transaction &t)
+{
+
+  logger().debug("{}", __func__);
+  return tm.alloc_extent<ExtMapLeafNode>(t, L_ADDR_MIN, EXTMAP_BLOCK_SIZE)
+    .safe_then([](auto&& root_extent) {
+      root_extent->set_size(0);
+      extmap_node_meta_t meta{1};
+      root_extent->set_meta(meta);
+      extmap_root_t extmap_root = extmap_root_t(1, root_extent->get_laddr());
+      return initialize_extmap_ertr::make_ready_future<extmap_root_t>(extmap_root);
+  });
+}
+
+BtreeExtentMapManager::get_root_ret
+BtreeExtentMapManager::get_extmap_root(const extmap_root_t &extmap_root, Transaction &t)
+{
+  assert(extmap_root.extmap_root_laddr != L_ADDR_NULL);
+  laddr_t laddr = extmap_root.extmap_root_laddr;
+  return extmap_load_extent(get_ext_context(t), laddr, extmap_root.depth);
+}
+
+BtreeExtentMapManager::find_lextent_ret
+BtreeExtentMapManager::find_lextent(const extmap_root_t &extmap_root, Transaction &t,
+	                            objaddr_t lo, extent_len_t len)
+{
+  logger().debug("{}: {}, {}", __func__, lo, len);
+  return get_extmap_root(extmap_root, t).safe_then([this, &t, lo, len](auto&& extent) {
+    return extent->find_lextent(get_ext_context(t), lo, len);
+  }).safe_then([](auto &&e) {
+    logger().debug("{}: found_lextent {}", __func__, e);
+    return find_lextent_ret(
+           find_lextent_ertr::ready_future_marker{},
+	          std::move(e));
+  });
+
+}
+
+BtreeExtentMapManager::add_lextent_ret
+BtreeExtentMapManager::add_lextent(extmap_root_t &extmap_root, Transaction &t,
+                                   objaddr_t lo, lext_map_val_t val)
+{
+  logger().debug("{}: {}, {}, {}", __func__, lo, val.laddr, val.length);
+  return get_extmap_root(extmap_root, t).safe_then([this, &extmap_root, &t, lo, val](auto &&root) {
+    return insert_lextent(extmap_root, t, root, lo, val);
+  }).safe_then([](auto ret) {
+      logger().debug("{}:  {}", __func__, ret);
+      return add_lextent_ret(
+             add_lextent_ertr::ready_future_marker{},
+             std::move(ret));
+  });
+
+}
+
+BtreeExtentMapManager::insert_lextent_ret
+BtreeExtentMapManager::insert_lextent(extmap_root_t &extmap_root, Transaction &t,
+	               ExtMapNodeRef root, objaddr_t logical_offset, lext_map_val_t val)
+{
+  auto split = insert_lextent_ertr::make_ready_future<ExtMapNodeRef>(root);
+  if (root->at_max_capacity()) {
+    logger().debug("{}::splitting root {}", __func__, *root);
+    split =  root->extmap_alloc_extent<ExtMapInnerNode>(get_ext_context(t), EXTMAP_BLOCK_SIZE)
+      .safe_then([this, &extmap_root, root, &t, logical_offset](auto&& nroot) {
+        extmap_node_meta_t meta{root->get_node_meta().depth + 1};
+        nroot->set_meta(meta);
+        nroot->journal_insert(nroot->begin(), OBJ_ADDR_MIN,
+        root->get_laddr(), nullptr);
+        extmap_root.extmap_root_laddr = nroot->get_laddr();
+        extmap_root.depth = root->get_node_meta().depth + 1;
+        extmap_root.state = extmap_root_state_t::MUTATED;
+        return nroot->split_entry(get_ext_context(t), logical_offset, nroot->begin(), root);
+      });
+  }
+  return split.safe_then([this, &t, logical_offset, val](ExtMapNodeRef node) {
+    return node->insert(get_ext_context(t), logical_offset, val);
+  });
+}
+
+BtreeExtentMapManager::rm_lextent_ret
+BtreeExtentMapManager::rm_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val)
+{
+  logger().debug("{}: {}, {}, {}", __func__, lo, val.laddr, val.length);
+  return get_extmap_root(extmap_root, t).safe_then([this, &t, lo, val](auto extent) {
+    return extent->rm_lextent(get_ext_context(t), lo, val);
+  }).safe_then([](auto removed) {
+    logger().debug("{}: {}", __func__, removed);
+    return rm_lextent_ret(
+           rm_lextent_ertr::ready_future_marker{},
+           removed);
+  });
+}
+
+
+}
diff --git a/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h b/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h
new file mode 100644
index 000000000..db676f41d
--- /dev/null
+++ b/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/extentmap_manager.h"
+#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+
+namespace crimson::os::seastore::extentmap_manager {
+/**
+ * BtreeExtentMapManager
+ *
+ * Uses a btree to track :
+ * objaddr_t -> laddr_t mapping for each onode extentmap
+ */
+
+class BtreeExtentMapManager : public ExtentMapManager {
+  TransactionManager &tm;
+
+  ext_context_t get_ext_context(Transaction &t) {
+    return ext_context_t{tm,t};
+  }
+
+  /* get_extmap_root
+   *
+   * load extent map tree root node
+   */
+  using get_root_ertr = TransactionManager::read_extent_ertr;
+  using get_root_ret = get_root_ertr::future<ExtMapNodeRef>;
+  get_root_ret get_extmap_root(const extmap_root_t &extmap_root, Transaction &t);
+
+  using insert_lextent_ertr = TransactionManager::read_extent_ertr;
+  using insert_lextent_ret = insert_lextent_ertr::future<extent_mapping_t >;
+  insert_lextent_ret insert_lextent(extmap_root_t &extmap_root, Transaction &t,
+                                    ExtMapNodeRef extent, objaddr_t lo,
+                                    lext_map_val_t val);
+
+public:
+  explicit BtreeExtentMapManager(TransactionManager &tm);
+
+  initialize_extmap_ret initialize_extmap(Transaction &t) final;
+
+  find_lextent_ret find_lextent(const extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, extent_len_t len) final;
+
+  add_lextent_ret add_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) final;
+
+  rm_lextent_ret rm_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) final;
+
+
+};
+using BtreeExtentMapManagerRef = std::unique_ptr<BtreeExtentMapManager>;
+
+}
diff --git a/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h
new file mode 100644
index 000000000..3937bd049
--- /dev/null
+++ b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+
+#pragma once
+
+#include <boost/iterator/counting_iterator.hpp>
+
+#include "crimson/common/log.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/extentmap_manager.h"
+
+namespace crimson::os::seastore::extentmap_manager{
+
+struct ext_context_t {
+  TransactionManager &tm;
+  Transaction &t;
+};
+
+struct extmap_node_meta_t {
+  depth_t depth = 0;
+
+  std::pair<extmap_node_meta_t, extmap_node_meta_t> split_into(objaddr_t pivot) const {
+    return std::make_pair(
+           extmap_node_meta_t{depth},
+           extmap_node_meta_t{depth});
+  }
+
+  static extmap_node_meta_t merge_from(
+    const extmap_node_meta_t &lhs, const extmap_node_meta_t &rhs) {
+    assert(lhs.depth == rhs.depth);
+    return extmap_node_meta_t{lhs.depth};
+  }
+
+  static std::pair<extmap_node_meta_t, extmap_node_meta_t>
+  rebalance(const extmap_node_meta_t &lhs, const extmap_node_meta_t &rhs, laddr_t pivot) {
+    assert(lhs.depth == rhs.depth);
+    return std::make_pair(
+           extmap_node_meta_t{lhs.depth},
+           extmap_node_meta_t{lhs.depth});
+  }
+};
+
+struct ExtMapNode : LogicalCachedExtent {
+  using ExtMapNodeRef = TCachedExtentRef<ExtMapNode>;
+
+  ExtMapNode(ceph::bufferptr &&ptr) : LogicalCachedExtent(std::move(ptr)) {}
+  ExtMapNode(const ExtMapNode &other)
+  : LogicalCachedExtent(other) {}
+
+  using find_lextent_ertr = ExtentMapManager::find_lextent_ertr;
+  using find_lextent_ret = ExtentMapManager::find_lextent_ret;
+  virtual find_lextent_ret find_lextent(ext_context_t ec,
+		                        objaddr_t lo, extent_len_t len) = 0;
+
+  using insert_ertr = TransactionManager::read_extent_ertr;
+  using insert_ret = insert_ertr::future<extent_mapping_t>;
+  virtual insert_ret insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val) = 0;
+
+  using rm_lextent_ertr = TransactionManager::read_extent_ertr;
+  using rm_lextent_ret = rm_lextent_ertr::future<bool>;
+  virtual rm_lextent_ret rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val) = 0;
+
+  using split_children_ertr = TransactionManager::alloc_extent_ertr;
+  using split_children_ret = split_children_ertr::future
+ 	                           <std::tuple<ExtMapNodeRef, ExtMapNodeRef, uint32_t>>;
+  virtual split_children_ret make_split_children(ext_context_t ec) = 0;
+
+  using full_merge_ertr = TransactionManager::alloc_extent_ertr;
+  using full_merge_ret = full_merge_ertr::future<ExtMapNodeRef>;
+  virtual full_merge_ret make_full_merge(ext_context_t ec, ExtMapNodeRef right) = 0;
+
+  using make_balanced_ertr = TransactionManager::alloc_extent_ertr;
+  using make_balanced_ret = make_balanced_ertr::future
+	                           <std::tuple<ExtMapNodeRef, ExtMapNodeRef, uint32_t>>;
+  virtual make_balanced_ret
+    make_balanced(ext_context_t ec, ExtMapNodeRef right, bool prefer_left) = 0;
+
+  virtual extmap_node_meta_t get_node_meta() const = 0;
+
+  virtual bool at_max_capacity() const = 0;
+  virtual bool at_min_capacity() const = 0;
+  virtual unsigned get_node_size() const = 0;
+  virtual ~ExtMapNode() = default;
+
+  using alloc_ertr = TransactionManager::alloc_extent_ertr;
+  template<class T>
+  alloc_ertr::future<TCachedExtentRef<T>>
+  extmap_alloc_extent(ext_context_t ec, extent_len_t len) {
+    return ec.tm.alloc_extent<T>(ec.t, L_ADDR_MIN, len).safe_then(
+      [](auto&& extent) {
+      return alloc_ertr::make_ready_future<TCachedExtentRef<T>>(std::move(extent));
+    });
+  }
+
+  template<class T>
+  alloc_ertr::future<std::pair<TCachedExtentRef<T>, TCachedExtentRef<T>>>
+  extmap_alloc_2extents(ext_context_t ec, extent_len_t len) {
+    return seastar::do_with(std::pair<TCachedExtentRef<T>, TCachedExtentRef<T>>(),
+      [ec, len] (auto &extents) {
+      return crimson::do_for_each(boost::make_counting_iterator(0),
+                                  boost::make_counting_iterator(2),
+                                  [ec, len, &extents] (auto i) {
+        return ec.tm.alloc_extent<T>(ec.t, L_ADDR_MIN, len).safe_then(
+          [i, &extents](auto &&node) {
+	         if (i == 0)
+	           extents.first = node;
+	         if (i == 1)
+	           extents.second = node;
+	       });
+      }).safe_then([&extents] {
+        return alloc_ertr::make_ready_future
+	         <std::pair<TCachedExtentRef<T>, TCachedExtentRef<T>>>(std::move(extents));
+      });
+    });
+  }
+
+  using retire_ertr = crimson::errorator<
+                      crimson::ct_error::enoent,
+                      crimson::ct_error::input_output_error>;
+  using retire_ret = retire_ertr::future<std::list<unsigned>>;
+  retire_ret
+  extmap_retire_node(ext_context_t ec, std::list<laddr_t> dec_laddrs) {
+    return seastar::do_with(std::move(dec_laddrs), std::list<unsigned>(),
+      [ec] (auto &&dec_laddrs, auto &refcnt) {
+      return crimson::do_for_each(dec_laddrs.begin(), dec_laddrs.end(),
+        [ec, &refcnt] (auto &laddr) {
+        return ec.tm.dec_ref(ec.t, laddr).safe_then([&refcnt] (auto ref) {
+          refcnt.push_back(ref);
+        });
+      }).safe_then([&refcnt] {
+        return retire_ertr::make_ready_future<std::list<unsigned>>(std::move(refcnt));
+      });
+    });
+  }
+
+};
+
+using ExtMapNodeRef = ExtMapNode::ExtMapNodeRef;
+
+TransactionManager::read_extent_ertr::future<ExtMapNodeRef>
+extmap_load_extent(ext_context_t ec, laddr_t laddr, depth_t depth);
+
+}
diff --git a/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.cc b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.cc
new file mode 100644
index 000000000..7bf8680a5
--- /dev/null
+++ b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.cc
@@ -0,0 +1,373 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include <memory>
+#include <string.h>
+
+#include "include/buffer.h"
+#include "include/byteorder.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h"
+#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+namespace crimson::os::seastore::extentmap_manager {
+
+std::ostream &ExtMapInnerNode::print_detail_l(std::ostream &out) const
+{
+  return out << ", size=" << get_size()
+	     << ", depth=" << get_meta().depth;
+}
+
+ExtMapInnerNode::find_lextent_ret
+ExtMapInnerNode::find_lextent(ext_context_t ec, objaddr_t lo, extent_len_t len)
+{
+  auto [begin, end] = bound(lo, lo + len);
+  auto result_up = std::make_unique<extent_map_list_t>();
+  auto &result = *result_up;
+  return crimson::do_for_each(
+    std::move(begin),
+    std::move(end),
+    [this, ec, &result, lo, len](const auto &val) mutable {
+      return extmap_load_extent(ec, val.get_val(),  get_meta().depth - 1).safe_then(
+        [ec, &result, lo, len](auto extent) mutable {
+        return extent->find_lextent(ec, lo, len).safe_then(
+          [&result](auto item_list) mutable {
+          result.splice(result.end(), item_list,
+          item_list.begin(), item_list.end());
+        });
+      });
+  }).safe_then([result=std::move(result_up)] {
+    return find_lextent_ret(
+           find_lextent_ertr::ready_future_marker{},
+           std::move(*result));
+  });
+}
+
+ExtMapInnerNode::insert_ret
+ExtMapInnerNode::insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val)
+{
+  auto insertion_pt = get_containing_child(lo);
+  assert(insertion_pt != end());
+  return extmap_load_extent(ec, insertion_pt->get_val(), get_meta().depth - 1).safe_then(
+    [this, ec, insertion_pt, lo, val=std::move(val)](auto extent) mutable {
+      return extent->at_max_capacity() ?
+        split_entry(ec, lo, insertion_pt, extent) :
+        insert_ertr::make_ready_future<ExtMapNodeRef>(std::move(extent));
+    }).safe_then([ec, lo, val=std::move(val)](ExtMapNodeRef extent) mutable {
+      return extent->insert(ec, lo, val);
+    });
+}
+
+ExtMapInnerNode::rm_lextent_ret
+ExtMapInnerNode::rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val)
+{
+  auto rm_pt = get_containing_child(lo);
+  return extmap_load_extent(ec, rm_pt->get_val(),  get_meta().depth - 1).safe_then(
+    [this, ec, rm_pt, lo, val=std::move(val)](auto extent) mutable {
+    if (extent->at_min_capacity() && get_node_size() > 1) {
+      return merge_entry(ec, lo, rm_pt, extent);
+    } else {
+      return merge_entry_ertr::make_ready_future<ExtMapNodeRef>(std::move(extent));
+    }
+  }).safe_then([ec, lo, val](ExtMapNodeRef extent) mutable {
+    return extent->rm_lextent(ec, lo, val);
+  });
+}
+
+ExtMapInnerNode::split_children_ret
+ExtMapInnerNode::make_split_children(ext_context_t ec)
+{
+  logger().debug("{}: {}", "ExtMapInnerNode", __func__);
+  return extmap_alloc_2extents<ExtMapInnerNode>(ec, EXTMAP_BLOCK_SIZE)
+    .safe_then([this] (auto &&ext_pair) {
+      auto [left, right] = ext_pair;
+      return split_children_ret(
+             split_children_ertr::ready_future_marker{},
+             std::make_tuple(left, right, split_into(*left, *right)));
+  });
+}
+
+ExtMapInnerNode::full_merge_ret
+ExtMapInnerNode::make_full_merge(ext_context_t ec, ExtMapNodeRef right)
+{
+  logger().debug("{}: {}", "ExtMapInnerNode", __func__);
+  return extmap_alloc_extent<ExtMapInnerNode>(ec, EXTMAP_BLOCK_SIZE)
+    .safe_then([this, right] (auto &&replacement) {
+      replacement->merge_from(*this, *right->cast<ExtMapInnerNode>());
+      return full_merge_ret(
+             full_merge_ertr::ready_future_marker{},
+             std::move(replacement));
+  });
+}
+
+ExtMapInnerNode::make_balanced_ret
+ExtMapInnerNode::make_balanced(ext_context_t ec, ExtMapNodeRef _right, bool prefer_left)
+{
+  logger().debug("{}: {}", "ExtMapInnerNode", __func__);
+  ceph_assert(_right->get_type() == type);
+  return extmap_alloc_2extents<ExtMapInnerNode>(ec, EXTMAP_BLOCK_SIZE)
+    .safe_then([this,  _right, prefer_left] (auto &&replacement_pair){
+      auto [replacement_left, replacement_right] = replacement_pair;
+      auto &right = *_right->cast<ExtMapInnerNode>();
+      return make_balanced_ret(
+             make_balanced_ertr::ready_future_marker{},
+             std::make_tuple(replacement_left, replacement_right,
+             balance_into_new_nodes(*this, right, prefer_left,
+                                    *replacement_left, *replacement_right)));
+  });
+}
+
+ExtMapInnerNode::split_entry_ret
+ExtMapInnerNode::split_entry(ext_context_t ec, objaddr_t lo,
+	                     internal_iterator_t iter, ExtMapNodeRef entry)
+{
+  logger().debug("{}: {}", "ExtMapInnerNode", __func__);
+  if (!is_pending()) {
+    auto mut = ec.tm.get_mutable_extent(ec.t, this)->cast<ExtMapInnerNode>();
+    auto mut_iter = mut->iter_idx(iter->get_offset());
+    return mut->split_entry(ec, lo, mut_iter, entry);
+  }
+  ceph_assert(!at_max_capacity());
+  return entry->make_split_children(ec)
+    .safe_then([this, ec, lo, iter, entry] (auto tuple){
+    auto [left, right, pivot] = tuple;
+    journal_update(iter, left->get_laddr(), maybe_get_delta_buffer());
+    journal_insert(iter + 1, pivot, right->get_laddr(), maybe_get_delta_buffer());
+    logger().debug(
+      "ExtMapInnerNode::split_entry *this {} entry {} into left {} right {}",
+      *this, *entry, *left, *right);
+    //retire extent
+    return ec.tm.dec_ref(ec.t, entry->get_laddr())
+      .safe_then([lo, left = left, right = right, pivot = pivot] (auto ret) {
+      return split_entry_ertr::make_ready_future<ExtMapNodeRef>(
+             pivot > lo ? left : right);
+    });
+  });
+}
+
+ExtMapInnerNode::merge_entry_ret
+ExtMapInnerNode::merge_entry(ext_context_t ec, objaddr_t lo,
+  internal_iterator_t iter, ExtMapNodeRef entry)
+{
+  if (!is_pending()) {
+    auto mut = ec.tm.get_mutable_extent(ec.t, this)->cast<ExtMapInnerNode>();
+    auto mut_iter = mut->iter_idx(iter->get_offset());
+    return mut->merge_entry(ec, lo, mut_iter, entry);
+  }
+  logger().debug("ExtMapInnerNode: merge_entry: {}, {}", *this, *entry);
+  auto is_left = (iter + 1) == end();
+  auto donor_iter = is_left ? iter - 1 : iter + 1;
+  return extmap_load_extent(ec, donor_iter->get_val(),  get_meta().depth - 1)
+    .safe_then([this, ec, lo, iter, entry, donor_iter, is_left]
+    (auto &&donor) mutable {
+    auto [l, r] = is_left ?
+                  std::make_pair(donor, entry) : std::make_pair(entry, donor);
+    auto [liter, riter] = is_left ?
+              std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter);
+    if (donor->at_min_capacity()) {
+      return l->make_full_merge(ec, r)
+        .safe_then([this, ec, entry, l = l, r = r, liter = liter, riter = riter]
+        (auto &&replacement){
+        journal_update(liter, replacement->get_laddr(), maybe_get_delta_buffer());
+        journal_remove(riter, maybe_get_delta_buffer());
+        //retire extent
+        std::list<laddr_t> dec_laddrs;
+        dec_laddrs.push_back(l->get_laddr());
+        dec_laddrs.push_back(r->get_laddr());
+        return extmap_retire_node(ec, dec_laddrs)
+          .safe_then([replacement] (auto &&ret) {
+            return merge_entry_ertr::make_ready_future<ExtMapNodeRef>(replacement);
+        });
+      });
+    } else {
+      logger().debug("ExtMapInnerNode::merge_entry balanced l {} r {}",
+                    	*l, *r);
+      return l->make_balanced(ec, r, !is_left)
+	       .safe_then([this, ec, lo,  entry, l = l, r = r, liter = liter, riter = riter]
+        (auto tuple) {
+        auto [replacement_l, replacement_r, pivot] = tuple;
+        journal_update(liter, replacement_l->get_laddr(), maybe_get_delta_buffer());
+        journal_replace(riter, pivot, replacement_r->get_laddr(),
+                maybe_get_delta_buffer());
+        // retire extent
+        std::list<laddr_t> dec_laddrs;
+        dec_laddrs.push_back(l->get_laddr());
+        dec_laddrs.push_back(r->get_laddr());
+        return extmap_retire_node(ec, dec_laddrs)
+          .safe_then([lo, pivot = pivot, replacement_l = replacement_l, replacement_r = replacement_r] 
+            (auto &&ret) {
+            return merge_entry_ertr::make_ready_future<ExtMapNodeRef>(
+                   lo >= pivot ? replacement_r : replacement_l);
+        });
+      });
+    }
+  });
+}
+
+
+ExtMapInnerNode::internal_iterator_t
+ExtMapInnerNode::get_containing_child(objaddr_t lo)
+{
+  // TODO: binary search
+  for (auto i = begin(); i != end(); ++i) {
+    if (i.contains(lo))
+      return i;
+  }
+  ceph_assert(0 == "invalid");
+  return end();
+}
+
+std::ostream &ExtMapLeafNode::print_detail_l(std::ostream &out) const
+{
+  return out << ", size=" << get_size()
+	     << ", depth=" << get_meta().depth;
+}
+
+ExtMapLeafNode::find_lextent_ret
+ExtMapLeafNode::find_lextent(ext_context_t ec, objaddr_t lo, extent_len_t len)
+{
+  logger().debug(
+    "ExtMapLeafNode::find_lextent {}~{}", lo, len);
+  auto ret = extent_map_list_t();
+  auto [from, to] = get_leaf_entries(lo, len);
+  if (from == to && to != end())
+    ++to;
+  for (; from != to; ++from) {
+    auto val = (*from).get_val();
+    ret.emplace_back(
+      extent_mapping_t(
+      (*from).get_key(),
+      val.laddr,
+      val.length));
+    logger().debug("ExtMapLeafNode::find_lextent find {}~{}", lo, val.laddr);
+  }
+  return find_lextent_ertr::make_ready_future<extent_map_list_t>(
+         std::move(ret));
+}
+
+ExtMapLeafNode::insert_ret
+ExtMapLeafNode::insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val)
+{
+  ceph_assert(!at_max_capacity());
+  if (!is_pending()) {
+    auto mut = ec.tm.get_mutable_extent(ec.t, this)->cast<ExtMapLeafNode>();
+    return mut->insert(ec, lo, val);
+  }
+  auto insert_pt = lower_bound(lo);
+  journal_insert(insert_pt, lo, val, maybe_get_delta_buffer());
+
+  logger().debug(
+    "ExtMapLeafNode::insert: inserted {}->{} {}",
+    insert_pt.get_key(),
+    insert_pt.get_val().laddr,
+    insert_pt.get_val().length);
+  return insert_ertr::make_ready_future<extent_mapping_t>(
+         extent_mapping_t(lo, val.laddr, val.length));
+}
+
+ExtMapLeafNode::rm_lextent_ret
+ExtMapLeafNode::rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val)
+{
+  if (!is_pending()) {
+    auto mut = ec.tm.get_mutable_extent(ec.t, this)->cast<ExtMapLeafNode>();
+    return mut->rm_lextent(ec, lo, val);
+  }
+
+  auto [rm_pt, rm_end] = get_leaf_entries(lo, val.length);
+  if (lo == rm_pt->get_key() && val.laddr == rm_pt->get_val().laddr
+           && val.length == rm_pt->get_val().length) {
+    journal_remove(rm_pt, maybe_get_delta_buffer());
+    logger().debug(
+      "ExtMapLeafNode::rm_lextent: removed {}->{} {}",
+      rm_pt.get_key(),
+      rm_pt.get_val().laddr,
+      rm_pt.get_val().length);
+    return rm_lextent_ertr::make_ready_future<bool>(true);
+  } else {
+    return rm_lextent_ertr::make_ready_future<bool>(false);
+  }
+}
+
+ExtMapLeafNode::split_children_ret
+ExtMapLeafNode::make_split_children(ext_context_t ec)
+{
+  logger().debug("{}: {}", "ExtMapLeafNode", __func__);
+  return extmap_alloc_2extents<ExtMapLeafNode>(ec, EXTMAP_BLOCK_SIZE)
+    .safe_then([this] (auto &&ext_pair) {
+      auto [left, right] = ext_pair;
+      return split_children_ret(
+             split_children_ertr::ready_future_marker{},
+             std::make_tuple(left, right, split_into(*left, *right)));
+  });
+}
+
+ExtMapLeafNode::full_merge_ret
+ExtMapLeafNode::make_full_merge(ext_context_t ec, ExtMapNodeRef right)
+{
+  logger().debug("{}: {}", "ExtMapLeafNode", __func__);
+  return extmap_alloc_extent<ExtMapLeafNode>(ec, EXTMAP_BLOCK_SIZE)
+    .safe_then([this, right] (auto &&replacement) {
+      replacement->merge_from(*this, *right->cast<ExtMapLeafNode>());
+      return full_merge_ret(
+             full_merge_ertr::ready_future_marker{},
+             std::move(replacement));
+  });
+}
+ExtMapLeafNode::make_balanced_ret
+ExtMapLeafNode::make_balanced(ext_context_t ec, ExtMapNodeRef _right, bool prefer_left)
+{
+  logger().debug("{}: {}", "ExtMapLeafNode", __func__);
+  ceph_assert(_right->get_type() == type);
+  return extmap_alloc_2extents<ExtMapLeafNode>(ec, EXTMAP_BLOCK_SIZE)
+    .safe_then([this, _right, prefer_left] (auto &&replacement_pair) {
+      auto [replacement_left, replacement_right] = replacement_pair;
+      auto &right = *_right->cast<ExtMapLeafNode>();
+      return make_balanced_ret(
+             make_balanced_ertr::ready_future_marker{},
+             std::make_tuple(
+               replacement_left, replacement_right,
+               balance_into_new_nodes(
+                 *this, right, prefer_left,
+                 *replacement_left, *replacement_right)));
+  });
+}
+
+
+std::pair<ExtMapLeafNode::internal_iterator_t, ExtMapLeafNode::internal_iterator_t>
+ExtMapLeafNode::get_leaf_entries(objaddr_t addr, extent_len_t len)
+{
+  return bound(addr, addr + len);
+}
+
+
+TransactionManager::read_extent_ertr::future<ExtMapNodeRef>
+extmap_load_extent(ext_context_t ec, laddr_t laddr, depth_t depth)
+{
+  ceph_assert(depth > 0);
+  if (depth > 1) {
+    return ec.tm.read_extents<ExtMapInnerNode>(ec.t, laddr, EXTMAP_BLOCK_SIZE).safe_then(
+      [](auto&& extents) {
+      assert(extents.size() == 1);
+      [[maybe_unused]] auto [laddr, e] = extents.front();
+      return TransactionManager::read_extent_ertr::make_ready_future<ExtMapNodeRef>(std::move(e));
+    });
+  } else {
+    return ec.tm.read_extents<ExtMapLeafNode>(ec.t, laddr, EXTMAP_BLOCK_SIZE).safe_then(
+      [](auto&& extents) {
+      assert(extents.size() == 1);
+      [[maybe_unused]] auto [laddr, e] = extents.front();
+      return TransactionManager::read_extent_ertr::make_ready_future<ExtMapNodeRef>(std::move(e));
+    });
+  }
+}
+
+}
diff --git a/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h
new file mode 100644
index 000000000..f5da8cdc2
--- /dev/null
+++ b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h
@@ -0,0 +1,281 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+#include "include/buffer.h"
+
+#include "crimson/common/fixed_kv_node_layout.h"
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/extentmap_manager.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h"
+
+namespace crimson::os::seastore::extentmap_manager {
+
+struct extmap_node_meta_le_t {
+  depth_le_t depth = init_les32(0);
+
+  extmap_node_meta_le_t() = default;
+  extmap_node_meta_le_t(const extmap_node_meta_le_t &) = default;
+  explicit extmap_node_meta_le_t(const extmap_node_meta_t &val)
+    : depth(init_les32(val.depth)) {}
+
+  operator extmap_node_meta_t() const {
+    return extmap_node_meta_t{ depth };
+  }
+};
+
+/**
+ * ExtMapInnerNode
+ *
+ * Abstracts operations on and layout of internal nodes for the
+ * Extentmap Tree.
+ *
+ * Layout (4k):
+ *   num_entries: uint32_t           4b
+ *   meta       : depth              4b
+ *   (padding)  :                    8b
+ *   keys       : objaddr_t[340]     (340*4)b
+ *   values     : laddr_t[340]       (340*8)b
+ *                                    = 4096
+ */
+constexpr size_t INNER_NODE_CAPACITY =
+                 (EXTMAP_BLOCK_SIZE - sizeof(uint32_t) - sizeof(extmap_node_meta_t))
+                 / (sizeof (objaddr_t) + sizeof(laddr_t));
+
+struct ExtMapInnerNode
+  : ExtMapNode,
+    common::FixedKVNodeLayout<
+      INNER_NODE_CAPACITY,
+      extmap_node_meta_t, extmap_node_meta_le_t,
+      objaddr_t, ceph_le32,
+      laddr_t, laddr_le_t> {
+  using internal_iterator_t = const_iterator;
+  template <typename... T>
+  ExtMapInnerNode(T&&... t) :
+    ExtMapNode(std::forward<T>(t)...),
+    FixedKVNodeLayout(get_bptr().c_str()) {}
+
+  static constexpr extent_types_t type = extent_types_t::EXTMAP_INNER;
+
+  extmap_node_meta_t get_node_meta() const final {return get_meta();}
+
+  CachedExtentRef duplicate_for_write() final {
+    assert(delta_buffer.empty());
+    return CachedExtentRef(new ExtMapInnerNode(*this));
+  };
+
+  delta_buffer_t delta_buffer;
+  delta_buffer_t *maybe_get_delta_buffer() {
+    return is_mutation_pending() ? &delta_buffer : nullptr;
+  }
+
+  find_lextent_ret find_lextent(ext_context_t ec, objaddr_t lo, extent_len_t len) final;
+
+  insert_ret insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val) final;
+
+  rm_lextent_ret rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val) final;
+
+  split_children_ret make_split_children(ext_context_t ec) final;
+
+  full_merge_ret make_full_merge(ext_context_t ec, ExtMapNodeRef right) final;
+
+  make_balanced_ret make_balanced(ext_context_t ec, ExtMapNodeRef _right, bool prefer_left) final;
+
+  std::ostream &print_detail_l(std::ostream &out) const final;
+
+  extent_types_t get_type() const final {
+    return type;
+  }
+
+  ceph::bufferlist get_delta() final {
+    assert(!delta_buffer.empty());
+    ceph::buffer::ptr bptr(delta_buffer.get_bytes());
+    delta_buffer.copy_out(bptr.c_str(), bptr.length());
+    ceph::bufferlist bl;
+    bl.push_back(bptr);
+    return bl;
+  }
+
+  void apply_delta(const ceph::bufferlist &_bl) final {
+    assert(_bl.length());
+    ceph::bufferlist bl = _bl;
+    bl.rebuild();
+    delta_buffer_t buffer;
+    buffer.copy_in(bl.front().c_str(), bl.front().length());
+    buffer.replay(*this);
+  }
+
+  bool at_max_capacity() const final {
+    return get_size() == get_capacity();
+  }
+
+  bool at_min_capacity() const {
+    return get_size() == get_capacity() / 2;
+  }
+
+  unsigned get_node_size() const {
+    return get_size();
+  }
+
+  /* get the iterator containing [l, r]
+   */
+  std::pair<internal_iterator_t, internal_iterator_t> bound(
+    objaddr_t l, objaddr_t r) {
+    auto retl = begin();
+    for (; retl != end(); ++retl) {
+      if (retl->get_next_key_or_max() > l)
+        break;
+    }
+    auto retr = retl;
+    for (; retr != end(); ++retr) {
+      if (retr->get_key() >= r)
+        break;
+    }
+    return {retl, retr};
+  }
+
+  using split_entry_ertr = TransactionManager::read_extent_ertr;
+  using split_entry_ret = split_entry_ertr::future<ExtMapNodeRef>;
+  split_entry_ret split_entry(ext_context_t ec, objaddr_t lo,
+                              internal_iterator_t, ExtMapNodeRef entry);
+  using merge_entry_ertr = TransactionManager::read_extent_ertr;
+  using merge_entry_ret = merge_entry_ertr::future<ExtMapNodeRef>;
+  merge_entry_ret merge_entry(ext_context_t ec, objaddr_t lo,
+                              internal_iterator_t iter, ExtMapNodeRef entry);
+  internal_iterator_t get_containing_child(objaddr_t lo);
+
+};
+
+/**
+ * ExtMapLeafNode
+ *
+ * Abstracts operations on and layout of leaf nodes for the
+ * ExtentMap Tree.
+ *
+ * Layout (4k):
+ *   num_entries: uint32_t               4b
+ *   meta       : depth                  4b
+ *   (padding)  :                        8b
+ *   keys       : objaddr_t[204]         (204*4)b
+ *   values     : lext_map_val_t[204]    (204*16)b
+ *                                       = 4096
+ */
+constexpr size_t LEAF_NODE_CAPACITY = 
+                 (EXTMAP_BLOCK_SIZE - sizeof(uint32_t) - sizeof(extmap_node_meta_t))
+                 / (sizeof(objaddr_t) + sizeof(lext_map_val_t));
+
+struct lext_map_val_le_t {
+  laddr_le_t laddr;
+  extent_len_le_t  length = init_extent_len_le_t(0);
+
+  lext_map_val_le_t() = default;
+  lext_map_val_le_t(const lext_map_val_le_t &) = default;
+  explicit lext_map_val_le_t(const lext_map_val_t &val)
+    : laddr(laddr_le_t(val.laddr)),
+      length(init_extent_len_le_t(val.length)) {}
+
+  operator lext_map_val_t() const {
+    return lext_map_val_t{laddr, length};
+  }
+};
+
+struct ExtMapLeafNode
+  : ExtMapNode,
+    common::FixedKVNodeLayout<
+      LEAF_NODE_CAPACITY,
+      extmap_node_meta_t, extmap_node_meta_le_t,
+      objaddr_t, ceph_le32,
+      lext_map_val_t, lext_map_val_le_t> {
+  using internal_iterator_t = const_iterator;
+  template <typename... T>
+  ExtMapLeafNode(T&&... t) :
+    ExtMapNode(std::forward<T>(t)...),
+    FixedKVNodeLayout(get_bptr().c_str()) {}
+
+  static constexpr extent_types_t type = extent_types_t::EXTMAP_LEAF;
+
+  extmap_node_meta_t get_node_meta() const final { return get_meta(); }
+
+  CachedExtentRef duplicate_for_write() final {
+    assert(delta_buffer.empty());
+    return CachedExtentRef(new ExtMapLeafNode(*this));
+  };
+
+  delta_buffer_t delta_buffer;
+  delta_buffer_t *maybe_get_delta_buffer() {
+    return is_mutation_pending() ? &delta_buffer : nullptr;
+  }
+
+  find_lextent_ret find_lextent(ext_context_t ec, objaddr_t lo, extent_len_t len) final;
+
+  insert_ret insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val) final;
+
+  rm_lextent_ret rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val) final;
+
+  split_children_ret make_split_children(ext_context_t ec) final;
+
+  full_merge_ret make_full_merge(ext_context_t ec, ExtMapNodeRef right) final;
+
+  make_balanced_ret make_balanced(ext_context_t ec, ExtMapNodeRef _right, bool prefer_left) final;
+
+  extent_types_t get_type() const final {
+    return type;
+  }
+
+  ceph::bufferlist get_delta() final {
+    assert(!delta_buffer.empty());
+    ceph::buffer::ptr bptr(delta_buffer.get_bytes());
+    delta_buffer.copy_out(bptr.c_str(), bptr.length());
+    ceph::bufferlist bl;
+    bl.push_back(bptr);
+    return bl;
+  }
+
+  void apply_delta(const ceph::bufferlist &_bl) final {
+    assert(_bl.length());
+    ceph::bufferlist bl = _bl;
+    bl.rebuild();
+    delta_buffer_t buffer;
+    buffer.copy_in(bl.front().c_str(), bl.front().length());
+    buffer.replay(*this);
+  }
+
+  std::ostream &print_detail_l(std::ostream &out) const final;
+
+  bool at_max_capacity() const final {
+    return get_size() == get_capacity();
+  }
+
+  bool at_min_capacity() const final {
+    return get_size() == get_capacity() / 2;
+  }
+
+  unsigned get_node_size() const {
+    return get_size();
+  }
+
+  /* get the iterator containing [l, r]
+   */
+  std::pair<internal_iterator_t, internal_iterator_t> bound(
+    objaddr_t l, objaddr_t r) {
+    auto retl = begin();
+    for (; retl != end(); ++retl) {
+      if (retl->get_key() >= l || (retl->get_key() + retl->get_val().length) > l)
+        break;
+    }
+    auto retr = retl;
+    for (; retr != end(); ++retr) {
+      if (retr->get_key() >= r)
+        break;
+    }
+    return {retl, retr};
+  }
+
+  std::pair<internal_iterator_t, internal_iterator_t>
+  get_leaf_entries(objaddr_t lo, extent_len_t len);
+
+};
+using ExtentMapLeafNodeRef = TCachedExtentRef<ExtMapLeafNode>;
+
+}
diff --git a/src/crimson/os/seastore/journal.cc b/src/crimson/os/seastore/journal.cc
new file mode 100644
index 000000000..39875fb56
--- /dev/null
+++ b/src/crimson/os/seastore/journal.cc
@@ -0,0 +1,756 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+
+#include <boost/iterator/counting_iterator.hpp>
+
+#include "crimson/os/seastore/journal.h"
+
+#include "include/intarith.h"
+#include "crimson/os/seastore/segment_manager.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+namespace crimson::os::seastore {
+
+std::ostream &operator<<(std::ostream &out, const segment_header_t &header)
+{
+  return out << "segment_header_t("
+	     << "segment_seq=" << header.journal_segment_seq
+	     << ", physical_segment_id=" << header.physical_segment_id
+	     << ", journal_tail=" << header.journal_tail
+	     << ", segment_nonce=" << header.segment_nonce
+	     << ")";
+}
+
+segment_nonce_t generate_nonce(
+  segment_seq_t seq,
+  const seastore_meta_t &meta)
+{
+  return ceph_crc32c(
+    seq,
+    reinterpret_cast<const unsigned char *>(meta.seastore_id.bytes()),
+    sizeof(meta.seastore_id.uuid));
+}
+
+Journal::Journal(SegmentManager &segment_manager)
+  : block_size(segment_manager.get_block_size()),
+    max_record_length(
+      segment_manager.get_segment_size() -
+      p2align(ceph::encoded_sizeof_bounded<segment_header_t>(),
+	      size_t(block_size))),
+    segment_manager(segment_manager) {}
+
+
+Journal::initialize_segment_ertr::future<segment_seq_t>
+Journal::initialize_segment(Segment &segment)
+{
+  auto new_tail = segment_provider->get_journal_tail_target();
+  logger().debug(
+    "initialize_segment {} journal_tail_target {}",
+    segment.get_segment_id(),
+    new_tail);
+  // write out header
+  ceph_assert(segment.get_write_ptr() == 0);
+  bufferlist bl;
+
+  segment_seq_t seq = next_journal_segment_seq++;
+  current_segment_nonce = generate_nonce(
+    seq, segment_manager.get_meta());
+  auto header = segment_header_t{
+    seq,
+    segment.get_segment_id(),
+    segment_provider->get_journal_tail_target(),
+    current_segment_nonce};
+  encode(header, bl);
+
+  bufferptr bp(
+    ceph::buffer::create_page_aligned(
+      segment_manager.get_block_size()));
+  bp.zero();
+  auto iter = bl.cbegin();
+  iter.copy(bl.length(), bp.c_str());
+  bl.clear();
+  bl.append(bp);
+
+  written_to = segment_manager.get_block_size();
+  committed_to = 0;
+  return segment.write(0, bl).safe_then(
+    [=] {
+      segment_provider->update_journal_tail_committed(new_tail);
+      return seq;
+    },
+    initialize_segment_ertr::pass_further{},
+    crimson::ct_error::assert_all{ "TODO" });
+}
+
+ceph::bufferlist Journal::encode_record(
+  record_size_t rsize,
+  record_t &&record)
+{
+  bufferlist data_bl;
+  for (auto &i: record.extents) {
+    data_bl.append(i.bl);
+  }
+
+  bufferlist bl;
+  record_header_t header{
+    rsize.mdlength,
+    rsize.dlength,
+    (uint32_t)record.deltas.size(),
+    (uint32_t)record.extents.size(),
+    current_segment_nonce,
+    committed_to,
+    data_bl.crc32c(-1)
+  };
+  encode(header, bl);
+
+  auto metadata_crc_filler = bl.append_hole(sizeof(uint32_t));
+
+  for (const auto &i: record.extents) {
+    encode(extent_info_t(i), bl);
+  }
+  for (const auto &i: record.deltas) {
+    encode(i, bl);
+  }
+  if (bl.length() % block_size != 0) {
+    bl.append_zero(
+      block_size - (bl.length() % block_size));
+  }
+  ceph_assert(bl.length() == rsize.mdlength);
+
+
+  auto bliter = bl.cbegin();
+  auto metadata_crc = bliter.crc32c(
+    ceph::encoded_sizeof_bounded<record_header_t>(),
+    -1);
+  bliter += sizeof(checksum_t); /* crc hole again */
+  metadata_crc = bliter.crc32c(
+    bliter.get_remaining(),
+    metadata_crc);
+  ceph_le32 metadata_crc_le;
+  metadata_crc_le = metadata_crc;
+  metadata_crc_filler.copy_in(
+    sizeof(checksum_t),
+    reinterpret_cast<const char *>(&metadata_crc_le));
+
+  bl.claim_append(data_bl);
+  ceph_assert(bl.length() == (rsize.dlength + rsize.mdlength));
+
+  return bl;
+}
+
+bool Journal::validate_metadata(const bufferlist &bl)
+{
+  auto bliter = bl.cbegin();
+  auto test_crc = bliter.crc32c(
+    ceph::encoded_sizeof_bounded<record_header_t>(),
+    -1);
+  ceph_le32 recorded_crc_le;
+  ::decode(recorded_crc_le, bliter);
+  uint32_t recorded_crc = recorded_crc_le;
+  test_crc = bliter.crc32c(
+    bliter.get_remaining(),
+    test_crc);
+  return test_crc == recorded_crc;
+}
+
+Journal::read_validate_data_ret Journal::read_validate_data(
+  paddr_t record_base,
+  const record_header_t &header)
+{
+  return segment_manager.read(
+    record_base.add_offset(header.mdlength),
+    header.dlength
+  ).safe_then([=, &header](auto bptr) {
+    bufferlist bl;
+    bl.append(bptr);
+    return bl.crc32c(-1) == header.data_crc;
+  });
+}
+
+Journal::write_record_ret Journal::write_record(
+  record_size_t rsize,
+  record_t &&record)
+{
+  ceph::bufferlist to_write = encode_record(
+    rsize, std::move(record));
+  auto target = written_to;
+  assert((to_write.length() % block_size) == 0);
+  written_to += to_write.length();
+  logger().debug(
+    "write_record, mdlength {}, dlength {}, target {}",
+    rsize.mdlength,
+    rsize.dlength,
+    target);
+  return current_journal_segment->write(target, to_write).handle_error(
+    write_record_ertr::pass_further{},
+    crimson::ct_error::assert_all{ "TODO" }).safe_then([this, target] {
+      committed_to = target;
+      return write_record_ret(
+	write_record_ertr::ready_future_marker{},
+	paddr_t{
+	  current_journal_segment->get_segment_id(),
+	  target});
+    });
+}
+
+Journal::record_size_t Journal::get_encoded_record_length(
+  const record_t &record) const {
+  extent_len_t metadata =
+    (extent_len_t)ceph::encoded_sizeof_bounded<record_header_t>();
+  metadata += sizeof(checksum_t) /* crc */;
+  metadata += record.extents.size() *
+    ceph::encoded_sizeof_bounded<extent_info_t>();
+  extent_len_t data = 0;
+  for (const auto &i: record.deltas) {
+    metadata += ceph::encoded_sizeof(i);
+  }
+  for (const auto &i: record.extents) {
+    data += i.bl.length();
+  }
+  metadata = p2roundup(metadata, block_size);
+  return record_size_t{metadata, data};
+}
+
+bool Journal::needs_roll(segment_off_t length) const
+{
+  return length + written_to >
+    current_journal_segment->get_write_capacity();
+}
+
+Journal::roll_journal_segment_ertr::future<segment_seq_t>
+Journal::roll_journal_segment()
+{
+  auto old_segment_id = current_journal_segment ?
+    current_journal_segment->get_segment_id() :
+    NULL_SEG_ID;
+
+  return (current_journal_segment ?
+	  current_journal_segment->close() :
+	  Segment::close_ertr::now()).safe_then([this] {
+      return segment_provider->get_segment();
+    }).safe_then([this](auto segment) {
+      return segment_manager.open(segment);
+    }).safe_then([this](auto sref) {
+      current_journal_segment = sref;
+      written_to = 0;
+      return initialize_segment(*current_journal_segment);
+    }).safe_then([=](auto seq) {
+      if (old_segment_id != NULL_SEG_ID) {
+	segment_provider->close_segment(old_segment_id);
+      }
+      segment_provider->set_journal_segment(
+	current_journal_segment->get_segment_id(),
+	seq);
+      return seq;
+    }).handle_error(
+      roll_journal_segment_ertr::pass_further{},
+      crimson::ct_error::all_same_way([] { ceph_assert(0 == "TODO"); })
+    );
+}
+
+Journal::read_segment_header_ret
+Journal::read_segment_header(segment_id_t segment)
+{
+  return segment_manager.read(paddr_t{segment, 0}, block_size
+  ).handle_error(
+    read_segment_header_ertr::pass_further{},
+    crimson::ct_error::assert_all{}
+  ).safe_then([=](bufferptr bptr) -> read_segment_header_ret {
+    logger().debug("segment {} bptr size {}", segment, bptr.length());
+
+    segment_header_t header;
+    bufferlist bl;
+    bl.push_back(bptr);
+
+    logger().debug(
+      "Journal::read_segment_header: segment {} block crc {}",
+      segment,
+      bl.begin().crc32c(block_size, 0));
+
+    auto bp = bl.cbegin();
+    try {
+      decode(header, bp);
+    } catch (ceph::buffer::error &e) {
+      logger().debug(
+	"Journal::read_segment_header: segment {} unable to decode "
+	"header, skipping",
+	segment);
+      return crimson::ct_error::enodata::make();
+    }
+    logger().debug(
+      "Journal::read_segment_header: segment {} header {}",
+      segment,
+      header);
+    return read_segment_header_ret(
+      read_segment_header_ertr::ready_future_marker{},
+      header);
+  });
+}
+
+Journal::open_for_write_ret Journal::open_for_write()
+{
+  return roll_journal_segment().safe_then([this](auto seq) {
+    return open_for_write_ret(
+      open_for_write_ertr::ready_future_marker{},
+      journal_seq_t{
+	seq,
+	paddr_t{
+	  current_journal_segment->get_segment_id(),
+	  static_cast<segment_off_t>(block_size)}
+      });
+  });
+}
+
+Journal::find_replay_segments_fut Journal::find_replay_segments()
+{
+  return seastar::do_with(
+    std::vector<std::pair<segment_id_t, segment_header_t>>(),
+    [this](auto &&segments) mutable {
+      return crimson::do_for_each(
+	boost::make_counting_iterator(segment_id_t{0}),
+	boost::make_counting_iterator(segment_manager.get_num_segments()),
+	[this, &segments](auto i) {
+	  return read_segment_header(i
+	  ).safe_then([this, &segments, i](auto header) mutable {
+	    if (generate_nonce(
+		  header.journal_segment_seq,
+		  segment_manager.get_meta()) != header.segment_nonce) {
+	      logger().debug(
+		"find_replay_segments: nonce mismatch segment {} header {}",
+		i,
+		header);
+	      assert(0 == "impossible");
+	      return find_replay_segments_ertr::now();
+	    }
+
+	    segments.emplace_back(i, std::move(header));
+	    return find_replay_segments_ertr::now();
+	  }).handle_error(
+	    crimson::ct_error::enoent::handle([i](auto) {
+	      logger().debug(
+		"find_replay_segments: segment {} not available for read",
+		i);
+	      return find_replay_segments_ertr::now();
+	    }),
+	    crimson::ct_error::enodata::handle([i](auto) {
+	      logger().debug(
+		"find_replay_segments: segment {} header undecodable",
+		i);
+	      return find_replay_segments_ertr::now();
+	    }),
+	    find_replay_segments_ertr::pass_further{},
+	    crimson::ct_error::assert_all{}
+	  );
+	}).safe_then([this, &segments]() mutable -> find_replay_segments_fut {
+	  logger().debug(
+	    "find_replay_segments: have {} segments",
+	    segments.size());
+	  if (segments.empty()) {
+	    return crimson::ct_error::input_output_error::make();
+	  }
+	  std::sort(
+	    segments.begin(),
+	    segments.end(),
+	    [](const auto &lt, const auto &rt) {
+	      return lt.second.journal_segment_seq <
+		rt.second.journal_segment_seq;
+	    });
+
+	  next_journal_segment_seq =
+	    segments.rbegin()->second.journal_segment_seq + 1;
+	  std::for_each(
+	    segments.begin(),
+	    segments.end(),
+	    [this](auto &seg) {
+	      segment_provider->init_mark_segment_closed(
+		seg.first,
+		seg.second.journal_segment_seq);
+	    });
+
+	  auto journal_tail = segments.rbegin()->second.journal_tail;
+	  segment_provider->update_journal_tail_committed(journal_tail);
+	  auto replay_from = journal_tail.offset;
+	  logger().debug(
+	    "Journal::find_replay_segments: journal_tail={}",
+	    journal_tail);
+	  auto from = segments.begin();
+	  if (replay_from != P_ADDR_NULL) {
+	    from = std::find_if(
+	      segments.begin(),
+	      segments.end(),
+	      [&replay_from](const auto &seg) -> bool {
+		return seg.first == replay_from.segment;
+	      });
+	    if (from->second.journal_segment_seq != journal_tail.segment_seq) {
+	      logger().error(
+		"find_replay_segments: journal_tail {} does not match {}",
+		journal_tail,
+		from->second);
+	      assert(0 == "invalid");
+	    }
+	  } else {
+	    replay_from = paddr_t{from->first, (segment_off_t)block_size};
+	  }
+	  auto ret = replay_segments_t(segments.end() - from);
+	  std::transform(
+	    from, segments.end(), ret.begin(),
+	    [this](const auto &p) {
+	      auto ret = journal_seq_t{
+		p.second.journal_segment_seq,
+		paddr_t{p.first, (segment_off_t)block_size}};
+	      logger().debug(
+		"Journal::find_replay_segments: replaying from  {}",
+		ret);
+	      return std::make_pair(ret, p.second);
+	    });
+	  ret[0].first.offset = replay_from;
+	  return find_replay_segments_fut(
+	    find_replay_segments_ertr::ready_future_marker{},
+	    std::move(ret));
+	});
+    });
+}
+
+Journal::read_validate_record_metadata_ret Journal::read_validate_record_metadata(
+  paddr_t start,
+  segment_nonce_t nonce)
+{
+  if (start.offset + block_size > (int64_t)segment_manager.get_segment_size()) {
+    return read_validate_record_metadata_ret(
+      read_validate_record_metadata_ertr::ready_future_marker{},
+      std::nullopt);
+  }
+  return segment_manager.read(start, block_size
+  ).safe_then(
+    [=](bufferptr bptr) mutable
+    -> read_validate_record_metadata_ret {
+      logger().debug("read_validate_record_metadata: reading {}", start);
+      bufferlist bl;
+      bl.append(bptr);
+      auto bp = bl.cbegin();
+      record_header_t header;
+      try {
+	decode(header, bp);
+      } catch (ceph::buffer::error &e) {
+	return read_validate_record_metadata_ret(
+	  read_validate_record_metadata_ertr::ready_future_marker{},
+	  std::nullopt);
+      }
+      if (header.segment_nonce != nonce) {
+	return read_validate_record_metadata_ret(
+	  read_validate_record_metadata_ertr::ready_future_marker{},
+	  std::nullopt);
+      }
+      if (header.mdlength > block_size) {
+	if (start.offset + header.mdlength >
+	    (int64_t)segment_manager.get_segment_size()) {
+	  return crimson::ct_error::input_output_error::make();
+	}
+	return segment_manager.read(
+	  {start.segment, start.offset + (segment_off_t)block_size},
+	  header.mdlength - block_size).safe_then(
+	    [header=std::move(header), bl=std::move(bl)](
+	      auto &&bptail) mutable {
+	      bl.push_back(bptail);
+	      return read_validate_record_metadata_ret(
+		read_validate_record_metadata_ertr::ready_future_marker{},
+		std::make_pair(std::move(header), std::move(bl)));
+	    });
+      } else {
+	return read_validate_record_metadata_ret(
+	  read_validate_record_metadata_ertr::ready_future_marker{},
+	  std::make_pair(std::move(header), std::move(bl))
+	);
+      }
+    }).safe_then([=](auto p) {
+      if (p && validate_metadata(p->second)) {
+	return read_validate_record_metadata_ret(
+	  read_validate_record_metadata_ertr::ready_future_marker{},
+	  std::move(*p)
+	);
+      } else {
+	return read_validate_record_metadata_ret(
+	  read_validate_record_metadata_ertr::ready_future_marker{},
+	  std::nullopt);
+      }
+    });
+}
+
+std::optional<std::vector<delta_info_t>> Journal::try_decode_deltas(
+  record_header_t header,
+  const bufferlist &bl)
+{
+  auto bliter = bl.cbegin();
+  bliter += ceph::encoded_sizeof_bounded<record_header_t>();
+  bliter += sizeof(checksum_t) /* crc */;
+  bliter += header.extents  * ceph::encoded_sizeof_bounded<extent_info_t>();
+  logger().debug("{}: decoding {} deltas", __func__, header.deltas);
+  std::vector<delta_info_t> deltas(header.deltas);
+  for (auto &&i : deltas) {
+    try {
+      decode(i, bliter);
+    } catch (ceph::buffer::error &e) {
+      return std::nullopt;
+    }
+  }
+  return deltas;
+}
+
+std::optional<std::vector<extent_info_t>> Journal::try_decode_extent_infos(
+  record_header_t header,
+  const bufferlist &bl)
+{
+  auto bliter = bl.cbegin();
+  bliter += ceph::encoded_sizeof_bounded<record_header_t>();
+  bliter += sizeof(checksum_t) /* crc */;
+  logger().debug("{}: decoding {} extents", __func__, header.extents);
+  std::vector<extent_info_t> extent_infos(header.extents);
+  for (auto &&i : extent_infos) {
+    try {
+      decode(i, bliter);
+    } catch (ceph::buffer::error &e) {
+      return std::nullopt;
+    }
+  }
+  return extent_infos;
+}
+
+Journal::replay_ertr::future<>
+Journal::replay_segment(
+  journal_seq_t seq,
+  segment_header_t header,
+  delta_handler_t &handler)
+{
+  logger().debug("replay_segment: starting at {}", seq);
+  return seastar::do_with(
+    scan_valid_records_cursor(seq.offset),
+    found_record_handler_t(
+      [=, &handler](paddr_t base,
+		    const record_header_t &header,
+		    const bufferlist &mdbuf) {
+	auto deltas = try_decode_deltas(
+	  header,
+	  mdbuf);
+	if (!deltas) {
+	  // This should be impossible, we did check the crc on the mdbuf
+	  logger().error(
+	    "Journal::replay_segment unable to decode deltas for record {}",
+	    base);
+	  assert(deltas);
+	}
+
+	return seastar::do_with(
+	  std::move(*deltas),
+	  [=](auto &deltas) {
+	    return crimson::do_for_each(
+	      deltas,
+	      [=](auto &delta) {
+		/* The journal may validly contain deltas for extents in
+		 * since released segments.  We can detect those cases by
+		 * checking whether the segment in question currently has a
+		 * sequence number > the current journal segment seq. We can
+		 * safetly skip these deltas because the extent must already
+		 * have been rewritten.
+		 *
+		 * Note, this comparison exploits the fact that
+		 * SEGMENT_SEQ_NULL is a large number.
+		 */
+		if (delta.paddr != P_ADDR_NULL &&
+		    (segment_provider->get_seq(delta.paddr.segment) >
+		     seq.segment_seq)) {
+		  return replay_ertr::now();
+		} else {
+		  return handler(
+		    journal_seq_t{seq.segment_seq, base},
+		    base.add_offset(header.mdlength),
+		    delta);
+		}
+	      });
+	  });
+      }),
+    [=](auto &cursor, auto &dhandler) {
+      return scan_valid_records(
+	cursor,
+	header.segment_nonce,
+	std::numeric_limits<size_t>::max(),
+	dhandler).safe_then([](auto){});
+    });
+}
+
+Journal::replay_ret Journal::replay(delta_handler_t &&delta_handler)
+{
+  return seastar::do_with(
+    std::move(delta_handler), replay_segments_t(),
+    [this](auto &handler, auto &segments) mutable -> replay_ret {
+      return find_replay_segments().safe_then(
+        [this, &handler, &segments](auto replay_segs) mutable {
+          logger().debug("replay: found {} segments", replay_segs.size());
+          segments = std::move(replay_segs);
+          return crimson::do_for_each(segments, [this, &handler](auto i) mutable {
+            return replay_segment(i.first, i.second, handler);
+          });
+        });
+    });
+}
+
+Journal::scan_extents_ret Journal::scan_extents(
+  scan_extents_cursor &cursor,
+  extent_len_t bytes_to_read)
+{
+  auto ret = std::make_unique<scan_extents_ret_bare>();
+  auto &retref = *ret;
+  return read_segment_header(cursor.get_offset().segment
+  ).handle_error(
+    scan_extents_ertr::pass_further{},
+    crimson::ct_error::assert_all{}
+  ).safe_then([&](auto segment_header) {
+    auto segment_nonce = segment_header.segment_nonce;
+    return seastar::do_with(
+      found_record_handler_t(
+	[&](
+	  paddr_t base,
+	  const record_header_t &header,
+	  const bufferlist &mdbuf) mutable {
+
+	  auto infos = try_decode_extent_infos(
+	    header,
+	    mdbuf);
+	  if (!infos) {
+	    // This should be impossible, we did check the crc on the mdbuf
+	    logger().error(
+	      "Journal::scan_extents unable to decode extents for record {}",
+	      base);
+	    assert(infos);
+	  }
+
+	  paddr_t extent_offset = base.add_offset(header.mdlength);
+	  for (const auto &i : *infos) {
+	    retref.emplace_back(extent_offset, i);
+	    extent_offset.offset += i.len;
+	  }
+	  return scan_extents_ertr::now();
+	}),
+      [=, &cursor](auto &dhandler) {
+	return scan_valid_records(
+	  cursor,
+	  segment_nonce,
+	  std::numeric_limits<size_t>::max(),
+	  dhandler).safe_then([](auto){});
+      });
+  }).safe_then([ret=std::move(ret)] {
+    return std::move(*ret);
+  });
+}
+
+Journal::scan_valid_records_ret Journal::scan_valid_records(
+    scan_valid_records_cursor &cursor,
+    segment_nonce_t nonce,
+    size_t budget,
+    found_record_handler_t &handler)
+{
+  if (cursor.offset.offset == 0) {
+    cursor.offset.offset = block_size;
+  }
+  auto retref = std::make_unique<size_t>(0);
+  auto budget_used = *retref;
+  return crimson::do_until(
+    [=, &cursor, &budget_used, &handler]() mutable
+    -> scan_valid_records_ertr::future<bool> {
+      return [=, &handler, &cursor, &budget_used] {
+	if (!cursor.last_valid_header_found) {
+	  return read_validate_record_metadata(cursor.offset, nonce
+	  ).safe_then([=, &cursor](auto md) {
+	    logger().debug(
+	      "Journal::scan_valid_records: read complete {}",
+	      cursor.offset);
+	    if (!md) {
+	      logger().debug(
+		"Journal::scan_valid_records: found invalid header at {}, presumably at end",
+		cursor.offset);
+	      cursor.last_valid_header_found = true;
+	      return scan_valid_records_ertr::now();
+	    } else {
+	      logger().debug(
+		"Journal::scan_valid_records: valid record read at {}",
+		cursor.offset);
+	      cursor.last_committed = paddr_t{
+		cursor.offset.segment,
+		md->first.committed_to};
+	      cursor.pending_records.emplace_back(
+		cursor.offset,
+		md->first,
+		md->second);
+	      cursor.offset.offset +=
+		md->first.dlength + md->first.mdlength;
+	      return scan_valid_records_ertr::now();
+	    }
+	  }).safe_then([=, &cursor, &budget_used, &handler] {
+	    return crimson::do_until(
+	      [=, &budget_used, &cursor, &handler] {
+		logger().debug(
+		  "Journal::scan_valid_records: valid record read, processing queue");
+		if (cursor.pending_records.empty()) {
+		  /* This is only possible if the segment is empty.
+		   * A record's last_commited must be prior to its own
+		   * location since it itself cannot yet have been committed
+		   * at its own time of submission.  Thus, the most recently
+		   * read record must always fall after cursor.last_committed */
+		  return scan_valid_records_ertr::make_ready_future<bool>(true);
+		}
+		auto &next = cursor.pending_records.front();
+		if (next.offset > cursor.last_committed) {
+		  return scan_valid_records_ertr::make_ready_future<bool>(true);
+		}
+		budget_used +=
+		  next.header.dlength + next.header.mdlength;
+		return handler(
+		  next.offset,
+		  next.header,
+		  next.mdbuffer
+		).safe_then([&cursor] {
+		  cursor.pending_records.pop_front();
+		  return scan_valid_records_ertr::make_ready_future<bool>(false);
+		});
+	      });
+	  });
+	} else {
+	  assert(!cursor.pending_records.empty());
+	  auto &next = cursor.pending_records.front();
+	  return read_validate_data(next.offset, next.header
+	  ).safe_then([=, &budget_used, &next, &cursor, &handler](auto valid) {
+	    if (!valid) {
+	      cursor.pending_records.clear();
+	      return scan_valid_records_ertr::now();
+	    }
+	    budget_used +=
+	      next.header.dlength + next.header.mdlength;
+	    return handler(
+	      next.offset,
+	      next.header,
+	      next.mdbuffer
+	    ).safe_then([&cursor] {
+	      cursor.pending_records.pop_front();
+	      return scan_valid_records_ertr::now();
+	    });
+	  });
+	}
+      }().safe_then([=, &budget_used, &cursor] {
+	return scan_valid_records_ertr::make_ready_future<bool>(
+	  cursor.is_complete() || budget_used >= budget);
+      });
+    }).safe_then([retref=std::move(retref)]() mutable -> scan_valid_records_ret {
+      return scan_valid_records_ret(
+	scan_valid_records_ertr::ready_future_marker{},
+	std::move(*retref));
+    });
+}
+
+
+}
diff --git a/src/crimson/os/seastore/journal.h b/src/crimson/os/seastore/journal.h
new file mode 100644
index 000000000..7424d78b3
--- /dev/null
+++ b/src/crimson/os/seastore/journal.h
@@ -0,0 +1,405 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/log.h"
+
+#include <boost/intrusive_ptr.hpp>
+
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "include/buffer.h"
+#include "include/denc.h"
+
+#include "crimson/os/seastore/segment_manager.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/osd/exceptions.h"
+
+namespace crimson::os::seastore {
+
+using segment_nonce_t = uint32_t;
+
+
+/**
+ * Segment header
+ *
+ * Every segment contains and encode segment_header_t in the first block.
+ * Our strategy for finding the journal replay point is:
+ * 1) Find the segment with the highest journal_segment_seq
+ * 2) Replay starting at record located at that segment's journal_tail
+ */
+struct segment_header_t {
+  segment_seq_t journal_segment_seq;
+  segment_id_t physical_segment_id; // debugging
+
+  journal_seq_t journal_tail;
+  segment_nonce_t segment_nonce;
+
+  DENC(segment_header_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.journal_segment_seq, p);
+    denc(v.physical_segment_id, p);
+    denc(v.journal_tail, p);
+    denc(v.segment_nonce, p);
+    DENC_FINISH(p);
+  }
+};
+std::ostream &operator<<(std::ostream &out, const segment_header_t &header);
+
+struct record_header_t {
+  // Fixed portion
+  extent_len_t  mdlength;       // block aligned, length of metadata
+  extent_len_t  dlength;        // block aligned, length of data
+  uint32_t deltas;                // number of deltas
+  uint32_t extents;               // number of extents
+  segment_nonce_t segment_nonce;// nonce of containing segment
+  segment_off_t committed_to;   // records in this segment prior to committed_to
+                                // have been fully written
+  checksum_t data_crc;          // crc of data payload
+
+
+  DENC(record_header_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.mdlength, p);
+    denc(v.dlength, p);
+    denc(v.deltas, p);
+    denc(v.extents, p);
+    denc(v.segment_nonce, p);
+    denc(v.committed_to, p);
+    denc(v.data_crc, p);
+    DENC_FINISH(p);
+  }
+};
+
+struct extent_info_t {
+  extent_types_t type = extent_types_t::NONE;
+  laddr_t addr = L_ADDR_NULL;
+  extent_len_t len = 0;
+
+  extent_info_t() = default;
+  extent_info_t(const extent_t &et)
+    : type(et.type), addr(et.addr), len(et.bl.length()) {}
+
+  DENC(extent_info_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.type, p);
+    denc(v.addr, p);
+    denc(v.len, p);
+    DENC_FINISH(p);
+  }
+};
+
+/**
+ * Callback interface for managing available segments
+ */
+class JournalSegmentProvider {
+public:
+  using get_segment_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using get_segment_ret = get_segment_ertr::future<segment_id_t>;
+  virtual get_segment_ret get_segment() = 0;
+
+  virtual void close_segment(segment_id_t) {}
+
+  virtual void set_journal_segment(
+    segment_id_t segment,
+    segment_seq_t seq) {}
+
+  virtual journal_seq_t get_journal_tail_target() const = 0;
+  virtual void update_journal_tail_committed(journal_seq_t tail_committed) = 0;
+
+  virtual void init_mark_segment_closed(
+    segment_id_t segment, segment_seq_t seq) {}
+
+  virtual segment_seq_t get_seq(segment_id_t id) { return 0; }
+
+  virtual ~JournalSegmentProvider() {}
+};
+
+/**
+ * Manages stream of atomically written records to a SegmentManager.
+ */
+class Journal {
+public:
+  Journal(SegmentManager &segment_manager);
+
+  /**
+   * Sets the JournalSegmentProvider.
+   *
+   * Not provided in constructor to allow the provider to not own
+   * or construct the Journal (TransactionManager).
+   *
+   * Note, Journal does not own this ptr, user must ensure that
+   * *provider outlives Journal.
+   */
+  void set_segment_provider(JournalSegmentProvider *provider) {
+    segment_provider = provider;
+  }
+
+  /**
+   * initializes journal for new writes -- must run prior to calls
+   * to submit_record.  Should be called after replay if not a new
+   * Journal.
+   */
+  using open_for_write_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error
+    >;
+  using open_for_write_ret = open_for_write_ertr::future<journal_seq_t>;
+  open_for_write_ret open_for_write();
+
+  /**
+   * close journal
+   *
+   * TODO: should probably flush and disallow further writes
+   */
+  using close_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  close_ertr::future<> close() { return close_ertr::now(); }
+
+  /**
+   * submit_record
+   *
+   * @param write record and returns offset of first block and seq
+   */
+  using submit_record_ertr = crimson::errorator<
+    crimson::ct_error::erange,
+    crimson::ct_error::input_output_error
+    >;
+  using submit_record_ret = submit_record_ertr::future<
+    std::pair<paddr_t, journal_seq_t>
+    >;
+  submit_record_ret submit_record(record_t &&record) {
+    auto rsize = get_encoded_record_length(record);
+    auto total = rsize.mdlength + rsize.dlength;
+    if (total > max_record_length) {
+      return crimson::ct_error::erange::make();
+    }
+    auto roll = needs_roll(total)
+      ? roll_journal_segment().safe_then([](auto){})
+      : roll_journal_segment_ertr::now();
+    return roll.safe_then(
+      [this, rsize, record=std::move(record)]() mutable {
+	return write_record(rsize, std::move(record)
+	).safe_then([this, rsize](auto addr) {
+	  return std::make_pair(
+	    addr.add_offset(rsize.mdlength),
+	    get_journal_seq(addr));
+	});
+      });
+  }
+
+  /**
+   * Read deltas and pass to delta_handler
+   *
+   * record_block_start (argument to delta_handler) is the start of the
+   * of the first block in the record
+   */
+  using replay_ertr = SegmentManager::read_ertr;
+  using replay_ret = replay_ertr::future<>;
+  using delta_handler_t = std::function<
+    replay_ret(journal_seq_t seq,
+	       paddr_t record_block_base,
+	       const delta_info_t&)>;
+  replay_ret replay(delta_handler_t &&delta_handler);
+
+  /**
+   * scan_extents
+   *
+   * Scans records beginning at addr until the first record boundary after
+   * addr + bytes_to_read.
+   *
+   * Returns list<extent, extent_info>
+   * cursor.is_complete() will be true when no further extents exist in segment.
+   */
+  class scan_valid_records_cursor;
+  using scan_extents_cursor = scan_valid_records_cursor;
+  using scan_extents_ertr = SegmentManager::read_ertr;
+  using scan_extents_ret_bare = std::list<std::pair<paddr_t, extent_info_t>>;
+  using scan_extents_ret = scan_extents_ertr::future<scan_extents_ret_bare>;
+  scan_extents_ret scan_extents(
+    scan_extents_cursor &cursor,
+    extent_len_t bytes_to_read
+  );
+
+
+private:
+  const extent_len_t block_size;
+  const extent_len_t max_record_length;
+
+  JournalSegmentProvider *segment_provider = nullptr;
+  SegmentManager &segment_manager;
+
+  segment_seq_t next_journal_segment_seq = 0;
+  segment_nonce_t current_segment_nonce = 0;
+
+  SegmentRef current_journal_segment;
+  segment_off_t written_to = 0;
+  segment_off_t committed_to = 0;
+
+  journal_seq_t get_journal_seq(paddr_t addr) {
+    return journal_seq_t{next_journal_segment_seq-1, addr};
+  }
+
+  /// prepare segment for writes, writes out segment header
+  using initialize_segment_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  initialize_segment_ertr::future<segment_seq_t> initialize_segment(
+    Segment &segment);
+
+  struct record_size_t {
+    extent_len_t mdlength = 0;
+    extent_len_t dlength = 0;
+
+    record_size_t(
+      extent_len_t mdlength,
+      extent_len_t dlength)
+      : mdlength(mdlength), dlength(dlength) {}
+  };
+
+  /**
+   * Return <mdlength, dlength> pair denoting length of
+   * metadata and blocks respectively.
+   */
+  record_size_t get_encoded_record_length(
+    const record_t &record) const;
+
+  /// create encoded record bl
+  ceph::bufferlist encode_record(
+    record_size_t rsize,
+    record_t &&record);
+
+  /// validate embedded metadata checksum
+  static bool validate_metadata(const bufferlist &bl);
+
+  /// read and validate data
+  using read_validate_data_ertr = SegmentManager::read_ertr;
+  using read_validate_data_ret = read_validate_data_ertr::future<bool>;
+  read_validate_data_ret read_validate_data(
+    paddr_t record_base,
+    const record_header_t &header  ///< caller must ensure lifetime through
+                                   ///  future resolution
+  );
+
+
+  /// do record write
+  using write_record_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using write_record_ret = write_record_ertr::future<paddr_t>;
+  write_record_ret write_record(
+    record_size_t rsize,
+    record_t &&record);
+
+  /// close current segment and initialize next one
+  using roll_journal_segment_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  roll_journal_segment_ertr::future<segment_seq_t> roll_journal_segment();
+
+  /// returns true iff current segment has insufficient space
+  bool needs_roll(segment_off_t length) const;
+
+  using read_segment_header_ertr = crimson::errorator<
+    crimson::ct_error::enoent,
+    crimson::ct_error::enodata,
+    crimson::ct_error::input_output_error
+    >;
+  using read_segment_header_ret = read_segment_header_ertr::future<
+    segment_header_t>;
+  read_segment_header_ret read_segment_header(segment_id_t segment);
+
+  /// return ordered vector of segments to replay
+  using replay_segments_t = std::vector<
+    std::pair<journal_seq_t, segment_header_t>>;
+  using find_replay_segments_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error
+    >;
+  using find_replay_segments_fut = find_replay_segments_ertr::future<
+    replay_segments_t>;
+  find_replay_segments_fut find_replay_segments();
+
+  /// attempts to decode deltas from bl, return nullopt if unsuccessful
+  std::optional<std::vector<delta_info_t>> try_decode_deltas(
+    record_header_t header,
+    const bufferlist &bl);
+
+  /// attempts to decode extent infos from bl, return nullopt if unsuccessful
+  std::optional<std::vector<extent_info_t>> try_decode_extent_infos(
+    record_header_t header,
+    const bufferlist &bl);
+
+  /// read record metadata for record starting at start
+  using read_validate_record_metadata_ertr = replay_ertr;
+  using read_validate_record_metadata_ret =
+    read_validate_record_metadata_ertr::future<
+      std::optional<std::pair<record_header_t, bufferlist>>
+    >;
+  read_validate_record_metadata_ret read_validate_record_metadata(
+    paddr_t start,
+    segment_nonce_t nonce);
+
+public:
+  /// scan segment for end incrementally
+  struct scan_valid_records_cursor {
+    bool last_valid_header_found = false;
+    paddr_t offset;
+    paddr_t last_committed;
+
+    struct found_record_t {
+      paddr_t offset;
+      record_header_t header;
+      bufferlist mdbuffer;
+
+      found_record_t(
+	paddr_t offset,
+	const record_header_t &header,
+	const bufferlist &mdbuffer)
+	: offset(offset), header(header), mdbuffer(mdbuffer) {}
+    };
+    std::deque<found_record_t> pending_records;
+
+    bool is_complete() const {
+      return last_valid_header_found && pending_records.empty();
+    }
+
+    paddr_t get_offset() const {
+      return offset;
+    }
+
+    scan_valid_records_cursor(
+      paddr_t offset)
+      : offset(offset) {}
+  };
+private:
+
+  using scan_valid_records_ertr = SegmentManager::read_ertr;
+  using scan_valid_records_ret = scan_valid_records_ertr::future<
+    size_t>;
+  using found_record_handler_t = std::function<
+    scan_valid_records_ertr::future<>(
+      paddr_t record_block_base,
+      // callee may assume header and bl will remain valid until
+      // returned future resolves
+      const record_header_t &header,
+      const bufferlist &bl)>;
+  scan_valid_records_ret scan_valid_records(
+    scan_valid_records_cursor &cursor, ///< [in, out] cursor, updated during call
+    segment_nonce_t nonce,             ///< [in] nonce for segment
+    size_t budget,                     ///< [in] max budget to use
+    found_record_handler_t &handler    ///< [in] handler for records
+  ); ///< @return used budget
+
+  /// replays records starting at start through end of segment
+  replay_ertr::future<>
+  replay_segment(
+    journal_seq_t start,             ///< [in] starting addr, seq
+    segment_header_t header,         ///< [in] segment header
+    delta_handler_t &delta_handler   ///< [in] processes deltas in order
+  );
+
+};
+
+}
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_header_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_header_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::extent_info_t)
diff --git a/src/crimson/os/seastore/lba_manager.cc b/src/crimson/os/seastore/lba_manager.cc
new file mode 100644
index 000000000..73411dcf7
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager.cc
@@ -0,0 +1,17 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/segment_manager.h"
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/lba_manager.h"
+#include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h"
+
+namespace crimson::os::seastore::lba_manager {
+
+LBAManagerRef create_lba_manager(
+  SegmentManager &segment_manager,
+  Cache &cache) {
+  return LBAManagerRef(new btree::BtreeLBAManager(segment_manager, cache));
+}
+
+}
diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h
new file mode 100644
index 000000000..ad90f4c4f
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager.h
@@ -0,0 +1,207 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "include/buffer_fwd.h"
+#include "include/interval_set.h"
+#include "common/interval_map.h"
+
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/segment_manager.h"
+
+namespace crimson::os::seastore {
+
+/**
+ * Abstract interface for managing the logical to physical mapping
+ */
+class LBAManager {
+public:
+  using mkfs_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using mkfs_ret = mkfs_ertr::future<>;
+  virtual mkfs_ret mkfs(
+    Transaction &t
+  ) = 0;
+
+  /**
+   * Fetches mappings for laddr_t in range [offset, offset + len)
+   *
+   * Future will not resolve until all pins have resolved (set_paddr called)
+   */
+  using get_mapping_ertr = crimson::errorator<
+  crimson::ct_error::input_output_error>;
+  using get_mapping_ret = get_mapping_ertr::future<lba_pin_list_t>;
+  virtual get_mapping_ret get_mapping(
+    Transaction &t,
+    laddr_t offset, extent_len_t length) = 0;
+
+  /**
+   * Fetches mappings for laddr_t in range [offset, offset + len)
+   *
+   * Future will not result until all pins have resolved (set_paddr called)
+   */
+  using get_mappings_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using get_mappings_ret = get_mapping_ertr::future<lba_pin_list_t>;
+  virtual get_mappings_ret get_mappings(
+    Transaction &t,
+    laddr_list_t &&extent_lisk) = 0;
+
+  /**
+   * Allocates a new mapping referenced by LBARef
+   *
+   * Offset will be relative to the block offset of the record
+   * This mapping will block from transaction submission until set_paddr
+   * is called on the LBAPin.
+   */
+  using alloc_extent_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using alloc_extent_ret = alloc_extent_ertr::future<LBAPinRef>;
+  virtual alloc_extent_ret alloc_extent(
+    Transaction &t,
+    laddr_t hint,
+    extent_len_t len,
+    paddr_t addr) = 0;
+
+  /**
+   * Creates a new absolute mapping.
+   *
+   * off~len must be unreferenced
+   */
+  using set_extent_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error,
+    crimson::ct_error::invarg>;
+  using set_extent_ret = set_extent_ertr::future<LBAPinRef>;
+  virtual set_extent_ret set_extent(
+    Transaction &t,
+    laddr_t off, extent_len_t len, paddr_t addr) = 0;
+
+
+  struct ref_update_result_t {
+    unsigned refcount = 0;
+    paddr_t addr;
+  };
+  using ref_ertr = crimson::errorator<
+    crimson::ct_error::enoent,
+    crimson::ct_error::input_output_error>;
+  using ref_ret = ref_ertr::future<ref_update_result_t>;
+
+  /**
+   * Decrements ref count on extent
+   *
+   * @return returns resulting refcount
+   */
+  virtual ref_ret decref_extent(
+    Transaction &t,
+    laddr_t addr) = 0;
+
+  /**
+   * Increments ref count on extent
+   *
+   * @return returns resulting refcount
+   */
+  virtual ref_ret incref_extent(
+    Transaction &t,
+    laddr_t addr) = 0;
+
+  using complete_transaction_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using complete_transaction_ret = complete_transaction_ertr::future<>;
+  virtual complete_transaction_ret complete_transaction(
+    Transaction &t) = 0;
+
+  /**
+   * Should be called after replay on each cached extent.
+   * Implementation must initialize the LBAPin on any
+   * LogicalCachedExtent's and may also read in any dependent
+   * structures, etc.
+   */
+  using init_cached_extent_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using init_cached_extent_ret = init_cached_extent_ertr::future<>;
+  virtual init_cached_extent_ret init_cached_extent(
+    Transaction &t,
+    CachedExtentRef e) = 0;
+
+  /**
+   * Calls f for each mapping in [begin, end)
+   */
+  using scan_mappings_ertr = SegmentManager::read_ertr;
+  using scan_mappings_ret = scan_mappings_ertr::future<>;
+  using scan_mappings_func_t = std::function<
+    void(laddr_t, paddr_t, extent_len_t)>;
+  virtual scan_mappings_ret scan_mappings(
+    Transaction &t,
+    laddr_t begin,
+    laddr_t end,
+    scan_mappings_func_t &&f) = 0;
+
+  /**
+   * Calls f for each mapped space usage in [begin, end)
+   */
+  using scan_mapped_space_ertr = SegmentManager::read_ertr;
+  using scan_mapped_space_ret = scan_mapped_space_ertr::future<>;
+  using scan_mapped_space_func_t = std::function<
+    void(paddr_t, extent_len_t)>;
+  virtual scan_mapped_space_ret scan_mapped_space(
+    Transaction &t,
+    scan_mapped_space_func_t &&f) = 0;
+
+  /**
+   * rewrite_extent
+   *
+   * rewrite extent into passed transaction
+   */
+  using rewrite_extent_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using rewrite_extent_ret = rewrite_extent_ertr::future<>;
+  virtual rewrite_extent_ret rewrite_extent(
+    Transaction &t,
+    CachedExtentRef extent) = 0;
+
+  /**
+   * get_physical_extent_if_live
+   *
+   * Returns extent at addr/laddr if still live (if laddr
+   * still points at addr).  Extent must be an internal, physical
+   * extent.
+   *
+   * Returns a null CachedExtentRef if extent is not live.
+   */
+  using get_physical_extent_if_live_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using get_physical_extent_if_live_ret =
+    get_physical_extent_if_live_ertr::future<CachedExtentRef>;
+  virtual get_physical_extent_if_live_ret get_physical_extent_if_live(
+    Transaction &t,
+    extent_types_t type,
+    paddr_t addr,
+    laddr_t laddr,
+    segment_off_t len) = 0;
+
+  virtual void add_pin(LBAPin &pin) = 0;
+
+  virtual ~LBAManager() {}
+};
+using LBAManagerRef = std::unique_ptr<LBAManager>;
+
+class Cache;
+namespace lba_manager {
+LBAManagerRef create_lba_manager(
+  SegmentManager &segment_manager,
+  Cache &cache);
+}
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
new file mode 100644
index 000000000..a837ae37e
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
@@ -0,0 +1,580 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include "crimson/common/log.h"
+
+#include "include/buffer.h"
+#include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h"
+
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+BtreeLBAManager::mkfs_ret BtreeLBAManager::mkfs(
+  Transaction &t)
+{
+  logger().debug("BtreeLBAManager::mkfs");
+  return cache.get_root(t).safe_then([this, &t](auto croot) {
+    auto root_leaf = cache.alloc_new_extent<LBALeafNode>(
+      t,
+      LBA_BLOCK_SIZE);
+    root_leaf->set_size(0);
+    lba_node_meta_t meta{0, L_ADDR_MAX, 1};
+    root_leaf->set_meta(meta);
+    root_leaf->pin.set_range(meta);
+    croot->get_root() =
+      root_t{
+        1,
+        0,
+        root_leaf->get_paddr(),
+        make_record_relative_paddr(0),
+        L_ADDR_NULL};
+    return mkfs_ertr::now();
+  });
+}
+
+BtreeLBAManager::get_root_ret
+BtreeLBAManager::get_root(Transaction &t)
+{
+  return cache.get_root(t).safe_then([this, &t](auto croot) {
+    logger().debug(
+      "BtreeLBAManager::get_root: reading root at {} depth {}",
+      paddr_t{croot->get_root().lba_root_addr},
+      unsigned(croot->get_root().lba_depth));
+    return get_lba_btree_extent(
+      get_context(t),
+      croot->get_root().lba_depth,
+      croot->get_root().lba_root_addr,
+      paddr_t());
+  });
+}
+
+BtreeLBAManager::get_mapping_ret
+BtreeLBAManager::get_mapping(
+  Transaction &t,
+  laddr_t offset, extent_len_t length)
+{
+  logger().debug("BtreeLBAManager::get_mapping: {}, {}", offset, length);
+  return get_root(
+    t).safe_then([this, &t, offset, length](auto extent) {
+      return extent->lookup_range(
+	get_context(t),
+	offset, length
+      ).safe_then([extent](auto ret) { return ret; });
+    }).safe_then([](auto &&e) {
+      logger().debug("BtreeLBAManager::get_mapping: got mapping {}", e);
+      return get_mapping_ret(
+	get_mapping_ertr::ready_future_marker{},
+	std::move(e));
+    });
+}
+
+
+BtreeLBAManager::get_mappings_ret
+BtreeLBAManager::get_mappings(
+  Transaction &t,
+  laddr_list_t &&list)
+{
+  logger().debug("BtreeLBAManager::get_mappings: {}", list);
+  auto l = std::make_unique<laddr_list_t>(std::move(list));
+  auto retptr = std::make_unique<lba_pin_list_t>();
+  auto &ret = *retptr;
+  return crimson::do_for_each(
+    l->begin(),
+    l->end(),
+    [this, &t, &ret](const auto &p) {
+      return get_mapping(t, p.first, p.second).safe_then(
+	[&ret](auto res) {
+	  ret.splice(ret.end(), res, res.begin(), res.end());
+	});
+    }).safe_then([l=std::move(l), retptr=std::move(retptr)]() mutable {
+      return std::move(*retptr);
+    });
+}
+
+BtreeLBAManager::alloc_extent_ret
+BtreeLBAManager::alloc_extent(
+  Transaction &t,
+  laddr_t hint,
+  extent_len_t len,
+  paddr_t addr)
+{
+  // TODO: we can certainly combine the lookup and the insert.
+  return get_root(
+    t).safe_then([this, &t, hint, len](auto extent) {
+      logger().debug(
+	"BtreeLBAManager::alloc_extent: beginning search at {}",
+	*extent);
+      return extent->find_hole(
+	get_context(t),
+	hint,
+	L_ADDR_MAX,
+	len).safe_then([extent](auto ret) {
+	  return std::make_pair(ret, extent);
+	});
+    }).safe_then([this, &t, len, addr](auto allocation_pair) {
+      auto &[laddr, extent] = allocation_pair;
+      ceph_assert(laddr != L_ADDR_MAX);
+      return insert_mapping(
+	t,
+	extent,
+	laddr,
+	{ len, addr, 1, 0 }
+      ).safe_then([laddr=laddr, addr, len](auto pin) {
+	logger().debug(
+	  "BtreeLBAManager::alloc_extent: alloc {}~{} for {}",
+	  laddr,
+	  len,
+	  addr);
+	return alloc_extent_ret(
+	  alloc_extent_ertr::ready_future_marker{},
+	  LBAPinRef(pin.release()));
+      });
+    });
+}
+
+BtreeLBAManager::set_extent_ret
+BtreeLBAManager::set_extent(
+  Transaction &t,
+  laddr_t off, extent_len_t len, paddr_t addr)
+{
+  return get_root(
+    t).safe_then([this, &t, off, len, addr](auto root) {
+      return insert_mapping(
+	t,
+	root,
+	off,
+	{ len, addr, 1, 0 });
+    }).safe_then([](auto ret) {
+      return set_extent_ret(
+	set_extent_ertr::ready_future_marker{},
+	LBAPinRef(ret.release()));
+    });
+}
+
+static bool is_lba_node(extent_types_t type)
+{
+  return type == extent_types_t::LADDR_INTERNAL ||
+    type == extent_types_t::LADDR_LEAF;
+}
+
+static bool is_lba_node(const CachedExtent &e)
+{
+  return is_lba_node(e.get_type());
+}
+
+btree_range_pin_t &BtreeLBAManager::get_pin(CachedExtent &e)
+{
+  if (is_lba_node(e)) {
+    return e.cast<LBANode>()->pin;
+  } else if (e.is_logical()) {
+    return static_cast<BtreeLBAPin &>(
+      e.cast<LogicalCachedExtent>()->get_pin()).pin;
+  } else {
+    ceph_abort_msg("impossible");
+  }
+}
+
+static depth_t get_depth(const CachedExtent &e)
+{
+  if (is_lba_node(e)) {
+    return e.cast<LBANode>()->get_node_meta().depth;
+  } else if (e.is_logical()) {
+    return 0;
+  } else {
+    ceph_assert(0 == "currently impossible");
+    return 0;
+  }
+}
+
+BtreeLBAManager::complete_transaction_ret
+BtreeLBAManager::complete_transaction(
+  Transaction &t)
+{
+  std::vector<CachedExtentRef> to_clear;
+  to_clear.reserve(t.get_retired_set().size());
+  for (auto &e: t.get_retired_set()) {
+    if (e->is_logical() || is_lba_node(*e))
+      to_clear.push_back(e);
+  }
+  // need to call check_parent from leaf->parent
+  std::sort(
+    to_clear.begin(), to_clear.end(),
+    [](auto &l, auto &r) { return get_depth(*l) < get_depth(*r); });
+
+  for (auto &e: to_clear) {
+    auto &pin = get_pin(*e);
+    logger().debug("{}: retiring {}, {}", __func__, *e, pin);
+    pin_set.retire(pin);
+  }
+
+  // ...but add_pin from parent->leaf
+  std::vector<CachedExtentRef> to_link;
+  to_link.reserve(t.get_fresh_block_list().size());
+  for (auto &e: t.get_fresh_block_list()) {
+    if (e->is_valid() && (is_lba_node(*e) || e->is_logical()))
+      to_link.push_back(e);
+  }
+  std::sort(
+    to_link.begin(), to_link.end(),
+    [](auto &l, auto &r) -> bool { return get_depth(*l) > get_depth(*r); });
+
+  for (auto &e : to_link) {
+    logger().debug("{}: linking {}", __func__, *e);
+    pin_set.add_pin(get_pin(*e));
+  }
+
+  for (auto &e: to_clear) {
+    auto &pin = get_pin(*e);
+    logger().debug("{}: checking {}, {}", __func__, *e, pin);
+    pin_set.check_parent(pin);
+  }
+  return complete_transaction_ertr::now();
+}
+
+BtreeLBAManager::init_cached_extent_ret BtreeLBAManager::init_cached_extent(
+  Transaction &t,
+  CachedExtentRef e)
+{
+  logger().debug("{}: {}", __func__, *e);
+  return get_root(t).safe_then(
+    [this, &t, e=std::move(e)](LBANodeRef root) mutable {
+      if (is_lba_node(*e)) {
+	auto lban = e->cast<LBANode>();
+	logger().debug("init_cached_extent: lba node, getting root");
+	return root->lookup(
+	  op_context_t{cache, pin_set, t},
+	  lban->get_node_meta().begin,
+	  lban->get_node_meta().depth
+	).safe_then([this, e=std::move(e)](LBANodeRef c) {
+	  if (c->get_paddr() == e->get_paddr()) {
+	    assert(&*c == &*e);
+	    logger().debug("init_cached_extent: {} initialized", *e);
+	  } else {
+	    // e is obsolete
+	    logger().debug("init_cached_extent: {} obsolete", *e);
+	    cache.drop_from_cache(e);
+	  }
+	  return init_cached_extent_ertr::now();
+	});
+      } else if (e->is_logical()) {
+	auto logn = e->cast<LogicalCachedExtent>();
+	return root->lookup_range(
+	  op_context_t{cache, pin_set, t},
+	  logn->get_laddr(),
+	  logn->get_length()).safe_then(
+	    [this, logn=std::move(logn)](auto pins) {
+	      if (pins.size() == 1) {
+		auto pin = std::move(pins.front());
+		pins.pop_front();
+		if (pin->get_paddr() == logn->get_paddr()) {
+		  logn->set_pin(std::move(pin));
+		  pin_set.add_pin(
+		    static_cast<BtreeLBAPin&>(logn->get_pin()).pin);
+		  logger().debug("init_cached_extent: {} initialized", *logn);
+		} else {
+		  // paddr doesn't match, remapped, obsolete
+		  logger().debug("init_cached_extent: {} obsolete", *logn);
+		  cache.drop_from_cache(logn);
+		}
+	      } else {
+		// set of extents changed, obsolete
+		logger().debug("init_cached_extent: {} obsolete", *logn);
+		cache.drop_from_cache(logn);
+	      }
+	      return init_cached_extent_ertr::now();
+	    });
+      } else {
+	logger().debug("init_cached_extent: {} skipped", *e);
+	return init_cached_extent_ertr::now();
+      }
+    });
+}
+
+BtreeLBAManager::scan_mappings_ret BtreeLBAManager::scan_mappings(
+  Transaction &t,
+  laddr_t begin,
+  laddr_t end,
+  scan_mappings_func_t &&f)
+{
+  return seastar::do_with(
+    std::move(f),
+    LBANodeRef(),
+    [=, &t](auto &f, auto &lbarootref) {
+      return get_root(t).safe_then(
+	[=, &t, &f](LBANodeRef lbaroot) mutable {
+	  lbarootref = lbaroot;
+	  return lbaroot->scan_mappings(
+	    get_context(t),
+	    begin,
+	    end,
+	    f);
+	});
+    });
+}
+
+BtreeLBAManager::scan_mapped_space_ret BtreeLBAManager::scan_mapped_space(
+    Transaction &t,
+    scan_mapped_space_func_t &&f)
+{
+  return seastar::do_with(
+    std::move(f),
+    LBANodeRef(),
+    [=, &t](auto &f, auto &lbarootref) {
+      return get_root(t).safe_then(
+	[=, &t, &f](LBANodeRef lbaroot) mutable {
+	  lbarootref = lbaroot;
+	  return lbaroot->scan_mapped_space(
+	    get_context(t),
+	    f);
+	});
+    });
+}
+
+BtreeLBAManager::rewrite_extent_ret BtreeLBAManager::rewrite_extent(
+  Transaction &t,
+  CachedExtentRef extent)
+{
+  if (extent->is_logical()) {
+    auto lextent = extent->cast<LogicalCachedExtent>();
+    cache.retire_extent(t, extent);
+    auto nlextent = cache.alloc_new_extent_by_type(
+      t,
+      lextent->get_type(),
+      lextent->get_length())->cast<LogicalCachedExtent>();
+    lextent->get_bptr().copy_out(
+      0,
+      lextent->get_length(),
+      nlextent->get_bptr().c_str());
+    nlextent->set_laddr(lextent->get_laddr());
+    nlextent->set_pin(lextent->get_pin().duplicate());
+
+    logger().debug(
+      "{}: rewriting {} into {}",
+      __func__,
+      *lextent,
+      *nlextent);
+
+    return update_mapping(
+      t,
+      lextent->get_laddr(),
+      [prev_addr = lextent->get_paddr(), addr = nlextent->get_paddr()](
+	const lba_map_val_t &in) {
+	lba_map_val_t ret = in;
+	ceph_assert(in.paddr == prev_addr);
+	ret.paddr = addr;
+	return ret;
+      }).safe_then([nlextent](auto e) {}).handle_error(
+	rewrite_extent_ertr::pass_further{},
+        /* ENOENT in particular should be impossible */
+	crimson::ct_error::assert_all{}
+      );
+  } else if (is_lba_node(*extent)) {
+    auto lba_extent = extent->cast<LBANode>();
+    cache.retire_extent(t, extent);
+    auto nlba_extent = cache.alloc_new_extent_by_type(
+      t,
+      lba_extent->get_type(),
+      lba_extent->get_length())->cast<LBANode>();
+    lba_extent->get_bptr().copy_out(
+      0,
+      lba_extent->get_length(),
+      nlba_extent->get_bptr().c_str());
+    nlba_extent->pin.set_range(nlba_extent->get_node_meta());
+
+    /* This is a bit underhanded.  Any relative addrs here must necessarily
+     * be record relative as we are rewriting a dirty extent.  Thus, we
+     * are using resolve_relative_addrs with a (likely negative) block
+     * relative offset to correct them to block-relative offsets adjusted
+     * for our new transaction location.
+     *
+     * Upon commit, these now block relative addresses will be interpretted
+     * against the real final address.
+     */
+    nlba_extent->resolve_relative_addrs(
+      make_record_relative_paddr(0) - nlba_extent->get_paddr());
+
+    return update_internal_mapping(
+      t,
+      nlba_extent->get_node_meta().depth,
+      nlba_extent->get_node_meta().begin,
+      nlba_extent->get_paddr()).safe_then(
+	[](auto) {},
+	rewrite_extent_ertr::pass_further {},
+	crimson::ct_error::assert_all{});
+  } else {
+    return rewrite_extent_ertr::now();
+  }
+}
+
+BtreeLBAManager::get_physical_extent_if_live_ret
+BtreeLBAManager::get_physical_extent_if_live(
+  Transaction &t,
+  extent_types_t type,
+  paddr_t addr,
+  laddr_t laddr,
+  segment_off_t len)
+{
+  ceph_assert(is_lba_node(type));
+  return cache.get_extent_by_type(
+    t,
+    type,
+    addr,
+    laddr,
+    len
+  ).safe_then([=, &t](CachedExtentRef extent) {
+    return get_root(t).safe_then([=, &t](LBANodeRef root) {
+      auto lba_node = extent->cast<LBANode>();
+      return root->lookup(
+	op_context_t{cache, pin_set, t},
+	lba_node->get_node_meta().begin,
+	lba_node->get_node_meta().depth).safe_then([=](LBANodeRef c) {
+	  if (c->get_paddr() == lba_node->get_paddr()) {
+	    return get_physical_extent_if_live_ret(
+	      get_physical_extent_if_live_ertr::ready_future_marker{},
+	      lba_node);
+	  } else {
+	    cache.drop_from_cache(lba_node);
+	    return get_physical_extent_if_live_ret(
+	      get_physical_extent_if_live_ertr::ready_future_marker{},
+	      CachedExtentRef());
+	  }
+	});
+    });
+  });
+}
+
+BtreeLBAManager::BtreeLBAManager(
+  SegmentManager &segment_manager,
+  Cache &cache)
+  : segment_manager(segment_manager),
+    cache(cache) {}
+
+BtreeLBAManager::insert_mapping_ret BtreeLBAManager::insert_mapping(
+  Transaction &t,
+  LBANodeRef root,
+  laddr_t laddr,
+  lba_map_val_t val)
+{
+  auto split = insert_mapping_ertr::future<LBANodeRef>(
+    insert_mapping_ertr::ready_future_marker{},
+    root);
+  if (root->at_max_capacity()) {
+    split = cache.get_root(t).safe_then(
+      [this, root, laddr, &t](RootBlockRef croot) {
+	logger().debug(
+	  "BtreeLBAManager::insert_mapping: splitting root {}",
+	  *croot);
+	{
+	  auto mut_croot = cache.duplicate_for_write(t, croot);
+	  croot = mut_croot->cast<RootBlock>();
+	}
+	auto nroot = cache.alloc_new_extent<LBAInternalNode>(t, LBA_BLOCK_SIZE);
+	lba_node_meta_t meta{0, L_ADDR_MAX, root->get_node_meta().depth + 1};
+	nroot->set_meta(meta);
+	nroot->pin.set_range(meta);
+	nroot->journal_insert(
+	  nroot->begin(),
+	  L_ADDR_MIN,
+	  root->get_paddr(),
+	  nullptr);
+	croot->get_root().lba_root_addr = nroot->get_paddr();
+	croot->get_root().lba_depth = root->get_node_meta().depth + 1;
+	return nroot->split_entry(
+	  get_context(t),
+	  laddr, nroot->begin(), root);
+      });
+  }
+  return split.safe_then([this, &t, laddr, val](LBANodeRef node) {
+    return node->insert(
+      get_context(t),
+      laddr, val);
+  });
+}
+
+BtreeLBAManager::update_refcount_ret BtreeLBAManager::update_refcount(
+  Transaction &t,
+  laddr_t addr,
+  int delta)
+{
+  return update_mapping(
+    t,
+    addr,
+    [delta](const lba_map_val_t &in) {
+      lba_map_val_t out = in;
+      ceph_assert((int)out.refcount + delta >= 0);
+      out.refcount += delta;
+      return out;
+    }).safe_then([](auto result) {
+      return ref_update_result_t{result.refcount, result.paddr};
+    });
+}
+
+BtreeLBAManager::update_mapping_ret BtreeLBAManager::update_mapping(
+  Transaction &t,
+  laddr_t addr,
+  update_func_t &&f)
+{
+  return get_root(t
+  ).safe_then([this, f=std::move(f), &t, addr](LBANodeRef root) mutable {
+    return root->mutate_mapping(
+      get_context(t),
+      addr,
+      std::move(f));
+  });
+}
+
+BtreeLBAManager::update_internal_mapping_ret
+BtreeLBAManager::update_internal_mapping(
+  Transaction &t,
+  depth_t depth,
+  laddr_t laddr,
+  paddr_t paddr)
+{
+  return cache.get_root(t).safe_then([=, &t](RootBlockRef croot) {
+    if (depth == croot->get_root().lba_depth) {
+      logger().debug(
+	"update_internal_mapping: updating lba root to: {}->{}",
+	laddr,
+	paddr);
+      {
+	auto mut_croot = cache.duplicate_for_write(t, croot);
+	croot = mut_croot->cast<RootBlock>();
+      }
+      ceph_assert(laddr == 0);
+      auto old_paddr = croot->get_root().lba_root_addr;
+      croot->get_root().lba_root_addr = paddr;
+      return update_internal_mapping_ret(
+	update_internal_mapping_ertr::ready_future_marker{},
+	old_paddr);
+    } else {
+      logger().debug(
+	"update_internal_mapping: updating lba node at depth {} to: {}->{}",
+	depth,
+	laddr,
+	paddr);
+      return get_lba_btree_extent(
+	get_context(t),
+	croot->get_root().lba_depth,
+	croot->get_root().lba_root_addr,
+	paddr_t()).safe_then([=, &t](LBANodeRef broot) {
+	  return broot->mutate_internal_address(
+	    get_context(t),
+	    depth,
+	    laddr,
+	    paddr);
+	});
+    }
+  });
+}
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
new file mode 100644
index 000000000..640d56734
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
@@ -0,0 +1,188 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "include/buffer_fwd.h"
+#include "include/interval_set.h"
+#include "common/interval_map.h"
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/lba_manager.h"
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/segment_manager.h"
+
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+/**
+ * BtreeLBAManager
+ *
+ * Uses a wandering btree to track two things:
+ * 1) lba state including laddr_t -> paddr_t mapping
+ * 2) reverse paddr_t -> laddr_t mapping for gc (TODO)
+ *
+ * Generally, any transaction will involve
+ * 1) deltas against lba tree nodes
+ * 2) new lba tree nodes
+ *    - Note, there must necessarily be a delta linking
+ *      these new nodes into the tree -- might be a
+ *      bootstrap_state_t delta if new root
+ *
+ * get_mappings, alloc_extent_*, etc populate a Transaction
+ * which then gets submitted
+ */
+class BtreeLBAManager : public LBAManager {
+public:
+  BtreeLBAManager(
+    SegmentManager &segment_manager,
+    Cache &cache);
+
+  mkfs_ret mkfs(
+    Transaction &t) final;
+
+  get_mapping_ret get_mapping(
+    Transaction &t,
+    laddr_t offset, extent_len_t length) final;
+
+  get_mappings_ret get_mappings(
+    Transaction &t,
+    laddr_list_t &&list) final;
+
+  alloc_extent_ret alloc_extent(
+    Transaction &t,
+    laddr_t hint,
+    extent_len_t len,
+    paddr_t addr) final;
+
+  set_extent_ret set_extent(
+    Transaction &t,
+    laddr_t off, extent_len_t len, paddr_t addr) final;
+
+  ref_ret decref_extent(
+    Transaction &t,
+    laddr_t addr) final {
+    return update_refcount(t, addr, -1);
+  }
+
+  ref_ret incref_extent(
+    Transaction &t,
+    laddr_t addr) final {
+    return update_refcount(t, addr, 1);
+  }
+
+  complete_transaction_ret complete_transaction(
+    Transaction &t) final;
+
+  init_cached_extent_ret init_cached_extent(
+    Transaction &t,
+    CachedExtentRef e) final;
+
+  scan_mappings_ret scan_mappings(
+    Transaction &t,
+    laddr_t begin,
+    laddr_t end,
+    scan_mappings_func_t &&f) final;
+
+  scan_mapped_space_ret scan_mapped_space(
+    Transaction &t,
+    scan_mapped_space_func_t &&f) final;
+
+  rewrite_extent_ret rewrite_extent(
+    Transaction &t,
+    CachedExtentRef extent);
+
+  get_physical_extent_if_live_ret get_physical_extent_if_live(
+    Transaction &t,
+    extent_types_t type,
+    paddr_t addr,
+    laddr_t laddr,
+    segment_off_t len) final;
+
+  void add_pin(LBAPin &pin) final {
+    auto *bpin = reinterpret_cast<BtreeLBAPin*>(&pin);
+    pin_set.add_pin(bpin->pin);
+    bpin->parent = nullptr;
+  }
+
+private:
+  SegmentManager &segment_manager;
+  Cache &cache;
+
+  btree_pin_set_t pin_set;
+
+  op_context_t get_context(Transaction &t) {
+    return op_context_t{cache, pin_set, t};
+  }
+
+  static btree_range_pin_t &get_pin(CachedExtent &e);
+
+
+  /**
+   * get_root
+   *
+   * Get a reference to the root LBANode.
+   */
+  using get_root_ertr = Cache::get_extent_ertr;
+  using get_root_ret = get_root_ertr::future<LBANodeRef>;
+  get_root_ret get_root(Transaction &);
+
+  /**
+   * insert_mapping
+   *
+   * Insert a lba mapping into the tree
+   */
+  using insert_mapping_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using insert_mapping_ret = insert_mapping_ertr::future<LBAPinRef>;
+  insert_mapping_ret insert_mapping(
+    Transaction &t,   ///< [in,out] transaction
+    LBANodeRef root,  ///< [in] root node
+    laddr_t laddr,    ///< [in] logical addr to insert
+    lba_map_val_t val ///< [in] mapping to insert
+  );
+
+  /**
+   * update_refcount
+   *
+   * Updates refcount, returns resulting refcount
+   */
+  using update_refcount_ret = ref_ret;
+  update_refcount_ret update_refcount(
+    Transaction &t,
+    laddr_t addr,
+    int delta);
+
+  /**
+   * update_mapping
+   *
+   * Updates mapping, removes if f returns nullopt
+   */
+  using update_mapping_ertr = ref_ertr;
+  using update_mapping_ret = ref_ertr::future<lba_map_val_t>;
+  using update_func_t = LBANode::mutate_func_t;
+  update_mapping_ret update_mapping(
+    Transaction &t,
+    laddr_t addr,
+    update_func_t &&f);
+
+  using update_internal_mapping_ertr = LBANode::mutate_internal_address_ertr;
+  using update_internal_mapping_ret = LBANode::mutate_internal_address_ret;
+  update_internal_mapping_ret update_internal_mapping(
+    Transaction &t,
+    depth_t depth,
+    laddr_t laddr,
+    paddr_t paddr);
+};
+using BtreeLBAManagerRef = std::unique_ptr<BtreeLBAManager>;
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc
new file mode 100644
index 000000000..a86c3cc57
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc
@@ -0,0 +1,153 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/common/log.h"
+
+#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+void btree_range_pin_t::take_pin(btree_range_pin_t &other)
+{
+  assert(other.extent);
+  assert(other.pins);
+  other.pins->replace_pin(*this, other);
+  pins = other.pins;
+  other.pins = nullptr;
+
+  if (other.has_ref()) {
+    other.drop_ref();
+    acquire_ref();
+  }
+}
+
+btree_range_pin_t::~btree_range_pin_t()
+{
+  assert(!pins == !is_linked());
+  assert(!ref);
+  if (pins) {
+    logger().debug("{}: removing {}", __func__, *this);
+    pins->remove_pin(*this, true);
+  }
+  extent = nullptr;
+}
+
+void btree_pin_set_t::replace_pin(btree_range_pin_t &to, btree_range_pin_t &from)
+{
+  pins.replace_node(pins.iterator_to(from), to);
+}
+
+void btree_pin_set_t::remove_pin(btree_range_pin_t &pin, bool do_check_parent)
+{
+  logger().debug("{}: {}", __func__, pin);
+  assert(pin.is_linked());
+  assert(pin.pins);
+  assert(!pin.ref);
+
+  pins.erase(pin);
+  pin.pins = nullptr;
+
+  if (do_check_parent) {
+    check_parent(pin);
+  }
+}
+
+btree_range_pin_t *btree_pin_set_t::maybe_get_parent(
+  const lba_node_meta_t &meta)
+{
+  auto cmeta = meta;
+  cmeta.depth++;
+  auto iter = pins.upper_bound(cmeta, btree_range_pin_t::meta_cmp_t());
+  if (iter == pins.begin()) {
+    return nullptr;
+  } else {
+    --iter;
+    if (iter->range.is_parent_of(meta)) {
+      return &*iter;
+    } else {
+      return nullptr;
+    }
+  }
+}
+
+const btree_range_pin_t *btree_pin_set_t::maybe_get_first_child(
+  const lba_node_meta_t &meta) const
+{
+  if (meta.depth == 0) {
+    return nullptr;
+  }
+
+  auto cmeta = meta;
+  cmeta.depth--;
+
+  auto iter = pins.lower_bound(cmeta, btree_range_pin_t::meta_cmp_t());
+  if (iter == pins.end()) {
+    return nullptr;
+  } else if (meta.is_parent_of(iter->range)) {
+    return &*iter;
+  } else {
+    return nullptr;
+  }
+}
+
+void btree_pin_set_t::release_if_no_children(btree_range_pin_t &pin)
+{
+  assert(pin.is_linked());
+  if (maybe_get_first_child(pin.range) == nullptr) {
+    pin.drop_ref();
+  }
+}
+
+void btree_pin_set_t::add_pin(btree_range_pin_t &pin)
+{
+  assert(!pin.is_linked());
+  assert(!pin.pins);
+  assert(!pin.ref);
+
+  auto [prev, inserted] = pins.insert(pin);
+  if (!inserted) {
+    logger().error("{}: unable to add {}, found {}", __func__, pin, *prev);
+    assert(0 == "impossible");
+    return;
+  }
+  pin.pins = this;
+  if (!pin.is_root()) {
+    auto *parent = maybe_get_parent(pin.range);
+    assert(parent);
+    if (!parent->has_ref()) {
+      logger().debug("{}: acquiring parent {}", __func__,
+		     static_cast<void*>(parent));
+      parent->acquire_ref();
+    } else {
+      logger().debug("{}: parent has ref {}", __func__,
+		     static_cast<void*>(parent));
+    }
+  }
+  if (maybe_get_first_child(pin.range) != nullptr) {
+    logger().debug("{}: acquiring self {}", __func__, pin);
+    pin.acquire_ref();
+  }
+}
+
+void btree_pin_set_t::retire(btree_range_pin_t &pin)
+{
+  pin.drop_ref();
+  remove_pin(pin, false);
+}
+
+void btree_pin_set_t::check_parent(btree_range_pin_t &pin)
+{
+  auto parent = maybe_get_parent(pin.range);
+  if (parent) {
+    logger().debug("{}: releasing parent {}", __func__, *parent);
+    release_if_no_children(*parent);
+  }
+}
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h
new file mode 100644
index 000000000..3fa218fc8
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h
@@ -0,0 +1,274 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive/set.hpp>
+
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/seastore_types.h"
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+class LBANode;
+using LBANodeRef = TCachedExtentRef<LBANode>;
+
+struct lba_node_meta_t {
+  laddr_t begin = 0;
+  laddr_t end = 0;
+  depth_t depth = 0;
+
+  bool is_parent_of(const lba_node_meta_t &other) const {
+    return (depth == other.depth + 1) &&
+      (begin <= other.begin) &&
+      (end >= other.end);
+  }
+
+  std::pair<lba_node_meta_t, lba_node_meta_t> split_into(laddr_t pivot) const {
+    return std::make_pair(
+      lba_node_meta_t{begin, pivot, depth},
+      lba_node_meta_t{pivot, end, depth});
+  }
+
+  static lba_node_meta_t merge_from(
+    const lba_node_meta_t &lhs, const lba_node_meta_t &rhs) {
+    assert(lhs.depth == rhs.depth);
+    return lba_node_meta_t{lhs.begin, rhs.end, lhs.depth};
+  }
+
+  static std::pair<lba_node_meta_t, lba_node_meta_t>
+  rebalance(const lba_node_meta_t &lhs, const lba_node_meta_t &rhs, laddr_t pivot) {
+    assert(lhs.depth == rhs.depth);
+    return std::make_pair(
+      lba_node_meta_t{lhs.begin, pivot, lhs.depth},
+      lba_node_meta_t{pivot, rhs.end, lhs.depth});
+  }
+
+  bool is_root() const {
+    return begin == 0 && end == L_ADDR_MAX;
+  }
+};
+
+inline std::ostream &operator<<(
+  std::ostream &lhs,
+  const lba_node_meta_t &rhs)
+{
+  return lhs << "btree_node_meta_t("
+	     << "begin=" << rhs.begin
+	     << ", end=" << rhs.end
+	     << ", depth=" << rhs.depth
+	     << ")";
+}
+
+/**
+ * btree_range_pin_t
+ *
+ * Element tracked by btree_pin_set_t below.  Encapsulates the intrusive_set
+ * hook, the lba_node_meta_t representing the lba range covered by a node,
+ * and extent and ref members intended to hold a reference when the extent
+ * should be pinned.
+ */
+class btree_pin_set_t;
+class btree_range_pin_t : public boost::intrusive::set_base_hook<> {
+  friend class btree_pin_set_t;
+  lba_node_meta_t range;
+
+  btree_pin_set_t *pins = nullptr;
+
+  // We need to be able to remember extent without holding a reference,
+  // but we can do it more compactly -- TODO
+  CachedExtent *extent = nullptr;
+  CachedExtentRef ref;
+
+  using index_t = boost::intrusive::set<btree_range_pin_t>;
+
+  static auto get_tuple(const lba_node_meta_t &meta) {
+    return std::make_tuple(-meta.depth, meta.begin);
+  }
+
+  void acquire_ref() {
+    ref = CachedExtentRef(extent);
+  }
+
+  void drop_ref() {
+    ref.reset();
+  }
+
+public:
+  btree_range_pin_t() = default;
+  btree_range_pin_t(CachedExtent *extent)
+    : extent(extent) {}
+  btree_range_pin_t(const btree_range_pin_t &rhs, CachedExtent *extent)
+    : range(rhs.range), extent(extent) {}
+
+  bool has_ref() const {
+    return !!ref;
+  }
+
+  bool is_root() const {
+    return range.is_root();
+  }
+
+  void set_range(const lba_node_meta_t &nrange) {
+    range = nrange;
+  }
+  void set_extent(CachedExtent *nextent) {
+    assert(!extent);
+    extent = nextent;
+  }
+
+  void take_pin(btree_range_pin_t &other);
+
+  friend bool operator<(
+    const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) {
+    return get_tuple(lhs.range) < get_tuple(rhs.range);
+  }
+  friend bool operator>(
+    const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) {
+    return get_tuple(lhs.range) > get_tuple(rhs.range);
+  }
+  friend bool operator==(
+    const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) {
+    return get_tuple(lhs.range) == rhs.get_tuple(rhs.range);
+  }
+
+  struct meta_cmp_t {
+    bool operator()(
+      const btree_range_pin_t &lhs, const lba_node_meta_t &rhs) const {
+      return get_tuple(lhs.range) < get_tuple(rhs);
+    }
+    bool operator()(
+      const lba_node_meta_t &lhs, const btree_range_pin_t &rhs) const {
+      return get_tuple(lhs) < get_tuple(rhs.range);
+    }
+  };
+
+  friend std::ostream &operator<<(
+    std::ostream &lhs,
+    const btree_range_pin_t &rhs) {
+    return lhs << "btree_range_pin_t("
+	       << "begin=" << rhs.range.begin
+	       << ", end=" << rhs.range.end
+	       << ", depth=" << rhs.range.depth
+	       << ", extent=" << rhs.extent
+	       << ")";
+  }
+
+  friend class BtreeLBAPin;
+  ~btree_range_pin_t();
+};
+
+/**
+ * btree_pin_set_t
+ *
+ * Ensures that for every cached node, all parent LBANodes required
+ * to map it are present in cache.  Relocating these nodes can
+ * therefore be done without further reads or cache space.
+ *
+ * Contains a btree_range_pin_t for every clean or dirty LBANode
+ * or LogicalCachedExtent instance in cache at any point in time.
+ * For any LBANode, the contained btree_range_pin_t will hold
+ * a reference to that node pinning it in cache as long as that
+ * node has children in the set.  This invariant can be violated
+ * only by calling retire_extent and is repaired by calling
+ * check_parent synchronously after adding any new extents.
+ */
+class btree_pin_set_t {
+  friend class btree_range_pin_t;
+  using pins_t = btree_range_pin_t::index_t;
+  pins_t pins;
+
+  pins_t::iterator get_iter(btree_range_pin_t &pin) {
+    return pins_t::s_iterator_to(pin);
+  }
+
+  /// Removes pin from set optionally checking whether parent has other children
+  void remove_pin(btree_range_pin_t &pin, bool check_parent);
+
+  void replace_pin(btree_range_pin_t &to, btree_range_pin_t &from);
+
+  /// Returns parent pin if exists
+  btree_range_pin_t *maybe_get_parent(const lba_node_meta_t &pin);
+
+  /// Returns earliest child pin if exist
+  const btree_range_pin_t *maybe_get_first_child(const lba_node_meta_t &pin) const;
+
+  /// Releases pin if it has no children
+  void release_if_no_children(btree_range_pin_t &pin);
+
+public:
+  /// Adds pin to set, assumes set is consistent
+  void add_pin(btree_range_pin_t &pin);
+
+  /**
+   * retire/check_parent
+   *
+   * See BtreeLBAManager::complete_transaction.
+   * retire removes the specified pin from the set, but does not
+   * check parents.  After any new extents are added to the set,
+   * the caller is required to call check_parent to restore the
+   * invariant.
+   */
+  void retire(btree_range_pin_t &pin);
+  void check_parent(btree_range_pin_t &pin);
+
+  ~btree_pin_set_t() {
+    assert(pins.empty());
+  }
+};
+
+class BtreeLBAPin : public LBAPin {
+  friend class BtreeLBAManager;
+
+  /**
+   * parent
+   *
+   * populated until link_extent is called to ensure cache residence
+   * until add_pin is called.
+   */
+  CachedExtentRef parent;
+
+  paddr_t paddr;
+  btree_range_pin_t pin;
+
+public:
+  BtreeLBAPin() = default;
+
+  BtreeLBAPin(
+    CachedExtentRef parent,
+    paddr_t paddr,
+    lba_node_meta_t &&meta)
+    : parent(parent), paddr(paddr) {
+    pin.set_range(std::move(meta));
+  }
+
+  void link_extent(LogicalCachedExtent *ref) final {
+    pin.set_extent(ref);
+  }
+
+  extent_len_t get_length() const final {
+    assert(pin.range.end > pin.range.begin);
+    return pin.range.end - pin.range.begin;
+  }
+
+  paddr_t get_paddr() const final {
+    return paddr;
+  }
+
+  laddr_t get_laddr() const final {
+    return pin.range.begin;
+  }
+
+  LBAPinRef duplicate() const final {
+    auto ret = std::unique_ptr<BtreeLBAPin>(new BtreeLBAPin);
+    ret->pin.set_range(pin.range);
+    ret->paddr = paddr;
+    return ret;
+  }
+
+  void take_pin(LBAPin &opin) final {
+    pin.take_pin(static_cast<BtreeLBAPin&>(opin).pin);
+  }
+};
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h
new file mode 100644
index 000000000..b6f33a1ae
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h
@@ -0,0 +1,269 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <sys/mman.h>
+#include <memory>
+#include <string.h>
+
+#include "crimson/common/log.h"
+#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h"
+#include "crimson/os/seastore/lba_manager.h"
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+struct op_context_t {
+  Cache &cache;
+  btree_pin_set_t &pins;
+  Transaction &trans;
+};
+
+/**
+ * lba_map_val_t
+ *
+ * struct representing a single lba mapping
+ */
+struct lba_map_val_t {
+  extent_len_t len = 0;  ///< length of mapping
+  paddr_t paddr;         ///< physical addr of mapping
+  uint32_t refcount = 0; ///< refcount
+  uint32_t checksum = 0; ///< checksum of original block written at paddr (TODO)
+
+  lba_map_val_t(
+    extent_len_t len,
+    paddr_t paddr,
+    uint32_t refcount,
+    uint32_t checksum)
+    : len(len), paddr(paddr), refcount(refcount), checksum(checksum) {}
+};
+
+class BtreeLBAPin;
+using BtreeLBAPinRef = std::unique_ptr<BtreeLBAPin>;
+
+/**
+ * LBANode
+ *
+ * Base class enabling recursive lookup between internal and leaf nodes.
+ */
+struct LBANode : CachedExtent {
+  using LBANodeRef = TCachedExtentRef<LBANode>;
+  using lookup_range_ertr = LBAManager::get_mapping_ertr;
+  using lookup_range_ret = LBAManager::get_mapping_ret;
+
+  btree_range_pin_t pin;
+
+  LBANode(ceph::bufferptr &&ptr) : CachedExtent(std::move(ptr)), pin(this) {}
+  LBANode(const LBANode &rhs)
+    : CachedExtent(rhs), pin(rhs.pin, this) {}
+
+  virtual lba_node_meta_t get_node_meta() const = 0;
+
+  /**
+   * lookup
+   *
+   * Returns the node at the specified depth responsible
+   * for laddr
+   */
+  using lookup_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using lookup_ret = lookup_ertr::future<LBANodeRef>;
+  virtual lookup_ret lookup(
+    op_context_t c,
+    laddr_t addr,
+    depth_t depth) = 0;
+
+  /**
+   * lookup_range
+   *
+   * Returns mappings within range [addr, addr+len)
+   */
+  virtual lookup_range_ret lookup_range(
+    op_context_t c,
+    laddr_t addr,
+    extent_len_t len) = 0;
+
+  /**
+   * insert
+   *
+   * Recursively inserts into subtree rooted at *this.  Caller
+   * must already have handled splitting if at_max_capacity().
+   *
+   * Precondition: !at_max_capacity()
+   */
+  using insert_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error
+    >;
+  using insert_ret = insert_ertr::future<LBAPinRef>;
+  virtual insert_ret insert(
+    op_context_t c,
+    laddr_t laddr,
+    lba_map_val_t val) = 0;
+
+  /**
+   * find_hole
+   *
+   * Finds minimum hole of size len in [min, max)
+   *
+   * @return addr of hole, L_ADDR_NULL if unfound
+   */
+  using find_hole_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using find_hole_ret = find_hole_ertr::future<laddr_t>;
+  virtual find_hole_ret find_hole(
+    op_context_t c,
+    laddr_t min,
+    laddr_t max,
+    extent_len_t len) = 0;
+
+  /**
+   * scan_mappings
+   *
+   * Call f for all mappings in [begin, end)
+   */
+  using scan_mappings_ertr = LBAManager::scan_mappings_ertr;
+  using scan_mappings_ret = LBAManager::scan_mappings_ret;
+  using scan_mappings_func_t = LBAManager::scan_mappings_func_t;
+  virtual scan_mappings_ret scan_mappings(
+    op_context_t c,
+    laddr_t begin,
+    laddr_t end,
+    scan_mappings_func_t &f) = 0;
+
+  using scan_mapped_space_ertr = LBAManager::scan_mapped_space_ertr;
+  using scan_mapped_space_ret = LBAManager::scan_mapped_space_ret;
+  using scan_mapped_space_func_t = LBAManager::scan_mapped_space_func_t;
+  virtual scan_mapped_space_ret scan_mapped_space(
+    op_context_t c,
+    scan_mapped_space_func_t &f) = 0;
+
+  /**
+   * mutate_mapping
+   *
+   * Lookups up laddr, calls f on value. If f returns a value, inserts it.
+   * If it returns nullopt, removes the value.
+   * Caller must already have merged if at_min_capacity().
+   *
+   * Recursive calls use mutate_mapping_internal.
+   *
+   * Precondition: !at_min_capacity()
+   */
+  using mutate_mapping_ertr = crimson::errorator<
+    crimson::ct_error::enoent,            ///< mapping does not exist
+    crimson::ct_error::input_output_error
+    >;
+  using mutate_mapping_ret = mutate_mapping_ertr::future<
+    lba_map_val_t>;
+  using mutate_func_t = std::function<
+    lba_map_val_t(const lba_map_val_t &v)
+    >;
+  virtual mutate_mapping_ret mutate_mapping(
+    op_context_t c,
+    laddr_t laddr,
+    mutate_func_t &&f) = 0;
+  virtual mutate_mapping_ret mutate_mapping_internal(
+    op_context_t c,
+    laddr_t laddr,
+    bool is_root,
+    mutate_func_t &&f) = 0;
+
+  /**
+   * mutate_internal_address
+   *
+   * Looks up internal node mapping at laddr, depth and
+   * updates the mapping to paddr.  Returns previous paddr
+   * (for debugging purposes).
+   */
+  using mutate_internal_address_ertr = crimson::errorator<
+    crimson::ct_error::enoent,            ///< mapping does not exist
+    crimson::ct_error::input_output_error
+    >;
+  using mutate_internal_address_ret = mutate_internal_address_ertr::future<
+    paddr_t>;
+  virtual mutate_internal_address_ret mutate_internal_address(
+    op_context_t c,
+    depth_t depth,
+    laddr_t laddr,
+    paddr_t paddr) = 0;
+
+  /**
+   * make_split_children
+   *
+   * Generates appropriately typed left and right nodes formed from the
+   * contents of *this.
+   *
+   * Returns <left, right, pivot> where pivot is the first value of right.
+   */
+  virtual std::tuple<
+    LBANodeRef,
+    LBANodeRef,
+    laddr_t>
+  make_split_children(
+    op_context_t c) = 0;
+
+  /**
+   * make_full_merge
+   *
+   * Returns a single node formed from merging *this and right.
+   * Precondition: at_min_capacity() && right.at_min_capacity()
+   */
+  virtual LBANodeRef make_full_merge(
+    op_context_t c,
+    LBANodeRef &right) = 0;
+
+  /**
+   * make_balanced
+   *
+   * Returns nodes formed by balancing the contents of *this and right.
+   *
+   * Returns <left, right, pivot> where pivot is the first value of right.
+   */
+  virtual std::tuple<
+    LBANodeRef,
+    LBANodeRef,
+    laddr_t>
+  make_balanced(
+    op_context_t c,
+    LBANodeRef &right,
+    bool prefer_left) = 0;
+
+  virtual bool at_max_capacity() const = 0;
+  virtual bool at_min_capacity() const = 0;
+
+  virtual ~LBANode() = default;
+
+  void on_delta_write(paddr_t record_block_offset) final {
+    // All in-memory relative addrs are necessarily record-relative
+    assert(get_prior_instance());
+    pin.take_pin(get_prior_instance()->cast<LBANode>()->pin);
+    resolve_relative_addrs(record_block_offset);
+  }
+
+  void on_initial_write() final {
+    // All in-memory relative addrs are necessarily block-relative
+    resolve_relative_addrs(get_paddr());
+  }
+
+  void on_clean_read() final {
+    // From initial write of block, relative addrs are necessarily block-relative
+    resolve_relative_addrs(get_paddr());
+  }
+
+  virtual void resolve_relative_addrs(paddr_t base) = 0;
+};
+using LBANodeRef = LBANode::LBANodeRef;
+
+/**
+ * get_lba_btree_extent
+ *
+ * Fetches node at depth of the appropriate type.
+ */
+Cache::get_extent_ertr::future<LBANodeRef> get_lba_btree_extent(
+  op_context_t c, ///< [in] context structure
+  depth_t depth,  ///< [in] depth of node to fetch
+  paddr_t offset, ///< [in] physical addr of node
+  paddr_t base    ///< [in] depending on user, block addr or record addr
+                  ///       in case offset is relative
+);
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc
new file mode 100644
index 000000000..5e400803b
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc
@@ -0,0 +1,701 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include <memory>
+#include <string.h>
+
+#include "include/buffer.h"
+#include "include/byteorder.h"
+
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+std::ostream &LBAInternalNode::print_detail(std::ostream &out) const
+{
+  return out << ", size=" << get_size()
+	     << ", meta=" << get_meta();
+}
+
+LBAInternalNode::lookup_ret LBAInternalNode::lookup(
+  op_context_t c,
+  laddr_t addr,
+  depth_t depth)
+{
+  auto meta = get_meta();
+  if (depth == get_meta().depth) {
+    return lookup_ret(
+      lookup_ertr::ready_future_marker{},
+      this);
+  }
+  assert(meta.begin <= addr);
+  assert(meta.end > addr);
+  auto iter = lower_bound(addr);
+  return get_lba_btree_extent(
+    c,
+    meta.depth - 1,
+    iter->get_val(),
+    get_paddr()).safe_then([c, addr, depth](auto child) {
+      return child->lookup(c, addr, depth);
+    }).finally([ref=LBANodeRef(this)] {});
+}
+
+LBAInternalNode::lookup_range_ret LBAInternalNode::lookup_range(
+  op_context_t c,
+  laddr_t addr,
+  extent_len_t len)
+{
+  auto [begin, end] = bound(addr, addr + len);
+  auto result_up = std::make_unique<lba_pin_list_t>();
+  auto &result = *result_up;
+  return crimson::do_for_each(
+    std::move(begin),
+    std::move(end),
+    [this, c, &result, addr, len](const auto &val) mutable {
+      return get_lba_btree_extent(
+	c,
+	get_meta().depth - 1,
+	val.get_val(),
+	get_paddr()).safe_then(
+	  [c, &result, addr, len](auto extent) mutable {
+	    return extent->lookup_range(
+	      c,
+	      addr,
+	      len).safe_then(
+		[&result](auto pin_list) mutable {
+		  result.splice(result.end(), pin_list,
+				pin_list.begin(), pin_list.end());
+		});
+	  });
+    }).safe_then([result=std::move(result_up), ref=LBANodeRef(this)] {
+      return lookup_range_ertr::make_ready_future<lba_pin_list_t>(
+	std::move(*result));
+    });
+}
+
+LBAInternalNode::insert_ret LBAInternalNode::insert(
+  op_context_t c,
+  laddr_t laddr,
+  lba_map_val_t val)
+{
+  auto insertion_pt = get_containing_child(laddr);
+  return get_lba_btree_extent(
+    c,
+    get_meta().depth - 1,
+    insertion_pt->get_val(),
+    get_paddr()).safe_then(
+      [this, insertion_pt, c, laddr, val=std::move(val)](
+	auto extent) mutable {
+	return extent->at_max_capacity() ?
+	  split_entry(c, laddr, insertion_pt, extent) :
+	  insert_ertr::make_ready_future<LBANodeRef>(std::move(extent));
+      }).safe_then([c, laddr, val=std::move(val)](
+		     LBANodeRef extent) mutable {
+	return extent->insert(c, laddr, val);
+      });
+}
+
+LBAInternalNode::mutate_mapping_ret LBAInternalNode::mutate_mapping(
+  op_context_t c,
+  laddr_t laddr,
+  mutate_func_t &&f)
+{
+  return mutate_mapping_internal(c, laddr, true, std::move(f));
+}
+
+LBAInternalNode::mutate_mapping_ret LBAInternalNode::mutate_mapping_internal(
+  op_context_t c,
+  laddr_t laddr,
+  bool is_root,
+  mutate_func_t &&f)
+{
+  auto mutation_pt = get_containing_child(laddr);
+  if (mutation_pt == end()) {
+    assert(0 == "impossible");
+    return crimson::ct_error::enoent::make();
+  }
+  return get_lba_btree_extent(
+    c,
+    get_meta().depth - 1,
+    mutation_pt->get_val(),
+    get_paddr()
+  ).safe_then([=](LBANodeRef extent) {
+    if (extent->at_min_capacity() && get_size() > 1) {
+      return merge_entry(
+	c,
+	laddr,
+	mutation_pt,
+	extent,
+	is_root);
+    } else {
+      return merge_ertr::make_ready_future<LBANodeRef>(
+	std::move(extent));
+    }
+  }).safe_then([c, laddr, f=std::move(f)](LBANodeRef extent) mutable {
+    return extent->mutate_mapping_internal(c, laddr, false, std::move(f));
+  });
+}
+
+LBAInternalNode::mutate_internal_address_ret LBAInternalNode::mutate_internal_address(
+  op_context_t c,
+  depth_t depth,
+  laddr_t laddr,
+  paddr_t paddr)
+{
+  if (get_meta().depth == (depth + 1)) {
+    if (!is_pending()) {
+      return c.cache.duplicate_for_write(c.trans, this)->cast<LBAInternalNode>(
+      )->mutate_internal_address(
+	c,
+	depth,
+	laddr,
+	paddr);
+    }
+    auto iter = get_containing_child(laddr);
+    if (iter->get_key() != laddr) {
+      return crimson::ct_error::enoent::make();
+    }
+
+    auto old_paddr = iter->get_val();
+
+    journal_update(
+      iter,
+      maybe_generate_relative(paddr),
+      maybe_get_delta_buffer());
+
+    return mutate_internal_address_ret(
+      mutate_internal_address_ertr::ready_future_marker{},
+      old_paddr
+    );
+  } else {
+    auto iter = get_containing_child(laddr);
+    return get_lba_btree_extent(
+      c,
+      get_meta().depth - 1,
+      iter->get_val(),
+      get_paddr()
+    ).safe_then([=](auto node) {
+      return node->mutate_internal_address(
+	c,
+	depth,
+	laddr,
+	paddr);
+    });
+  }
+}
+
+LBAInternalNode::find_hole_ret LBAInternalNode::find_hole(
+  op_context_t c,
+  laddr_t min_addr,
+  laddr_t max_addr,
+  extent_len_t len)
+{
+  logger().debug(
+    "LBAInternalNode::find_hole min={}, max={}, len={}, *this={}",
+    min_addr, max_addr, len, *this);
+  auto [begin, end] = bound(min_addr, max_addr);
+  return seastar::repeat_until_value(
+    [i=begin, e=end, c, min_addr, len, this]() mutable {
+    if (i == e) {
+      return seastar::make_ready_future<std::optional<laddr_t>>(
+        std::make_optional<laddr_t>(L_ADDR_NULL));
+    }
+    return get_lba_btree_extent(c,
+				get_meta().depth - 1,
+				i->get_val(),
+				get_paddr()).safe_then(
+      [c, min_addr, len, i](auto extent) mutable {
+      auto lb = std::max(min_addr, i->get_key());
+      auto ub = i->get_next_key_or_max();
+      logger().debug("LBAInternalNode::find_hole extent {} lb {} ub {}",
+		     *extent, lb, ub);
+      return extent->find_hole(c, lb, ub, len);
+    }).safe_then([&i](auto addr) mutable -> std::optional<laddr_t> {
+      if (addr == L_ADDR_NULL) {
+        ++i;
+        return {};
+      } else {
+        return addr;
+      }
+    },
+    // TODO: GCC enters a dead loop if crimson::do_until() is used
+    //       or erroratorized future is returned
+    crimson::ct_error::assert_all{ "fix me - APIv6" });
+  });
+}
+
+LBAInternalNode::scan_mappings_ret LBAInternalNode::scan_mappings(
+  op_context_t c,
+  laddr_t begin,
+  laddr_t end,
+  scan_mappings_func_t &f)
+{
+  auto [biter, eiter] = bound(begin, end);
+  return crimson::do_for_each(
+    std::move(biter),
+    std::move(eiter),
+    [=, &f](auto &viter) {
+      return get_lba_btree_extent(
+	c,
+	get_meta().depth - 1,
+	viter->get_val(),
+	get_paddr()).safe_then([=, &f](auto child) {
+	  return child->scan_mappings(c, begin, end, f);
+	});
+    }).safe_then([ref=LBANodeRef(this)]{});
+}
+
+LBAInternalNode::scan_mapped_space_ret LBAInternalNode::scan_mapped_space(
+  op_context_t c,
+  scan_mapped_space_func_t &f)
+{
+  f(get_paddr(), get_length());
+  return crimson::do_for_each(
+    begin(), end(),
+    [=, &f](auto &viter) {
+      return get_lba_btree_extent(
+	c,
+	get_meta().depth - 1,
+	viter->get_val(),
+	get_paddr()).safe_then([=, &f](auto child) {
+	  return child->scan_mapped_space(c, f);
+	});
+    }).safe_then([ref=LBANodeRef(this)]{});
+}
+
+
+void LBAInternalNode::resolve_relative_addrs(paddr_t base)
+{
+  for (auto i: *this) {
+    if (i->get_val().is_relative()) {
+      auto updated = base.add_relative(i->get_val());
+      logger().debug(
+	"LBAInternalNode::resolve_relative_addrs {} -> {}",
+	i->get_val(),
+	updated);
+      i->set_val(updated);
+    }
+  }
+}
+
+
+LBAInternalNode::split_ret
+LBAInternalNode::split_entry(
+  op_context_t c,
+  laddr_t addr,
+  internal_iterator_t iter, LBANodeRef entry)
+{
+  if (!is_pending()) {
+    auto mut = c.cache.duplicate_for_write(
+      c.trans, this)->cast<LBAInternalNode>();
+    auto mut_iter = mut->iter_idx(iter->get_offset());
+    return mut->split_entry(c, addr, mut_iter, entry);
+  }
+
+  ceph_assert(!at_max_capacity());
+  auto [left, right, pivot] = entry->make_split_children(c);
+
+  journal_update(
+    iter,
+    maybe_generate_relative(left->get_paddr()),
+    maybe_get_delta_buffer());
+  journal_insert(
+    iter + 1,
+    pivot,
+    maybe_generate_relative(right->get_paddr()),
+    maybe_get_delta_buffer());
+
+  c.cache.retire_extent(c.trans, entry);
+
+  logger().debug(
+    "LBAInternalNode::split_entry *this {} entry {} into left {} right {}",
+    *this,
+    *entry,
+    *left,
+    *right);
+
+  return split_ertr::make_ready_future<LBANodeRef>(
+    pivot > addr ? left : right
+  );
+}
+
+LBAInternalNode::merge_ret
+LBAInternalNode::merge_entry(
+  op_context_t c,
+  laddr_t addr,
+  internal_iterator_t iter,
+  LBANodeRef entry,
+  bool is_root)
+{
+  if (!is_pending()) {
+    auto mut = c.cache.duplicate_for_write(c.trans, this)->cast<LBAInternalNode>();
+    auto mut_iter = mut->iter_idx(iter->get_offset());
+    return mut->merge_entry(c, addr, mut_iter, entry, is_root);
+  }
+
+  logger().debug(
+    "LBAInternalNode: merge_entry: {}, {}",
+    *this,
+    *entry);
+  auto donor_is_left = (iter + 1) == end();
+  auto donor_iter = donor_is_left ? iter - 1 : iter + 1;
+  return get_lba_btree_extent(
+    c,
+    get_meta().depth - 1,
+    donor_iter->get_val(),
+    get_paddr()
+  ).safe_then([=](auto donor) mutable {
+    auto [l, r] = donor_is_left ?
+      std::make_pair(donor, entry) : std::make_pair(entry, donor);
+    auto [liter, riter] = donor_is_left ?
+      std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter);
+    if (donor->at_min_capacity()) {
+      auto replacement = l->make_full_merge(
+	c,
+	r);
+
+      journal_update(
+	liter,
+	maybe_generate_relative(replacement->get_paddr()),
+	maybe_get_delta_buffer());
+      journal_remove(riter, maybe_get_delta_buffer());
+
+      c.cache.retire_extent(c.trans, l);
+      c.cache.retire_extent(c.trans, r);
+
+      if (is_root && get_size() == 1) {
+	return c.cache.get_root(c.trans).safe_then([=](RootBlockRef croot) {
+	  {
+	    auto mut_croot = c.cache.duplicate_for_write(c.trans, croot);
+	    croot = mut_croot->cast<RootBlock>();
+	  }
+	  croot->root.lba_root_addr = begin()->get_val();
+	  logger().debug(
+	    "LBAInternalNode::merge_entry: collapsing root {} to addr {}",
+	    *this,
+	    begin()->get_val());
+	  croot->root.lba_depth = get_meta().depth - 1;
+	  c.cache.retire_extent(c.trans, this);
+	  return merge_ertr::make_ready_future<LBANodeRef>(replacement);
+	});
+      } else {
+	return merge_ertr::make_ready_future<LBANodeRef>(replacement);
+      }
+    } else {
+      logger().debug(
+	"LBAInternalEntry::merge_entry balanced l {} r {}",
+	*l,
+	*r);
+      auto [replacement_l, replacement_r, pivot] =
+	l->make_balanced(
+	  c,
+	  r,
+	  !donor_is_left);
+
+      journal_update(
+	liter,
+	maybe_generate_relative(replacement_l->get_paddr()),
+	maybe_get_delta_buffer());
+      journal_replace(
+	riter,
+	pivot,
+	maybe_generate_relative(replacement_r->get_paddr()),
+	maybe_get_delta_buffer());
+
+      c.cache.retire_extent(c.trans, l);
+      c.cache.retire_extent(c.trans, r);
+      return merge_ertr::make_ready_future<LBANodeRef>(
+	addr >= pivot ? replacement_r : replacement_l
+      );
+    }
+  });
+}
+
+
+LBAInternalNode::internal_iterator_t
+LBAInternalNode::get_containing_child(laddr_t laddr)
+{
+  // TODO: binary search
+  for (auto i = begin(); i != end(); ++i) {
+    if (i.contains(laddr))
+      return i;
+  }
+  ceph_assert(0 == "invalid");
+  return end();
+}
+
+std::ostream &LBALeafNode::print_detail(std::ostream &out) const
+{
+  return out << ", size=" << get_size()
+	     << ", meta=" << get_meta();
+}
+
+LBALeafNode::lookup_range_ret LBALeafNode::lookup_range(
+  op_context_t c,
+  laddr_t addr,
+  extent_len_t len)
+{
+  logger().debug(
+    "LBALeafNode::lookup_range {}~{}",
+    addr,
+    len);
+  auto ret = lba_pin_list_t();
+  auto [i, end] = get_leaf_entries(addr, len);
+  for (; i != end; ++i) {
+    auto val = i->get_val();
+    auto begin = i->get_key();
+    ret.emplace_back(
+      std::make_unique<BtreeLBAPin>(
+	this,
+	val.paddr.maybe_relative_to(get_paddr()),
+	lba_node_meta_t{ begin, begin + val.len, 0}));
+  }
+  return lookup_range_ertr::make_ready_future<lba_pin_list_t>(
+    std::move(ret));
+}
+
+LBALeafNode::insert_ret LBALeafNode::insert(
+  op_context_t c,
+  laddr_t laddr,
+  lba_map_val_t val)
+{
+  ceph_assert(!at_max_capacity());
+
+  if (!is_pending()) {
+    return c.cache.duplicate_for_write(c.trans, this
+    )->cast<LBALeafNode>()->insert(c, laddr, val);
+  }
+
+  val.paddr = maybe_generate_relative(val.paddr);
+  logger().debug(
+    "LBALeafNode::insert: inserting {}~{} -> {}",
+    laddr,
+    val.len,
+    val.paddr);
+
+  auto insert_pt = lower_bound(laddr);
+  journal_insert(insert_pt, laddr, val, maybe_get_delta_buffer());
+
+  logger().debug(
+    "LBALeafNode::insert: inserted {}~{} -> {}",
+    insert_pt.get_key(),
+    insert_pt.get_val().len,
+    insert_pt.get_val().paddr);
+  auto begin = insert_pt.get_key();
+  return insert_ret(
+    insert_ertr::ready_future_marker{},
+    std::make_unique<BtreeLBAPin>(
+      this,
+      val.paddr.maybe_relative_to(get_paddr()),
+      lba_node_meta_t{ begin, begin + val.len, 0}));
+}
+
+LBALeafNode::mutate_mapping_ret LBALeafNode::mutate_mapping(
+  op_context_t c,
+  laddr_t laddr,
+  mutate_func_t &&f)
+{
+  return mutate_mapping_internal(c, laddr, true, std::move(f));
+}
+
+LBALeafNode::mutate_mapping_ret LBALeafNode::mutate_mapping_internal(
+  op_context_t c,
+  laddr_t laddr,
+  bool is_root,
+  mutate_func_t &&f)
+{
+  auto mutation_pt = find(laddr);
+  if (mutation_pt == end()) {
+    return crimson::ct_error::enoent::make();
+  }
+
+  if (!is_pending()) {
+    return c.cache.duplicate_for_write(c.trans, this)->cast<LBALeafNode>(
+    )->mutate_mapping_internal(
+      c,
+      laddr,
+      is_root,
+      std::move(f));
+  }
+
+  auto cur = mutation_pt.get_val();
+  auto mutated = f(cur);
+
+  mutated.paddr = maybe_generate_relative(mutated.paddr);
+
+  logger().debug(
+    "{}: mutate addr {}: {} -> {}",
+    __func__,
+    laddr,
+    cur.paddr,
+    mutated.paddr);
+
+  if (mutated.refcount > 0) {
+    journal_update(mutation_pt, mutated, maybe_get_delta_buffer());
+    return mutate_mapping_ret(
+      mutate_mapping_ertr::ready_future_marker{},
+      mutated);
+  } else {
+    journal_remove(mutation_pt, maybe_get_delta_buffer());
+    return mutate_mapping_ret(
+      mutate_mapping_ertr::ready_future_marker{},
+      mutated);
+  }
+}
+
+LBALeafNode::mutate_internal_address_ret LBALeafNode::mutate_internal_address(
+  op_context_t c,
+  depth_t depth,
+  laddr_t laddr,
+  paddr_t paddr)
+{
+  ceph_assert(0 == "Impossible");
+  return mutate_internal_address_ret(
+    mutate_internal_address_ertr::ready_future_marker{},
+    paddr);
+}
+
+LBALeafNode::find_hole_ret LBALeafNode::find_hole(
+  op_context_t c,
+  laddr_t min,
+  laddr_t max,
+  extent_len_t len)
+{
+  logger().debug(
+    "LBALeafNode::find_hole min={} max={}, len={}, *this={}",
+    min, max, len, *this);
+  auto [liter, uiter] = bound(min, max);
+  for (auto i = liter; i != uiter; ++i) {
+    auto ub = i->get_key();
+    if (min + len <= ub) {
+      return find_hole_ret(
+	find_hole_ertr::ready_future_marker{},
+	min);
+    } else {
+      min = i->get_key() + i->get_val().len;
+    }
+  }
+  if (min + len <= max) {
+    return find_hole_ret(
+      find_hole_ertr::ready_future_marker{},
+      min);
+  } else {
+    return find_hole_ret(
+      find_hole_ertr::ready_future_marker{},
+      L_ADDR_MAX);
+  }
+}
+
+LBALeafNode::scan_mappings_ret LBALeafNode::scan_mappings(
+  op_context_t c,
+  laddr_t begin,
+  laddr_t end,
+  scan_mappings_func_t &f)
+{
+  auto [biter, eiter] = bound(begin, end);
+  for (auto i = biter; i != eiter; ++i) {
+    auto val = i->get_val();
+    f(i->get_key(), val.paddr, val.len);
+  }
+  return scan_mappings_ertr::now();
+}
+
+LBALeafNode::scan_mapped_space_ret LBALeafNode::scan_mapped_space(
+  op_context_t c,
+  scan_mapped_space_func_t &f)
+{
+  f(get_paddr(), get_length());
+  for (auto i = begin(); i != end(); ++i) {
+    auto val = i->get_val();
+    f(val.paddr, val.len);
+  }
+  return scan_mappings_ertr::now();
+}
+
+
+void LBALeafNode::resolve_relative_addrs(paddr_t base)
+{
+  for (auto i: *this) {
+    if (i->get_val().paddr.is_relative()) {
+      auto val = i->get_val();
+      val.paddr = base.add_relative(val.paddr);
+      logger().debug(
+	"LBALeafNode::resolve_relative_addrs {} -> {}",
+	i->get_val().paddr,
+	val.paddr);
+      i->set_val(val);
+    }
+  }
+}
+
+std::pair<LBALeafNode::internal_iterator_t, LBALeafNode::internal_iterator_t>
+LBALeafNode::get_leaf_entries(laddr_t addr, extent_len_t len)
+{
+  return bound(addr, addr + len);
+}
+
+Cache::get_extent_ertr::future<LBANodeRef> get_lba_btree_extent(
+  op_context_t c,
+  depth_t depth,
+  paddr_t offset,
+  paddr_t base)
+{
+  offset = offset.maybe_relative_to(base);
+  ceph_assert(depth > 0);
+  if (depth > 1) {
+    logger().debug(
+      "get_lba_btree_extent: reading internal at offset {}, depth {}",
+      offset,
+      depth);
+    return c.cache.get_extent<LBAInternalNode>(
+      c.trans,
+      offset,
+      LBA_BLOCK_SIZE).safe_then([c](auto ret) {
+	auto meta = ret->get_meta();
+	if (ret->get_size()) {
+	  ceph_assert(meta.begin <= ret->begin()->get_key());
+	  ceph_assert(meta.end > (ret->end() - 1)->get_key());
+	}
+	if (!ret->is_pending() && !ret->pin.is_linked()) {
+	  ret->pin.set_range(meta);
+	  c.pins.add_pin(ret->pin);
+	}
+	return LBANodeRef(ret.detach(), /* add_ref = */ false);
+      });
+  } else {
+    logger().debug(
+      "get_lba_btree_extent: reading leaf at offset {}, depth {}",
+      offset,
+      depth);
+    return c.cache.get_extent<LBALeafNode>(
+      c.trans,
+      offset,
+      LBA_BLOCK_SIZE).safe_then([offset, c](auto ret) {
+	logger().debug(
+	  "get_lba_btree_extent: read leaf at offset {} {}",
+	  offset,
+	  *ret);
+	auto meta = ret->get_meta();
+	if (ret->get_size()) {
+	  ceph_assert(meta.begin <= ret->begin()->get_key());
+	  ceph_assert(meta.end > (ret->end() - 1)->get_key());
+	}
+	if (!ret->is_pending() && !ret->pin.is_linked()) {
+	  ret->pin.set_range(meta);
+	  c.pins.add_pin(ret->pin);
+	}
+	return LBANodeRef(ret.detach(), /* add_ref = */ false);
+      });
+  }
+}
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h
new file mode 100644
index 000000000..230eef682
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h
@@ -0,0 +1,555 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include <memory>
+#include <string.h>
+
+#include "include/buffer.h"
+
+#include "crimson/common/fixed_kv_node_layout.h"
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/lba_manager.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
+#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h"
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+constexpr size_t LBA_BLOCK_SIZE = 4096;
+
+/**
+ * lba_node_meta_le_t
+ *
+ * On disk layout for lba_node_meta_t
+ */
+struct lba_node_meta_le_t {
+  laddr_le_t begin = laddr_le_t(0);
+  laddr_le_t end = laddr_le_t(0);
+  depth_le_t depth = init_les32(0);
+
+  lba_node_meta_le_t() = default;
+  lba_node_meta_le_t(const lba_node_meta_le_t &) = default;
+  explicit lba_node_meta_le_t(const lba_node_meta_t &val)
+    : begin(init_le64(val.begin)),
+      end(init_le64(val.end)),
+      depth(init_les32(val.depth)) {}
+
+  operator lba_node_meta_t() const {
+    return lba_node_meta_t{ begin, end, depth };
+  }
+};
+
+
+/**
+ * LBAInternalNode
+ *
+ * Abstracts operations on and layout of internal nodes for the
+ * LBA Tree.
+ *
+ * Layout (4k):
+ *   size       : uint32_t[1]                4b
+ *   (padding)  :                            4b
+ *   meta       : lba_node_meta_le_t[3]      (1*24)b
+ *   keys       : laddr_t[255]               (254*8)b
+ *   values     : paddr_t[255]               (254*8)b
+ *                                           = 4096
+
+ * TODO: make the above capacity calculation part of FixedKVNodeLayout
+ * TODO: the above alignment probably isn't portable without further work
+ */
+constexpr size_t INTERNAL_NODE_CAPACITY = 254;
+struct LBAInternalNode
+  : LBANode,
+    common::FixedKVNodeLayout<
+      INTERNAL_NODE_CAPACITY,
+      lba_node_meta_t, lba_node_meta_le_t,
+      laddr_t, laddr_le_t,
+      paddr_t, paddr_le_t> {
+  using internal_iterator_t = const_iterator;
+  template <typename... T>
+  LBAInternalNode(T&&... t) :
+    LBANode(std::forward<T>(t)...),
+    FixedKVNodeLayout(get_bptr().c_str()) {}
+
+  static constexpr extent_types_t type = extent_types_t::LADDR_INTERNAL;
+
+  lba_node_meta_t get_node_meta() const final { return get_meta(); }
+
+  CachedExtentRef duplicate_for_write() final {
+    assert(delta_buffer.empty());
+    return CachedExtentRef(new LBAInternalNode(*this));
+  };
+
+  delta_buffer_t delta_buffer;
+  delta_buffer_t *maybe_get_delta_buffer() {
+    return is_mutation_pending() ? &delta_buffer : nullptr;
+  }
+
+  lookup_ret lookup(op_context_t c, laddr_t addr, depth_t depth) final;
+
+  lookup_range_ret lookup_range(
+    op_context_t c,
+    laddr_t addr,
+    extent_len_t len) final;
+
+  insert_ret insert(
+    op_context_t c,
+    laddr_t laddr,
+    lba_map_val_t val) final;
+
+  mutate_mapping_ret mutate_mapping(
+    op_context_t c,
+    laddr_t laddr,
+    mutate_func_t &&f) final;
+  mutate_mapping_ret mutate_mapping_internal(
+    op_context_t c,
+    laddr_t laddr,
+    bool is_root,
+    mutate_func_t &&f) final;
+
+  mutate_internal_address_ret mutate_internal_address(
+    op_context_t c,
+    depth_t depth,
+    laddr_t laddr,
+    paddr_t paddr) final;
+
+  find_hole_ret find_hole(
+    op_context_t c,
+    laddr_t min,
+    laddr_t max,
+    extent_len_t len) final;
+
+  scan_mappings_ret scan_mappings(
+    op_context_t c,
+    laddr_t begin,
+    laddr_t end,
+    scan_mappings_func_t &f) final;
+
+  scan_mapped_space_ret scan_mapped_space(
+    op_context_t c,
+    scan_mapped_space_func_t &f) final;
+
+  std::tuple<LBANodeRef, LBANodeRef, laddr_t>
+  make_split_children(op_context_t c) final {
+    auto left = c.cache.alloc_new_extent<LBAInternalNode>(
+      c.trans, LBA_BLOCK_SIZE);
+    auto right = c.cache.alloc_new_extent<LBAInternalNode>(
+      c.trans, LBA_BLOCK_SIZE);
+    auto pivot = split_into(*left, *right);
+    left->pin.set_range(left->get_meta());
+    right->pin.set_range(right->get_meta());
+    return std::make_tuple(
+      left,
+      right,
+      pivot);
+  }
+
+  LBANodeRef make_full_merge(
+    op_context_t c,
+    LBANodeRef &right) final {
+    auto replacement = c.cache.alloc_new_extent<LBAInternalNode>(
+      c.trans, LBA_BLOCK_SIZE);
+    replacement->merge_from(*this, *right->cast<LBAInternalNode>());
+    replacement->pin.set_range(replacement->get_meta());
+    return replacement;
+  }
+
+  std::tuple<LBANodeRef, LBANodeRef, laddr_t>
+  make_balanced(
+    op_context_t c,
+    LBANodeRef &_right,
+    bool prefer_left) final {
+    ceph_assert(_right->get_type() == type);
+    auto &right = *_right->cast<LBAInternalNode>();
+    auto replacement_left = c.cache.alloc_new_extent<LBAInternalNode>(
+      c.trans, LBA_BLOCK_SIZE);
+    auto replacement_right = c.cache.alloc_new_extent<LBAInternalNode>(
+      c.trans, LBA_BLOCK_SIZE);
+
+    auto pivot = balance_into_new_nodes(
+      *this,
+      right,
+      prefer_left,
+      *replacement_left,
+      *replacement_right);
+
+    replacement_left->pin.set_range(replacement_left->get_meta());
+    replacement_right->pin.set_range(replacement_right->get_meta());
+    return std::make_tuple(
+      replacement_left,
+      replacement_right,
+      pivot);
+  }
+
+  /**
+   * Internal relative addresses on read or in memory prior to commit
+   * are either record or block relative depending on whether this
+   * physical node is is_initial_pending() or just is_pending().
+   *
+   * User passes appropriate base depending on lifecycle and
+   * resolve_relative_addrs fixes up relative internal references
+   * based on base.
+   */
+  void resolve_relative_addrs(paddr_t base) final;
+  void node_resolve_vals(iterator from, iterator to) const final {
+    if (is_initial_pending()) {
+      for (auto i = from; i != to; ++i) {
+	if (i->get_val().is_relative()) {
+	  assert(i->get_val().is_block_relative());
+	  i->set_val(get_paddr().add_relative(i->get_val()));
+	}
+      }
+    }
+  }
+  void node_unresolve_vals(iterator from, iterator to) const final {
+    if (is_initial_pending()) {
+      for (auto i = from; i != to; ++i) {
+	if (i->get_val().is_relative()) {
+	  assert(i->get_val().is_record_relative());
+	  i->set_val(i->get_val() - get_paddr());
+	}
+      }
+    }
+  }
+
+  extent_types_t get_type() const final {
+    return type;
+  }
+
+  std::ostream &print_detail(std::ostream &out) const final;
+
+  ceph::bufferlist get_delta() final {
+    assert(!delta_buffer.empty());
+    ceph::buffer::ptr bptr(delta_buffer.get_bytes());
+    delta_buffer.copy_out(bptr.c_str(), bptr.length());
+    ceph::bufferlist bl;
+    bl.push_back(bptr);
+    return bl;
+  }
+
+  void apply_delta_and_adjust_crc(
+    paddr_t base, const ceph::bufferlist &_bl) final {
+    assert(_bl.length());
+    ceph::bufferlist bl = _bl;
+    bl.rebuild();
+    delta_buffer_t buffer;
+    buffer.copy_in(bl.front().c_str(), bl.front().length());
+    buffer.replay(*this);
+    set_last_committed_crc(get_crc32c());
+    resolve_relative_addrs(base);
+  }
+
+  bool at_max_capacity() const final {
+    return get_size() == get_capacity();
+  }
+
+  bool at_min_capacity() const {
+    return get_size() == (get_capacity() / 2);
+  }
+
+  /// returns iterators containing [l, r)
+  std::pair<internal_iterator_t, internal_iterator_t> bound(
+    laddr_t l, laddr_t r) {
+    // TODO: inefficient
+    auto retl = begin();
+    for (; retl != end(); ++retl) {
+      if (retl->get_next_key_or_max() > l)
+	break;
+    }
+    auto retr = retl;
+    for (; retr != end(); ++retr) {
+      if (retr->get_key() >= r)
+	break;
+    }
+    return std::make_pair(retl, retr);
+  }
+
+  using split_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error
+    >;
+  using split_ret = split_ertr::future<LBANodeRef>;
+  split_ret split_entry(
+    op_context_t c,
+    laddr_t addr,
+    internal_iterator_t,
+    LBANodeRef entry);
+
+  using merge_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error
+    >;
+  using merge_ret = merge_ertr::future<LBANodeRef>;
+  merge_ret merge_entry(
+    op_context_t c,
+    laddr_t addr,
+    internal_iterator_t,
+    LBANodeRef entry,
+    bool is_root);
+
+  /// returns iterator for subtree containing laddr
+  internal_iterator_t get_containing_child(laddr_t laddr);
+};
+
+/**
+ * LBALeafNode
+ *
+ * Abstracts operations on and layout of leaf nodes for the
+ * LBA Tree.
+ *
+ * Layout (4k):
+ *   size       : uint32_t[1]                4b
+ *   (padding)  :                            4b
+ *   meta       : lba_node_meta_le_t[3]      (1*24)b
+ *   keys       : laddr_t[170]               (145*8)b
+ *   values     : lba_map_val_t[170]         (145*20)b
+ *                                           = 4092
+ *
+ * TODO: update FixedKVNodeLayout to handle the above calculation
+ * TODO: the above alignment probably isn't portable without further work
+ */
+constexpr size_t LEAF_NODE_CAPACITY = 145;
+
+/**
+ * lba_map_val_le_t
+ *
+ * On disk layout for lba_map_val_t.
+ */
+struct lba_map_val_le_t {
+  extent_len_le_t len = init_extent_len_le_t(0);
+  paddr_le_t paddr;
+  ceph_le32 refcount = init_le32(0);
+  ceph_le32 checksum = init_le32(0);
+
+  lba_map_val_le_t() = default;
+  lba_map_val_le_t(const lba_map_val_le_t &) = default;
+  explicit lba_map_val_le_t(const lba_map_val_t &val)
+    : len(init_extent_len_le_t(val.len)),
+      paddr(paddr_le_t(val.paddr)),
+      refcount(init_le32(val.refcount)),
+      checksum(init_le32(val.checksum)) {}
+
+  operator lba_map_val_t() const {
+    return lba_map_val_t{ len, paddr, refcount, checksum };
+  }
+};
+
+struct LBALeafNode
+  : LBANode,
+    common::FixedKVNodeLayout<
+      LEAF_NODE_CAPACITY,
+      lba_node_meta_t, lba_node_meta_le_t,
+      laddr_t, laddr_le_t,
+      lba_map_val_t, lba_map_val_le_t> {
+  using internal_iterator_t = const_iterator;
+  template <typename... T>
+  LBALeafNode(T&&... t) :
+    LBANode(std::forward<T>(t)...),
+    FixedKVNodeLayout(get_bptr().c_str()) {}
+
+  static constexpr extent_types_t type = extent_types_t::LADDR_LEAF;
+
+  lba_node_meta_t get_node_meta() const final { return get_meta(); }
+
+  CachedExtentRef duplicate_for_write() final {
+    assert(delta_buffer.empty());
+    return CachedExtentRef(new LBALeafNode(*this));
+  };
+
+  delta_buffer_t delta_buffer;
+  delta_buffer_t *maybe_get_delta_buffer() {
+    return is_mutation_pending() ? &delta_buffer : nullptr;
+  }
+
+  lookup_ret lookup(op_context_t c, laddr_t addr, depth_t depth) final
+  {
+    return lookup_ret(
+      lookup_ertr::ready_future_marker{},
+      this);
+  }
+
+  lookup_range_ret lookup_range(
+    op_context_t c,
+    laddr_t addr,
+    extent_len_t len) final;
+
+  insert_ret insert(
+    op_context_t c,
+    laddr_t laddr,
+    lba_map_val_t val) final;
+
+  mutate_mapping_ret mutate_mapping(
+    op_context_t c,
+    laddr_t laddr,
+    mutate_func_t &&f) final;
+  mutate_mapping_ret mutate_mapping_internal(
+    op_context_t c,
+    laddr_t laddr,
+    bool is_root,
+    mutate_func_t &&f) final;
+
+  mutate_internal_address_ret mutate_internal_address(
+    op_context_t c,
+    depth_t depth,
+    laddr_t laddr,
+    paddr_t paddr) final;
+
+  find_hole_ret find_hole(
+    op_context_t c,
+    laddr_t min,
+    laddr_t max,
+    extent_len_t len) final;
+
+  scan_mappings_ret scan_mappings(
+    op_context_t c,
+    laddr_t begin,
+    laddr_t end,
+    scan_mappings_func_t &f) final;
+
+  scan_mapped_space_ret scan_mapped_space(
+    op_context_t c,
+    scan_mapped_space_func_t &f) final;
+
+  std::tuple<LBANodeRef, LBANodeRef, laddr_t>
+  make_split_children(op_context_t c) final {
+    auto left = c.cache.alloc_new_extent<LBALeafNode>(
+      c.trans, LBA_BLOCK_SIZE);
+    auto right = c.cache.alloc_new_extent<LBALeafNode>(
+      c.trans, LBA_BLOCK_SIZE);
+    auto pivot = split_into(*left, *right);
+    left->pin.set_range(left->get_meta());
+    right->pin.set_range(right->get_meta());
+    return std::make_tuple(
+      left,
+      right,
+      pivot);
+  }
+
+  LBANodeRef make_full_merge(
+    op_context_t c,
+    LBANodeRef &right) final {
+    auto replacement = c.cache.alloc_new_extent<LBALeafNode>(
+      c.trans, LBA_BLOCK_SIZE);
+    replacement->merge_from(*this, *right->cast<LBALeafNode>());
+    replacement->pin.set_range(replacement->get_meta());
+    return replacement;
+  }
+
+  std::tuple<LBANodeRef, LBANodeRef, laddr_t>
+  make_balanced(
+    op_context_t c,
+    LBANodeRef &_right,
+    bool prefer_left) final {
+    ceph_assert(_right->get_type() == type);
+    auto &right = *_right->cast<LBALeafNode>();
+    auto replacement_left = c.cache.alloc_new_extent<LBALeafNode>(
+      c.trans, LBA_BLOCK_SIZE);
+    auto replacement_right = c.cache.alloc_new_extent<LBALeafNode>(
+      c.trans, LBA_BLOCK_SIZE);
+
+    auto pivot = balance_into_new_nodes(
+      *this,
+      right,
+      prefer_left,
+      *replacement_left,
+      *replacement_right);
+
+    replacement_left->pin.set_range(replacement_left->get_meta());
+    replacement_right->pin.set_range(replacement_right->get_meta());
+    return std::make_tuple(
+      replacement_left,
+      replacement_right,
+      pivot);
+  }
+
+  // See LBAInternalNode, same concept
+  void resolve_relative_addrs(paddr_t base) final;
+  void node_resolve_vals(iterator from, iterator to) const final {
+    if (is_initial_pending()) {
+      for (auto i = from; i != to; ++i) {
+	auto val = i->get_val();
+	if (val.paddr.is_relative()) {
+	  assert(val.paddr.is_block_relative());
+	  val.paddr = get_paddr().add_relative(val.paddr);
+	  i->set_val(val);
+	}
+      }
+    }
+  }
+  void node_unresolve_vals(iterator from, iterator to) const final {
+    if (is_initial_pending()) {
+      for (auto i = from; i != to; ++i) {
+	auto val = i->get_val();
+	if (val.paddr.is_relative()) {
+	  auto val = i->get_val();
+	  assert(val.paddr.is_record_relative());
+	  val.paddr = val.paddr - get_paddr();
+	  i->set_val(val);
+	}
+      }
+    }
+  }
+
+  ceph::bufferlist get_delta() final {
+    assert(!delta_buffer.empty());
+    ceph::buffer::ptr bptr(delta_buffer.get_bytes());
+    delta_buffer.copy_out(bptr.c_str(), bptr.length());
+    ceph::bufferlist bl;
+    bl.push_back(bptr);
+    return bl;
+  }
+
+  void apply_delta_and_adjust_crc(
+    paddr_t base, const ceph::bufferlist &_bl) final {
+    assert(_bl.length());
+    ceph::bufferlist bl = _bl;
+    bl.rebuild();
+    delta_buffer_t buffer;
+    buffer.copy_in(bl.front().c_str(), bl.front().length());
+    buffer.replay(*this);
+    set_last_committed_crc(get_crc32c());
+    resolve_relative_addrs(base);
+  }
+
+  extent_types_t get_type() const final {
+    return type;
+  }
+
+  std::ostream &print_detail(std::ostream &out) const final;
+
+  bool at_max_capacity() const final {
+    return get_size() == get_capacity();
+  }
+
+  bool at_min_capacity() const final {
+    return get_size() == (get_capacity() / 2);
+  }
+
+  /// returns iterators <lb, ub> containing addresses [l, r)
+  std::pair<internal_iterator_t, internal_iterator_t> bound(
+    laddr_t l, laddr_t r) {
+    // TODO: inefficient
+    auto retl = begin();
+    for (; retl != end(); ++retl) {
+      if (retl->get_key() >= l || (retl->get_key() + retl->get_val().len) > l)
+	break;
+    }
+    auto retr = retl;
+    for (; retr != end(); ++retr) {
+      if (retr->get_key() >= r)
+	break;
+    }
+    return std::make_pair(retl, retr);
+  }
+
+  std::pair<internal_iterator_t, internal_iterator_t>
+  get_leaf_entries(laddr_t addr, extent_len_t len);
+};
+using LBALeafNodeRef = TCachedExtentRef<LBALeafNode>;
+
+}
diff --git a/src/crimson/os/seastore/onode.cc b/src/crimson/os/seastore/onode.cc
new file mode 100644
index 000000000..a8b925b70
--- /dev/null
+++ b/src/crimson/os/seastore/onode.cc
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "onode.h"
+#include "include/encoding.h"
+
+namespace crimson::os::seastore {
+
+size_t Onode::size() const
+{
+  return ceph::encoded_sizeof(*this);
+}
+
+void Onode::encode(void* buffer, size_t len)
+{
+  struct [[gnu::packed]] encoded_t {
+    uint8_t struct_v;
+    uint8_t struct_compat;
+    uint32_t struct_len;
+    uint32_t len;
+    char data[];
+  };
+  auto p = reinterpret_cast<encoded_t*>(buffer);
+  assert(std::numeric_limits<uint16_t>::max() >= size());
+  assert(len >= size());
+  p->struct_v = 1;
+  p->struct_compat = 1;
+  p->struct_len = sizeof(encoded_t) + payload.size();
+  p->len = payload.size();
+  std::memcpy(p->data, payload.data(), payload.size());
+}
+
+bool operator==(const Onode& lhs, const Onode& rhs)
+{
+  return lhs.get() == rhs.get();
+}
+
+std::ostream& operator<<(std::ostream &out, const Onode &rhs)
+{
+  return out << rhs.get();
+}
+
+}
+
diff --git a/src/crimson/os/seastore/onode.h b/src/crimson/os/seastore/onode.h
new file mode 100644
index 000000000..4d7783028
--- /dev/null
+++ b/src/crimson/os/seastore/onode.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <limits>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include "include/buffer.h"
+#include "include/denc.h"
+
+namespace crimson::os::seastore {
+
+// in-memory onode, in addition to the stuff that should be persisted to disk,
+// it may contain intrusive hooks for LRU, rw locks etc
+class Onode : public boost::intrusive_ref_counter<
+  Onode,
+  boost::thread_unsafe_counter>
+{
+public:
+  Onode(std::string_view s)
+    : payload{s}
+  {}
+  size_t size() const;
+  const std::string& get() const {
+    return payload;
+  }
+  void encode(void* buffer, size_t len);
+  DENC(Onode, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.payload, p);
+    DENC_FINISH(p);
+  }
+
+private:
+  // dummy payload
+  std::string payload;
+};
+
+bool operator==(const Onode& lhs, const Onode& rhs);
+std::ostream& operator<<(std::ostream &out, const Onode &rhs);
+using OnodeRef = boost::intrusive_ptr<Onode>;
+}
+
+WRITE_CLASS_DENC(crimson::os::seastore::Onode)
diff --git a/src/crimson/os/seastore/onode_manager.h b/src/crimson/os/seastore/onode_manager.h
new file mode 100644
index 000000000..0a03b7fdf
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager.h
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/buffer_fwd.h"
+#include "include/ceph_assert.h"
+#include "common/hobject.h"
+
+#include "crimson/os/seastore/onode.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/osd/exceptions.h"
+
+namespace crimson::os::seastore {
+
+class OnodeManager {
+public:
+  using open_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  virtual open_ertr::future<OnodeRef> get_or_create_onode(
+    Transaction &trans,
+    const ghobject_t &hoid) {
+    return open_ertr::make_ready_future<OnodeRef>();
+  }
+  virtual open_ertr::future<std::vector<OnodeRef>> get_or_create_onodes(
+    Transaction &trans,
+    const std::vector<ghobject_t> &hoids) {
+    return open_ertr::make_ready_future<std::vector<OnodeRef>>();
+  }
+
+  using write_ertr= crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  virtual write_ertr::future<> write_dirty(
+    Transaction &trans,
+    const std::vector<OnodeRef> &onodes) {
+    return write_ertr::now();
+  }
+  virtual ~OnodeManager() {}
+};
+using OnodeManagerRef = std::unique_ptr<OnodeManager>;
+
+namespace onode_manager {
+
+OnodeManagerRef create_ephemeral() {
+  return OnodeManagerRef();
+}
+
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc
new file mode 100644
index 000000000..b05ea76a3
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "onode_block.h"
+
+namespace crimson::os::seastore {
+
+ceph::bufferlist OnodeBlock::get_delta()
+{
+  bufferlist bl;
+  assert(deltas.size() <= std::numeric_limits<uint8_t>::max());
+  uint8_t n_deltas = deltas.size();
+  ceph::encode(n_deltas, bl);
+  for (auto& delta : deltas) {
+    delta->encode(bl);
+  }
+  return bl;
+}
+
+void OnodeBlock::logical_on_delta_write()
+{
+  // journal submitted to disk, now update the memory
+  apply_pending_changes(true);
+}
+
+void OnodeBlock::apply_delta(const ceph::bufferlist &bl)
+{
+  assert(deltas.empty());
+
+  auto p = bl.cbegin();
+  uint8_t n_deltas = 0;
+  ceph::decode(n_deltas, p);
+  for (uint8_t i = 0; i < n_deltas; i++) {
+    delta_t delta;
+    delta.decode(p);
+    mutate(std::move(delta));
+  }
+  apply_pending_changes(true);
+}
+
+void OnodeBlock::mutate(delta_t&& d)
+{
+  if (is_initial_pending()) {
+    char* const p = get_bptr().c_str();
+    mutate_func(p, d);
+  }
+  deltas.push_back(std::make_unique<delta_t>(std::move(d)));
+}
+
+void OnodeBlock::apply_pending_changes(bool do_cleanup)
+{
+  if (!is_mutation_pending()) {
+    return;
+  }
+  if (share_buffer) {
+    // do a deep copy so i can change my own copy
+    get_bptr() = ceph::bufferptr{get_bptr().c_str(),
+				 get_bptr().length()};
+    share_buffer = false;
+  }
+  assert(mutate_func);
+  char* const p = get_bptr().c_str();
+  for (auto& delta : deltas) {
+    mutate_func(p, *delta);
+    if (do_cleanup) {
+      delta.reset();
+    }
+  }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h
new file mode 100644
index 000000000..0025d9847
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <cstdint>
+#include <boost/container/small_vector.hpp>
+
+#include "crimson/os/seastore/transaction_manager.h"
+#include "onode_delta.h"
+
+namespace crimson::os::seastore {
+
+// TODO s/CachedExtent/LogicalCachedExtent/
+struct OnodeBlock final : LogicalCachedExtent {
+  using Ref = TCachedExtentRef<OnodeBlock>;
+
+  template <typename... T>
+  OnodeBlock(T&&... t) : LogicalCachedExtent(std::forward<T>(t)...) {}
+  OnodeBlock(OnodeBlock&& block) = delete;
+  OnodeBlock(const OnodeBlock& block, CachedExtent::share_buffer_t tag) noexcept
+    : LogicalCachedExtent{block, tag},
+      share_buffer{true}
+  {}
+
+  CachedExtentRef duplicate_for_write() final {
+    return new OnodeBlock{*this, CachedExtent::share_buffer_t{}};
+  }
+
+  // could materialize the pending changes to the underlying buffer here,
+  // but since we write the change to the buffer immediately, let skip
+  // this for now.
+  void prepare_write() final {}
+
+  // queries
+  static constexpr extent_types_t TYPE = extent_types_t::ONODE_BLOCK;
+  extent_types_t get_type() const final {
+    return TYPE;
+  }
+
+  // have to stash all the changes before on_delta_write() is called,
+  // otherwise we could pollute the extent with pending mutations
+  // before the transaction carrying these mutations is committed to
+  // disk
+  ceph::bufferlist get_delta() final;
+  void logical_on_delta_write() final;
+  void apply_delta(const ceph::bufferlist &bl) final;
+
+  void sync() {
+    apply_pending_changes(false);
+  }
+  void mutate(delta_t&& d);
+  using mutate_func_t = std::function<void (char*, const delta_t&)>;
+  void set_delta_applier(mutate_func_t&& func) {
+    mutate_func = std::move(func);
+  }
+private:
+  // before looking at the extent, we need to make sure the content is up to date
+  void apply_pending_changes(bool do_cleanup);
+  // assuming we don't stash too many deltas to a single block
+  // otherwise a fullwrite op is necessary
+  boost::container::small_vector<std::unique_ptr<delta_t>, 2> deltas;
+  mutate_func_t mutate_func;
+  bool share_buffer = false;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc
new file mode 100644
index 000000000..869685d45
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc
@@ -0,0 +1,188 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "onode_delta.h"
+
+delta_t::delta_t(delta_t&& delta)
+{
+  assert(op == op_t::nop);
+  op = delta.op;
+  n = delta.n;
+  oid = std::move(delta.oid);
+  onode = std::move(delta.onode);
+  keys = std::move(delta.keys);
+  cells = std::move(delta.cells);
+  delta.op = op_t::nop;
+}
+
+delta_t& delta_t::operator=(delta_t&& delta)
+{
+  assert(op == op_t::nop);
+  op = delta.op;
+  n = delta.n;
+  oid = std::move(delta.oid);
+  onode = std::move(delta.onode);
+  keys = std::move(delta.keys);
+  cells = std::move(delta.cells);
+  delta.op = op_t::nop;
+  return *this;
+}
+
+delta_t delta_t::nop()
+{
+  return delta_t{op_t::nop};
+}
+
+delta_t delta_t::insert_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode)
+{
+  delta_t delta{op_t::insert_onode};
+  delta.n = slot;
+  delta.oid = oid;
+  delta.onode = onode;
+  return delta;
+}
+
+delta_t delta_t::update_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode)
+{
+  delta_t delta{op_t::update_onode};
+  delta.n = slot;
+  delta.oid = oid;
+  delta.onode = onode;
+  return delta;
+}
+
+delta_t delta_t::insert_child(unsigned slot,
+                              const ghobject_t& oid,
+                              crimson::os::seastore::laddr_t addr)
+{
+  delta_t delta{op_t::insert_child};
+  delta.n = slot;
+  delta.oid = oid;
+  delta.addr = addr;
+  return delta;
+}
+
+delta_t delta_t::update_key(unsigned slot, const ghobject_t& oid)
+{
+  delta_t delta{op_t::update_key};
+  delta.n = slot;
+  delta.oid = oid;
+  return delta;
+}
+
+delta_t delta_t::shift_left(unsigned n)
+{
+  delta_t delta{op_t::shift_left};
+  delta.n = n;
+  return delta;
+}
+
+delta_t delta_t::trim_right(unsigned n)
+{
+  delta_t delta{op_t::trim_right};
+  delta.n = n;
+  return delta;
+}
+
+delta_t delta_t::insert_front(ceph::buffer::ptr keys,
+                              ceph::buffer::ptr cells)
+{
+  delta_t delta{op_t::insert_front};
+  delta.keys = std::move(keys);
+  delta.cells = std::move(cells);
+  return delta;
+}
+
+delta_t delta_t::insert_back(ceph::buffer::ptr keys,
+                             ceph::buffer::ptr cells)
+{
+  delta_t delta{op_t::insert_back};
+  delta.keys = std::move(keys);
+  delta.cells = std::move(cells);
+  return delta;
+}
+
+delta_t delta_t::remove_from(unsigned slot)
+{
+  delta_t delta{op_t::remove_from};
+  delta.n = slot;
+  return delta;
+}
+
+void delta_t::encode(ceph::bufferlist& bl)
+{
+  using ceph::encode;
+  switch (op) {
+  case op_t::insert_onode:
+    [[fallthrough]];
+  case op_t::update_onode:
+    // the slot # is not encoded, because we can alway figure it out
+    // when we have to replay the delta by looking the oid up in the
+    // node block
+    encode(oid, bl);
+    encode(*onode, bl);
+    break;
+  case op_t::insert_child:
+    encode(oid, bl);
+    encode(addr, bl);
+  case op_t::update_key:
+    encode(n, bl);
+    encode(oid, bl);
+    break;
+  case op_t::shift_left:
+    encode(n, bl);
+    break;
+  case op_t::trim_right:
+    encode(n, bl);
+    break;
+  case op_t::insert_front:
+    [[fallthrough]];
+  case op_t::insert_back:
+    encode(n, bl);
+    encode(keys, bl);
+    encode(cells, bl);
+    break;
+  case op_t::remove_from:
+    encode(n, bl);
+    break;
+  default:
+    assert(0 == "unknown onode op");
+  }
+}
+
+void delta_t::decode(ceph::bufferlist::const_iterator& p) {
+  using ceph::decode;
+  decode(op, p);
+  switch (op) {
+  case op_t::insert_onode:
+    [[fallthrough]];
+  case op_t::update_onode:
+    decode(oid, p);
+    decode(*onode, p);
+    break;
+  case op_t::insert_child:
+    [[fallthrough]];
+  case op_t::update_key:
+    decode(n, p);
+    decode(oid, p);
+    break;
+  case op_t::shift_left:
+    decode(n, p);
+    break;
+  case op_t::trim_right:
+    decode(n, p);
+    break;
+  case op_t::insert_front:
+    [[fallthrough]];
+  case op_t::insert_back:
+    decode(n, p);
+    decode(keys, p);
+    decode(cells, p);
+    break;
+  case op_t::remove_from:
+    decode(n, p);
+    break;
+  default:
+    assert(0 == "unknown onode op");
+  }
+}
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h
new file mode 100644
index 000000000..3e7e7315e
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cstdint>
+
+#include "common/hobject.h"
+#include "include/buffer_fwd.h"
+
+#include "crimson/os/seastore/onode.h"
+#include "crimson/os/seastore/seastore_types.h"
+
+using crimson::os::seastore::OnodeRef;
+
+struct delta_t {
+  enum class op_t : uint8_t {
+    nop,
+    insert_onode,
+    update_onode,
+    insert_child,
+    update_key,
+    shift_left,
+    trim_right,
+    insert_front,
+    insert_back,
+    remove_from,
+    // finer grained op?
+    //  - changing the embedded extent map of given oid
+    //  - mutating the embedded xattrs of given oid
+  } op = op_t::nop;
+
+  unsigned n = 0;
+  ghobject_t oid;
+  crimson::os::seastore::laddr_t addr = 0;
+  OnodeRef onode;
+  ceph::bufferptr keys;
+  ceph::bufferptr cells;
+
+  delta_t() = default;
+  delta_t(op_t op)
+    : op{op}
+  {}
+  delta_t(delta_t&& delta);
+  delta_t& operator=(delta_t&& delta);
+
+  static delta_t nop();
+  static delta_t insert_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode);
+  static delta_t update_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode);
+  static delta_t insert_child(unsigned slot, const ghobject_t& oid, crimson::os::seastore::laddr_t addr);
+  static delta_t update_key(unsigned slot, const ghobject_t& oid);
+  static delta_t shift_left(unsigned n);
+  static delta_t trim_right(unsigned n);
+  static delta_t insert_front(ceph::buffer::ptr keys,
+                              ceph::buffer::ptr cells);
+  static delta_t insert_back(ceph::buffer::ptr keys,
+                             ceph::buffer::ptr cells);
+  static delta_t remove_from(unsigned slot);
+
+  // shortcuts
+  static delta_t insert_item(unsigned slot, const ghobject_t& oid, OnodeRef onode) {
+    return insert_onode(slot, oid, onode);
+  }
+  static delta_t insert_item(unsigned slot, const ghobject_t& oid, crimson::os::seastore::laddr_t addr) {
+    return insert_child(slot, oid, addr);
+  }
+
+  void encode(ceph::bufferlist& bl);
+  void decode(ceph::bufferlist::const_iterator& p);
+};
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc
new file mode 100644
index 000000000..fdcaa2fcb
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc
@@ -0,0 +1,567 @@
+#include "onode_node.h"
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+auto node_t<BlockSize, N, NodeType>::key_at(unsigned slot) const
+  -> std::pair<const key_prefix_t&, const key_suffix_t&>
+{
+  auto& key = keys[slot];
+  if constexpr (item_in_key) {
+    return {key, key_suffix_t{}};
+  } else {
+    auto p = from_end(key.offset);
+    return {key, *reinterpret_cast<const key_suffix_t*>(p)};
+  }
+}
+
+// update an existing oid with the specified item
+template<size_t BlockSize, int N, ntype_t NodeType>
+ghobject_t
+node_t<BlockSize, N, NodeType>::get_oid_at(unsigned slot,
+                                           const ghobject_t& oid) const
+{
+  auto [prefix, suffix] = key_at(slot);
+  ghobject_t updated = oid;
+  prefix.update_oid(updated);
+  suffix.update_oid(updated);
+  return updated;
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+auto node_t<BlockSize, N, NodeType>::item_at(const key_prefix_t& key) const
+  -> const_item_t
+{
+  if constexpr (item_in_key) {
+    return key.child_addr;
+  } else {
+    assert(key.offset < BlockSize);
+    auto p = from_end(key.offset);
+    auto partial_key = reinterpret_cast<const key_suffix_t*>(p);
+    p += size_of(*partial_key);
+    return *reinterpret_cast<const item_t*>(p);
+  }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::dump(std::ostream& os) const
+{
+  for (uint16_t i = 0; i < count; i++) {
+    const auto& [prefix, suffix] = key_at(i);
+    os << " [" << i << '/' << count - 1 << "]\n"
+       << "  key1 = (" << prefix << ")\n"
+       << "  key2 = (" << suffix << ")\n";
+    const auto& item = item_at(prefix);
+    if (_is_leaf()) {
+      os << " item = " << item << "\n";
+    } else {
+      os << " child = " << std::hex << item << std::dec << "\n";
+    }
+  }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+char* node_t<BlockSize, N, NodeType>::from_end(uint16_t offset)
+{
+  auto end = reinterpret_cast<char*>(this) + BlockSize;
+  return end - static_cast<int>(offset);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+const char* node_t<BlockSize, N, NodeType>::from_end(uint16_t offset) const
+{
+  auto end = reinterpret_cast<const char*>(this) + BlockSize;
+  return end - static_cast<int>(offset);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+uint16_t node_t<BlockSize, N, NodeType>::used_space() const
+{
+  if constexpr (item_in_key) {
+    return count * sizeof(key_prefix_t);
+  } else {
+    if (count) {
+      return keys[count - 1].offset + count * sizeof(key_prefix_t);
+    } else {
+      return 0;
+    }
+  }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+uint16_t node_t<BlockSize, N, NodeType>::capacity()
+{
+  auto p = reinterpret_cast<node_t*>(0);
+  return BlockSize - (reinterpret_cast<char*>(p->keys) -
+                      reinterpret_cast<char*>(p));
+}
+
+// TODO: if it's allowed to update 2 siblings at the same time, we can have
+//       B* tree
+template<size_t BlockSize, int N, ntype_t NodeType>
+constexpr uint16_t node_t<BlockSize, N, NodeType>::min_size()
+{
+  return capacity() / 2;
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+constexpr std::pair<int16_t, int16_t>
+node_t<BlockSize, N, NodeType>::bytes_to_add(uint16_t size)
+{
+  assert(size < min_size());
+  return {min_size() - size, capacity() - size};
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+constexpr std::pair<int16_t, int16_t>
+node_t<BlockSize, N, NodeType>::bytes_to_remove(uint16_t size)
+{
+  assert(size > capacity());
+  return {size - capacity(), size - min_size()};
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+size_state_t node_t<BlockSize, N, NodeType>::size_state(uint16_t size) const
+{
+  if (size > capacity()) {
+    return size_state_t::overflow;
+  } else if (size < capacity() / 2) {
+    return size_state_t::underflow;
+  } else {
+    return size_state_t::okay;
+  }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+bool node_t<BlockSize, N, NodeType>::is_underflow(uint16_t size) const
+{
+  switch (size_state(size)) {
+  case size_state_t::underflow:
+    return true;
+  case size_state_t::okay:
+    return false;
+  default:
+    assert(0);
+    return false;
+  }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+int16_t node_t<BlockSize, N, NodeType>::size_with_key(unsigned slot,
+                                                      const ghobject_t& oid) const
+{
+  if constexpr (item_in_key) {
+    return capacity();
+  } else {
+    // the size of fixed key does not change
+    [[maybe_unused]] const auto& [prefix, suffix] = key_at(slot);
+    return capacity() + key_suffix_t::size_from(oid) - suffix.size();
+  }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+ordering_t node_t<BlockSize, N, NodeType>::compare_with_slot(unsigned slot,
+                                                             const ghobject_t& oid) const
+{
+  const auto& [prefix, suffix] = key_at(slot);
+  if (auto result = prefix.compare(oid); result != ordering_t::equivalent) {
+    return result;
+  } else {
+    return suffix.compare(oid);
+  }
+}
+
+/// return the slot number of the first slot that is greater or equal to
+/// key
+template<size_t BlockSize, int N, ntype_t NodeType>
+std::pair<unsigned, bool> node_t<BlockSize, N, NodeType>::lower_bound(const ghobject_t& oid) const
+{
+  unsigned s = 0, e = count;
+  while (s != e) {
+    unsigned mid = (s + e) / 2;
+    switch (compare_with_slot(mid, oid)) {
+    case ordering_t::less:
+      s = ++mid;
+      break;
+    case ordering_t::greater:
+      e = mid;
+      break;
+    case ordering_t::equivalent:
+      assert(mid == 0 || mid < count);
+      return {mid, true};
+    }
+  }
+  return {s, false};
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+uint16_t node_t<BlockSize, N, NodeType>::size_of_item(const ghobject_t& oid,
+                                                      const item_t& item)
+{
+  if constexpr (item_in_key) {
+    return sizeof(key_prefix_t);
+  } else {
+    return (sizeof(key_prefix_t) +
+            key_suffix_t::size_from(oid) + size_of(item));
+  }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+bool node_t<BlockSize, N, NodeType>::is_overflow(const ghobject_t& oid,
+                                                 const item_t& item) const
+{
+  return free_space() < size_of_item(oid, item);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+bool node_t<BlockSize, N, NodeType>::is_overflow(const ghobject_t& oid,
+                                                 const OnodeRef& item) const
+{
+  return free_space() < (sizeof(key_prefix_t) + key_suffix_t::size_from(oid) + item->size());
+}
+
+// inserts an item into the given slot, pushing all subsequent keys forward
+// @note if the item is not embedded in key, shift the right half as well
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::insert_at(unsigned slot,
+                                               const ghobject_t& oid,
+                                               const item_t& item)
+{
+  assert(!is_overflow(oid, item));
+  assert(slot <= count);
+  if constexpr (item_in_key) {
+    // shift the keys right
+    key_prefix_t* key = keys + slot;
+    key_prefix_t* last_key = keys + count;
+    std::copy_backward(key, last_key, last_key + 1);
+    key->set(oid, item);
+  } else {
+    const uint16_t size = key_suffix_t::size_from(oid) + size_of(item);
+    uint16_t offset = size;
+    if (slot > 0) {
+      offset += keys[slot - 1].offset;
+    }
+    if (slot < count) {
+      //                                 V
+      // |         |... //    ...|//////||    |
+      // |         |... // ...|//////|   |    |
+      // shift the partial keys and items left
+      auto first = keys[slot - 1].offset;
+      auto last = keys[count - 1].offset;
+      std::memmove(from_end(last + size), from_end(last), last - first);
+      // shift the keys right and update the pointers
+      for (key_prefix_t* dst = keys + count; dst > keys + slot; dst--) {
+        key_prefix_t* src = dst - 1;
+        *dst = *src;
+        dst->offset += size;
+      }
+    }
+    keys[slot].set(oid, offset);
+    auto p = from_end(offset);
+    auto partial_key = reinterpret_cast<key_suffix_t*>(p);
+    partial_key->set(oid);
+    p += size_of(*partial_key);
+    auto item_ptr = reinterpret_cast<item_t*>(p);
+    *item_ptr = item;
+  }
+  count++;
+  assert(used_space() <= capacity());
+}
+
+// used by InnerNode for updating the keys indexing its children when their lower boundaries
+// is updated
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::update_key_at(unsigned slot, const ghobject_t& oid)
+{
+  if constexpr (is_leaf()) {
+    assert(0);
+  } else if constexpr (item_in_key) {
+    keys[slot].update(oid);
+  } else {
+    const auto& [prefix, suffix] = key_at(slot);
+    int16_t delta = key_suffix_t::size_from(oid) - suffix.size();
+    if (delta > 0) {
+      // shift the cells sitting at its left side
+      auto first = keys[slot].offset;
+      auto last = keys[count - 1].offset;
+      std::memmove(from_end(last + delta), from_end(last), last - first);
+      // update the pointers
+      for (key_prefix_t* key = keys + slot; key < keys + count; key++) {
+        key->offset += delta;
+      }
+    }
+    keys[slot].update(oid);
+    auto p = from_end(keys[slot].offset);
+    auto partial_key = reinterpret_cast<key_suffix_t*>(p);
+    partial_key->set(oid);
+    // we don't update item here
+  }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+std::pair<unsigned, uint16_t>
+node_t<BlockSize, N, NodeType>::calc_grab_front(uint16_t min_grab,
+                                                uint16_t max_grab) const
+{
+  // TODO: split by likeness
+  uint16_t grabbed = 0;
+  uint16_t used = used_space();
+  int n = 0;
+  for (; n < count; n++) {
+    const auto& [prefix, suffix] = key_at(n);
+    uint16_t to_grab = sizeof(prefix) + size_of(suffix);
+    if constexpr (!item_in_key) {
+      const auto& item = item_at(prefix);
+      to_grab += size_of(item);
+    }
+    if (grabbed + to_grab > max_grab) {
+      break;
+    }
+    grabbed += to_grab;
+  }
+  if (grabbed >= min_grab) {
+    if (n == count) {
+      return {n, grabbed};
+    } else if (!is_underflow(used - grabbed)) {
+      return {n, grabbed};
+    }
+  }
+  return {0, 0};
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+std::pair<unsigned, uint16_t>
+node_t<BlockSize, N, NodeType>::calc_grab_back(uint16_t min_grab,
+                                               uint16_t max_grab) const
+{
+  // TODO: split by likeness
+  uint16_t grabbed = 0;
+  uint16_t used = used_space();
+  for (int i = count - 1; i >= 0; i--) {
+    const auto& [prefix, suffix] = key_at(i);
+    uint16_t to_grab = sizeof(prefix) + size_of(suffix);
+    if constexpr (!item_in_key) {
+      const auto& item = item_at(prefix);
+      to_grab += size_of(item);
+    }
+    grabbed += to_grab;
+    if (is_underflow(used - grabbed)) {
+      return {0, 0};
+    } else if (grabbed > max_grab) {
+      return {0, 0};
+    } else if (grabbed >= min_grab) {
+      return {i + 1, grabbed};
+    }
+  }
+  return {0, 0};
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+template<int LeftN, class Mover>
+void node_t<BlockSize, N, NodeType>::grab_from_left(node_t<BlockSize, LeftN, NodeType>& left,
+                                                    unsigned n, uint16_t bytes,
+                                                    Mover& mover)
+{
+  // TODO: rebuild keys if moving across different layouts
+  //       group by likeness
+  shift_right(n, bytes);
+  mover.move_from(left.count - n, 0, n);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+template<int RightN, class Mover>
+delta_t node_t<BlockSize, N, NodeType>::acquire_right(node_t<BlockSize, RightN, NodeType>& right,
+                                                      unsigned whoami, Mover& mover)
+{
+  mover.move_from(0, count, right.count);
+  return mover.to_delta();
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+template<int RightN, class Mover>
+void node_t<BlockSize, N, NodeType>::grab_from_right(node_t<BlockSize, RightN, NodeType>& right,
+                                                     unsigned n, uint16_t bytes,
+                                                     Mover& mover)
+{
+  mover.move_from(0, count, n);
+  right.shift_left(n, 0);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+template<int LeftN, class Mover>
+void node_t<BlockSize, N, NodeType>::push_to_left(node_t<BlockSize, LeftN, NodeType>& left,
+                                                  unsigned n, uint16_t bytes,
+                                                  Mover& mover)
+{
+  left.grab_from_right(*this, n, bytes, mover);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+template<int RightN, class Mover>
+void node_t<BlockSize, N, NodeType>::push_to_right(node_t<BlockSize, RightN, NodeType>& right,
+                                                   unsigned n, uint16_t bytes,
+                                                   Mover& mover)
+{
+  right.grab_from_left(*this, n, bytes, mover);
+}
+
+// [to, from) are removed, so we need to shift left
+// actually there are only two use cases:
+// - to = 0: for giving elements in bulk
+// - to = from - 1: for removing a single element
+// old: |////|.....|   |.....|/|........|
+// new: |.....|        |.....||........|
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::shift_left(unsigned from, unsigned to)
+{
+  assert(from < count);
+  assert(to < from);
+  if constexpr (item_in_key) {
+    std::copy(keys + from, keys + count, keys + to);
+  } else {
+    const uint16_t cell_hi = keys[count - 1].offset;
+    const uint16_t cell_lo = keys[from - 1].offset;
+    const uint16_t offset_delta = keys[from].offset - keys[to].offset;
+    for (auto src_key = keys + from, dst_key = keys + to;
+         src_key != keys + count;
+         ++src_key, ++dst_key) {
+      // shift the keys left
+      *dst_key = *src_key;
+      // update the pointers
+      dst_key->offset -= offset_delta;
+    }
+    // and cells
+    auto dst = from_end(cell_hi);
+    std::memmove(dst + offset_delta, dst, cell_hi - cell_lo);
+  }
+  count -= (from - to);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::insert_front(const ceph::bufferptr& keys_buf,
+                                                  const ceph::bufferptr& cells_buf)
+{
+  unsigned n = keys_buf.length() / sizeof(key_prefix_t);
+  shift_right(n, cells_buf.length());
+  keys_buf.copy_out(0, keys_buf.length(), reinterpret_cast<char*>(keys));
+  if constexpr (item_in_key) {
+    assert(cells_buf.length() == 0);
+  } else {
+    cells_buf.copy_out(0, cells_buf.length(), from_end(keys[n - 1].offset));
+  }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::insert_back(const ceph::bufferptr& keys_buf,
+                                                 const ceph::bufferptr& cells_buf)
+{
+  keys_buf.copy_out(0, keys_buf.length(), reinterpret_cast<char*>(keys + count));
+  count += keys_buf.length() / sizeof(key_prefix_t);
+  if constexpr (item_in_key) {
+    assert(cells_buf.length() == 0);
+  } else {
+    cells_buf.copy_out(0, cells_buf.length(), from_end(keys[count - 1].offset));
+  }
+}
+
+// one or more elements are inserted, so we need to shift the elements right
+// actually there are only two use cases:
+// - bytes != 0: for inserting bytes before from
+// - bytes = 0: for inserting a single element before from
+// old: ||.....|
+// new: |/////|.....|
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::shift_right(unsigned n, unsigned bytes)
+{
+  assert(bytes + used_space() < capacity());
+  // shift the keys left
+  std::copy_backward(keys, keys + count, keys + count + n);
+  count += n;
+  if constexpr (!item_in_key) {
+    uint16_t cells = keys[count - 1].offset;
+    // copy the partial keys and items
+    std::memmove(from_end(cells + bytes), from_end(cells), cells);
+    // update the pointers
+    for (auto key = keys + n; key < keys + count; ++key) {
+      key->offset += bytes;
+    }
+  }
+}
+
+// shift all keys after slot is removed.
+// @note if the item is not embdedded in key, all items sitting at the left
+//       side of it will be shifted right
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::remove_from(unsigned slot)
+{
+  assert(slot < count);
+  if (unsigned next = slot + 1; next < count) {
+    shift_left(next, slot);
+  } else {
+    // slot is the last one
+    count--;
+  }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::trim_right(unsigned n)
+{
+  count = n;
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::play_delta(const delta_t& delta)
+{
+  switch (delta.op) {
+  case delta_t::op_t::insert_onode:
+    if constexpr (is_leaf()) {
+      auto [slot, found] = lower_bound(delta.oid);
+      assert(!found);
+      assert(delta.onode->size() <= std::numeric_limits<unsigned>::max());
+      ceph::bufferptr buf{static_cast<unsigned>(delta.onode->size())};
+      delta.onode->encode(buf.c_str(), buf.length());
+      auto onode = reinterpret_cast<const onode_t*>(buf.c_str());
+      return insert_at(slot, delta.oid, *onode);
+    } else {
+      throw std::invalid_argument("wrong node type");
+    }
+  case delta_t::op_t::update_onode:
+    // TODO
+    assert(0 == "not implemented");
+    break;
+  case delta_t::op_t::insert_child:
+    if constexpr (is_leaf()) {
+      throw std::invalid_argument("wrong node type");
+    } else {
+      auto [slot, found] = lower_bound(delta.oid);
+      assert(!found);
+      insert_at(slot, delta.oid, delta.addr);
+    }
+  case delta_t::op_t::update_key:
+    if constexpr (is_leaf()) {
+      throw std::invalid_argument("wrong node type");
+    } else {
+      return update_key_at(delta.n, delta.oid);
+    }
+  case delta_t::op_t::shift_left:
+    return shift_left(delta.n, 0);
+  case delta_t::op_t::trim_right:
+    return trim_right(delta.n);
+  case delta_t::op_t::insert_front:
+    return insert_front(delta.keys, delta.cells);
+  case delta_t::op_t::insert_back:
+    return insert_back(delta.keys, delta.cells);
+  case delta_t::op_t::remove_from:
+    return remove_from(delta.n);
+  default:
+    assert(0 == "unknown onode delta");
+  }
+}
+
+// explicit instantiate the node_t classes used by test_node.cc
+template class node_t<512, 0, ntype_t::inner>;
+template class node_t<512, 0, ntype_t::leaf>;
+template class node_t<512, 1, ntype_t::inner>;
+template class node_t<512, 1, ntype_t::leaf>;
+template class node_t<512, 2, ntype_t::inner>;
+template class node_t<512, 2, ntype_t::leaf>;
+template class node_t<512, 3, ntype_t::inner>;
+template class node_t<512, 3, ntype_t::leaf>;
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h
new file mode 100644
index 000000000..d833a6682
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h
@@ -0,0 +1,942 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <algorithm>
+#include <cstdint>
+#include <type_traits>
+#include <variant>
+
+#include "common/hobject.h"
+#include "crimson/common/layout.h"
+#include "crimson/os/seastore/onode.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "onode_delta.h"
+
+namespace asci = absl::container_internal;
+
+namespace boost::beast {
+  template<class T>
+  bool operator==(const span<T>& lhs, const span<T>& rhs) {
+    return std::equal(
+        lhs.begin(), lhs.end(),
+        rhs.begin(), rhs.end());
+  }
+}
+
+// on-disk onode
+// it only keeps the bits necessary to rebuild an in-memory onode
+struct [[gnu::packed]] onode_t {
+  onode_t& operator=(const onode_t& onode) {
+    len = onode.len;
+    std::memcpy(data, onode.data, len);
+    return *this;
+  }
+  size_t size() const {
+    return sizeof(*this) + len;
+  }
+  OnodeRef decode() const {
+    return new crimson::os::seastore::Onode(std::string_view{data, len});
+  }
+  uint8_t struct_v = 1;
+  uint8_t struct_compat = 1;
+  // TODO:
+  // - use uint16_t for length, as the size of an onode should be less
+  //   than a block (16K for now)
+  // - drop struct_len
+  uint32_t struct_len = 0;
+  uint32_t len;
+  char data[];
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const onode_t& onode) {
+  return os << *onode.decode();
+}
+
+using crimson::os::seastore::laddr_t;
+
+struct [[gnu::packed]] child_addr_t {
+  laddr_t data;
+  child_addr_t(laddr_t data)
+    : data{data}
+  {}
+  child_addr_t& operator=(laddr_t addr) {
+    data = addr;
+    return *this;
+  }
+  laddr_t get() const {
+    return data;
+  }
+  operator laddr_t() const {
+    return data;
+  }
+  size_t size() const {
+    return sizeof(laddr_t);
+  }
+};
+
+// poor man's operator<=>
+enum class ordering_t {
+  less,
+  equivalent,
+  greater,
+};
+
+template<class L, class R>
+ordering_t compare_element(const L& x, const R& y)
+{
+  if constexpr (std::is_arithmetic_v<L>) {
+    static_assert(std::is_arithmetic_v<R>);
+    if (x < y) {
+      return ordering_t::less;
+    } else if (x > y) {
+      return ordering_t::greater;
+    } else {
+      return ordering_t::equivalent;
+    }
+  } else {
+    // string_view::compare(), string::compare(), ...
+    auto result = x.compare(y);
+    if (result < 0) {
+      return ordering_t::less;
+    } else if (result > 0) {
+      return ordering_t::greater;
+    } else {
+      return ordering_t::equivalent;
+    }
+  }
+}
+
+template<typename L, typename R>
+constexpr ordering_t tuple_cmp(const L&, const R&, std::index_sequence<>)
+{
+  return ordering_t::equivalent;
+}
+
+template<typename L, typename R,
+         size_t Head, size_t... Tail>
+constexpr ordering_t tuple_cmp(const L& x, const R& y,
+                               std::index_sequence<Head, Tail...>)
+{
+  auto ordering = compare_element(std::get<Head>(x), std::get<Head>(y));
+  if (ordering != ordering_t::equivalent) {
+    return ordering;
+  } else {
+    return tuple_cmp(x, y, std::index_sequence<Tail...>());
+  }
+}
+
+template<typename... Ls, typename... Rs>
+constexpr ordering_t cmp(const std::tuple<Ls...>& x,
+                         const std::tuple<Rs...>& y)
+{
+  static_assert(sizeof...(Ls) == sizeof...(Rs));
+  return tuple_cmp(x, y, std::index_sequence_for<Ls...>());
+}
+
+enum class likes_t {
+  yes,
+  no,
+  maybe,
+};
+
+struct [[gnu::packed]] variable_key_suffix {
+  uint64_t snap;
+  uint64_t gen;
+  uint8_t nspace_len;
+  uint8_t name_len;
+  char data[];
+  struct index_t {
+    enum {
+      nspace_data = 0,
+      name_data = 1,
+    };
+  };
+  using layout_type = asci::Layout<char, char>;
+  layout_type cell_layout() const {
+    return layout_type{nspace_len, name_len};
+  }
+  void set(const ghobject_t& oid) {
+    snap = oid.hobj.snap;
+    gen = oid.generation;
+    nspace_len = oid.hobj.nspace.size();
+    name_len = oid.hobj.oid.name.size();
+    auto layout = cell_layout();
+    std::memcpy(layout.Pointer<index_t::nspace_data>(data),
+                oid.hobj.nspace.data(), oid.hobj.nspace.size());
+    std::memcpy(layout.Pointer<index_t::name_data>(data),
+                oid.hobj.oid.name.data(), oid.hobj.oid.name.size());
+  }
+
+  void update_oid(ghobject_t& oid) const {
+    oid.hobj.snap = snap;
+    oid.generation = gen;
+    oid.hobj.nspace = nspace();
+    oid.hobj.oid.name = name();
+  }
+
+  variable_key_suffix& operator=(const variable_key_suffix& key) {
+    snap = key.snap;
+    gen = key.gen;
+    auto layout = cell_layout();
+    auto nspace = key.nspace();
+    std::copy_n(nspace.data(), nspace.size(),
+                layout.Pointer<index_t::nspace_data>(data));
+    auto name = key.name();
+    std::copy_n(name.data(), name.size(),
+                layout.Pointer<index_t::name_data>(data));
+    return *this;
+  }
+  const std::string_view nspace() const {
+    auto layout = cell_layout();
+    auto nspace = layout.Slice<index_t::nspace_data>(data);
+    return {nspace.data(), nspace.size()};
+  }
+  const std::string_view name() const {
+    auto layout = cell_layout();
+    auto name = layout.Slice<index_t::name_data>(data);
+    return {name.data(), name.size()};
+  }
+  size_t size() const {
+    return sizeof(*this) + nspace_len + name_len;
+  }
+  static size_t size_from(const ghobject_t& oid) {
+    return (sizeof(variable_key_suffix) +
+            oid.hobj.nspace.size() +
+            oid.hobj.oid.name.size());
+  }
+  ordering_t compare(const ghobject_t& oid) const {
+    return cmp(std::tie(nspace(), name(), snap, gen),
+               std::tie(oid.hobj.nspace, oid.hobj.oid.name, oid.hobj.snap.val,
+                        oid.generation));
+  }
+  bool likes(const variable_key_suffix& key) const {
+    return nspace() == key.nspace() && name() == key.name();
+  }
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const variable_key_suffix& k) {
+  if (k.snap != CEPH_NOSNAP) {
+    os << "s" << k.snap << ",";
+  }
+  if (k.gen != ghobject_t::NO_GEN) {
+    os << "g" << k.gen << ",";
+  }
+  return os << k.nspace() << "/" << k.name();
+}
+
+// should use [[no_unique_address]] in C++20
+struct empty_key_suffix {
+  static constexpr ordering_t compare(const ghobject_t&) {
+    return ordering_t::equivalent;
+  }
+  static void set(const ghobject_t&) {}
+  static constexpr size_t size() {
+    return 0;
+  }
+  static size_t size_from(const ghobject_t&) {
+    return 0;
+  }
+  static void update_oid(ghobject_t&) {}
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const empty_key_suffix&)
+{
+  return os;
+}
+
+enum class ntype_t : uint8_t {
+  leaf = 0u,
+  inner,
+};
+
+constexpr ntype_t flip_ntype(ntype_t ntype) noexcept
+{
+  if (ntype == ntype_t::leaf) {
+    return ntype_t::inner;
+  } else {
+    return ntype_t::leaf;
+  }
+}
+
+template<int N, ntype_t NodeType>
+struct FixedKeyPrefix {};
+
+template<ntype_t NodeType>
+struct FixedKeyPrefix<0, NodeType>
+{
+  static constexpr bool item_in_key = false;
+  int8_t shard = -1;
+  int64_t pool = -1;
+  uint32_t hash = 0;
+  uint16_t offset = 0;
+
+  FixedKeyPrefix() = default;
+  FixedKeyPrefix(const ghobject_t& oid, uint16_t offset)
+    : shard{oid.shard_id},
+      pool{oid.hobj.pool},
+      hash{oid.hobj.get_hash()},
+      offset{offset}
+  {}
+
+  void set(const ghobject_t& oid, uint16_t new_offset) {
+    shard = oid.shard_id;
+    pool = oid.hobj.pool;
+    hash = oid.hobj.get_hash();
+    offset = new_offset;
+  }
+
+  void set(const FixedKeyPrefix& k, uint16_t new_offset) {
+    shard = k.shard;
+    pool = k.pool;
+    hash = k.hash;
+    offset = new_offset;
+  }
+
+  void update(const ghobject_t& oid) {
+    shard = oid.shard_id;
+    pool = oid.hobj.pool;
+    hash = oid.hobj.get_hash();
+  }
+
+  void update_oid(ghobject_t& oid) const {
+    oid.set_shard(shard_id_t{shard});
+    oid.hobj.pool = pool;
+    oid.hobj.set_hash(hash);
+  }
+
+  ordering_t compare(const ghobject_t& oid) const {
+    // so std::tie() can bind them  by reference
+    int8_t rhs_shard = oid.shard_id;
+    uint32_t rhs_hash = oid.hobj.get_hash();
+    return cmp(std::tie(shard, pool, hash),
+               std::tie(rhs_shard, oid.hobj.pool, rhs_hash));
+  }
+  // @return true if i likes @c k, we will can be pushed down to next level
+  //              in the same node
+  likes_t likes(const FixedKeyPrefix& k) const {
+    if (shard == k.shard && pool == k.pool) {
+      return likes_t::yes;
+    } else {
+      return likes_t::no;
+    }
+  }
+};
+
+template<ntype_t NodeType>
+std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<0, NodeType>& k) {
+  if (k.shard != shard_id_t::NO_SHARD) {
+    os << "s" << k.shard;
+  }
+  return os << "p=" << k.pool << ","
+            << "h=" << std::hex << k.hash << std::dec << ","
+            << ">" << k.offset;
+}
+
+// all elements in this node share the same <shard, pool>
+template<ntype_t NodeType>
+struct FixedKeyPrefix<1, NodeType> {
+  static constexpr bool item_in_key = false;
+  uint32_t hash = 0;
+  uint16_t offset = 0;
+
+  FixedKeyPrefix() = default;
+  FixedKeyPrefix(uint32_t hash, uint16_t offset)
+    : hash{hash},
+      offset{offset}
+  {}
+  FixedKeyPrefix(const ghobject_t& oid, uint16_t offset)
+    : FixedKeyPrefix(oid.hobj.get_hash(), offset)
+  {}
+  void set(const ghobject_t& oid, uint16_t new_offset) {
+    hash = oid.hobj.get_hash();
+    offset = new_offset;
+  }
+  template<int N>
+  void set(const FixedKeyPrefix<N, NodeType>& k, uint16_t new_offset) {
+    static_assert(N < 2, "only N0, N1 have hash");
+    hash = k.hash;
+    offset = new_offset;
+  }
+  void update_oid(ghobject_t& oid) const {
+    oid.hobj.set_hash(hash);
+  }
+  void update(const ghobject_t& oid) {
+    hash = oid.hobj.get_hash();
+  }
+  ordering_t compare(const ghobject_t& oid) const {
+    return compare_element(hash, oid.hobj.get_hash());
+  }
+  likes_t likes(const FixedKeyPrefix& k) const {
+    return hash == k.hash ? likes_t::yes : likes_t::no;
+  }
+};
+
+template<ntype_t NodeType>
+std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<1, NodeType>& k) {
+  return os << "0x" << std::hex << k.hash << std::dec << ","
+            << ">" << k.offset;
+}
+
+// all elements in this node must share the same <shard, pool, hash>
+template<ntype_t NodeType>
+struct FixedKeyPrefix<2, NodeType> {
+  static constexpr bool item_in_key = false;
+  uint16_t offset = 0;
+
+  FixedKeyPrefix() = default;
+
+  static constexpr ordering_t compare(const ghobject_t& oid) {
+    // need to compare the cell
+    return ordering_t::equivalent;
+  }
+  // always defer to my cell for likeness
+  constexpr likes_t likes(const FixedKeyPrefix&) const {
+    return likes_t::maybe;
+  }
+  void set(const ghobject_t&, uint16_t new_offset) {
+    offset = new_offset;
+  }
+  template<int N>
+  void set(const FixedKeyPrefix<N, NodeType>&, uint16_t new_offset) {
+    offset = new_offset;
+  }
+  void update(const ghobject_t&) {}
+  void update_oid(ghobject_t&) const {}
+};
+
+template<ntype_t NodeType>
+std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<2, NodeType>& k) {
+  return os << ">" << k.offset;
+}
+
+struct fixed_key_3 {
+  uint64_t snap = 0;
+  uint64_t gen = 0;
+
+  fixed_key_3() = default;
+  fixed_key_3(const ghobject_t& oid)
+  : snap{oid.hobj.snap}, gen{oid.generation}
+  {}
+  ordering_t compare(const ghobject_t& oid) const {
+    return cmp(std::tie(snap, gen),
+               std::tie(oid.hobj.snap.val, oid.generation));
+  }
+  // no object likes each other at this level
+  constexpr likes_t likes(const fixed_key_3&) const {
+    return likes_t::no;
+  }
+  void update_with_oid(const ghobject_t& oid) {
+    snap = oid.hobj.snap;
+    gen = oid.generation;
+  }
+  void update_oid(ghobject_t& oid) const {
+    oid.hobj.snap = snap;
+    oid.generation = gen;
+  }
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const fixed_key_3& k) {
+  if (k.snap != CEPH_NOSNAP) {
+    os << "s" << k.snap << ",";
+  }
+  if (k.gen != ghobject_t::NO_GEN) {
+    os << "g" << k.gen << ",";
+  }
+  return os;
+}
+
+// all elements in this node must share the same <shard, pool, hash, namespace, oid>
+// but the unlike other FixedKeyPrefix<>, a node with FixedKeyPrefix<3> does not have
+// variable_sized_key, so if it is an inner node, we can just embed the child
+// addr right in the key.
+template<>
+struct FixedKeyPrefix<3, ntype_t::inner> : public fixed_key_3 {
+  // the item is embedded in the key
+  static constexpr bool item_in_key = true;
+  laddr_t child_addr = 0;
+
+  FixedKeyPrefix() = default;
+  void set(const ghobject_t& oid, laddr_t new_child_addr) {
+    update_with_oid(oid);
+    child_addr = new_child_addr;
+  }
+  // unlikely get called, though..
+  void update(const ghobject_t& oid) {}
+  template<int N>
+  std::enable_if_t<N < 3> set(const FixedKeyPrefix<N, ntype_t::inner>&,
+                              laddr_t new_child_addr) {
+    child_addr = new_child_addr;
+  }
+  void set(const FixedKeyPrefix& k, laddr_t new_child_addr) {
+    snap = k.snap;
+    gen = k.gen;
+    child_addr = new_child_addr;
+  }
+  void set(const variable_key_suffix& k, laddr_t new_child_addr) {
+    snap = k.snap;
+    gen = k.gen;
+    child_addr = new_child_addr;
+  }
+};
+
+template<>
+struct FixedKeyPrefix<3, ntype_t::leaf> : public fixed_key_3 {
+  static constexpr bool item_in_key = false;
+  uint16_t offset = 0;
+
+  FixedKeyPrefix() = default;
+  void set(const ghobject_t& oid, uint16_t new_offset) {
+    update_with_oid(oid);
+    offset = new_offset;
+  }
+  void set(const FixedKeyPrefix& k, uint16_t new_offset) {
+    snap = k.snap;
+    gen = k.gen;
+    offset = new_offset;
+  }
+  template<int N>
+  std::enable_if_t<N < 3> set(const FixedKeyPrefix<N, ntype_t::leaf>&,
+                              uint16_t new_offset) {
+    offset = new_offset;
+  }
+};
+
+struct tag_t {
+  template<int N, ntype_t node_type>
+  static constexpr tag_t create() {
+    static_assert(std::clamp(N, 0, 3) == N);
+    return tag_t{N, static_cast<uint8_t>(node_type)};
+  }
+  bool is_leaf() const {
+    return type() == ntype_t::leaf;
+  }
+  int layout() const {
+    return layout_type;
+  }
+  ntype_t type() const {
+    return ntype_t{node_type};
+  }
+  int layout_type : 4;
+  uint8_t node_type : 4;
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const tag_t& tag) {
+  return os << "n=" << tag.layout() << ", leaf=" << tag.is_leaf();
+}
+
+// for calculating size of variable-sized item/key
+template<class T>
+size_t size_of(const T& t) {
+  using decayed_t = std::decay_t<T>;
+  if constexpr (std::is_scalar_v<decayed_t>) {
+    return sizeof(decayed_t);
+  } else {
+    return t.size();
+  }
+}
+
+enum class size_state_t {
+  okay,
+  underflow,
+  overflow,
+};
+
+// layout of a node of B+ tree
+//
+// it is different from a typical B+ tree in following ways
+// - the size of keys is not necessarily fixed, neither is the size of value.
+// - the max number of elements in a node is determined by the total size of
+//   the keys and values in the node
+// - in internal nodes, each key maps to the logical address of the child
+//   node whose minimum key is greater or equal to that key.
+template<size_t BlockSize,
+         int N,
+         ntype_t NodeType>
+struct node_t {
+  static_assert(std::clamp(N, 0, 3) == N);
+  constexpr static ntype_t node_type = NodeType;
+  constexpr static int node_n = N;
+
+  using key_prefix_t = FixedKeyPrefix<N, NodeType>;
+  using item_t = std::conditional_t<NodeType == ntype_t::leaf,
+                                    onode_t,
+                                    child_addr_t>;
+  using const_item_t = std::conditional_t<NodeType == ntype_t::leaf,
+                                          const onode_t&,
+                                          child_addr_t>;
+  static constexpr bool item_in_key = key_prefix_t::item_in_key;
+  using key_suffix_t = std::conditional_t<N < 3,
+                                           variable_key_suffix,
+                                           empty_key_suffix>;
+
+  std::pair<const key_prefix_t&, const key_suffix_t&>
+  key_at(unsigned slot) const;
+
+  // update an existing oid with the specified item
+  ghobject_t get_oid_at(unsigned slot, const ghobject_t& oid) const;
+  const_item_t item_at(const key_prefix_t& key) const;
+  void dump(std::ostream& os) const;
+
+  // for debugging only.
+  static constexpr bool is_leaf() {
+    return node_type == ntype_t::leaf;
+  }
+
+  bool _is_leaf() const {
+    return tag.is_leaf();
+  }
+
+  char* from_end(uint16_t offset);
+  const char* from_end(uint16_t offset) const;
+  uint16_t used_space() const;
+  uint16_t free_space() const {
+    return capacity() - used_space();
+  }
+  static uint16_t capacity();
+  // TODO: if it's allowed to update 2 siblings at the same time, we can have
+  //       B* tree
+  static constexpr uint16_t min_size();
+
+
+  // calculate the allowable bounds on bytes to remove from an overflow node
+  // with specified size
+  // @param size the overflowed size
+  // @return <minimum bytes to grab, maximum bytes to grab>
+  static constexpr std::pair<int16_t, int16_t> bytes_to_remove(uint16_t size);
+
+  // calculate the allowable bounds on bytes to add to an underflow node
+  // with specified size
+  // @param size the underflowed size
+  // @return <minimum bytes to push, maximum bytes to push>
+  static constexpr std::pair<int16_t, int16_t> bytes_to_add(uint16_t size);
+
+  size_state_t size_state(uint16_t size) const;
+  bool is_underflow(uint16_t size) const;
+  int16_t size_with_key(unsigned slot, const ghobject_t& oid) const;
+  ordering_t compare_with_slot(unsigned slot, const ghobject_t& oid) const;
+  /// return the slot number of the first slot that is greater or equal to
+  /// key
+  std::pair<unsigned, bool> lower_bound(const ghobject_t& oid) const;
+  static uint16_t size_of_item(const ghobject_t& oid, const item_t& item);
+  bool is_overflow(const ghobject_t& oid, const item_t& item) const;
+  bool is_overflow(const ghobject_t& oid, const OnodeRef& item) const;
+
+  // inserts an item into the given slot, pushing all subsequent keys forward
+  // @note if the item is not embedded in key, shift the right half as well
+  void insert_at(unsigned slot, const ghobject_t& oid, const item_t& item);
+  // used by InnerNode for updating the keys indexing its children when their lower boundaries
+  // is updated
+  void update_key_at(unsigned slot, const ghobject_t& oid);
+  // try to figure out the number of elements and total size when trying to
+  // rebalance by moving the elements from the front of this node when its
+  // left sibling node is underflow
+  //
+  // @param min_grab lower bound of the number of bytes to move
+  // @param max_grab upper bound of the number of bytes to move
+  // @return the number of element to grab
+  // @note return {0, 0} if current node would be underflow if
+  //       @c min_grab bytes of elements are taken from it
+  std::pair<unsigned, uint16_t> calc_grab_front(uint16_t min_grab, uint16_t max_grab) const;
+  // try to figure out the number of elements and their total size when trying to
+  // rebalance by moving the elements from the end of this node when its right
+  // sibling node is underflow
+  //
+  // @param min_grab lower bound of the number of bytes to move
+  // @param max_grab upper bound of the number of bytes to move
+  // @return the number of element to grab
+  // @note return {0, 0} if current node would be underflow if
+  //       @c min_grab bytes of elements are taken from it
+  std::pair<unsigned, uint16_t> calc_grab_back(uint16_t min_grab, uint16_t max_grab) const;
+  template<int LeftN, class Mover> void grab_from_left(
+    node_t<BlockSize, LeftN, NodeType>& left,
+    unsigned n, uint16_t bytes,
+    Mover& mover);
+  template<int RightN, class Mover>
+  delta_t acquire_right(node_t<BlockSize, RightN, NodeType>& right,
+                        unsigned whoami, Mover& mover);
+  // transfer n elements at the front of given node to me
+  template<int RightN, class Mover>
+  void grab_from_right(node_t<BlockSize, RightN, NodeType>& right,
+                       unsigned n, uint16_t bytes,
+                       Mover& mover);
+  template<int LeftN, class Mover>
+  void push_to_left(node_t<BlockSize, LeftN, NodeType>& left,
+                    unsigned n, uint16_t bytes,
+                    Mover& mover);
+  template<int RightN, class Mover>
+  void push_to_right(node_t<BlockSize, RightN, NodeType>& right,
+                     unsigned n, uint16_t bytes,
+                     Mover& mover);
+  // [to, from) are removed, so we need to shift left
+  // actually there are only two use cases:
+  // - to = 0: for giving elements in bulk
+  // - to = from - 1: for removing a single element
+  // old: |////|.....|   |.....|/|........|
+  // new: |.....|        |.....||........|
+  void shift_left(unsigned from, unsigned to);
+  void insert_front(const ceph::bufferptr& keys_buf, const ceph::bufferptr& cells_buf);
+  void insert_back(const ceph::bufferptr& keys_buf, const ceph::bufferptr& cells_buf);
+  // one or more elements are inserted, so we need to shift the elements right
+  // actually there are only two use cases:
+  // - bytes != 0: for inserting bytes before from
+  // - bytes = 0: for inserting a single element before from
+  // old: ||.....|
+  // new: |/////|.....|
+  void shift_right(unsigned n, unsigned bytes);
+  // shift all keys after slot is removed.
+  // @note if the item is not embdedded in key, all items sitting at the left
+  //       side of it will be shifted right
+  void remove_from(unsigned slot);
+  void trim_right(unsigned n);
+  void play_delta(const delta_t& delta);
+  //         /-------------------------------|
+  //         |                               V
+  // |header|k0|k1|k2|...  | / / |k2'v2|k1'v1|k0'.v0| v_m |
+  //        |<-- count  -->|
+  tag_t tag = tag_t::create<N, NodeType>();
+  // the count of values in the node
+  uint16_t count = 0;
+  key_prefix_t keys[];
+};
+
+template<class parent_t,
+         class from_t,
+         class to_t,
+         typename=void>
+class EntryMover {
+public:
+  // a "trap" mover
+  EntryMover(const parent_t&, from_t&, to_t& dst, unsigned) {
+    assert(0);
+  }
+  void move_from(unsigned, unsigned, unsigned) {
+    assert(0);
+  }
+  delta_t get_delta() {
+    return delta_t::nop();
+  }
+};
+
+// lower the layout, for instance, from L0 to L1, no reference oid is used
+template<class parent_t,
+         class from_t,
+         class to_t>
+class EntryMover<parent_t,
+                 from_t,
+                 to_t,
+                 std::enable_if_t<from_t::node_n < to_t::node_n>>
+{
+public:
+  EntryMover(const parent_t&, from_t& src, to_t& dst, unsigned)
+    : src{src}, dst{dst}
+  {}
+  void move_from(unsigned src_first, unsigned dst_first, unsigned n)
+  {
+    ceph::bufferptr keys_buf{n * sizeof(to_t::key_prefix_t)};
+    ceph::bufferptr cells_buf;
+    auto dst_keys = reinterpret_cast<typename to_t::key_prefix_t*>(keys_buf.c_str());
+    if constexpr (to_t::item_in_key) {
+      for (unsigned i = 0; i < n; i++) {
+        const auto& [prefix, suffix] = src.key_at(src_first + i);
+        dst_keys[i].set(suffix, src.item_at(prefix));
+      }
+    } else {
+      // copy keys
+      uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0;
+      uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0;
+      for (unsigned i = 0; i < n; i++) {
+        auto& src_key = src.keys[src_first + i];
+        uint16_t offset = src_key.offset - src_offset + dst_offset;
+        dst_keys[i].set(src_key, offset);
+      }
+      // copy cells in bulk, yay!
+      auto src_end = src.keys[src_first + n - 1].offset;
+      uint16_t total_cell_size = src_end - src_offset;
+      cells_buf = ceph::bufferptr{total_cell_size};
+      cells_buf.copy_in(0, total_cell_size, src.from_end(src_end));
+    }
+    if (dst_first == dst.count) {
+      dst_delta = delta_t::insert_back(keys_buf, cells_buf);
+    } else {
+      dst_delta = delta_t::insert_front(keys_buf, cells_buf);
+    }
+    if (src_first > 0 && src_first + n == src.count) {
+      src_delta = delta_t::trim_right(src_first);
+    } else if (src_first == 0 && n < src.count) {
+      src_delta = delta_t::shift_left(n);
+    } else if (src_first == 0 && n == src.count) {
+      // the caller will retire the src extent
+    } else {
+      // grab in the middle?
+      assert(0);
+    }
+  }
+
+  delta_t from_delta() {
+    return std::move(src_delta);
+  }
+  delta_t to_delta() {
+    return std::move(dst_delta);
+  }
+private:
+  const from_t& src;
+  const to_t& dst;
+  delta_t dst_delta;
+  delta_t src_delta;
+};
+
+// lift the layout, for instance, from L2 to L0, need a reference oid
+template<class parent_t,
+         class from_t,
+         class to_t>
+class EntryMover<parent_t, from_t, to_t,
+                 std::enable_if_t<(from_t::node_n > to_t::node_n)>>
+{
+public:
+  EntryMover(const parent_t& parent, from_t& src, to_t& dst, unsigned from_slot)
+    : src{src}, dst{dst}, ref_oid{parent->get_oid_at(from_slot, {})}
+  {}
+  void move_from(unsigned src_first, unsigned dst_first, unsigned n)
+  {
+    ceph::bufferptr keys_buf{n * sizeof(to_t::key_prefix_t)};
+    ceph::bufferptr cells_buf;
+    auto dst_keys = reinterpret_cast<typename to_t::key_prefix_t*>(keys_buf.c_str());
+    uint16_t in_node_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0;
+    static_assert(!std::is_same_v<typename to_t::key_suffix_t, empty_key_suffix>);
+    // copy keys
+    uint16_t buf_offset = 0;
+    for (unsigned i = 0; i < n; i++) {
+      auto& src_key = src.keys[src_first + i];
+      if constexpr (std::is_same_v<typename from_t::key_suffix_t, empty_key_suffix>) {
+        // heterogeneous partial key, have to rebuild dst partial key from oid
+        src_key.update_oid(ref_oid);
+        const auto& src_item = src.item_at(src_key);
+        size_t key2_size = to_t::key_suffix_t::size_from(ref_oid);
+        buf_offset += key2_size + size_of(src_item);
+        dst_keys[i].set(ref_oid, in_node_offset + buf_offset);
+        auto p = from_end(cells_buf, buf_offset);
+        auto partial_key = reinterpret_cast<typename to_t::key_suffix_t*>(p);
+        partial_key->set(ref_oid);
+        p += key2_size;
+        auto dst_item = reinterpret_cast<typename to_t::item_t*>(p);
+        *dst_item = src_item;
+      } else {
+        // homogeneous partial key, just update the pointers
+        uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0;
+        uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0;
+        uint16_t offset = src_key.offset - src_offset + dst_offset;
+        dst_keys[i].set(ref_oid, in_node_offset + offset);
+      }
+    }
+    if constexpr (std::is_same_v<typename to_t::key_suffix_t,
+                                 typename from_t::key_suffix_t>) {
+      // copy cells in bulk, yay!
+      uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0;
+      uint16_t src_end = src.keys[src_first + n - 1].offset;
+      uint16_t total_cell_size = src_end - src_offset;
+      cells_buf.copy_in(0,  total_cell_size, src.from_end(src_end));
+    }
+    if (dst_first == dst.count) {
+      dst_delta = delta_t::insert_back(keys_buf, cells_buf);
+    } else {
+      dst_delta = delta_t::insert_front(keys_buf, cells_buf);
+    }
+    if (src_first + n == src.count && src_first > 0) {
+      src_delta = delta_t::trim_right(src_first);
+    } else {
+      // the caller will retire the src extent
+      assert(src_first == 0);
+    }
+  }
+
+  delta_t from_delta() {
+    return std::move(src_delta);
+  }
+  delta_t to_delta() {
+    return std::move(dst_delta);
+  }
+private:
+  char* from_end(ceph::bufferptr& ptr, uint16_t offset) {
+    return ptr.end_c_str() - static_cast<int>(offset);
+  }
+private:
+  const from_t& src;
+  const to_t& dst;
+  delta_t dst_delta;
+  delta_t src_delta;
+  ghobject_t ref_oid;
+};
+
+// identical layout, yay!
+template<class parent_t,
+         class child_t>
+class EntryMover<parent_t, child_t, child_t>
+{
+public:
+  EntryMover(const parent_t&, child_t& src, child_t& dst, unsigned)
+    : src{src}, dst{dst}
+  {}
+
+  void move_from(unsigned src_first, unsigned dst_first, unsigned n)
+  {
+    ceph::bufferptr keys_buf{static_cast<unsigned>(n * sizeof(typename child_t::key_prefix_t))};
+    ceph::bufferptr cells_buf;
+    auto dst_keys = reinterpret_cast<typename child_t::key_prefix_t*>(keys_buf.c_str());
+
+    // copy keys
+    std::copy(src.keys + src_first, src.keys + src_first + n,
+              dst_keys);
+    if constexpr (!child_t::item_in_key) {
+      uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0;
+      uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0;
+      const int offset_delta = dst_offset - src_offset;
+      // update the pointers
+      for (unsigned i = 0; i < n; i++) {
+        dst_keys[i].offset += offset_delta;
+      }
+      // copy cells in bulk, yay!
+      auto src_end = src.keys[src_first + n - 1].offset;
+      uint16_t total_cell_size = src_end - src_offset;
+      cells_buf = ceph::bufferptr{total_cell_size};
+      cells_buf.copy_in(0,  total_cell_size, src.from_end(src_end));
+    }
+    if (dst_first == dst.count) {
+      dst_delta = delta_t::insert_back(std::move(keys_buf), std::move(cells_buf));
+    } else {
+      dst_delta = delta_t::insert_front(std::move(keys_buf), std::move(cells_buf));
+    }
+    if (src_first + n == src.count && src_first > 0) {
+      src_delta = delta_t::trim_right(n);
+    } else if (src_first == 0 && n < src.count) {
+      src_delta = delta_t::shift_left(n);
+    } else if (src_first == 0 && n == src.count) {
+      // the caller will retire the src extent
+    } else {
+      // grab in the middle?
+      assert(0);
+    }
+  }
+
+  delta_t from_delta() {
+    return std::move(src_delta);
+  }
+
+  delta_t to_delta() {
+    return std::move(dst_delta);
+  }
+private:
+  char* from_end(ceph::bufferptr& ptr, uint16_t offset) {
+    return ptr.end_c_str() - static_cast<int>(offset);
+  }
+private:
+  const child_t& src;
+  const child_t& dst;
+  delta_t src_delta;
+  delta_t dst_delta;
+};
+
+template<class parent_t, class from_t, class to_t>
+EntryMover<parent_t, from_t, to_t>
+make_mover(const parent_t& parent, from_t& src, to_t& dst, unsigned from_slot) {
+  return EntryMover<parent_t, from_t, to_t>(parent, src, dst, from_slot);
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h
new file mode 100644
index 000000000..4908c691f
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <algorithm>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <string>
+
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction.h"
+
+namespace crimson::os::seastore::onode {
+
+using crimson::os::seastore::Transaction;
+using crimson::os::seastore::TransactionRef;
+using crimson::os::seastore::make_transaction;
+using crimson::os::seastore::laddr_t;
+using crimson::os::seastore::L_ADDR_MIN;
+using crimson::os::seastore::L_ADDR_NULL;
+using crimson::os::seastore::extent_len_t;
+
+class DeltaRecorder;
+class NodeExtent;
+class NodeExtentManager;
+class RootNodeTracker;
+using DeltaRecorderURef = std::unique_ptr<DeltaRecorder>;
+using NodeExtentRef = crimson::os::seastore::TCachedExtentRef<NodeExtent>;
+using NodeExtentManagerURef = std::unique_ptr<NodeExtentManager>;
+using RootNodeTrackerURef = std::unique_ptr<RootNodeTracker>;
+struct context_t {
+  NodeExtentManager& nm;
+  Transaction& t;
+};
+
+class LeafNodeImpl;
+class InternalNodeImpl;
+class NodeImpl;
+using LeafNodeImplURef = std::unique_ptr<LeafNodeImpl>;
+using InternalNodeImplURef = std::unique_ptr<InternalNodeImpl>;
+using NodeImplURef = std::unique_ptr<NodeImpl>;
+
+using level_t = uint8_t;
+// a type only to index within a node, 32 bits should be enough
+using index_t = uint32_t;
+constexpr auto INDEX_END = std::numeric_limits<index_t>::max();
+constexpr auto INDEX_LAST = INDEX_END - 0x4;
+constexpr auto INDEX_UPPER_BOUND = INDEX_END - 0x8;
+inline bool is_valid_index(index_t index) { return index < INDEX_UPPER_BOUND; }
+
+// TODO: decide by NODE_BLOCK_SIZE
+using node_offset_t = uint16_t;
+constexpr node_offset_t DISK_BLOCK_SIZE = 1u << 12;
+constexpr node_offset_t NODE_BLOCK_SIZE = DISK_BLOCK_SIZE * 1u;
+
+enum class MatchKindBS : int8_t { NE = -1, EQ = 0 };
+
+enum class MatchKindCMP : int8_t { LT = -1, EQ = 0, GT };
+inline MatchKindCMP toMatchKindCMP(int value) {
+  if (value > 0) {
+    return MatchKindCMP::GT;
+  } else if (value < 0) {
+    return MatchKindCMP::LT;
+  } else {
+    return MatchKindCMP::EQ;
+  }
+}
+template <typename Type>
+MatchKindCMP toMatchKindCMP(const Type& l, const Type& r) {
+  int match = l - r;
+  return toMatchKindCMP(match);
+}
+
+inline MatchKindCMP toMatchKindCMP(
+    std::string_view l, std::string_view r) {
+  return toMatchKindCMP(l.compare(r));
+}
+
+inline MatchKindCMP reverse(MatchKindCMP cmp) {
+  if (cmp == MatchKindCMP::LT) {
+    return MatchKindCMP::GT;
+  } else if (cmp == MatchKindCMP::GT) {
+    return MatchKindCMP::LT;
+  } else {
+    return cmp;
+  }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc
new file mode 100644
index 000000000..3df458f08
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc
@@ -0,0 +1,809 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node.h"
+
+#include <cassert>
+#include <exception>
+#include <sstream>
+
+#include "common/likely.h"
+#include "crimson/common/log.h"
+#include "node_extent_manager.h"
+#include "node_impl.h"
+#include "stages/node_stage_layout.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+namespace crimson::os::seastore::onode {
+
+using node_ertr = Node::node_ertr;
+template <class ValueT=void>
+using node_future = Node::node_future<ValueT>;
+
+/*
+ * tree_cursor_t
+ */
+
+tree_cursor_t::tree_cursor_t(Ref<LeafNode> node, const search_position_t& pos)
+      : leaf_node{node}, position{pos} {
+  assert(!is_end());
+  leaf_node->do_track_cursor<true>(*this);
+}
+
+tree_cursor_t::tree_cursor_t(
+    Ref<LeafNode> node, const search_position_t& pos,
+    const key_view_t& key, const onode_t* _p_value, layout_version_t v)
+      : leaf_node{node}, position{pos} {
+  assert(!is_end());
+  update_kv(key, _p_value, v);
+  leaf_node->do_track_cursor<true>(*this);
+}
+
+tree_cursor_t::tree_cursor_t(Ref<LeafNode> node)
+      : leaf_node{node}, position{search_position_t::end()} {
+  assert(is_end());
+  assert(leaf_node->is_level_tail());
+}
+
+tree_cursor_t::~tree_cursor_t() {
+  if (!is_end()) {
+    leaf_node->do_untrack_cursor(*this);
+  }
+}
+
+const key_view_t& tree_cursor_t::get_key_view() const {
+  ensure_kv();
+  return *key_view;
+}
+
+const onode_t* tree_cursor_t::get_p_value() const {
+  ensure_kv();
+  return p_value;
+}
+
+template <bool VALIDATE>
+void tree_cursor_t::update_track(
+    Ref<LeafNode> node, const search_position_t& pos) {
+  // the cursor must be already untracked
+  // track the new node and new pos
+  assert(!pos.is_end());
+  assert(!is_end());
+  leaf_node = node;
+  position = pos;
+  key_view.reset();
+  p_value = nullptr;
+  leaf_node->do_track_cursor<VALIDATE>(*this);
+}
+template void tree_cursor_t::update_track<true>(Ref<LeafNode>, const search_position_t&);
+template void tree_cursor_t::update_track<false>(Ref<LeafNode>, const search_position_t&);
+
+void tree_cursor_t::update_kv(
+    const key_view_t& key, const onode_t* _p_value, layout_version_t v) const {
+  assert(!is_end());
+  assert(_p_value);
+  assert(std::make_tuple(key, _p_value, v) == leaf_node->get_kv(position));
+  key_view = key;
+  p_value = _p_value;
+  node_version = v;
+}
+
+void tree_cursor_t::ensure_kv() const {
+  assert(!is_end());
+  if (!p_value || node_version != leaf_node->get_layout_version()) {
+    // NOTE: the leaf node is always present when we hold its reference.
+    std::tie(key_view, p_value, node_version) = leaf_node->get_kv(position);
+  }
+  assert(p_value);
+}
+
+/*
+ * Node
+ */
+
+Node::Node(NodeImplURef&& impl) : impl{std::move(impl)} {}
+
+Node::~Node() {
+  // XXX: tolerate failure between allocate() and as_child()
+  if (is_root()) {
+    super->do_untrack_root(*this);
+  } else {
+    _parent_info->ptr->do_untrack_child(*this);
+  }
+}
+
+level_t Node::level() const {
+  return impl->level();
+}
+
+node_future<Node::search_result_t> Node::lower_bound(
+    context_t c, const key_hobj_t& key) {
+  return seastar::do_with(
+    MatchHistory(), [this, c, &key](auto& history) {
+      return lower_bound_tracked(c, key, history);
+    }
+  );
+}
+
+node_future<std::pair<Ref<tree_cursor_t>, bool>> Node::insert(
+    context_t c, const key_hobj_t& key, const onode_t& value) {
+  return seastar::do_with(
+    MatchHistory(), [this, c, &key, &value](auto& history) {
+      return lower_bound_tracked(c, key, history
+      ).safe_then([c, &key, &value, &history](auto result) {
+        if (result.match() == MatchKindBS::EQ) {
+          return node_ertr::make_ready_future<std::pair<Ref<tree_cursor_t>, bool>>(
+              std::make_pair(result.p_cursor, false));
+        } else {
+          auto leaf_node = result.p_cursor->get_leaf_node();
+          return leaf_node->insert_value(
+              c, key, value, result.p_cursor->get_position(), history, result.mstat
+          ).safe_then([](auto p_cursor) {
+            return node_ertr::make_ready_future<std::pair<Ref<tree_cursor_t>, bool>>(
+                std::make_pair(p_cursor, true));
+          });
+        }
+      });
+    }
+  );
+}
+
+node_future<tree_stats_t> Node::get_tree_stats(context_t c) {
+  return seastar::do_with(
+    tree_stats_t(), [this, c](auto& stats) {
+      return do_get_tree_stats(c, stats).safe_then([&stats] {
+        return stats;
+      });
+    }
+  );
+}
+
+std::ostream& Node::dump(std::ostream& os) const {
+  return impl->dump(os);
+}
+
+std::ostream& Node::dump_brief(std::ostream& os) const {
+  return impl->dump_brief(os);
+}
+
+void Node::test_make_destructable(
+    context_t c, NodeExtentMutable& mut, Super::URef&& _super) {
+  impl->test_set_tail(mut);
+  make_root(c, std::move(_super));
+}
+
+node_future<> Node::mkfs(context_t c, RootNodeTracker& root_tracker) {
+  return LeafNode::allocate_root(c, root_tracker
+  ).safe_then([](auto ret) { /* FIXME: discard_result(); */ });
+}
+
+node_future<Ref<Node>> Node::load_root(context_t c, RootNodeTracker& root_tracker) {
+  return c.nm.get_super(c.t, root_tracker
+  ).safe_then([c, &root_tracker](auto&& _super) {
+    auto root_addr = _super->get_root_laddr();
+    assert(root_addr != L_ADDR_NULL);
+    return Node::load(c, root_addr, true
+    ).safe_then([c, _super = std::move(_super),
+                 &root_tracker](auto root) mutable {
+      assert(root->impl->field_type() == field_type_t::N0);
+      root->as_root(std::move(_super));
+      std::ignore = c; // as only used in an assert
+      std::ignore = root_tracker;
+      assert(root == root_tracker.get_root(c.t));
+      return node_ertr::make_ready_future<Ref<Node>>(root);
+    });
+  });
+}
+
+void Node::make_root(context_t c, Super::URef&& _super) {
+  _super->write_root_laddr(c, impl->laddr());
+  as_root(std::move(_super));
+}
+
+void Node::as_root(Super::URef&& _super) {
+  assert(!super && !_parent_info);
+  assert(_super->get_root_laddr() == impl->laddr());
+  assert(impl->is_level_tail());
+  super = std::move(_super);
+  super->do_track_root(*this);
+}
+
+node_future<> Node::upgrade_root(context_t c) {
+  assert(is_root());
+  assert(impl->is_level_tail());
+  assert(impl->field_type() == field_type_t::N0);
+  super->do_untrack_root(*this);
+  return InternalNode::allocate_root(c, impl->level(), impl->laddr(), std::move(super)
+  ).safe_then([this](auto new_root) {
+    as_child(search_position_t::end(), new_root);
+  });
+}
+
+template <bool VALIDATE>
+void Node::as_child(const search_position_t& pos, Ref<InternalNode> parent_node) {
+  assert(!super);
+  _parent_info = parent_info_t{pos, parent_node};
+  parent_info().ptr->do_track_child<VALIDATE>(*this);
+}
+template void Node::as_child<true>(const search_position_t&, Ref<InternalNode>);
+template void Node::as_child<false>(const search_position_t&, Ref<InternalNode>);
+
+node_future<> Node::insert_parent(context_t c, Ref<Node> right_node) {
+  assert(!is_root());
+  // TODO(cross-node string dedup)
+  return parent_info().ptr->apply_child_split(
+      c, parent_info().position, this, right_node);
+}
+
+node_future<Ref<Node>> Node::load(
+    context_t c, laddr_t addr, bool expect_is_level_tail) {
+  // NOTE:
+  // *option1: all types of node have the same length;
+  // option2: length is defined by node/field types;
+  // option3: length is totally flexible;
+  return c.nm.read_extent(c.t, addr, NODE_BLOCK_SIZE
+  ).safe_then([expect_is_level_tail](auto extent) {
+    auto [node_type, field_type] = extent->get_types();
+    if (node_type == node_type_t::LEAF) {
+      auto impl = LeafNodeImpl::load(extent, field_type, expect_is_level_tail);
+      return Ref<Node>(new LeafNode(impl.get(), std::move(impl)));
+    } else if (node_type == node_type_t::INTERNAL) {
+      auto impl = InternalNodeImpl::load(extent, field_type, expect_is_level_tail);
+      return Ref<Node>(new InternalNode(impl.get(), std::move(impl)));
+    } else {
+      ceph_abort("impossible path");
+    }
+  });
+}
+
+/*
+ * InternalNode
+ */
+
+InternalNode::InternalNode(InternalNodeImpl* impl, NodeImplURef&& impl_ref)
+  : Node(std::move(impl_ref)), impl{impl} {}
+
+node_future<> InternalNode::apply_child_split(
+    context_t c, const search_position_t& pos,
+    Ref<Node> left_child, Ref<Node> right_child) {
+#ifndef NDEBUG
+  if (pos.is_end()) {
+    assert(impl->is_level_tail());
+  }
+#endif
+  impl->prepare_mutate(c);
+
+  auto left_key = left_child->impl->get_largest_key_view();
+  auto left_child_addr = left_child->impl->laddr();
+  auto left_child_addr_packed = laddr_packed_t{left_child_addr};
+  auto right_key = right_child->impl->get_largest_key_view();
+  auto right_child_addr = right_child->impl->laddr();
+  logger().debug("OTree::Internal::Insert: "
+                 "pos({}), left_child({}, {:#x}), right_child({}, {:#x}) ...",
+                 pos, left_key, left_child_addr, right_key, right_child_addr);
+  // update pos => left_child to pos => right_child
+  impl->replace_child_addr(pos, right_child_addr, left_child_addr);
+  replace_track(pos, right_child, left_child);
+
+  search_position_t insert_pos = pos;
+  auto [insert_stage, insert_size] = impl->evaluate_insert(
+      left_key, left_child_addr, insert_pos);
+  auto free_size = impl->free_size();
+  if (free_size >= insert_size) {
+    // insert
+    [[maybe_unused]] auto p_value = impl->insert(
+        left_key, left_child_addr_packed, insert_pos, insert_stage, insert_size);
+    assert(impl->free_size() == free_size - insert_size);
+    assert(insert_pos <= pos);
+    assert(p_value->value == left_child_addr);
+    track_insert(insert_pos, insert_stage, left_child, right_child);
+    validate_tracked_children();
+    return node_ertr::now();
+  }
+  // split and insert
+  Ref<InternalNode> this_ref = this;
+  return (is_root() ? upgrade_root(c) : node_ertr::now()
+  ).safe_then([this, c] {
+    return InternalNode::allocate(
+        c, impl->field_type(), impl->is_level_tail(), impl->level());
+  }).safe_then([this_ref, this, c, left_key, left_child, right_child,
+                insert_pos, insert_stage=insert_stage, insert_size=insert_size](auto fresh_right) mutable {
+    auto right_node = fresh_right.node;
+    auto left_child_addr = left_child->impl->laddr();
+    auto left_child_addr_packed = laddr_packed_t{left_child_addr};
+    auto [split_pos, is_insert_left, p_value] = impl->split_insert(
+        fresh_right.mut, *right_node->impl, left_key, left_child_addr_packed,
+        insert_pos, insert_stage, insert_size);
+    assert(p_value->value == left_child_addr);
+    track_split(split_pos, right_node);
+    if (is_insert_left) {
+      track_insert(insert_pos, insert_stage, left_child);
+    } else {
+      right_node->track_insert(insert_pos, insert_stage, left_child);
+    }
+    validate_tracked_children();
+    right_node->validate_tracked_children();
+
+    // propagate index to parent
+    return insert_parent(c, right_node);
+    // TODO (optimize)
+    // try to acquire space from siblings before split... see btrfs
+  });
+}
+
+node_future<Ref<InternalNode>> InternalNode::allocate_root(
+    context_t c, level_t old_root_level,
+    laddr_t old_root_addr, Super::URef&& super) {
+  return InternalNode::allocate(c, field_type_t::N0, true, old_root_level + 1
+  ).safe_then([c, old_root_addr,
+               super = std::move(super)](auto fresh_node) mutable {
+    auto root = fresh_node.node;
+    auto p_value = root->impl->get_p_value(search_position_t::end());
+    fresh_node.mut.copy_in_absolute(
+        const_cast<laddr_packed_t*>(p_value), old_root_addr);
+    root->make_root_from(c, std::move(super), old_root_addr);
+    return root;
+  });
+}
+
+node_future<Ref<tree_cursor_t>>
+InternalNode::lookup_smallest(context_t c) {
+  auto position = search_position_t::begin();
+  laddr_t child_addr = impl->get_p_value(position)->value;
+  return get_or_track_child(c, position, child_addr
+  ).safe_then([c](auto child) {
+    return child->lookup_smallest(c);
+  });
+}
+
+node_future<Ref<tree_cursor_t>>
+InternalNode::lookup_largest(context_t c) {
+  // NOTE: unlike LeafNode::lookup_largest(), this only works for the tail
+  // internal node to return the tail child address.
+  auto position = search_position_t::end();
+  laddr_t child_addr = impl->get_p_value(position)->value;
+  return get_or_track_child(c, position, child_addr).safe_then([c](auto child) {
+    return child->lookup_largest(c);
+  });
+}
+
+node_future<Node::search_result_t>
+InternalNode::lower_bound_tracked(
+    context_t c, const key_hobj_t& key, MatchHistory& history) {
+  auto result = impl->lower_bound(key, history);
+  return get_or_track_child(c, result.position, result.p_value->value
+  ).safe_then([c, &key, &history](auto child) {
+    // XXX(multi-type): pass result.mstat to child
+    return child->lower_bound_tracked(c, key, history);
+  });
+}
+
+node_future<> InternalNode::do_get_tree_stats(
+    context_t c, tree_stats_t& stats) {
+  auto nstats = impl->get_stats();
+  stats.size_persistent_internal += nstats.size_persistent;
+  stats.size_filled_internal += nstats.size_filled;
+  stats.size_logical_internal += nstats.size_logical;
+  stats.size_overhead_internal += nstats.size_overhead;
+  stats.size_value_internal += nstats.size_value;
+  stats.num_kvs_internal += nstats.num_kvs;
+  stats.num_nodes_internal += 1;
+
+  Ref<const InternalNode> this_ref = this;
+  return seastar::do_with(
+    search_position_t(), [this, this_ref, c, &stats](auto& pos) {
+      pos = search_position_t::begin();
+      return crimson::do_until(
+          [this, this_ref, c, &stats, &pos]() -> node_future<bool> {
+        auto child_addr = impl->get_p_value(pos)->value;
+        return get_or_track_child(c, pos, child_addr
+        ).safe_then([c, &stats](auto child) {
+          return child->do_get_tree_stats(c, stats);
+        }).safe_then([this, this_ref, &pos] {
+          if (pos.is_end()) {
+            return node_ertr::make_ready_future<bool>(true);
+          } else {
+            impl->next_position(pos);
+            if (pos.is_end()) {
+              if (impl->is_level_tail()) {
+                return node_ertr::make_ready_future<bool>(false);
+              } else {
+                return node_ertr::make_ready_future<bool>(true);
+              }
+            } else {
+              return node_ertr::make_ready_future<bool>(false);
+            }
+          }
+        });
+      });
+    }
+  );
+}
+
+node_future<> InternalNode::test_clone_root(
+    context_t c_other, RootNodeTracker& tracker_other) const {
+  assert(is_root());
+  assert(impl->is_level_tail());
+  assert(impl->field_type() == field_type_t::N0);
+  Ref<const InternalNode> this_ref = this;
+  return InternalNode::allocate(c_other, field_type_t::N0, true, impl->level()
+  ).safe_then([this, c_other, &tracker_other](auto fresh_other) {
+    impl->test_copy_to(fresh_other.mut);
+    auto cloned_root = fresh_other.node;
+    return c_other.nm.get_super(c_other.t, tracker_other
+    ).safe_then([c_other, cloned_root](auto&& super_other) {
+      cloned_root->make_root_new(c_other, std::move(super_other));
+      return cloned_root;
+    });
+  }).safe_then([this_ref, this, c_other](auto cloned_root) {
+    // clone tracked children
+    // In some unit tests, the children are stubbed out that they
+    // don't exist in NodeExtentManager, and are only tracked in memory.
+    return crimson::do_for_each(
+      tracked_child_nodes.begin(),
+      tracked_child_nodes.end(),
+      [this_ref, c_other, cloned_root](auto& kv) {
+        assert(kv.first == kv.second->parent_info().position);
+        return kv.second->test_clone_non_root(c_other, cloned_root);
+      }
+    );
+  });
+}
+
+node_future<Ref<Node>> InternalNode::get_or_track_child(
+    context_t c, const search_position_t& position, laddr_t child_addr) {
+  bool level_tail = position.is_end();
+  Ref<Node> child;
+  auto found = tracked_child_nodes.find(position);
+  Ref<InternalNode> this_ref = this;
+  return (found == tracked_child_nodes.end()
+    ? (logger().trace("OTree::Internal: load child untracked at {:#x}, pos({}), level={}",
+                      child_addr, position, level() - 1),
+       Node::load(c, child_addr, level_tail
+       ).safe_then([this, position] (auto child) {
+         child->as_child(position, this);
+         return child;
+       }))
+    : (logger().trace("OTree::Internal: load child tracked at {:#x}, pos({}), level={}",
+                      child_addr, position, level() - 1),
+       node_ertr::make_ready_future<Ref<Node>>(found->second))
+  ).safe_then([this_ref, this, position, child_addr] (auto child) {
+    assert(child_addr == child->impl->laddr());
+    assert(position == child->parent_info().position);
+    std::ignore = position;
+    std::ignore = child_addr;
+    validate_child(*child);
+    return child;
+  });
+}
+
+void InternalNode::track_insert(
+      const search_position_t& insert_pos, match_stage_t insert_stage,
+      Ref<Node> insert_child, Ref<Node> nxt_child) {
+  // update tracks
+  auto pos_upper_bound = insert_pos;
+  pos_upper_bound.index_by_stage(insert_stage) = INDEX_UPPER_BOUND;
+  auto first = tracked_child_nodes.lower_bound(insert_pos);
+  auto last = tracked_child_nodes.lower_bound(pos_upper_bound);
+  std::vector<Node*> nodes;
+  std::for_each(first, last, [&nodes](auto& kv) {
+    nodes.push_back(kv.second);
+  });
+  tracked_child_nodes.erase(first, last);
+  for (auto& node : nodes) {
+    auto _pos = node->parent_info().position;
+    assert(!_pos.is_end());
+    ++_pos.index_by_stage(insert_stage);
+    node->as_child(_pos, this);
+  }
+  // track insert
+  insert_child->as_child(insert_pos, this);
+
+#ifndef NDEBUG
+  // validate left_child is before right_child
+  if (nxt_child) {
+    auto iter = tracked_child_nodes.find(insert_pos);
+    ++iter;
+    assert(iter->second == nxt_child);
+  }
+#endif
+}
+
+void InternalNode::replace_track(
+    const search_position_t& position, Ref<Node> new_child, Ref<Node> old_child) {
+  assert(tracked_child_nodes[position] == old_child);
+  tracked_child_nodes.erase(position);
+  new_child->as_child(position, this);
+  assert(tracked_child_nodes[position] == new_child);
+}
+
+void InternalNode::track_split(
+    const search_position_t& split_pos, Ref<InternalNode> right_node) {
+  auto first = tracked_child_nodes.lower_bound(split_pos);
+  auto iter = first;
+  while (iter != tracked_child_nodes.end()) {
+    search_position_t new_pos = iter->first;
+    new_pos -= split_pos;
+    iter->second->as_child<false>(new_pos, right_node);
+    ++iter;
+  }
+  tracked_child_nodes.erase(first, tracked_child_nodes.end());
+}
+
+void InternalNode::validate_child(const Node& child) const {
+#ifndef NDEBUG
+  assert(impl->level() - 1 == child.impl->level());
+  assert(this == child.parent_info().ptr);
+  auto& child_pos = child.parent_info().position;
+  assert(impl->get_p_value(child_pos)->value == child.impl->laddr());
+  if (child_pos.is_end()) {
+    assert(impl->is_level_tail());
+    assert(child.impl->is_level_tail());
+  } else {
+    assert(!child.impl->is_level_tail());
+    assert(impl->get_key_view(child_pos) == child.impl->get_largest_key_view());
+  }
+  // XXX(multi-type)
+  assert(impl->field_type() <= child.impl->field_type());
+#endif
+}
+
+node_future<InternalNode::fresh_node_t> InternalNode::allocate(
+    context_t c, field_type_t field_type, bool is_level_tail, level_t level) {
+  return InternalNodeImpl::allocate(c, field_type, is_level_tail, level
+  ).safe_then([](auto&& fresh_impl) {
+    auto node = Ref<InternalNode>(new InternalNode(
+          fresh_impl.impl.get(), std::move(fresh_impl.impl)));
+    return fresh_node_t{node, fresh_impl.mut};
+  });
+}
+
+/*
+ * LeafNode
+ */
+
+LeafNode::LeafNode(LeafNodeImpl* impl, NodeImplURef&& impl_ref)
+  : Node(std::move(impl_ref)), impl{impl} {}
+
+bool LeafNode::is_level_tail() const {
+  return impl->is_level_tail();
+}
+
+std::tuple<key_view_t, const onode_t*, layout_version_t> LeafNode::get_kv(
+    const search_position_t& pos) const {
+  key_view_t key_view;
+  auto p_value = impl->get_p_value(pos, &key_view);
+  return {key_view, p_value, layout_version};
+}
+
+node_future<Ref<tree_cursor_t>>
+LeafNode::lookup_smallest(context_t) {
+  if (unlikely(impl->is_empty())) {
+    assert(is_root());
+    return node_ertr::make_ready_future<Ref<tree_cursor_t>>(
+        new tree_cursor_t(this));
+  }
+  auto pos = search_position_t::begin();
+  key_view_t index_key;
+  auto p_value = impl->get_p_value(pos, &index_key);
+  return node_ertr::make_ready_future<Ref<tree_cursor_t>>(
+      get_or_track_cursor(pos, index_key, p_value));
+}
+
+node_future<Ref<tree_cursor_t>>
+LeafNode::lookup_largest(context_t) {
+  if (unlikely(impl->is_empty())) {
+    assert(is_root());
+    return node_ertr::make_ready_future<Ref<tree_cursor_t>>(
+        new tree_cursor_t(this));
+  }
+  search_position_t pos;
+  const onode_t* p_value = nullptr;
+  key_view_t index_key;
+  impl->get_largest_slot(pos, index_key, &p_value);
+  return node_ertr::make_ready_future<Ref<tree_cursor_t>>(
+      get_or_track_cursor(pos, index_key, p_value));
+}
+
+node_future<Node::search_result_t>
+LeafNode::lower_bound_tracked(
+    context_t c, const key_hobj_t& key, MatchHistory& history) {
+  key_view_t index_key;
+  auto result = impl->lower_bound(key, history, &index_key);
+  Ref<tree_cursor_t> cursor;
+  if (result.position.is_end()) {
+    assert(!result.p_value);
+    cursor = new tree_cursor_t(this);
+  } else {
+    cursor = get_or_track_cursor(result.position, index_key, result.p_value);
+  }
+  return node_ertr::make_ready_future<search_result_t>(
+      search_result_t{cursor, result.mstat});
+}
+
+node_future<> LeafNode::do_get_tree_stats(context_t, tree_stats_t& stats) {
+  auto nstats = impl->get_stats();
+  stats.size_persistent_leaf += nstats.size_persistent;
+  stats.size_filled_leaf += nstats.size_filled;
+  stats.size_logical_leaf += nstats.size_logical;
+  stats.size_overhead_leaf += nstats.size_overhead;
+  stats.size_value_leaf += nstats.size_value;
+  stats.num_kvs_leaf += nstats.num_kvs;
+  stats.num_nodes_leaf += 1;
+  return node_ertr::now();
+}
+
+node_future<> LeafNode::test_clone_root(
+    context_t c_other, RootNodeTracker& tracker_other) const {
+  assert(is_root());
+  assert(impl->is_level_tail());
+  assert(impl->field_type() == field_type_t::N0);
+  Ref<const LeafNode> this_ref = this;
+  return LeafNode::allocate(c_other, field_type_t::N0, true
+  ).safe_then([this, c_other, &tracker_other](auto fresh_other) {
+    impl->test_copy_to(fresh_other.mut);
+    auto cloned_root = fresh_other.node;
+    return c_other.nm.get_super(c_other.t, tracker_other
+    ).safe_then([c_other, cloned_root](auto&& super_other) {
+      cloned_root->make_root_new(c_other, std::move(super_other));
+    });
+  }).safe_then([this_ref]{});
+}
+
+node_future<Ref<tree_cursor_t>> LeafNode::insert_value(
+    context_t c, const key_hobj_t& key, const onode_t& value,
+    const search_position_t& pos, const MatchHistory& history,
+    match_stat_t mstat) {
+#ifndef NDEBUG
+  if (pos.is_end()) {
+    assert(impl->is_level_tail());
+  }
+#endif
+  logger().debug("OTree::Leaf::Insert: "
+                 "pos({}), {}, {}, {}, mstat({}) ...",
+                 pos, key, value, history, mstat);
+  search_position_t insert_pos = pos;
+  auto [insert_stage, insert_size] = impl->evaluate_insert(
+      key, value, history, mstat, insert_pos);
+  auto free_size = impl->free_size();
+  if (free_size >= insert_size) {
+    // insert
+    on_layout_change();
+    impl->prepare_mutate(c);
+    auto p_value = impl->insert(key, value, insert_pos, insert_stage, insert_size);
+    assert(impl->free_size() == free_size - insert_size);
+    assert(insert_pos <= pos);
+    assert(p_value->size == value.size);
+    auto ret = track_insert(insert_pos, insert_stage, p_value);
+    validate_tracked_cursors();
+    return node_ertr::make_ready_future<Ref<tree_cursor_t>>(ret);
+  }
+  // split and insert
+  Ref<LeafNode> this_ref = this;
+  return (is_root() ? upgrade_root(c) : node_ertr::now()
+  ).safe_then([this, c] {
+    return LeafNode::allocate(c, impl->field_type(), impl->is_level_tail());
+  }).safe_then([this_ref, this, c, &key, &value,
+                insert_pos, insert_stage=insert_stage, insert_size=insert_size](auto fresh_right) mutable {
+    auto right_node = fresh_right.node;
+    // no need to bump version for right node, as it is fresh
+    on_layout_change();
+    impl->prepare_mutate(c);
+    auto [split_pos, is_insert_left, p_value] = impl->split_insert(
+        fresh_right.mut, *right_node->impl, key, value,
+        insert_pos, insert_stage, insert_size);
+    assert(p_value->size == value.size);
+    track_split(split_pos, right_node);
+    Ref<tree_cursor_t> ret;
+    if (is_insert_left) {
+      ret = track_insert(insert_pos, insert_stage, p_value);
+    } else {
+      ret = right_node->track_insert(insert_pos, insert_stage, p_value);
+    }
+    validate_tracked_cursors();
+    right_node->validate_tracked_cursors();
+
+    // propagate insert to parent
+    return insert_parent(c, right_node).safe_then([ret] {
+      return ret;
+    });
+    // TODO (optimize)
+    // try to acquire space from siblings before split... see btrfs
+  });
+}
+
+node_future<Ref<LeafNode>> LeafNode::allocate_root(
+    context_t c, RootNodeTracker& root_tracker) {
+  return LeafNode::allocate(c, field_type_t::N0, true
+  ).safe_then([c, &root_tracker](auto fresh_node) {
+    auto root = fresh_node.node;
+    return c.nm.get_super(c.t, root_tracker
+    ).safe_then([c, root](auto&& super) {
+      root->make_root_new(c, std::move(super));
+      return root;
+    });
+  });
+}
+
+Ref<tree_cursor_t> LeafNode::get_or_track_cursor(
+    const search_position_t& position,
+    const key_view_t& key, const onode_t* p_value) {
+  assert(!position.is_end());
+  assert(p_value);
+  Ref<tree_cursor_t> p_cursor;
+  auto found = tracked_cursors.find(position);
+  if (found == tracked_cursors.end()) {
+    p_cursor = new tree_cursor_t(this, position, key, p_value, layout_version);
+  } else {
+    p_cursor = found->second;
+    assert(p_cursor->get_leaf_node() == this);
+    assert(p_cursor->get_position() == position);
+    p_cursor->update_kv(key, p_value, layout_version);
+  }
+  return p_cursor;
+}
+
+void LeafNode::validate_cursor(tree_cursor_t& cursor) const {
+#ifndef NDEBUG
+  assert(this == cursor.get_leaf_node().get());
+  assert(!cursor.is_end());
+  auto [key, val, ver] = get_kv(cursor.get_position());
+  assert(key == cursor.get_key_view());
+  assert(val == cursor.get_p_value());
+#endif
+}
+
+Ref<tree_cursor_t> LeafNode::track_insert(
+    const search_position_t& insert_pos, match_stage_t insert_stage,
+    const onode_t* p_onode) {
+  // update cursor position
+  auto pos_upper_bound = insert_pos;
+  pos_upper_bound.index_by_stage(insert_stage) = INDEX_UPPER_BOUND;
+  auto first = tracked_cursors.lower_bound(insert_pos);
+  auto last = tracked_cursors.lower_bound(pos_upper_bound);
+  std::vector<tree_cursor_t*> p_cursors;
+  std::for_each(first, last, [&p_cursors](auto& kv) {
+    p_cursors.push_back(kv.second);
+  });
+  tracked_cursors.erase(first, last);
+  for (auto& p_cursor : p_cursors) {
+    search_position_t new_pos = p_cursor->get_position();
+    ++new_pos.index_by_stage(insert_stage);
+    p_cursor->update_track<true>(this, new_pos);
+  }
+
+  // track insert
+  // TODO: getting key_view_t from stage::proceed_insert() and
+  // stage::append_insert() has not supported yet
+  return new tree_cursor_t(this, insert_pos);
+}
+
+void LeafNode::track_split(
+    const search_position_t& split_pos, Ref<LeafNode> right_node) {
+  // update cursor ownership and position
+  auto first = tracked_cursors.lower_bound(split_pos);
+  auto iter = first;
+  while (iter != tracked_cursors.end()) {
+    search_position_t new_pos = iter->first;
+    new_pos -= split_pos;
+    iter->second->update_track<false>(right_node, new_pos);
+    ++iter;
+  }
+  tracked_cursors.erase(first, tracked_cursors.end());
+}
+
+node_future<LeafNode::fresh_node_t> LeafNode::allocate(
+    context_t c, field_type_t field_type, bool is_level_tail) {
+  return LeafNodeImpl::allocate(c, field_type, is_level_tail
+  ).safe_then([](auto&& fresh_impl) {
+    auto node = Ref<LeafNode>(new LeafNode(
+          fresh_impl.impl.get(), std::move(fresh_impl.impl)));
+    return fresh_node_t{node, fresh_impl.mut};
+  });
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h
new file mode 100644
index 000000000..d6af489e7
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h
@@ -0,0 +1,476 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <ostream>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include "crimson/common/type_helpers.h"
+
+#include "node_extent_mutable.h"
+#include "stages/key_layout.h"
+#include "stages/stage_types.h"
+#include "super.h"
+#include "tree_types.h"
+
+/**
+ * Tree example (2 levels):
+ *
+ * Root node keys:               [  3  7  ]
+ *           values:             [p1 p2 p3]
+ *                                /  |   \
+ *                         -------   |    -------
+ *                         |         |          |
+ *                         V         V          V
+ * Leaf node keys:   [ 1  2  3] [ 4  5  7] [ 9 11 12]
+ *           values: [v1 v2 v3] [v4 v5 v6] [v7 v8 v9]
+ *
+ * Tree structure properties:
+ * - As illustrated above, the parent key is strictly equal to its left child's
+ *   largest key;
+ * - If a tree is indexing multiple seastore transactions, each transaction
+ *   will be mapped to a Super which points to a distinct root node. So the
+ *   transactions are isolated at tree level. However, tree nodes from
+ *   different transactions can reference the same seastore CachedExtent before
+ *   modification;
+ * - The resources of the transactional tree are tracked by tree_cursor_ts held
+ *   by users. As long as any cursor is alive, the according tree hierarchy is
+ *   alive and keeps tracked. See the reversed resource management sections
+ *   below;
+ */
+
+namespace crimson::os::seastore::onode {
+
+class LeafNode;
+class InternalNode;
+
+/**
+ * tree_cursor_t
+ *
+ * A cursor points to a position (LeafNode and search_position_t) of the tree
+ * where it can find the according key and value pair. The position is updated
+ * by LeafNode insert/split/delete/merge internally and is kept valid. It also
+ * caches the key-value information for a specific node layout version.
+ *
+ * Exposes public interfaces for Btree::Cursor.
+ */
+using layout_version_t = uint32_t;
+class tree_cursor_t final
+  : public boost::intrusive_ref_counter<
+           tree_cursor_t, boost::thread_unsafe_counter> {
+ public:
+  // public to Btree
+  ~tree_cursor_t();
+  tree_cursor_t(const tree_cursor_t&) = delete;
+  tree_cursor_t(tree_cursor_t&&) = delete;
+  tree_cursor_t& operator=(const tree_cursor_t&) = delete;
+  tree_cursor_t& operator=(tree_cursor_t&&) = delete;
+
+  /**
+   * is_end
+   *
+   * Represents one-past-the-last of all the sorted key-value
+   * pairs in the tree. An end cursor won't contain valid key-value
+   * information.
+   */
+  bool is_end() const { return position.is_end(); }
+
+  /// Returns the key view in tree if it is not an end cursor.
+  const key_view_t& get_key_view() const;
+
+  /// Returns the value pointer in tree if it is not an end cursor.
+  const onode_t* get_p_value() const;
+
+ private:
+  tree_cursor_t(Ref<LeafNode>, const search_position_t&);
+  tree_cursor_t(Ref<LeafNode>, const search_position_t&,
+                const key_view_t& key, const onode_t*, layout_version_t);
+  // lookup reaches the end, contain leaf node for further insert
+  tree_cursor_t(Ref<LeafNode>);
+  const search_position_t& get_position() const { return position; }
+  Ref<LeafNode> get_leaf_node() { return leaf_node; }
+  template <bool VALIDATE>
+  void update_track(Ref<LeafNode>, const search_position_t&);
+  void update_kv(const key_view_t&, const onode_t*, layout_version_t) const;
+  void ensure_kv() const;
+
+ private:
+  /**
+   * Reversed resource management (tree_cursor_t)
+   *
+   * tree_cursor_t holds a reference to the LeafNode, so the LeafNode will be
+   * alive as long as any of it's cursors is still referenced by user.
+   */
+  Ref<LeafNode> leaf_node;
+  search_position_t position;
+
+  // cached information
+  mutable std::optional<key_view_t> key_view;
+  mutable const onode_t* p_value;
+  mutable layout_version_t node_version;
+
+  friend class LeafNode;
+  friend class Node; // get_position(), get_leaf_node()
+};
+
+/**
+ * Node
+ *
+ * An abstracted class for both InternalNode and LeafNode.
+ *
+ * Exposes public interfaces for Btree.
+ */
+class Node
+  : public boost::intrusive_ref_counter<
+           Node, boost::thread_unsafe_counter> {
+ public:
+  // public to Btree
+  using node_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error,
+    crimson::ct_error::invarg,
+    crimson::ct_error::enoent,
+    crimson::ct_error::erange>;
+  template <class ValueT=void>
+  using node_future = node_ertr::future<ValueT>;
+
+  struct search_result_t {
+    bool is_end() const { return p_cursor->is_end(); }
+    Ref<tree_cursor_t> p_cursor;
+    match_stat_t mstat;
+
+    MatchKindBS match() const {
+      assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX);
+      return (mstat == MSTAT_EQ ? MatchKindBS::EQ : MatchKindBS::NE);
+    }
+  };
+
+  virtual ~Node();
+  Node(const Node&) = delete;
+  Node(Node&&) = delete;
+  Node& operator=(const Node&) = delete;
+  Node& operator=(Node&&) = delete;
+
+  /**
+   * level
+   *
+   * A positive value denotes the level (or height) of this node in tree.
+   * 0 means LeafNode, positive means InternalNode.
+   */
+  level_t level() const;
+
+  /**
+   * lookup_smallest
+   *
+   * Returns a cursor pointing to the smallest key in the sub-tree formed by
+   * this node.
+   *
+   * Returns an end cursor if it is an empty root node.
+   */
+  virtual node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) = 0;
+
+  /**
+   * lookup_largest
+   *
+   * Returns a cursor pointing to the largest key in the sub-tree formed by
+   * this node.
+   *
+   * Returns an end cursor if it is an empty root node.
+   */
+  virtual node_future<Ref<tree_cursor_t>> lookup_largest(context_t) = 0;
+
+  /**
+   * lower_bound
+   *
+   * Returns a cursor pointing to the first element in the range [first, last)
+   * of the sub-tree which does not compare less than the input key. The
+   * result also denotes whether the pointed key is equal to the input key.
+   *
+   * Returns an end cursor with MatchKindBS::NE if:
+   * - It is an empty root node;
+   * - Or the input key is larger than all the keys in the sub-tree;
+   */
+  node_future<search_result_t> lower_bound(context_t c, const key_hobj_t& key);
+
+  /**
+   * insert
+   *
+   * Try to insert a key-value pair into the sub-tree formed by this node.
+   *
+   * Returns a boolean denoting whether the insertion is successful:
+   * - If true, the returned cursor points to the inserted element in tree;
+   * - If false, the returned cursor points to the conflicting element in tree;
+   */
+  node_future<std::pair<Ref<tree_cursor_t>, bool>> insert(
+      context_t, const key_hobj_t&, const onode_t&);
+
+  /// Recursively collects the statistics of the sub-tree formed by this node
+  node_future<tree_stats_t> get_tree_stats(context_t);
+
+  /// Returns an ostream containing a dump of all the elements in the node.
+  std::ostream& dump(std::ostream&) const;
+
+  /// Returns an ostream containing an one-line summary of this node.
+  std::ostream& dump_brief(std::ostream&) const;
+
+  /// Initializes the tree by allocating an empty root node.
+  static node_future<> mkfs(context_t, RootNodeTracker&);
+
+  /// Loads the tree root. The tree must be initialized.
+  static node_future<Ref<Node>> load_root(context_t, RootNodeTracker&);
+
+  // Only for unit test purposes.
+  void test_make_destructable(context_t, NodeExtentMutable&, Super::URef&&);
+  virtual node_future<> test_clone_root(context_t, RootNodeTracker&) const = 0;
+
+ protected:
+  virtual node_future<> test_clone_non_root(context_t, Ref<InternalNode>) const {
+    ceph_abort("impossible path");
+  }
+  virtual node_future<search_result_t> lower_bound_tracked(
+      context_t, const key_hobj_t&, MatchHistory&) = 0;
+  virtual node_future<> do_get_tree_stats(context_t, tree_stats_t&) = 0;
+
+ protected:
+  Node(NodeImplURef&&);
+  bool is_root() const {
+    assert((super && !_parent_info.has_value()) ||
+           (!super && _parent_info.has_value()));
+    return !_parent_info.has_value();
+  }
+
+  // as root
+  void make_root(context_t c, Super::URef&& _super);
+  void make_root_new(context_t c, Super::URef&& _super) {
+    assert(_super->get_root_laddr() == L_ADDR_NULL);
+    make_root(c, std::move(_super));
+  }
+  void make_root_from(context_t c, Super::URef&& _super, laddr_t from_addr) {
+    assert(_super->get_root_laddr() == from_addr);
+    make_root(c, std::move(_super));
+  }
+  void as_root(Super::URef&& _super);
+  node_future<> upgrade_root(context_t);
+
+  // as child/non-root
+  template <bool VALIDATE = true>
+  void as_child(const search_position_t&, Ref<InternalNode>);
+  struct parent_info_t {
+    search_position_t position;
+    Ref<InternalNode> ptr;
+  };
+  const parent_info_t& parent_info() const { return *_parent_info; }
+  node_future<> insert_parent(context_t, Ref<Node> right_node);
+
+ private:
+  /**
+   * Reversed resource management (Node)
+   *
+   * Root Node holds a reference to its parent Super class, so its parent
+   * will be alive as long as this root node is alive.
+   *
+   * None-root Node holds a reference to its parent Node, so its parent will
+   * be alive as long as any of it's children is alive.
+   */
+  // as root
+  Super::URef super;
+  // as child/non-root
+  std::optional<parent_info_t> _parent_info;
+
+ private:
+  static node_future<Ref<Node>> load(context_t, laddr_t, bool expect_is_level_tail);
+
+  NodeImplURef impl;
+  friend class InternalNode;
+};
+inline std::ostream& operator<<(std::ostream& os, const Node& node) {
+  return node.dump_brief(os);
+}
+
+/**
+ * InternalNode
+ *
+ * A concrete implementation of Node class that represents an internal tree
+ * node. Its level is always positive and its values are logical block
+ * addresses to its child nodes. An internal node cannot be empty.
+ */
+class InternalNode final : public Node {
+ public:
+  // public to Node
+  InternalNode(InternalNodeImpl*, NodeImplURef&&);
+  ~InternalNode() override { assert(tracked_child_nodes.empty()); }
+  InternalNode(const InternalNode&) = delete;
+  InternalNode(InternalNode&&) = delete;
+  InternalNode& operator=(const InternalNode&) = delete;
+  InternalNode& operator=(InternalNode&&) = delete;
+
+  node_future<> apply_child_split(
+      context_t, const search_position_t&, Ref<Node> left, Ref<Node> right);
+  template <bool VALIDATE>
+  void do_track_child(Node& child) {
+    if constexpr (VALIDATE) {
+      validate_child(child);
+    }
+    auto& child_pos = child.parent_info().position;
+    assert(tracked_child_nodes.find(child_pos) == tracked_child_nodes.end());
+    tracked_child_nodes[child_pos] = &child;
+  }
+  void do_untrack_child(const Node& child) {
+    auto& child_pos = child.parent_info().position;
+    assert(tracked_child_nodes.find(child_pos)->second == &child);
+    [[maybe_unused]] auto removed = tracked_child_nodes.erase(child_pos);
+    assert(removed);
+  }
+
+  static node_future<Ref<InternalNode>> allocate_root(
+      context_t, level_t, laddr_t, Super::URef&&);
+
+ protected:
+  node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) override;
+  node_future<Ref<tree_cursor_t>> lookup_largest(context_t) override;
+  node_future<search_result_t> lower_bound_tracked(
+      context_t, const key_hobj_t&, MatchHistory&) override;
+  node_future<> do_get_tree_stats(context_t, tree_stats_t&) override;
+
+  node_future<> test_clone_root(context_t, RootNodeTracker&) const override;
+
+ private:
+  // XXX: extract a common tracker for InternalNode to track Node,
+  // and LeafNode to track tree_cursor_t.
+  node_future<Ref<Node>> get_or_track_child(context_t, const search_position_t&, laddr_t);
+  void track_insert(
+      const search_position_t&, match_stage_t, Ref<Node>, Ref<Node> nxt_child = nullptr);
+  void replace_track(const search_position_t&, Ref<Node> new_child, Ref<Node> old_child);
+  void track_split(const search_position_t&, Ref<InternalNode>);
+  void validate_tracked_children() const {
+#ifndef NDEBUG
+    for (auto& kv : tracked_child_nodes) {
+      assert(kv.first == kv.second->parent_info().position);
+      validate_child(*kv.second);
+    }
+#endif
+  }
+  void validate_child(const Node& child) const;
+
+  struct fresh_node_t {
+    Ref<InternalNode> node;
+    NodeExtentMutable mut;
+    std::pair<Ref<Node>, NodeExtentMutable> make_pair() {
+      return std::make_pair(Ref<Node>(node), mut);
+    }
+  };
+  static node_future<fresh_node_t> allocate(context_t, field_type_t, bool, level_t);
+
+ private:
+  /**
+   * Reversed resource management (InternalNode)
+   *
+   * InteralNode keeps track of its child nodes which are still alive in
+   * memory, and their positions will be updated throughout
+   * insert/split/delete/merge operations of this node.
+   */
+  // XXX: leverage intrusive data structure to control memory overhead
+  std::map<search_position_t, Node*> tracked_child_nodes;
+  InternalNodeImpl* impl;
+};
+
+/**
+ * LeafNode
+ *
+ * A concrete implementation of Node class that represents a leaf tree node.
+ * Its level is always 0. A leaf node can only be empty if it is root.
+ */
+class LeafNode final : public Node {
+ public:
+  // public to tree_cursor_t
+  ~LeafNode() override { assert(tracked_cursors.empty()); }
+  LeafNode(const LeafNode&) = delete;
+  LeafNode(LeafNode&&) = delete;
+  LeafNode& operator=(const LeafNode&) = delete;
+  LeafNode& operator=(LeafNode&&) = delete;
+
+  bool is_level_tail() const;
+  layout_version_t get_layout_version() const { return layout_version; }
+  std::tuple<key_view_t, const onode_t*, layout_version_t> get_kv(
+      const search_position_t&) const;
+  template <bool VALIDATE>
+  void do_track_cursor(tree_cursor_t& cursor) {
+    if constexpr (VALIDATE) {
+      validate_cursor(cursor);
+    }
+    auto& cursor_pos = cursor.get_position();
+    assert(tracked_cursors.find(cursor_pos) == tracked_cursors.end());
+    tracked_cursors[cursor_pos] = &cursor;
+  }
+  void do_untrack_cursor(tree_cursor_t& cursor) {
+    validate_cursor(cursor);
+    auto& cursor_pos = cursor.get_position();
+    assert(tracked_cursors.find(cursor_pos)->second == &cursor);
+    [[maybe_unused]] auto removed = tracked_cursors.erase(cursor_pos);
+    assert(removed);
+  }
+
+ protected:
+  node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) override;
+  node_future<Ref<tree_cursor_t>> lookup_largest(context_t) override;
+  node_future<search_result_t> lower_bound_tracked(
+      context_t, const key_hobj_t&, MatchHistory&) override;
+  node_future<> do_get_tree_stats(context_t, tree_stats_t&) override;
+
+  node_future<> test_clone_root(context_t, RootNodeTracker&) const override;
+
+ private:
+  LeafNode(LeafNodeImpl*, NodeImplURef&&);
+  node_future<Ref<tree_cursor_t>> insert_value(
+      context_t, const key_hobj_t&, const onode_t&,
+      const search_position_t&, const MatchHistory&,
+      match_stat_t mstat);
+  static node_future<Ref<LeafNode>> allocate_root(context_t, RootNodeTracker&);
+  friend class Node;
+
+ private:
+  // XXX: extract a common tracker for InternalNode to track Node,
+  // and LeafNode to track tree_cursor_t.
+  Ref<tree_cursor_t> get_or_track_cursor(
+      const search_position_t&, const key_view_t&, const onode_t*);
+  Ref<tree_cursor_t> track_insert(
+      const search_position_t&, match_stage_t, const onode_t*);
+  void track_split(const search_position_t&, Ref<LeafNode>);
+  void validate_tracked_cursors() const {
+#ifndef NDEBUG
+    for (auto& kv : tracked_cursors) {
+      assert(kv.first == kv.second->get_position());
+      validate_cursor(*kv.second);
+    }
+#endif
+  }
+  void validate_cursor(tree_cursor_t& cursor) const;
+  // invalidate p_value pointers in tree_cursor_t
+  void on_layout_change() { ++layout_version; }
+
+  struct fresh_node_t {
+    Ref<LeafNode> node;
+    NodeExtentMutable mut;
+    std::pair<Ref<Node>, NodeExtentMutable> make_pair() {
+      return std::make_pair(Ref<Node>(node), mut);
+    }
+  };
+  static node_future<fresh_node_t> allocate(context_t, field_type_t, bool);
+
+ private:
+  /**
+   * Reversed resource management (LeafNode)
+   *
+   * LeafNode keeps track of the referencing cursors which are still alive in
+   * memory, and their positions will be updated throughout
+   * insert/split/delete/merge operations of this node.
+   */
+  // XXX: leverage intrusive data structure to control memory overhead
+  std::map<search_position_t, tree_cursor_t*> tracked_cursors;
+  LeafNodeImpl* impl;
+  layout_version_t layout_version = 0;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h
new file mode 100644
index 000000000..d08a99015
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "include/buffer.h"
+#include "node_types.h"
+
+namespace crimson::os::seastore::onode {
+
+/**
+ * DeltaRecorder
+ *
+ * An abstracted class to encapsulate different implementations to apply delta
+ * to a specific node layout.
+ */
+class DeltaRecorder {
+ public:
+  virtual ~DeltaRecorder() {
+    assert(is_empty());
+  }
+
+  bool is_empty() const {
+    return encoded.length() == 0;
+  }
+
+  ceph::bufferlist get_delta() {
+    assert(!is_empty());
+    return std::move(encoded);
+  }
+
+  virtual node_type_t node_type() const = 0;
+  virtual field_type_t field_type() const = 0;
+  virtual void apply_delta(ceph::bufferlist::const_iterator&,
+                           NodeExtentMutable&) = 0;
+
+ protected:
+  DeltaRecorder() = default;
+  ceph::bufferlist encoded;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h
new file mode 100644
index 000000000..94782f50d
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h
@@ -0,0 +1,413 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/log.h"
+#include "node_extent_manager.h"
+#include "node_delta_recorder.h"
+#include "node_layout_replayable.h"
+
+#ifndef NDEBUG
+#include "node_extent_manager/test_replay.h"
+#endif
+
+namespace crimson::os::seastore::onode {
+
+/**
+ * DeltaRecorderT
+ *
+ * Responsible to encode and decode delta, and apply delta for a specific node
+ * layout.
+ */
+template <typename FieldType, node_type_t NODE_TYPE>
+class DeltaRecorderT final: public DeltaRecorder {
+  enum class op_t : uint8_t {
+    INSERT,
+    SPLIT,
+    SPLIT_INSERT,
+    UPDATE_CHILD_ADDR,
+  };
+
+ public:
+  using layout_t = NodeLayoutReplayableT<FieldType, NODE_TYPE>;
+  using node_stage_t = typename layout_t::node_stage_t;
+  using position_t = typename layout_t::position_t;
+  using StagedIterator = typename layout_t::StagedIterator;
+  using value_t = typename layout_t::value_t;
+  static constexpr auto FIELD_TYPE = layout_t::FIELD_TYPE;
+
+  ~DeltaRecorderT() override = default;
+
+  template <KeyT KT>
+  void encode_insert(
+      const full_key_t<KT>& key,
+      const value_t& value,
+      const position_t& insert_pos,
+      const match_stage_t& insert_stage,
+      const node_offset_t& insert_size) {
+    ceph::encode(op_t::INSERT, encoded);
+    encode_key<KT>(key, encoded);
+    encode_value(value, encoded);
+    insert_pos.encode(encoded);
+    ceph::encode(insert_stage, encoded);
+    ceph::encode(insert_size, encoded);
+  }
+
+  void encode_split(
+      const StagedIterator& split_at,
+      const char* p_node_start) {
+    ceph::encode(op_t::SPLIT, encoded);
+    split_at.encode(p_node_start, encoded);
+  }
+
+  template <KeyT KT>
+  void encode_split_insert(
+      const StagedIterator& split_at,
+      const full_key_t<KT>& key,
+      const value_t& value,
+      const position_t& insert_pos,
+      const match_stage_t& insert_stage,
+      const node_offset_t& insert_size,
+      const char* p_node_start) {
+    ceph::encode(op_t::SPLIT_INSERT, encoded);
+    split_at.encode(p_node_start, encoded);
+    encode_key<KT>(key, encoded);
+    encode_value(value, encoded);
+    insert_pos.encode(encoded);
+    ceph::encode(insert_stage, encoded);
+    ceph::encode(insert_size, encoded);
+  }
+
+  void encode_update_child_addr(
+      const laddr_t new_addr,
+      const laddr_packed_t* p_addr,
+      const char* p_node_start) {
+    ceph::encode(op_t::UPDATE_CHILD_ADDR, encoded);
+    ceph::encode(new_addr, encoded);
+    int node_offset = reinterpret_cast<const char*>(p_addr) - p_node_start;
+    assert(node_offset > 0 && node_offset <= NODE_BLOCK_SIZE);
+    ceph::encode(static_cast<node_offset_t>(node_offset), encoded);
+  }
+
+  static DeltaRecorderURef create() {
+    return std::unique_ptr<DeltaRecorder>(new DeltaRecorderT());
+  }
+
+ protected:
+  DeltaRecorderT() = default;
+  node_type_t node_type() const override { return NODE_TYPE; }
+  field_type_t field_type() const override { return FIELD_TYPE; }
+  void apply_delta(ceph::bufferlist::const_iterator& delta,
+                   NodeExtentMutable& node) override {
+    assert(is_empty());
+    node_stage_t stage(reinterpret_cast<const FieldType*>(node.get_read()));
+    op_t op;
+    try {
+      ceph::decode(op, delta);
+      switch (op) {
+      case op_t::INSERT: {
+        logger().debug("OTree::Extent::Replay: decoding INSERT ...");
+        auto key = key_hobj_t::decode(delta);
+
+        std::unique_ptr<char[]> value_storage_heap;
+        value_t value_storage_stack;
+        auto p_value = decode_value(delta, value_storage_heap, value_storage_stack);
+
+        auto insert_pos = position_t::decode(delta);
+        match_stage_t insert_stage;
+        ceph::decode(insert_stage, delta);
+        node_offset_t insert_size;
+        ceph::decode(insert_size, delta);
+        logger().debug("OTree::Extent::Replay: apply {}, {}, "
+                       "insert_pos({}), insert_stage={}, insert_size={}B ...",
+                       key, *p_value, insert_pos, insert_stage, insert_size);
+        layout_t::template insert<KeyT::HOBJ>(
+          node, stage, key, *p_value, insert_pos, insert_stage, insert_size);
+        break;
+      }
+      case op_t::SPLIT: {
+        logger().debug("OTree::Extent::Replay: decoding SPLIT ...");
+        auto split_at = StagedIterator::decode(stage.p_start(), delta);
+        logger().debug("OTree::Extent::Replay: apply split_at={} ...", split_at);
+        layout_t::split(node, stage, split_at);
+        break;
+      }
+      case op_t::SPLIT_INSERT: {
+        logger().debug("OTree::Extent::Replay: decoding SPLIT_INSERT ...");
+        auto split_at = StagedIterator::decode(stage.p_start(), delta);
+        auto key = key_hobj_t::decode(delta);
+
+        std::unique_ptr<char[]> value_storage_heap;
+        value_t value_storage_stack;
+        auto p_value = decode_value(delta, value_storage_heap, value_storage_stack);
+
+        auto insert_pos = position_t::decode(delta);
+        match_stage_t insert_stage;
+        ceph::decode(insert_stage, delta);
+        node_offset_t insert_size;
+        ceph::decode(insert_size, delta);
+        logger().debug("OTree::Extent::Replay: apply split_at={}, {}, {}, "
+                       "insert_pos({}), insert_stage={}, insert_size={}B ...",
+                       split_at, key, *p_value, insert_pos, insert_stage, insert_size);
+        layout_t::template split_insert<KeyT::HOBJ>(
+          node, stage, split_at, key, *p_value, insert_pos, insert_stage, insert_size);
+        break;
+      }
+      case op_t::UPDATE_CHILD_ADDR: {
+        logger().debug("OTree::Extent::Replay: decoding UPDATE_CHILD_ADDR ...");
+        laddr_t new_addr;
+        ceph::decode(new_addr, delta);
+        node_offset_t update_offset;
+        ceph::decode(update_offset, delta);
+        auto p_addr = reinterpret_cast<laddr_packed_t*>(
+            node.get_write() + update_offset);
+        logger().debug("OTree::Extent::Replay: apply {:#x} to offset {:#x} ...",
+                       new_addr, update_offset);
+        layout_t::update_child_addr(node, new_addr, p_addr);
+        break;
+      }
+      default:
+        logger().error("OTree::Extent::Replay: got unknown op {} when replay {:#x}",
+                       op, node.get_laddr());
+        ceph_abort();
+      }
+    } catch (buffer::error& e) {
+      logger().error("OTree::Extent::Replay: got decode error {} when replay {:#x}",
+                     e, node.get_laddr());
+      ceph_abort();
+    }
+  }
+
+ private:
+  static void encode_value(const value_t& value, ceph::bufferlist& encoded) {
+    if constexpr (std::is_same_v<value_t, laddr_packed_t>) {
+      // NODE_TYPE == node_type_t::INTERNAL
+      ceph::encode(value.value, encoded);
+    } else if constexpr (std::is_same_v<value_t, onode_t>) {
+      // NODE_TYPE == node_type_t::LEAF
+      value.encode(encoded);
+    } else {
+      ceph_abort("impossible path");
+    }
+  }
+
+  static value_t* decode_value(ceph::bufferlist::const_iterator& delta,
+                               std::unique_ptr<char[]>& value_storage_heap,
+                               value_t& value_storage_stack) {
+    if constexpr (std::is_same_v<value_t, laddr_packed_t>) {
+      // NODE_TYPE == node_type_t::INTERNAL
+      laddr_t value;
+      ceph::decode(value, delta);
+      value_storage_stack.value = value;
+      return &value_storage_stack;
+    } else if constexpr (std::is_same_v<value_t, onode_t>) {
+      // NODE_TYPE == node_type_t::LEAF
+      auto value_config = onode_t::decode(delta);
+      value_storage_heap = onode_t::allocate(value_config);
+      return reinterpret_cast<onode_t*>(value_storage_heap.get());
+    } else {
+      ceph_abort("impossible path");
+    }
+  }
+
+  static seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+};
+
+/**
+ * NodeExtentAccessorT
+ *
+ * This component is responsible to reference and mutate the underlying
+ * NodeExtent, record mutation parameters when needed, and apply the recorded
+ * modifications for a specific node layout.
+ */
+template <typename FieldType, node_type_t NODE_TYPE>
+class NodeExtentAccessorT {
+ public:
+  using layout_t = NodeLayoutReplayableT<FieldType, NODE_TYPE>;
+  using node_stage_t = typename layout_t::node_stage_t;
+  using position_t = typename layout_t::position_t;
+  using recorder_t = DeltaRecorderT<FieldType, NODE_TYPE>;
+  using StagedIterator = typename layout_t::StagedIterator;
+  using value_t = typename layout_t::value_t;
+  static constexpr auto FIELD_TYPE = layout_t::FIELD_TYPE;
+
+  NodeExtentAccessorT(NodeExtentRef extent)
+      : extent{extent},
+        node_stage{reinterpret_cast<const FieldType*>(extent->get_read())} {
+    if (no_recording()) {
+      mut.emplace(extent->get_mutable());
+      assert(extent->get_recorder() == nullptr);
+      recorder = nullptr;
+    } else if (needs_recording()) {
+      mut.emplace(extent->get_mutable());
+      auto p_recorder = extent->get_recorder();
+      assert(p_recorder != nullptr);
+      assert(p_recorder->node_type() == NODE_TYPE);
+      assert(p_recorder->field_type() == FIELD_TYPE);
+      recorder = static_cast<recorder_t*>(p_recorder);
+    } else if (needs_mutate()) {
+      // mut is empty
+      assert(extent->get_recorder() == nullptr ||
+             extent->get_recorder()->is_empty());
+      recorder = nullptr;
+    } else {
+      ceph_abort("impossible path");
+    }
+#ifndef NDEBUG
+    auto ref_recorder = recorder_t::create();
+    test_recorder = static_cast<recorder_t*>(ref_recorder.get());
+    test_extent = TestReplayExtent::create(
+        extent->get_length(), std::move(ref_recorder));
+#endif
+  }
+  ~NodeExtentAccessorT() = default;
+  NodeExtentAccessorT(const NodeExtentAccessorT&) = delete;
+  NodeExtentAccessorT(NodeExtentAccessorT&&) = delete;
+  NodeExtentAccessorT& operator=(const NodeExtentAccessorT&) = delete;
+  NodeExtentAccessorT& operator=(NodeExtentAccessorT&&) = delete;
+
+  const node_stage_t& read() const { return node_stage; }
+  laddr_t get_laddr() const { return extent->get_laddr(); }
+
+  // must be called before any mutate attempes.
+  // for the safety of mixed read and mutate, call before read.
+  void prepare_mutate(context_t c) {
+    if (needs_mutate()) {
+      auto ref_recorder = recorder_t::create();
+      recorder = static_cast<recorder_t*>(ref_recorder.get());
+      extent = extent->mutate(c, std::move(ref_recorder));
+      assert(needs_recording());
+      node_stage = node_stage_t(
+          reinterpret_cast<const FieldType*>(extent->get_read()));
+      assert(recorder == static_cast<recorder_t*>(extent->get_recorder()));
+      mut.emplace(extent->get_mutable());
+    }
+  }
+
+  template <KeyT KT>
+  const value_t* insert_replayable(
+      const full_key_t<KT>& key,
+      const value_t& value,
+      position_t& insert_pos,
+      match_stage_t& insert_stage,
+      node_offset_t& insert_size) {
+    assert(!needs_mutate());
+    if (needs_recording()) {
+      recorder->template encode_insert<KT>(
+          key, value, insert_pos, insert_stage, insert_size);
+    }
+#ifndef NDEBUG
+    test_extent->prepare_replay(extent);
+    test_recorder->template encode_insert<KT>(
+        key, value, insert_pos, insert_stage, insert_size);
+#endif
+    auto ret = layout_t::template insert<KT>(
+        *mut, read(), key, value,
+        insert_pos, insert_stage, insert_size);
+#ifndef NDEBUG
+    test_extent->replay_and_verify(extent);
+#endif
+    return ret;
+  }
+
+  void split_replayable(StagedIterator& split_at) {
+    assert(!needs_mutate());
+    if (needs_recording()) {
+      recorder->encode_split(split_at, read().p_start());
+    }
+#ifndef NDEBUG
+    test_extent->prepare_replay(extent);
+    test_recorder->template encode_split(split_at, read().p_start());
+#endif
+    layout_t::split(*mut, read(), split_at);
+#ifndef NDEBUG
+    test_extent->replay_and_verify(extent);
+#endif
+  }
+
+  template <KeyT KT>
+  const value_t* split_insert_replayable(
+      StagedIterator& split_at,
+      const full_key_t<KT>& key,
+      const value_t& value,
+      position_t& insert_pos,
+      match_stage_t& insert_stage,
+      node_offset_t& insert_size) {
+    assert(!needs_mutate());
+    if (needs_recording()) {
+      recorder->template encode_split_insert<KT>(
+          split_at, key, value, insert_pos, insert_stage, insert_size,
+          read().p_start());
+    }
+#ifndef NDEBUG
+    test_extent->prepare_replay(extent);
+    test_recorder->template encode_split_insert<KT>(
+        split_at, key, value, insert_pos, insert_stage, insert_size,
+        read().p_start());
+#endif
+    auto ret = layout_t::template split_insert<KT>(
+        *mut, read(), split_at, key, value,
+        insert_pos, insert_stage, insert_size);
+#ifndef NDEBUG
+    test_extent->replay_and_verify(extent);
+#endif
+    return ret;
+  }
+
+  void update_child_addr_replayable(
+      const laddr_t new_addr, laddr_packed_t* p_addr) {
+    assert(!needs_mutate());
+    if (needs_recording()) {
+      recorder->encode_update_child_addr(new_addr, p_addr, read().p_start());
+    }
+#ifndef NDEBUG
+    test_extent->prepare_replay(extent);
+    test_recorder->encode_update_child_addr(new_addr, p_addr, read().p_start());
+#endif
+    layout_t::update_child_addr(*mut, new_addr, p_addr);
+#ifndef NDEBUG
+    test_extent->replay_and_verify(extent);
+#endif
+  }
+
+  void test_copy_to(NodeExtentMutable& to) const {
+    assert(extent->get_length() == to.get_length());
+    std::memcpy(to.get_write(), extent->get_read(), extent->get_length());
+  }
+
+ private:
+  /**
+   * Possible states with CachedExtent::extent_state_t:
+   *   INITIAL_WRITE_PENDING -- can mutate, no recording
+   *   MUTATION_PENDING      -- can mutate, needs recording
+   *   CLEAN/DIRTY           -- pending mutate
+   *   INVALID               -- impossible
+   */
+  bool no_recording() const {
+    return extent->is_initial_pending();
+  }
+  bool needs_recording() const {
+    return extent->is_mutation_pending();
+  }
+  bool needs_mutate() const {
+    assert(extent->is_valid());
+    return !extent->is_pending();
+  }
+
+  NodeExtentRef extent;
+  node_stage_t node_stage;
+  std::optional<NodeExtentMutable> mut;
+  // owned by extent
+  recorder_t* recorder;
+
+#ifndef NDEBUG
+  // verify record replay using a different memory block
+  TestReplayExtent::Ref test_extent;
+  recorder_t* test_recorder;
+#endif
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc
new file mode 100644
index 000000000..bd22d4b67
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_extent_manager.h"
+
+#include "node_extent_manager/dummy.h"
+#include "node_extent_manager/seastore.h"
+#include "stages/node_stage_layout.h"
+
+namespace crimson::os::seastore::onode {
+
+std::pair<node_type_t, field_type_t> NodeExtent::get_types() const {
+  const auto header = reinterpret_cast<const node_header_t*>(get_read());
+  auto node_type = header->get_node_type();
+  auto field_type = header->get_field_type();
+  if (!field_type.has_value()) {
+    throw std::runtime_error("load failed: bad field type");
+  }
+  return {node_type, *field_type};
+}
+
+NodeExtentManagerURef NodeExtentManager::create_dummy(bool is_sync) {
+  if (is_sync) {
+    return NodeExtentManagerURef(new DummyNodeExtentManager<true>());
+  } else {
+    return NodeExtentManagerURef(new DummyNodeExtentManager<false>());
+  }
+}
+
+NodeExtentManagerURef NodeExtentManager::create_seastore(
+    TransactionManager& tm, laddr_t min_laddr) {
+  return NodeExtentManagerURef(new SeastoreNodeExtentManager(tm, min_laddr));
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h
new file mode 100644
index 000000000..77b230e03
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/type_helpers.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/transaction_manager.h"
+
+#include "fwd.h"
+#include "super.h"
+#include "node_extent_mutable.h"
+#include "node_types.h"
+
+/**
+ * node_extent_manager.h
+ *
+ * Contains general interfaces for different backends (Dummy and Seastore).
+ */
+
+namespace crimson::os::seastore::onode {
+
+using crimson::os::seastore::LogicalCachedExtent;
+class NodeExtent : public LogicalCachedExtent {
+ public:
+  virtual ~NodeExtent() = default;
+  std::pair<node_type_t, field_type_t> get_types() const;
+  const char* get_read() const {
+    return get_bptr().c_str();
+  }
+  NodeExtentMutable get_mutable() {
+    assert(is_pending());
+    return do_get_mutable();
+  }
+
+  virtual DeltaRecorder* get_recorder() const = 0;
+  virtual NodeExtentRef mutate(context_t, DeltaRecorderURef&&) = 0;
+
+ protected:
+  template <typename... T>
+  NodeExtent(T&&... t) : LogicalCachedExtent(std::forward<T>(t)...) {}
+
+  NodeExtentMutable do_get_mutable() {
+    return NodeExtentMutable(*this);
+  }
+
+  /**
+   * Abstracted interfaces to implement:
+   * - CacheExtent::duplicate_for_write() -> CachedExtentRef
+   * - CacheExtent::get_type() -> extent_types_t
+   * - CacheExtent::get_delta() -> ceph::bufferlist
+   * - LogicalCachedExtent::apply_delta(const ceph::bufferlist) -> void
+   */
+
+ private:
+  friend class NodeExtentMutable;
+};
+
+using crimson::os::seastore::TransactionManager;
+class NodeExtentManager {
+ public:
+  virtual ~NodeExtentManager() = default;
+  using tm_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error,
+    crimson::ct_error::invarg,
+    crimson::ct_error::enoent,
+    crimson::ct_error::erange>;
+  template <class ValueT=void>
+  using tm_future = tm_ertr::future<ValueT>;
+
+  virtual bool is_read_isolated() const = 0;
+  virtual tm_future<NodeExtentRef> read_extent(
+      Transaction&, laddr_t, extent_len_t) = 0;
+  virtual tm_future<NodeExtentRef> alloc_extent(Transaction&, extent_len_t) = 0;
+  virtual tm_future<Super::URef> get_super(Transaction&, RootNodeTracker&) = 0;
+  virtual std::ostream& print(std::ostream& os) const = 0;
+
+  static NodeExtentManagerURef create_dummy(bool is_sync);
+  static NodeExtentManagerURef create_seastore(
+      TransactionManager& tm, laddr_t min_laddr = L_ADDR_MIN);
+};
+inline std::ostream& operator<<(std::ostream& os, const NodeExtentManager& nm) {
+  return nm.print(os);
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h
new file mode 100644
index 000000000..830ea4a7d
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h
@@ -0,0 +1,156 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <chrono>
+#include <seastar/core/sleep.hh>
+
+#include "include/buffer_raw.h"
+
+#include "crimson/common/log.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h"
+
+/**
+ * dummy.h
+ *
+ * Dummy backend implementations for test purposes.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class DummySuper final: public Super {
+ public:
+  DummySuper(Transaction& t, RootNodeTracker& tracker, laddr_t* p_root_laddr)
+      : Super(t, tracker), p_root_laddr{p_root_laddr} {}
+  ~DummySuper() override = default;
+ protected:
+  laddr_t get_root_laddr() const override { return *p_root_laddr; }
+  void write_root_laddr(context_t, laddr_t addr) override {
+    logger().info("OTree::Dummy: update root {:#x} ...", addr);
+    *p_root_laddr = addr;
+  }
+ private:
+  static seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+  laddr_t* p_root_laddr;
+};
+
+class DummyNodeExtent final: public NodeExtent {
+ public:
+  DummyNodeExtent(ceph::bufferptr &&ptr) : NodeExtent(std::move(ptr)) {
+    state = extent_state_t::INITIAL_WRITE_PENDING;
+  }
+  ~DummyNodeExtent() override = default;
+ protected:
+  NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override {
+    ceph_abort("impossible path"); }
+  DeltaRecorder* get_recorder() const override {
+    return nullptr; }
+  CachedExtentRef duplicate_for_write() override {
+    ceph_abort("impossible path"); }
+  extent_types_t get_type() const override {
+    return extent_types_t::TEST_BLOCK; }
+  ceph::bufferlist get_delta() override {
+    ceph_abort("impossible path"); }
+  void apply_delta(const ceph::bufferlist&) override {
+    ceph_abort("impossible path"); }
+};
+
+template <bool SYNC>
+class DummyNodeExtentManager final: public NodeExtentManager {
+  static constexpr size_t ALIGNMENT = 4096;
+ public:
+  ~DummyNodeExtentManager() override = default;
+ protected:
+  bool is_read_isolated() const override { return false; }
+
+  tm_future<NodeExtentRef> read_extent(
+      Transaction& t, laddr_t addr, extent_len_t len) override {
+    logger().trace("OTree::Dummy: reading {}B at {:#x} ...", len, addr);
+    if constexpr (SYNC) {
+      return read_extent_sync(t, addr, len);
+    } else {
+      using namespace std::chrono_literals;
+      return seastar::sleep(1us).then([this, &t, addr, len] {
+        return read_extent_sync(t, addr, len);
+      });
+    }
+  }
+
+  tm_future<NodeExtentRef> alloc_extent(
+      Transaction& t, extent_len_t len) override {
+    logger().trace("OTree::Dummy: allocating {}B ...", len);
+    if constexpr (SYNC) {
+      return alloc_extent_sync(t, len);
+    } else {
+      using namespace std::chrono_literals;
+      return seastar::sleep(1us).then([this, &t, len] {
+        return alloc_extent_sync(t, len);
+      });
+    }
+  }
+
+  tm_future<Super::URef> get_super(
+      Transaction& t, RootNodeTracker& tracker) override {
+    logger().trace("OTree::Dummy: get root ...");
+    if constexpr (SYNC) {
+      return get_super_sync(t, tracker);
+    } else {
+      using namespace std::chrono_literals;
+      return seastar::sleep(1us).then([this, &t, &tracker] {
+        return get_super_sync(t, tracker);
+      });
+    }
+  }
+
+  std::ostream& print(std::ostream& os) const override {
+    return os << "DummyNodeExtentManager(sync=" << SYNC << ")";
+  }
+
+ private:
+  tm_future<NodeExtentRef> read_extent_sync(
+      Transaction& t, laddr_t addr, extent_len_t len) {
+    auto iter = allocate_map.find(addr);
+    assert(iter != allocate_map.end());
+    auto extent = iter->second;
+    logger().trace("OTree::Dummy: read {}B at {:#x}",
+                   extent->get_length(), extent->get_laddr());
+    assert(extent->get_laddr() == addr);
+    assert(extent->get_length() == len);
+    return tm_ertr::make_ready_future<NodeExtentRef>(extent);
+  }
+
+  tm_future<NodeExtentRef> alloc_extent_sync(
+      Transaction& t, extent_len_t len) {
+    assert(len % ALIGNMENT == 0);
+    auto r = ceph::buffer::create_aligned(len, ALIGNMENT);
+    auto addr = reinterpret_cast<laddr_t>(r->get_data());
+    auto bp = ceph::bufferptr(std::move(r));
+    auto extent = Ref<DummyNodeExtent>(new DummyNodeExtent(std::move(bp)));
+    extent->set_laddr(addr);
+    assert(allocate_map.find(extent->get_laddr()) == allocate_map.end());
+    allocate_map.insert({extent->get_laddr(), extent});
+    logger().debug("OTree::Dummy: allocated {}B at {:#x}",
+                   extent->get_length(), extent->get_laddr());
+    assert(extent->get_length() == len);
+    return tm_ertr::make_ready_future<NodeExtentRef>(extent);
+  }
+
+  tm_future<Super::URef> get_super_sync(
+      Transaction& t, RootNodeTracker& tracker) {
+    logger().debug("OTree::Dummy: got root {:#x}", root_laddr);
+    return tm_ertr::make_ready_future<Super::URef>(
+        Super::URef(new DummySuper(t, tracker, &root_laddr)));
+  }
+
+  static seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+
+  std::map<laddr_t, Ref<DummyNodeExtent>> allocate_map;
+  laddr_t root_laddr = L_ADDR_NULL;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc
new file mode 100644
index 000000000..8d88485bf
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc
@@ -0,0 +1,88 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "seastore.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h"
+
+namespace {
+
+seastar::logger& logger() {
+  return crimson::get_logger(ceph_subsys_filestore);
+}
+
+}
+
+namespace crimson::os::seastore::onode {
+
+static DeltaRecorderURef create_recorder(
+    node_type_t node_type, field_type_t field_type) {
+  if (node_type == node_type_t::LEAF) {
+    if (field_type == field_type_t::N0) {
+      return DeltaRecorderT<node_fields_0_t, node_type_t::LEAF>::create();
+    } else if (field_type == field_type_t::N1) {
+      return DeltaRecorderT<node_fields_1_t, node_type_t::LEAF>::create();
+    } else if (field_type == field_type_t::N2) {
+      return DeltaRecorderT<node_fields_2_t, node_type_t::LEAF>::create();
+    } else if (field_type == field_type_t::N3) {
+      return DeltaRecorderT<leaf_fields_3_t, node_type_t::LEAF>::create();
+    } else {
+      ceph_abort("impossible path");
+    }
+  } else if (node_type == node_type_t::INTERNAL) {
+    if (field_type == field_type_t::N0) {
+      return DeltaRecorderT<node_fields_0_t, node_type_t::INTERNAL>::create();
+    } else if (field_type == field_type_t::N1) {
+      return DeltaRecorderT<node_fields_1_t, node_type_t::INTERNAL>::create();
+    } else if (field_type == field_type_t::N2) {
+      return DeltaRecorderT<node_fields_2_t, node_type_t::INTERNAL>::create();
+    } else if (field_type == field_type_t::N3) {
+      return DeltaRecorderT<internal_fields_3_t, node_type_t::INTERNAL>::create();
+    } else {
+      ceph_abort("impossible path");
+    }
+  } else {
+    ceph_abort("impossible path");
+  }
+}
+
+void SeastoreSuper::write_root_laddr(context_t c, laddr_t addr) {
+  logger().info("OTree::Seastore: update root {:#x} ...", addr);
+  root_addr = addr;
+  auto nm = static_cast<SeastoreNodeExtentManager*>(&c.nm);
+  nm->get_tm().write_onode_root(c.t, addr);
+}
+
+NodeExtentRef SeastoreNodeExtent::mutate(
+    context_t c, DeltaRecorderURef&& _recorder) {
+  logger().debug("OTree::Seastore: mutate {:#x} ...", get_laddr());
+  auto nm = static_cast<SeastoreNodeExtentManager*>(&c.nm);
+  auto extent = nm->get_tm().get_mutable_extent(c.t, this);
+  auto ret = extent->cast<SeastoreNodeExtent>();
+  assert(!ret->recorder || ret->recorder->is_empty());
+  ret->recorder = std::move(_recorder);
+  return ret;
+}
+
+void SeastoreNodeExtent::apply_delta(const ceph::bufferlist& bl) {
+  logger().debug("OTree::Seastore: replay {:#x} ...", get_laddr());
+  if (!recorder) {
+    auto [node_type, field_type] = get_types();
+    recorder = create_recorder(node_type, field_type);
+  } else {
+#ifndef NDEBUG
+    auto [node_type, field_type] = get_types();
+    assert(recorder->node_type() == node_type);
+    assert(recorder->field_type() == field_type);
+#endif
+  }
+  assert(is_clean());
+  auto node = do_get_mutable();
+  auto p = bl.cbegin();
+  while (p != bl.end()) {
+    recorder->apply_delta(p, node);
+  }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h
new file mode 100644
index 000000000..f80b99fab
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h
@@ -0,0 +1,126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/log.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h"
+
+/**
+ * seastore.h
+ *
+ * Seastore backend implementations.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class SeastoreSuper final: public Super {
+ public:
+  SeastoreSuper(Transaction& t, RootNodeTracker& tracker,
+                laddr_t root_addr, TransactionManager& tm)
+    : Super(t, tracker), root_addr{root_addr}, tm{tm} {}
+  ~SeastoreSuper() override = default;
+ protected:
+  laddr_t get_root_laddr() const override {
+    return root_addr;
+  }
+  void write_root_laddr(context_t c, laddr_t addr) override;
+ private:
+  static seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+  laddr_t root_addr;
+  TransactionManager& tm;
+};
+
+class SeastoreNodeExtent final: public NodeExtent {
+ public:
+  SeastoreNodeExtent(ceph::bufferptr &&ptr)
+    : NodeExtent(std::move(ptr)) {}
+  SeastoreNodeExtent(const SeastoreNodeExtent& other)
+    : NodeExtent(other) {}
+  ~SeastoreNodeExtent() override = default;
+ protected:
+  NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override;
+
+  DeltaRecorder* get_recorder() const override {
+    return recorder.get();
+  }
+
+  CachedExtentRef duplicate_for_write() override {
+    return CachedExtentRef(new SeastoreNodeExtent(*this));
+  }
+  extent_types_t get_type() const override {
+    return extent_types_t::ONODE_BLOCK_STAGED;
+  }
+  ceph::bufferlist get_delta() override {
+    assert(recorder);
+    return recorder->get_delta();
+  }
+  void apply_delta(const ceph::bufferlist&) override;
+ private:
+  DeltaRecorderURef recorder;
+};
+
+class SeastoreNodeExtentManager final: public NodeExtentManager {
+ public:
+  SeastoreNodeExtentManager(TransactionManager& tm, laddr_t min)
+    : tm{tm}, addr_min{min} {};
+  ~SeastoreNodeExtentManager() override = default;
+  TransactionManager& get_tm() { return tm; }
+ protected:
+  bool is_read_isolated() const override { return true; }
+
+  tm_future<NodeExtentRef> read_extent(
+      Transaction& t, laddr_t addr, extent_len_t len) override {
+    logger().debug("OTree::Seastore: reading {}B at {:#x} ...", len, addr);
+    return tm.read_extents<SeastoreNodeExtent>(t, addr, len
+    ).safe_then([addr, len](auto&& extents) {
+      assert(extents.size() == 1);
+      [[maybe_unused]] auto [laddr, e] = extents.front();
+      logger().trace("OTree::Seastore: read {}B at {:#x}",
+                     e->get_length(), e->get_laddr());
+      assert(e->get_laddr() == addr);
+      assert(e->get_length() == len);
+      std::ignore = addr;
+      std::ignore = len;
+      return NodeExtentRef(e);
+    });
+  }
+
+  tm_future<NodeExtentRef> alloc_extent(
+      Transaction& t, extent_len_t len) override {
+    logger().debug("OTree::Seastore: allocating {}B ...", len);
+    return tm.alloc_extent<SeastoreNodeExtent>(t, addr_min, len
+    ).safe_then([len](auto extent) {
+      logger().debug("OTree::Seastore: allocated {}B at {:#x}",
+                     extent->get_length(), extent->get_laddr());
+      assert(extent->get_length() == len);
+      std::ignore = len;
+      return NodeExtentRef(extent);
+    });
+  }
+
+  tm_future<Super::URef> get_super(
+      Transaction& t, RootNodeTracker& tracker) override {
+    logger().trace("OTree::Seastore: get root ...");
+    return tm.read_onode_root(t).safe_then([this, &t, &tracker](auto root_addr) {
+      logger().debug("OTree::Seastore: got root {:#x}", root_addr);
+      return Super::URef(new SeastoreSuper(t, tracker, root_addr, tm));
+    });
+  }
+
+  std::ostream& print(std::ostream& os) const override {
+    return os << "SeastoreNodeExtentManager";
+  }
+
+ private:
+  static seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+  TransactionManager& tm;
+  const laddr_t addr_min;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h
new file mode 100644
index 000000000..240c88932
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h"
+
+/** test_replay.h
+ *
+ * A special version of NodeExtent to help verify delta encode, decode and
+ * replay in recorder_t under debug build.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class TestReplayExtent final: public NodeExtent {
+ public:
+  using Ref = crimson::os::seastore::TCachedExtentRef<TestReplayExtent>;
+
+  void prepare_replay(NodeExtentRef from_extent) {
+    assert(get_length() == from_extent->get_length());
+    auto mut = do_get_mutable();
+    std::memcpy(mut.get_write(), from_extent->get_read(), get_length());
+  }
+
+  void replay_and_verify(NodeExtentRef replayed_extent) {
+    assert(get_length() == replayed_extent->get_length());
+    auto mut = do_get_mutable();
+    auto bl = recorder->get_delta();
+    assert(bl.length());
+    auto p = bl.cbegin();
+    recorder->apply_delta(p, mut);
+    assert(p == bl.end());
+    auto cmp = std::memcmp(get_read(), replayed_extent->get_read(), get_length());
+    ceph_assert(cmp == 0 && "replay mismatch!");
+  }
+
+  static Ref create(extent_len_t length, DeltaRecorderURef&& recorder) {
+    auto r = ceph::buffer::create_aligned(length, 4096);
+    auto bp = ceph::bufferptr(std::move(r));
+    return new TestReplayExtent(std::move(bp), std::move(recorder));
+  }
+
+ protected:
+  NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override {
+    ceph_abort("impossible path"); }
+  DeltaRecorder* get_recorder() const override {
+    ceph_abort("impossible path"); }
+  CachedExtentRef duplicate_for_write() override {
+    ceph_abort("impossible path"); }
+  extent_types_t get_type() const override {
+    return extent_types_t::TEST_BLOCK; }
+  ceph::bufferlist get_delta() override {
+    ceph_abort("impossible path"); }
+  void apply_delta(const ceph::bufferlist&) override {
+    ceph_abort("impossible path"); }
+
+ private:
+  TestReplayExtent(ceph::bufferptr&& ptr, DeltaRecorderURef&& recorder)
+      : NodeExtent(std::move(ptr)), recorder(std::move(recorder)) {
+    state = extent_state_t::MUTATION_PENDING;
+  }
+  DeltaRecorderURef recorder;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc
new file mode 100644
index 000000000..048c4000d
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_extent_mutable.h"
+#include "node_extent_manager.h"
+
+namespace crimson::os::seastore::onode {
+
+NodeExtentMutable::NodeExtentMutable(NodeExtent& extent)
+    : extent{extent} {
+  assert(extent.is_pending() ||  // during mutation
+         extent.is_clean());     // during replay
+}
+
+const char* NodeExtentMutable::get_read() const {
+  assert(extent.is_pending() ||  // during mutation
+         extent.is_clean());     // during replay
+  return extent.get_bptr().c_str();
+}
+
+char* NodeExtentMutable::get_write() {
+  assert(extent.is_pending() ||  // during mutation
+         extent.is_clean());     // during replay
+  return extent.get_bptr().c_str();
+}
+
+extent_len_t NodeExtentMutable::get_length() const {
+  return extent.get_length();
+}
+
+laddr_t NodeExtentMutable::get_laddr() const {
+  return extent.get_laddr();
+}
+
+const char* NodeExtentMutable::buf_upper_bound() const {
+  return get_read() + get_length();
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h
new file mode 100644
index 000000000..52f10a013
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <cstring>
+
+#include "fwd.h"
+
+#pragma once
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtent;
+
+/**
+ * NodeExtentMutable
+ *
+ * A thin wrapper of NodeExtent to make sure that only the newly allocated
+ * or the duplicated NodeExtent is mutable, and the memory modifications are
+ * safe within the extent range.
+ */
+class NodeExtentMutable {
+ public:
+  void copy_in_absolute(void* dst, const void* src, extent_len_t len) {
+    assert((char*)dst >= get_write());
+    assert((char*)dst + len <= buf_upper_bound());
+    std::memcpy(dst, src, len);
+  }
+  template <typename T>
+  void copy_in_absolute(void* dst, const T& src) {
+    copy_in_absolute(dst, &src, sizeof(T));
+  }
+
+  const void* copy_in_relative(
+      extent_len_t dst_offset, const void* src, extent_len_t len) {
+    auto dst = get_write() + dst_offset;
+    copy_in_absolute(dst, src, len);
+    return dst;
+  }
+  template <typename T>
+  const T* copy_in_relative(
+      extent_len_t dst_offset, const T& src) {
+    auto dst = copy_in_relative(dst_offset, &src, sizeof(T));
+    return static_cast<const T*>(dst);
+  }
+
+  void shift_absolute(const void* src, extent_len_t len, int offset) {
+    assert((const char*)src >= get_write());
+    assert((const char*)src + len <= buf_upper_bound());
+    char* to = (char*)src + offset;
+    assert(to >= get_write());
+    assert(to + len <= buf_upper_bound());
+    if (len != 0) {
+      std::memmove(to, src, len);
+    }
+  }
+  void shift_relative(extent_len_t src_offset, extent_len_t len, int offset) {
+    shift_absolute(get_write() + src_offset, len, offset);
+  }
+
+  template <typename T>
+  void validate_inplace_update(const T& updated) {
+    assert((const char*)&updated >= get_write());
+    assert((const char*)&updated + sizeof(T) <= buf_upper_bound());
+  }
+
+  const char* get_read() const;
+  char* get_write();
+  extent_len_t get_length() const;
+  laddr_t get_laddr() const;
+
+ private:
+  explicit NodeExtentMutable(NodeExtent&);
+  const char* buf_upper_bound() const;
+
+  NodeExtent& extent;
+
+  friend class NodeExtent;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc
new file mode 100644
index 000000000..59d792b1a
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_impl.h"
+#include "node_layout.h"
+
+namespace crimson::os::seastore::onode {
+
+#ifdef UNIT_TESTS_BUILT
+last_split_info_t last_split = {};
+#endif
+
+// XXX: branchless allocation
+InternalNodeImpl::alloc_ertr::future<InternalNodeImpl::fresh_impl_t>
+InternalNodeImpl::allocate(
+    context_t c, field_type_t type, bool is_level_tail, level_t level) {
+  if (type == field_type_t::N0) {
+    return InternalNode0::allocate(c, is_level_tail, level);
+  } else if (type == field_type_t::N1) {
+    return InternalNode1::allocate(c, is_level_tail, level);
+  } else if (type == field_type_t::N2) {
+    return InternalNode2::allocate(c, is_level_tail, level);
+  } else if (type == field_type_t::N3) {
+    return InternalNode3::allocate(c, is_level_tail, level);
+  } else {
+    ceph_abort("impossible path");
+  }
+}
+
+LeafNodeImpl::alloc_ertr::future<LeafNodeImpl::fresh_impl_t>
+LeafNodeImpl::allocate(
+    context_t c, field_type_t type, bool is_level_tail) {
+  if (type == field_type_t::N0) {
+    return LeafNode0::allocate(c, is_level_tail, 0);
+  } else if (type == field_type_t::N1) {
+    return LeafNode1::allocate(c, is_level_tail, 0);
+  } else if (type == field_type_t::N2) {
+    return LeafNode2::allocate(c, is_level_tail, 0);
+  } else if (type == field_type_t::N3) {
+    return LeafNode3::allocate(c, is_level_tail, 0);
+  } else {
+    ceph_abort("impossible path");
+  }
+}
+
+InternalNodeImplURef InternalNodeImpl::load(
+    NodeExtentRef extent, field_type_t type, bool expect_is_level_tail) {
+  if (type == field_type_t::N0) {
+    return InternalNode0::load(extent, expect_is_level_tail);
+  } else if (type == field_type_t::N1) {
+    return InternalNode1::load(extent, expect_is_level_tail);
+  } else if (type == field_type_t::N2) {
+    return InternalNode2::load(extent, expect_is_level_tail);
+  } else if (type == field_type_t::N3) {
+    return InternalNode3::load(extent, expect_is_level_tail);
+  } else {
+    ceph_abort("impossible path");
+  }
+}
+
+LeafNodeImplURef LeafNodeImpl::load(
+    NodeExtentRef extent, field_type_t type, bool expect_is_level_tail) {
+  if (type == field_type_t::N0) {
+    return LeafNode0::load(extent, expect_is_level_tail);
+  } else if (type == field_type_t::N1) {
+    return LeafNode1::load(extent, expect_is_level_tail);
+  } else if (type == field_type_t::N2) {
+    return LeafNode2::load(extent, expect_is_level_tail);
+  } else if (type == field_type_t::N3) {
+    return LeafNode3::load(extent, expect_is_level_tail);
+  } else {
+    ceph_abort("impossible path");
+  }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h
new file mode 100644
index 000000000..3267cda2b
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h
@@ -0,0 +1,197 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+
+#include "node_extent_mutable.h"
+#include "node_types.h"
+#include "stages/stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+#ifdef UNIT_TESTS_BUILT
+enum class InsertType { BEGIN, LAST, MID };
+struct split_expectation_t {
+  match_stage_t split_stage;
+  match_stage_t insert_stage;
+  bool is_insert_left;
+  InsertType insert_type;
+};
+struct last_split_info_t {
+  search_position_t split_pos;
+  match_stage_t insert_stage;
+  bool is_insert_left;
+  InsertType insert_type;
+  bool match(const split_expectation_t& e) const {
+    match_stage_t split_stage;
+    if (split_pos.nxt.nxt.index == 0) {
+      if (split_pos.nxt.index == 0) {
+        split_stage = 2;
+      } else {
+        split_stage = 1;
+      }
+    } else {
+      split_stage = 0;
+    }
+    return split_stage == e.split_stage &&
+           insert_stage == e.insert_stage &&
+           is_insert_left == e.is_insert_left &&
+           insert_type == e.insert_type;
+  }
+  bool match_split_pos(const search_position_t& pos) const {
+    return split_pos == pos;
+  }
+};
+extern last_split_info_t last_split;
+#endif
+
+struct key_hobj_t;
+struct key_view_t;
+class NodeExtentMutable;
+
+/**
+ * NodeImpl
+ *
+ * Hides type specific node layout implementations for Node.
+ */
+class NodeImpl {
+ public:
+  using alloc_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error,
+    crimson::ct_error::invarg,
+    crimson::ct_error::enoent,
+    crimson::ct_error::erange>;
+  virtual ~NodeImpl() = default;
+
+  virtual field_type_t field_type() const = 0;
+  virtual laddr_t laddr() const = 0;
+  virtual void prepare_mutate(context_t) = 0;
+  virtual bool is_level_tail() const = 0;
+  virtual bool is_empty() const = 0;
+  virtual level_t level() const = 0;
+  virtual node_offset_t free_size() const = 0;
+  virtual key_view_t get_key_view(const search_position_t&) const = 0;
+  virtual key_view_t get_largest_key_view() const = 0;
+  virtual void next_position(search_position_t&) const = 0;
+
+  virtual node_stats_t get_stats() const = 0;
+  virtual std::ostream& dump(std::ostream&) const = 0;
+  virtual std::ostream& dump_brief(std::ostream&) const = 0;
+  virtual void validate_layout() const = 0;
+
+  virtual void test_copy_to(NodeExtentMutable&) const = 0;
+  virtual void test_set_tail(NodeExtentMutable&) = 0;
+
+ protected:
+  NodeImpl() = default;
+};
+
+/**
+ * InternalNodeImpl
+ *
+ * Hides type specific node layout implementations for InternalNode.
+ */
+class InternalNodeImpl : public NodeImpl {
+ public:
+  struct internal_marker_t {};
+  virtual ~InternalNodeImpl() = default;
+
+  #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+  virtual const laddr_packed_t* get_p_value(
+      const search_position_t&,
+      key_view_t* = nullptr, internal_marker_t = {}) const {
+    ceph_abort("impossible path");
+  }
+  #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+  virtual lookup_result_t<node_type_t::INTERNAL> lower_bound(
+      const key_hobj_t&, MatchHistory&,
+      key_view_t* = nullptr, internal_marker_t = {}) const {
+    ceph_abort("impossible path");
+  }
+  #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+  virtual const laddr_packed_t* insert(
+      const key_view_t&, const laddr_packed_t&, search_position_t&, match_stage_t&, node_offset_t&) {
+    ceph_abort("impossible path");
+  }
+  #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+  virtual std::tuple<search_position_t, bool, const laddr_packed_t*> split_insert(
+      NodeExtentMutable&, NodeImpl&, const key_view_t&, const laddr_packed_t&,
+      search_position_t&, match_stage_t&, node_offset_t&) {
+    ceph_abort("impossible path");
+  }
+
+  virtual void replace_child_addr(const search_position_t&, laddr_t dst, laddr_t src) = 0;
+  virtual std::tuple<match_stage_t, node_offset_t> evaluate_insert(
+      const key_view_t&, const laddr_t&, search_position_t&) const = 0;
+
+  struct fresh_impl_t {
+    InternalNodeImplURef impl;
+    NodeExtentMutable mut;
+    std::pair<NodeImplURef, NodeExtentMutable> make_pair() {
+      return {std::move(impl), mut};
+    }
+  };
+  static alloc_ertr::future<fresh_impl_t> allocate(context_t, field_type_t, bool, level_t);
+  static InternalNodeImplURef load(NodeExtentRef, field_type_t, bool);
+
+ protected:
+  InternalNodeImpl() = default;
+};
+
+/**
+ * LeafNodeImpl
+ *
+ * Hides type specific node layout implementations for LeafNode.
+ */
+class LeafNodeImpl : public NodeImpl {
+ public:
+  struct leaf_marker_t {};
+  virtual ~LeafNodeImpl() = default;
+
+  #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+  virtual const onode_t* get_p_value(
+      const search_position_t&,
+      key_view_t* = nullptr, leaf_marker_t={}) const {
+    ceph_abort("impossible path");
+  }
+  #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+  virtual lookup_result_t<node_type_t::LEAF> lower_bound(
+      const key_hobj_t&, MatchHistory&,
+      key_view_t* = nullptr, leaf_marker_t = {}) const {
+    ceph_abort("impossible path");
+  }
+  #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+  virtual const onode_t* insert(
+      const key_hobj_t&, const onode_t&, search_position_t&, match_stage_t&, node_offset_t&) {
+    ceph_abort("impossible path");
+  }
+  #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+  virtual std::tuple<search_position_t, bool, const onode_t*> split_insert(
+      NodeExtentMutable&, NodeImpl&, const key_hobj_t&, const onode_t&,
+      search_position_t&, match_stage_t&, node_offset_t&) {
+    ceph_abort("impossible path");
+  }
+
+  virtual void get_largest_slot(
+      search_position_t&, key_view_t&, const onode_t**) const = 0;
+  virtual std::tuple<match_stage_t, node_offset_t> evaluate_insert(
+      const key_hobj_t&, const onode_t&,
+      const MatchHistory&, match_stat_t, search_position_t&) const = 0;
+
+  struct fresh_impl_t {
+    LeafNodeImplURef impl;
+    NodeExtentMutable mut;
+    std::pair<NodeImplURef, NodeExtentMutable> make_pair() {
+      return {std::move(impl), mut};
+    }
+  };
+  static alloc_ertr::future<fresh_impl_t> allocate(context_t, field_type_t, bool);
+  static LeafNodeImplURef load(NodeExtentRef, field_type_t, bool);
+
+ protected:
+  LeafNodeImpl() = default;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
new file mode 100644
index 000000000..916d17424
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
@@ -0,0 +1,613 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+#include <sstream>
+
+#include "common/likely.h"
+#include "crimson/common/log.h"
+#include "node_extent_accessor.h"
+#include "node_impl.h"
+#include "stages/node_stage_layout.h"
+
+namespace crimson::os::seastore::onode {
+
+template <node_type_t NODE_TYPE> struct insert_key_type;
+template <> struct insert_key_type<node_type_t::INTERNAL> {
+  static constexpr auto type = KeyT::VIEW; };
+template <> struct insert_key_type<node_type_t::LEAF> {
+  static constexpr auto type = KeyT::HOBJ; };
+
+template <node_type_t NODE_TYPE> struct node_impl_type;
+template <> struct node_impl_type<node_type_t::INTERNAL> {
+  using type = InternalNodeImpl; };
+template <> struct node_impl_type<node_type_t::LEAF> {
+  using type = LeafNodeImpl; };
+
+template <node_type_t NODE_TYPE> struct node_marker_type;
+template <> struct node_marker_type<node_type_t::INTERNAL> {
+  using type = InternalNodeImpl::internal_marker_t; };
+template <> struct node_marker_type<node_type_t::LEAF> {
+  using type = LeafNodeImpl::leaf_marker_t; };
+
+/**
+ * NodeLayoutT
+ *
+ * Contains templated and concrete implementations for both InternalNodeImpl
+ * and LeafNodeImpl under a specific node layout.
+ */
+template <typename FieldType, node_type_t NODE_TYPE>
+class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl {
+ public:
+  using URef = std::unique_ptr<NodeLayoutT>;
+  using extent_t = NodeExtentAccessorT<FieldType, NODE_TYPE>;
+  using parent_t = typename node_impl_type<NODE_TYPE>::type;
+  using marker_t = typename node_marker_type<NODE_TYPE>::type;
+  using node_stage_t = typename extent_t::node_stage_t;
+  using position_t = typename extent_t::position_t;
+  using value_t = typename extent_t::value_t;
+  static constexpr auto FIELD_TYPE = extent_t::FIELD_TYPE;
+  static constexpr auto KEY_TYPE = insert_key_type<NODE_TYPE>::type;
+  static constexpr auto STAGE = STAGE_T::STAGE;
+
+  NodeLayoutT(const NodeLayoutT&) = delete;
+  NodeLayoutT(NodeLayoutT&&) = delete;
+  NodeLayoutT& operator=(const NodeLayoutT&) = delete;
+  NodeLayoutT& operator=(NodeLayoutT&&) = delete;
+  ~NodeLayoutT() override = default;
+
+  static URef load(NodeExtentRef extent, bool expect_is_level_tail) {
+    std::unique_ptr<NodeLayoutT> ret(new NodeLayoutT(extent));
+    assert(ret->is_level_tail() == expect_is_level_tail);
+    return ret;
+  }
+
+  using alloc_ertr = NodeExtentManager::tm_ertr;
+  static alloc_ertr::future<typename parent_t::fresh_impl_t> allocate(
+      context_t c, bool is_level_tail, level_t level) {
+    // NOTE: Currently, all the node types have the same size for simplicity.
+    // But depending on the requirement, we may need to make node size
+    // configurable by field_type_t and node_type_t, or totally flexible.
+    return c.nm.alloc_extent(c.t, node_stage_t::EXTENT_SIZE
+    ).safe_then([is_level_tail, level](auto extent) {
+      assert(extent->is_initial_pending());
+      auto mut = extent->get_mutable();
+      node_stage_t::bootstrap_extent(
+          mut, FIELD_TYPE, NODE_TYPE, is_level_tail, level);
+      return typename parent_t::fresh_impl_t{
+        std::unique_ptr<parent_t>(new NodeLayoutT(extent)), mut};
+    });
+  }
+
+ protected:
+  /*
+   * NodeImpl
+   */
+  field_type_t field_type() const override { return FIELD_TYPE; }
+  laddr_t laddr() const override { return extent.get_laddr(); }
+  void prepare_mutate(context_t c) override { return extent.prepare_mutate(c); }
+  bool is_level_tail() const override { return extent.read().is_level_tail(); }
+  bool is_empty() const override { return extent.read().keys() == 0; }
+  level_t level() const override { return extent.read().level(); }
+  node_offset_t free_size() const override { return extent.read().free_size(); }
+
+  key_view_t get_key_view(const search_position_t& position) const override {
+    key_view_t ret;
+    STAGE_T::get_key_view(extent.read(), cast_down<STAGE>(position), ret);
+    return ret;
+  }
+
+  key_view_t get_largest_key_view() const override {
+    key_view_t index_key;
+    STAGE_T::template lookup_largest_slot<false, true, false>(
+        extent.read(), nullptr, &index_key, nullptr);
+    return index_key;
+  }
+
+  void next_position(search_position_t& pos) const override {
+    assert(!pos.is_end());
+    bool find_next = STAGE_T::next_position(extent.read(), cast_down<STAGE>(pos));
+    if (find_next) {
+      pos = search_position_t::end();
+    }
+  }
+
+  node_stats_t get_stats() const override {
+    node_stats_t stats;
+    auto& node_stage = extent.read();
+    key_view_t index_key;
+    if (node_stage.keys()) {
+      STAGE_T::get_stats(node_stage, stats, index_key);
+    }
+    stats.size_persistent = node_stage_t::EXTENT_SIZE;
+    stats.size_filled = filled_size();
+    if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+      if (is_level_tail()) {
+        stats.size_logical += sizeof(value_t);
+        stats.size_value += sizeof(value_t);
+        stats.num_kvs += 1;
+      }
+    }
+    return stats;
+  }
+
+  std::ostream& dump(std::ostream& os) const override {
+    auto& node_stage = extent.read();
+    auto p_start = node_stage.p_start();
+    dump_brief(os);
+    auto stats = get_stats();
+    os << " num_kvs=" << stats.num_kvs
+       << ", logical=" << stats.size_logical
+       << "B, overhead=" << stats.size_overhead
+       << "B, value=" << stats.size_value << "B";
+    os << ":\n  header: " << node_stage_t::header_size() << "B";
+    size_t size = 0u;
+    if (node_stage.keys()) {
+      STAGE_T::dump(node_stage, os, "  ", size, p_start);
+    } else {
+      size += node_stage_t::header_size();
+      if (NODE_TYPE == node_type_t::LEAF || !node_stage.is_level_tail()) {
+        os << " empty!";
+      }
+    }
+    if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+      if (node_stage.is_level_tail()) {
+        size += sizeof(laddr_t);
+        auto value_ptr = node_stage.get_end_p_laddr();
+        int offset = reinterpret_cast<const char*>(value_ptr) - p_start;
+        os << "\n  tail value: 0x"
+           << std::hex << value_ptr->value << std::dec
+           << " " << size << "B"
+           << "  @" << offset << "B";
+      }
+    }
+    assert(size == filled_size());
+    return os;
+  }
+
+  std::ostream& dump_brief(std::ostream& os) const override {
+    auto& node_stage = extent.read();
+    os << "Node" << NODE_TYPE << FIELD_TYPE
+       << "@0x" << std::hex << extent.get_laddr()
+       << "+" << node_stage_t::EXTENT_SIZE << std::dec
+       << (node_stage.is_level_tail() ? "$" : "")
+       << "(level=" << (unsigned)node_stage.level()
+       << ", filled=" << filled_size() << "B"
+       << ", free=" << node_stage.free_size() << "B"
+       << ")";
+    return os;
+  }
+
+  void validate_layout() const override {
+#ifndef NDEBUG
+    STAGE_T::validate(extent.read());
+#endif
+  }
+
+  void test_copy_to(NodeExtentMutable& to) const override {
+    extent.test_copy_to(to);
+  }
+
+  void test_set_tail(NodeExtentMutable& mut) override {
+    node_stage_t::update_is_level_tail(mut, extent.read(), true);
+  }
+
+  /*
+   * Common
+   */
+  const value_t* get_p_value(const search_position_t& position,
+                             key_view_t* index_key=nullptr, marker_t={}) const override {
+    auto& node_stage = extent.read();
+    if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+      assert(!index_key);
+      if (position.is_end()) {
+        assert(is_level_tail());
+        return node_stage.get_end_p_laddr();
+      }
+    } else {
+      assert(!position.is_end());
+    }
+    if (index_key) {
+      return STAGE_T::template get_p_value<true>(
+          node_stage, cast_down<STAGE>(position), index_key);
+    } else {
+      return STAGE_T::get_p_value(node_stage, cast_down<STAGE>(position));
+    }
+  }
+
+  lookup_result_t<NODE_TYPE> lower_bound(
+      const key_hobj_t& key, MatchHistory& history,
+      key_view_t* index_key=nullptr, marker_t={}) const override {
+    auto& node_stage = extent.read();
+    if constexpr (NODE_TYPE == node_type_t::LEAF) {
+      if (unlikely(node_stage.keys() == 0)) {
+        history.set<STAGE_LEFT>(MatchKindCMP::LT);
+        return lookup_result_t<NODE_TYPE>::end();
+      }
+    }
+
+    typename STAGE_T::result_t result_raw;
+    if (index_key) {
+      result_raw = STAGE_T::template lower_bound<true>(
+          node_stage, key, history, index_key);
+#ifndef NDEBUG
+      if (!result_raw.is_end()) {
+        full_key_t<KeyT::VIEW> index;
+        STAGE_T::get_key_view(node_stage, result_raw.position, index);
+        assert(index == *index_key);
+      }
+#endif
+    } else {
+      result_raw = STAGE_T::lower_bound(node_stage, key, history);
+    }
+#ifndef NDEBUG
+    if (result_raw.is_end()) {
+      assert(result_raw.mstat == MSTAT_END);
+    } else {
+      full_key_t<KeyT::VIEW> index;
+      STAGE_T::get_key_view(node_stage, result_raw.position, index);
+      assert_mstat(key, index, result_raw.mstat);
+    }
+#endif
+
+    // calculate MSTAT_LT3
+    if constexpr (FIELD_TYPE == field_type_t::N0) {
+      // currently only internal node checks mstat
+      if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+        if (result_raw.mstat == MSTAT_LT2) {
+          auto cmp = compare_to<KeyT::HOBJ>(
+              key, node_stage[result_raw.position.index].shard_pool);
+          assert(cmp != MatchKindCMP::GT);
+          if (cmp != MatchKindCMP::EQ) {
+            result_raw.mstat = MSTAT_LT3;
+          }
+        }
+      }
+    }
+
+    auto result = normalize(std::move(result_raw));
+    if (result.is_end()) {
+      assert(node_stage.is_level_tail());
+      assert(result.p_value == nullptr);
+      if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+        result.p_value = node_stage.get_end_p_laddr();
+      }
+    } else {
+      assert(result.p_value != nullptr);
+    }
+    return result;
+  }
+
+  const value_t* insert(
+      const full_key_t<KEY_TYPE>& key, const value_t& value,
+      search_position_t& insert_pos, match_stage_t& insert_stage,
+      node_offset_t& insert_size) override {
+    logger().debug("OTree::Layout::Insert: begin at "
+                   "insert_pos({}), insert_stage={}, insert_size={}B ...",
+                   insert_pos, insert_stage, insert_size);
+    if (unlikely(logger().is_enabled(seastar::log_level::trace))) {
+      std::ostringstream sos;
+      dump(sos);
+      logger().trace("OTree::Layout::Insert: -- dump\n{}", sos.str());
+    }
+    auto ret = extent.template insert_replayable<KEY_TYPE>(
+        key, value, cast_down<STAGE>(insert_pos), insert_stage, insert_size);
+    logger().debug("OTree::Layout::Insert: done  at "
+                   "insert_pos({}), insert_stage={}, insert_size={}B",
+                   insert_pos, insert_stage, insert_size);
+    if (unlikely(logger().is_enabled(seastar::log_level::trace))) {
+      std::ostringstream sos;
+      dump(sos);
+      logger().trace("OTree::Layout::Insert: -- dump\n{}", sos.str());
+    }
+    validate_layout();
+    assert(get_key_view(insert_pos) == key);
+    return ret;
+  }
+
+  std::tuple<search_position_t, bool, const value_t*> split_insert(
+      NodeExtentMutable& right_mut, NodeImpl& right_impl,
+      const full_key_t<KEY_TYPE>& key, const value_t& value,
+      search_position_t& _insert_pos, match_stage_t& insert_stage,
+      node_offset_t& insert_size) override {
+    logger().info("OTree::Layout::Split: begin at "
+                  "insert_pos({}), insert_stage={}, insert_size={}B, "
+                  "{:#x}=>{:#x} ...",
+                  _insert_pos, insert_stage, insert_size,
+                  laddr(), right_impl.laddr());
+    if (unlikely(logger().is_enabled(seastar::log_level::debug))) {
+      std::ostringstream sos;
+      dump(sos);
+      logger().debug("OTree::Layout::Split: -- dump\n{}", sos.str());
+    }
+#ifdef UNIT_TESTS_BUILT
+    auto insert_stage_pre = insert_stage;
+#endif
+
+    auto& insert_pos = cast_down<STAGE>(_insert_pos);
+    auto& node_stage = extent.read();
+    typename STAGE_T::StagedIterator split_at;
+    bool is_insert_left;
+    size_t split_size;
+    size_t target_split_size;
+    {
+      size_t empty_size = node_stage.size_before(0);
+      size_t filled_kv_size = filled_size() - empty_size;
+      /** NODE_BLOCK_SIZE considerations
+       *
+       * Generally,
+       * target_split_size = (filled_size + insert_size) / 2
+       * We can have two locate_split() strategies:
+       * A. the simpler one is to locate the largest split position where
+       *    the estimated left_node_size <= target_split_size;
+       * B. the fair one takes a further step to calculate the next slot of
+       *    P KiB, and if left_node_size + P/2 < target_split_size, compensate
+       *    the split position to include the next slot; (TODO)
+       *
+       * Say that the node_block_size = N KiB, the largest allowed
+       * insert_size = 1/I * N KiB (I > 1). We want to identify the minimal 'I'
+       * that won't lead to "double split" effect, meaning after a split,
+       * the right node size is still larger than N KiB and need to split
+       * again. I think "double split" makes split much more complicated and
+       * we can no longer identify whether the node is safe under concurrent
+       * operations.
+       *
+       * We need to evaluate the worst case in order to identify 'I'. This means:
+       * - filled_size ~= N KiB
+       * - insert_size == N/I KiB
+       * - target_split_size ~= (I+1)/2I * N KiB
+       * To simplify the below calculations, node_block_size is normalized to 1.
+       *
+       * With strategy A, the worst case is when left_node_size cannot include
+       * the next slot that will just overflow the target_split_size:
+       * - left_node_size + 1/I ~= (I+1)/2I
+       * - left_node_size ~= (I-1)/2I
+       * - right_node_size ~= 1 + 1/I - left_node_size ~= (I+3)/2I
+       * The right_node_size cannot larger than the node_block_size in the
+       * worst case, which means (I+3)/2I < 1, so I > 3, meaning the largest
+       * possible insert_size must be smaller than 1/3 of the node_block_size.
+       *
+       * With strategy B, the worst case is when left_node_size cannot include
+       * the next slot that will just overflow the threshold
+       * target_split_size - 1/2I, thus:
+       * - left_node_size ~= (I+1)/2I - 1/2I ~= 1/2
+       * - right_node_size ~= 1 + 1/I - 1/2 ~= (I+2)/2I < node_block_size(1)
+       * - I > 2
+       * This means the largest possible insert_size must be smaller than 1/2 of
+       * the node_block_size, which is better than strategy A.
+
+       * In order to avoid "double split", there is another side-effect we need
+       * to take into consideration: if split happens with snap-gen indexes, the
+       * according ns-oid string needs to be copied to the right node. That is
+       * to say: right_node_size + string_size < node_block_size.
+       *
+       * Say that the largest allowed string size is 1/S of the largest allowed
+       * insert_size N/I KiB. If we go with stragety B, the equation should be
+       * changed to:
+       * - right_node_size ~= (I+2)/2I + 1/(I*S) < 1
+       * - I > 2 + 2/S (S > 1)
+       *
+       * Now back to NODE_BLOCK_SIZE calculation, if we have limits of at most
+       * X KiB ns-oid string and Y KiB of onode_t to store in this BTree, then:
+       * - largest_insert_size ~= X+Y KiB
+       * - 1/S == X/(X+Y)
+       * - I > (4X+2Y)/(X+Y)
+       * - node_block_size(N) == I * insert_size > 4X+2Y KiB
+       *
+       * In conclusion,
+       * (TODO) the current node block size (4 KiB) is too small to
+       * store entire 2 KiB ns-oid string. We need to consider a larger
+       * node_block_size.
+       *
+       * We are setting X = Y = 640 B in order not to break the current
+       * implementations with 4KiB node.
+       *
+       * (TODO) Implement smarter logics to check when "double split" happens.
+       */
+      target_split_size = empty_size + (filled_kv_size + insert_size) / 2;
+      assert(insert_size < (node_stage.total_size() - empty_size) / 2);
+
+      std::optional<bool> _is_insert_left;
+      split_at.set(node_stage);
+      split_size = 0;
+      bool locate_nxt = STAGE_T::recursively_locate_split_inserted(
+          split_size, 0, target_split_size, insert_pos,
+          insert_stage, insert_size, _is_insert_left, split_at);
+      is_insert_left = *_is_insert_left;
+      logger().debug("OTree::Layout::Split: -- located "
+          "split_at({}), insert_pos({}), is_insert_left={}, "
+          "split_size={}B(target={}B, current={}B)",
+          split_at, insert_pos, is_insert_left,
+          split_size, target_split_size, filled_size());
+      // split_size can be larger than target_split_size in strategy B
+      // assert(split_size <= target_split_size);
+      if (locate_nxt) {
+        assert(insert_stage == STAGE);
+        assert(split_at.get().is_last());
+        split_at.set_end();
+        assert(insert_pos.index == split_at.index());
+      }
+    }
+
+    auto append_at = split_at;
+    // TODO(cross-node string dedup)
+    typename STAGE_T::template StagedAppender<KEY_TYPE> right_appender;
+    right_appender.init(&right_mut, right_mut.get_write());
+    const value_t* p_value = nullptr;
+    if (!is_insert_left) {
+      // right node: append [start(append_at), insert_pos)
+      STAGE_T::template append_until<KEY_TYPE>(
+          append_at, right_appender, insert_pos, insert_stage);
+      logger().debug("OTree::Layout::Split: -- right appended until "
+                     "insert_pos({}), insert_stage={}, insert/append the rest ...",
+                     insert_pos, insert_stage);
+      // right node: append [insert_pos(key, value)]
+      bool is_front_insert = (insert_pos == position_t::begin());
+      [[maybe_unused]] bool is_end = STAGE_T::template append_insert<KEY_TYPE>(
+          key, value, append_at, right_appender,
+          is_front_insert, insert_stage, p_value);
+      assert(append_at.is_end() == is_end);
+    } else {
+      logger().debug("OTree::Layout::Split: -- right appending ...");
+    }
+
+    // right node: append (insert_pos, end)
+    auto pos_end = position_t::end();
+    STAGE_T::template append_until<KEY_TYPE>(
+        append_at, right_appender, pos_end, STAGE);
+    assert(append_at.is_end());
+    right_appender.wrap();
+    if (unlikely(logger().is_enabled(seastar::log_level::debug))) {
+      std::ostringstream sos;
+      right_impl.dump(sos);
+      logger().debug("OTree::Layout::Split: -- right node dump\n{}", sos.str());
+    }
+    right_impl.validate_layout();
+
+    // mutate left node
+    if (is_insert_left) {
+      logger().debug("OTree::Layout::Split: -- left trim/insert at "
+                     "insert_pos({}), insert_stage={} ...",
+                     insert_pos, insert_stage);
+      p_value = extent.template split_insert_replayable<KEY_TYPE>(
+          split_at, key, value, insert_pos, insert_stage, insert_size);
+      assert(get_key_view(_insert_pos) == key);
+    } else {
+      logger().debug("OTree::Layout::Split: -- left trim ...");
+      assert(right_impl.get_key_view(_insert_pos) == key);
+      extent.split_replayable(split_at);
+    }
+    if (unlikely(logger().is_enabled(seastar::log_level::debug))) {
+      std::ostringstream sos;
+      dump(sos);
+      logger().debug("OTree::Layout::Split: -- left node dump\n{}", sos.str());
+    }
+    validate_layout();
+    assert(p_value);
+
+    auto split_pos = normalize(split_at.get_pos());
+    logger().info("OTree::Layout::Split: done  at "
+                  "insert_pos({}), insert_stage={}, insert_size={}B, split_at({}), "
+                  "is_insert_left={}, split_size={}B(target={}B)",
+                  _insert_pos, insert_stage, insert_size, split_pos,
+                  is_insert_left, split_size, target_split_size);
+    assert(split_size == filled_size());
+
+#ifdef UNIT_TESTS_BUILT
+    InsertType insert_type;
+    search_position_t last_pos;
+    if (is_insert_left) {
+      STAGE_T::template lookup_largest_slot<true, false, false>(
+          extent.read(), &cast_down_fill_0<STAGE>(last_pos), nullptr, nullptr);
+    } else {
+      node_stage_t right_stage{reinterpret_cast<FieldType*>(right_mut.get_write())};
+      STAGE_T::template lookup_largest_slot<true, false, false>(
+          right_stage, &cast_down_fill_0<STAGE>(last_pos), nullptr, nullptr);
+    }
+    if (_insert_pos == search_position_t::begin()) {
+      insert_type = InsertType::BEGIN;
+    } else if (_insert_pos == last_pos) {
+      insert_type = InsertType::LAST;
+    } else {
+      insert_type = InsertType::MID;
+    }
+    last_split = {split_pos, insert_stage_pre, is_insert_left, insert_type};
+#endif
+    return {split_pos, is_insert_left, p_value};
+  }
+
+  /*
+   * InternalNodeImpl
+   */
+  void replace_child_addr(
+      const search_position_t& pos, laddr_t dst, laddr_t src) override {
+    if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+      const laddr_packed_t* p_value = get_p_value(pos);
+      assert(p_value->value == src);
+      extent.update_child_addr_replayable(dst, const_cast<laddr_packed_t*>(p_value));
+    } else {
+      ceph_abort("impossible path");
+    }
+  }
+
+  std::tuple<match_stage_t, node_offset_t> evaluate_insert(
+      const key_view_t& key, const laddr_t& value,
+      search_position_t& insert_pos) const override {
+    if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+      auto packed_value = laddr_packed_t{value};
+      auto& node_stage = extent.read();
+      match_stage_t insert_stage;
+      node_offset_t insert_size;
+      if (unlikely(!node_stage.keys())) {
+        assert(insert_pos.is_end());
+        insert_stage = STAGE;
+        insert_size = STAGE_T::template insert_size<KeyT::VIEW>(key, packed_value);
+      } else {
+        std::tie(insert_stage, insert_size) = STAGE_T::evaluate_insert(
+            node_stage, key, packed_value, cast_down<STAGE>(insert_pos), false);
+      }
+      return {insert_stage, insert_size};
+    } else {
+      ceph_abort("impossible path");
+    }
+  }
+
+  /*
+   * LeafNodeImpl
+   */
+  void get_largest_slot(search_position_t& pos,
+                        key_view_t& index_key, const onode_t** pp_value) const override {
+    if constexpr (NODE_TYPE == node_type_t::LEAF) {
+      STAGE_T::template lookup_largest_slot<true, true, true>(
+          extent.read(), &cast_down_fill_0<STAGE>(pos), &index_key, pp_value);
+    } else {
+      ceph_abort("impossible path");
+    }
+  }
+
+  std::tuple<match_stage_t, node_offset_t> evaluate_insert(
+      const key_hobj_t& key, const onode_t& value,
+      const MatchHistory& history, match_stat_t mstat,
+      search_position_t& insert_pos) const override {
+    if constexpr (NODE_TYPE == node_type_t::LEAF) {
+      if (unlikely(is_empty())) {
+        assert(insert_pos.is_end());
+        return {STAGE, STAGE_T::template insert_size<KeyT::HOBJ>(key, value)};
+      } else {
+        return STAGE_T::evaluate_insert(
+            key, value, history, mstat, cast_down<STAGE>(insert_pos));
+      }
+    } else {
+      ceph_abort("impossible path");
+    }
+  }
+
+ private:
+  NodeLayoutT(NodeExtentRef extent) : extent{extent} {}
+
+  node_offset_t filled_size() const {
+    auto& node_stage = extent.read();
+    auto ret = node_stage.size_before(node_stage.keys());
+    assert(ret == node_stage.total_size() - node_stage.free_size());
+    return ret;
+  }
+
+  static seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+
+  extent_t extent;
+};
+
+using InternalNode0 = NodeLayoutT<node_fields_0_t, node_type_t::INTERNAL>;
+using InternalNode1 = NodeLayoutT<node_fields_1_t, node_type_t::INTERNAL>;
+using InternalNode2 = NodeLayoutT<node_fields_2_t, node_type_t::INTERNAL>;
+using InternalNode3 = NodeLayoutT<internal_fields_3_t, node_type_t::INTERNAL>;
+using LeafNode0 = NodeLayoutT<node_fields_0_t, node_type_t::LEAF>;
+using LeafNode1 = NodeLayoutT<node_fields_1_t, node_type_t::LEAF>;
+using LeafNode2 = NodeLayoutT<node_fields_2_t, node_type_t::LEAF>;
+using LeafNode3 = NodeLayoutT<leaf_fields_3_t, node_type_t::LEAF>;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h
new file mode 100644
index 000000000..c1499d609
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "node_extent_mutable.h"
+#include "stages/node_stage.h"
+#include "stages/stage.h"
+
+#define STAGE_T node_to_stage_t<node_stage_t>
+
+namespace crimson::os::seastore::onode {
+
+/**
+ * NodeLayoutReplayableT
+ *
+ * Contains templated logics to modify the layout of a NodeExtend which are
+ * also replayable. Used by NodeExtentAccessorT at runtime and by
+ * DeltaRecorderT during replay.
+ */
+template <typename FieldType, node_type_t NODE_TYPE>
+struct NodeLayoutReplayableT {
+  using node_stage_t = node_extent_t<FieldType, NODE_TYPE>;
+  using position_t = typename STAGE_T::position_t;
+  using StagedIterator = typename STAGE_T::StagedIterator;
+  using value_t = value_type_t<NODE_TYPE>;
+  static constexpr auto FIELD_TYPE = FieldType::FIELD_TYPE;
+
+  template <KeyT KT>
+  static const value_t* insert(
+      NodeExtentMutable& mut,
+      const node_stage_t& node_stage,
+      const full_key_t<KT>& key,
+      const value_t& value,
+      position_t& insert_pos,
+      match_stage_t& insert_stage,
+      node_offset_t& insert_size) {
+    auto p_value = STAGE_T::template proceed_insert<KT, false>(
+        mut, node_stage, key, value, insert_pos, insert_stage, insert_size);
+    return p_value;
+  }
+
+  static void split(
+      NodeExtentMutable& mut,
+      const node_stage_t& node_stage,
+      StagedIterator& split_at) {
+    node_stage_t::update_is_level_tail(mut, node_stage, false);
+    STAGE_T::trim(mut, split_at);
+  }
+
+  template <KeyT KT>
+  static const value_t* split_insert(
+      NodeExtentMutable& mut,
+      const node_stage_t& node_stage,
+      StagedIterator& split_at,
+      const full_key_t<KT>& key,
+      const value_t& value,
+      position_t& insert_pos,
+      match_stage_t& insert_stage,
+      node_offset_t& insert_size) {
+    node_stage_t::update_is_level_tail(mut, node_stage, false);
+    STAGE_T::trim(mut, split_at);
+    auto p_value = STAGE_T::template proceed_insert<KT, true>(
+        mut, node_stage, key, value, insert_pos, insert_stage, insert_size);
+    return p_value;
+  }
+
+  static void update_child_addr(
+      NodeExtentMutable& mut, const laddr_t new_addr, laddr_packed_t* p_addr) {
+    assert(NODE_TYPE == node_type_t::INTERNAL);
+    mut.copy_in_absolute(p_addr, new_addr);
+  }
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h
new file mode 100644
index 000000000..6774544c7
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <ostream>
+
+#include "fwd.h"
+
+namespace crimson::os::seastore::onode {
+
+constexpr uint8_t FIELD_TYPE_MAGIC = 0x25;
+enum class field_type_t : uint8_t {
+  N0 = FIELD_TYPE_MAGIC,
+  N1,
+  N2,
+  N3,
+  _MAX
+};
+inline uint8_t to_unsigned(field_type_t type) {
+  auto value = static_cast<uint8_t>(type);
+  assert(value >= FIELD_TYPE_MAGIC);
+  assert(value < static_cast<uint8_t>(field_type_t::_MAX));
+  return value - FIELD_TYPE_MAGIC;
+}
+inline std::ostream& operator<<(std::ostream &os, field_type_t type) {
+  const char* const names[] = {"0", "1", "2", "3"};
+  auto index = to_unsigned(type);
+  os << names[index];
+  return os;
+}
+
+enum class node_type_t : uint8_t {
+  LEAF = 0,
+  INTERNAL
+};
+inline std::ostream& operator<<(std::ostream &os, const node_type_t& type) {
+  const char* const names[] = {"L", "I"};
+  auto index = static_cast<uint8_t>(type);
+  assert(index <= 1u);
+  os << names[index];
+  return os;
+}
+
+struct laddr_packed_t {
+  laddr_t value;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const laddr_packed_t& laddr) {
+  return os << "laddr_packed(0x" << std::hex << laddr.value << std::dec << ")";
+}
+
+using match_stat_t = int8_t;
+constexpr match_stat_t MSTAT_END = -2; // index is search_position_t::end()
+constexpr match_stat_t MSTAT_EQ  = -1; // key == index
+constexpr match_stat_t MSTAT_LT0 =  0; // key == index [pool/shard crush ns/oid]; key < index [snap/gen]
+constexpr match_stat_t MSTAT_LT1 =  1; // key == index [pool/shard crush]; key < index [ns/oid]
+constexpr match_stat_t MSTAT_LT2 =  2; // key < index [pool/shard crush ns/oid] ||
+                                       // key == index [pool/shard]; key < index [crush]
+constexpr match_stat_t MSTAT_LT3 =  3; // key < index [pool/shard]
+constexpr match_stat_t MSTAT_MIN = MSTAT_END;
+constexpr match_stat_t MSTAT_MAX = MSTAT_LT3;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc
new file mode 100644
index 000000000..443c6cabd
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc
@@ -0,0 +1,165 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "item_iterator_stage.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+#define ITER_T item_iterator_t<NODE_TYPE>
+#define ITER_INST(NT) item_iterator_t<NT>
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+memory_range_t ITER_T::insert_prefix(
+    NodeExtentMutable& mut, const ITER_T& iter, const full_key_t<KT>& key,
+    bool is_end, node_offset_t size, const char* p_left_bound) {
+  // 1. insert range
+  char* p_insert;
+  if (is_end) {
+    assert(!iter.has_next());
+    p_insert = const_cast<char*>(iter.p_start());
+  } else {
+    p_insert = const_cast<char*>(iter.p_end());
+  }
+  char* p_insert_front = p_insert - size;
+
+  // 2. shift memory
+  const char* p_shift_start = p_left_bound;
+  const char* p_shift_end = p_insert;
+  mut.shift_absolute(p_shift_start,
+                     p_shift_end - p_shift_start,
+                     -(int)size);
+
+  // 3. append header
+  p_insert -= sizeof(node_offset_t);
+  node_offset_t back_offset = (p_insert - p_insert_front);
+  mut.copy_in_absolute(p_insert, back_offset);
+  ns_oid_view_t::append<KT>(mut, key, p_insert);
+
+  return {p_insert_front, p_insert};
+}
+#define IP_TEMPLATE(NT, KT)                                              \
+  template memory_range_t ITER_INST(NT)::insert_prefix<KT>(              \
+      NodeExtentMutable&, const ITER_INST(NT)&, const full_key_t<KT>&, \
+      bool, node_offset_t, const char*)
+IP_TEMPLATE(node_type_t::LEAF, KeyT::VIEW);
+IP_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW);
+IP_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ);
+IP_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ);
+
+template <node_type_t NODE_TYPE>
+void ITER_T::update_size(
+    NodeExtentMutable& mut, const ITER_T& iter, int change) {
+  node_offset_t offset = iter.get_back_offset();
+  int new_size = change + offset;
+  assert(new_size > 0 && new_size < NODE_BLOCK_SIZE);
+  mut.copy_in_absolute(
+      (void*)iter.get_item_range().p_end, node_offset_t(new_size));
+}
+
+template <node_type_t NODE_TYPE>
+node_offset_t ITER_T::trim_until(NodeExtentMutable&, const ITER_T& iter) {
+  assert(iter.index() != 0);
+  size_t ret = iter.p_end() - iter.p_items_start;
+  assert(ret < NODE_BLOCK_SIZE);
+  return ret;
+}
+
+template <node_type_t NODE_TYPE>
+node_offset_t ITER_T::trim_at(
+    NodeExtentMutable& mut, const ITER_T& iter, node_offset_t trimmed) {
+  size_t trim_size = iter.p_start() - iter.p_items_start + trimmed;
+  assert(trim_size < NODE_BLOCK_SIZE);
+  assert(iter.get_back_offset() > trimmed);
+  node_offset_t new_offset = iter.get_back_offset() - trimmed;
+  mut.copy_in_absolute((void*)iter.item_range.p_end, new_offset);
+  return trim_size;
+}
+
+#define ITER_TEMPLATE(NT) template class ITER_INST(NT)
+ITER_TEMPLATE(node_type_t::LEAF);
+ITER_TEMPLATE(node_type_t::INTERNAL);
+
+#define APPEND_T ITER_T::Appender<KT>
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+bool APPEND_T::append(const ITER_T& src, index_t& items) {
+  auto p_end = src.p_end();
+  bool append_till_end = false;
+  if (is_valid_index(items)) {
+    for (auto i = 1u; i <= items; ++i) {
+      if (!src.has_next()) {
+        assert(i == items);
+        append_till_end = true;
+        break;
+      }
+      ++src;
+    }
+  } else {
+    if (items == INDEX_END) {
+      append_till_end = true;
+    } else {
+      assert(items == INDEX_LAST);
+    }
+    items = 0;
+    while (src.has_next()) {
+      ++src;
+      ++items;
+    }
+    if (append_till_end) {
+      ++items;
+    }
+  }
+
+  const char* p_start;
+  if (append_till_end) {
+    p_start = src.p_start();
+  } else {
+    p_start = src.p_end();
+  }
+  assert(p_end >= p_start);
+  size_t append_size = p_end - p_start;
+  p_append -= append_size;
+  p_mut->copy_in_absolute(p_append, p_start, append_size);
+  return append_till_end;
+}
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const key_get_type& partial_key) {
+  p_append -= sizeof(node_offset_t);
+  p_offset_while_open = p_append;
+  ns_oid_view_t::append(*p_mut, partial_key, p_append);
+  return {p_mut, p_append};
+}
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const full_key_t<KT>& key) {
+  p_append -= sizeof(node_offset_t);
+  p_offset_while_open = p_append;
+  ns_oid_view_t::append<KT>(*p_mut, key, p_append);
+  return {p_mut, p_append};
+}
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+void APPEND_T::wrap_nxt(char* _p_append) {
+  assert(_p_append < p_append);
+  p_mut->copy_in_absolute(
+      p_offset_while_open, node_offset_t(p_offset_while_open - _p_append));
+  p_append = _p_append;
+}
+
+#define APPEND_TEMPLATE(NT, KT) template class ITER_INST(NT)::Appender<KT>
+APPEND_TEMPLATE(node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ);
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h
new file mode 100644
index 000000000..bb68eec8f
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h
@@ -0,0 +1,180 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "key_layout.h"
+#include "stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+/**
+ * item_iterator_t
+ *
+ * The STAGE_STRING implementation for node N0/N1, implements staged contract
+ * as an iterative container to resolve crush hash conflicts.
+ *
+ * The layout of the contaner to index ns, oid strings storing n items:
+ *
+ * # <--------- container range ---------> #
+ * #<~># items [i+1, n)                    #
+ * #   #                  items [0, i) #<~>#
+ * #   # <------ item i -------------> #   #
+ * #   # <--- item_range ---> |        #   #
+ * #   #                      |        #   #
+ * #   # next-stage | ns-oid  | back_  #   #
+ * #   #  contaner  | strings | offset #   #
+ * #...#   range    |         |        #...#
+ * ^   ^                           |       ^
+ * |   |                           |       |
+ * |   +---------------------------+       |
+ * + p_items_start             p_items_end +
+ */
+template <node_type_t NODE_TYPE>
+class item_iterator_t {
+  using value_t = value_type_t<NODE_TYPE>;
+ public:
+  item_iterator_t(const memory_range_t& range)
+      : p_items_start(range.p_start), p_items_end(range.p_end) {
+    assert(p_items_start < p_items_end);
+    next_item_range(p_items_end);
+  }
+
+  const char* p_start() const { return item_range.p_start; }
+  const char* p_end() const { return item_range.p_end + sizeof(node_offset_t); }
+  const memory_range_t& get_item_range() const { return item_range; }
+  node_offset_t get_back_offset() const { return back_offset; }
+
+  // container type system
+  using key_get_type = const ns_oid_view_t&;
+  static constexpr auto CONTAINER_TYPE = ContainerType::ITERATIVE;
+  index_t index() const { return _index; }
+  key_get_type get_key() const {
+    if (!key.has_value()) {
+      key = ns_oid_view_t(item_range.p_end);
+      assert(item_range.p_start < (*key).p_start());
+    }
+    return *key;
+  }
+  node_offset_t size() const {
+    size_t ret = item_range.p_end - item_range.p_start + sizeof(node_offset_t);
+    assert(ret < NODE_BLOCK_SIZE);
+    return ret;
+  };
+  node_offset_t size_to_nxt() const {
+    size_t ret = get_key().size() + sizeof(node_offset_t);
+    assert(ret < NODE_BLOCK_SIZE);
+    return ret;
+  }
+  node_offset_t size_overhead() const {
+    return sizeof(node_offset_t) + get_key().size_overhead();
+  }
+  memory_range_t get_nxt_container() const {
+    return {item_range.p_start, get_key().p_start()};
+  }
+  bool has_next() const {
+    assert(p_items_start <= item_range.p_start);
+    return p_items_start < item_range.p_start;
+  }
+  const item_iterator_t<NODE_TYPE>& operator++() const {
+    assert(has_next());
+    next_item_range(item_range.p_start);
+    key.reset();
+    ++_index;
+    return *this;
+  }
+  void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+    int start_offset = p_items_start - p_node_start;
+    int end_offset = p_items_end - p_node_start;
+    assert(start_offset > 0 && start_offset < NODE_BLOCK_SIZE);
+    assert(end_offset > 0 && end_offset <= NODE_BLOCK_SIZE);
+    ceph::encode(static_cast<node_offset_t>(start_offset), encoded);
+    ceph::encode(static_cast<node_offset_t>(end_offset), encoded);
+    ceph::encode(_index, encoded);
+  }
+
+  static item_iterator_t decode(const char* p_node_start,
+                                ceph::bufferlist::const_iterator& delta) {
+    node_offset_t start_offset;
+    ceph::decode(start_offset, delta);
+    node_offset_t end_offset;
+    ceph::decode(end_offset, delta);
+    assert(start_offset < end_offset);
+    assert(end_offset <= NODE_BLOCK_SIZE);
+    index_t index;
+    ceph::decode(index, delta);
+
+    item_iterator_t ret({p_node_start + start_offset,
+                         p_node_start + end_offset});
+    while (index > 0) {
+      ++ret;
+      --index;
+    }
+    return ret;
+  }
+
+  static node_offset_t header_size() { return 0u; }
+
+  template <KeyT KT>
+  static node_offset_t estimate_insert(
+      const full_key_t<KT>& key, const value_t&) {
+    return ns_oid_view_t::estimate_size<KT>(key) + sizeof(node_offset_t);
+  }
+
+  template <KeyT KT>
+  static memory_range_t insert_prefix(
+      NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter,
+      const full_key_t<KT>& key, bool is_end,
+      node_offset_t size, const char* p_left_bound);
+
+  static void update_size(
+      NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter, int change);
+
+  static node_offset_t trim_until(NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&);
+  static node_offset_t trim_at(
+      NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&, node_offset_t trimmed);
+
+  template <KeyT KT>
+  class Appender;
+
+ private:
+  void next_item_range(const char* p_end) const {
+    auto p_item_end = p_end - sizeof(node_offset_t);
+    assert(p_items_start < p_item_end);
+    back_offset = reinterpret_cast<const node_offset_packed_t*>(p_item_end)->value;
+    assert(back_offset);
+    const char* p_item_start = p_item_end - back_offset;
+    assert(p_items_start <= p_item_start);
+    item_range = {p_item_start, p_item_end};
+  }
+
+  const char* p_items_start;
+  const char* p_items_end;
+  mutable memory_range_t item_range;
+  mutable node_offset_t back_offset;
+  mutable std::optional<ns_oid_view_t> key;
+  mutable index_t _index = 0u;
+};
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+class item_iterator_t<NODE_TYPE>::Appender {
+ public:
+  Appender(NodeExtentMutable* p_mut, char* p_append)
+    : p_mut{p_mut}, p_append{p_append} {}
+  bool append(const item_iterator_t<NODE_TYPE>& src, index_t& items);
+  char* wrap() { return p_append; }
+  std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&);
+  std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&);
+  void wrap_nxt(char* _p_append);
+
+ private:
+  NodeExtentMutable* p_mut;
+  char* p_append;
+  char* p_offset_while_open;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc
new file mode 100644
index 000000000..d60bb8d09
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "key_layout.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+void string_key_view_t::append_str(
+    NodeExtentMutable& mut, std::string_view str, char*& p_append) {
+  assert(is_valid_size(str.length()));
+  p_append -= sizeof(string_size_t);
+  string_size_t len = str.length();
+  mut.copy_in_absolute(p_append, len);
+  p_append -= len;
+  mut.copy_in_absolute(p_append, str.data(), len);
+}
+
+void string_key_view_t::append_dedup(
+    NodeExtentMutable& mut, const Type& dedup_type, char*& p_append) {
+  p_append -= sizeof(string_size_t);
+  if (dedup_type == Type::MIN) {
+    mut.copy_in_absolute(p_append, MIN);
+  } else if (dedup_type == Type::MAX) {
+    mut.copy_in_absolute(p_append, MAX);
+  } else {
+    ceph_abort("impossible path");
+  }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h
new file mode 100644
index 000000000..cc1f546c1
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h
@@ -0,0 +1,846 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <limits>
+#include <optional>
+#include <ostream>
+
+#include "common/hobject.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h"
+
+namespace crimson::os::seastore::onode {
+
+using shard_t = int8_t;
+using pool_t = int64_t;
+using crush_hash_t = uint32_t;
+using snap_t = uint64_t;
+using gen_t = uint64_t;
+static_assert(sizeof(shard_t) == sizeof(ghobject_t().shard_id.id));
+static_assert(sizeof(pool_t) == sizeof(ghobject_t().hobj.pool));
+static_assert(sizeof(crush_hash_t) == sizeof(ghobject_t().hobj.get_hash()));
+static_assert(sizeof(snap_t) == sizeof(ghobject_t().hobj.snap.val));
+static_assert(sizeof(gen_t) == sizeof(ghobject_t().generation));
+
+class NodeExtentMutable;
+class key_view_t;
+class key_hobj_t;
+enum class KeyT { VIEW, HOBJ };
+template <KeyT> struct _full_key_type;
+template<> struct _full_key_type<KeyT::VIEW> { using type = key_view_t; };
+template<> struct _full_key_type<KeyT::HOBJ> { using type = key_hobj_t; };
+template <KeyT type>
+using full_key_t = typename _full_key_type<type>::type;
+
+struct node_offset_packed_t {
+  node_offset_t value;
+} __attribute__((packed));
+
+// TODO: consider alignments
+struct shard_pool_t {
+  bool operator==(const shard_pool_t& x) const {
+    return (shard == x.shard && pool == x.pool);
+  }
+  bool operator!=(const shard_pool_t& x) const { return !(*this == x); }
+
+  template <KeyT KT>
+  static shard_pool_t from_key(const full_key_t<KT>& key);
+
+  shard_t shard;
+  pool_t pool;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const shard_pool_t& sp) {
+  return os << (unsigned)sp.shard << "," << sp.pool;
+}
+inline MatchKindCMP compare_to(const shard_pool_t& l, const shard_pool_t& r) {
+  auto ret = toMatchKindCMP(l.shard, r.shard);
+  if (ret != MatchKindCMP::EQ)
+    return ret;
+  return toMatchKindCMP(l.pool, r.pool);
+}
+
+struct crush_t {
+  bool operator==(const crush_t& x) const { return crush == x.crush; }
+  bool operator!=(const crush_t& x) const { return !(*this == x); }
+
+  template <KeyT KT>
+  static crush_t from_key(const full_key_t<KT>& key);
+
+  crush_hash_t crush;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const crush_t& c) {
+  return os << c.crush;
+}
+inline MatchKindCMP compare_to(const crush_t& l, const crush_t& r) {
+  return toMatchKindCMP(l.crush, r.crush);
+}
+
+struct shard_pool_crush_t {
+  bool operator==(const shard_pool_crush_t& x) const {
+    return (shard_pool == x.shard_pool && crush == x.crush);
+  }
+  bool operator!=(const shard_pool_crush_t& x) const { return !(*this == x); }
+
+  template <KeyT KT>
+  static shard_pool_crush_t from_key(const full_key_t<KT>& key);
+
+  shard_pool_t shard_pool;
+  crush_t crush;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const shard_pool_crush_t& spc) {
+  return os << spc.shard_pool << "," << spc.crush;
+}
+inline MatchKindCMP compare_to(
+    const shard_pool_crush_t& l, const shard_pool_crush_t& r) {
+  auto ret = compare_to(l.shard_pool, r.shard_pool);
+  if (ret != MatchKindCMP::EQ)
+    return ret;
+  return compare_to(l.crush, r.crush);
+}
+
+struct snap_gen_t {
+  bool operator==(const snap_gen_t& x) const {
+    return (snap == x.snap && gen == x.gen);
+  }
+  bool operator!=(const snap_gen_t& x) const { return !(*this == x); }
+
+  template <KeyT KT>
+  static snap_gen_t from_key(const full_key_t<KT>& key);
+
+  snap_t snap;
+  gen_t gen;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const snap_gen_t& sg) {
+  return os << sg.snap << "," << sg.gen;
+}
+inline MatchKindCMP compare_to(const snap_gen_t& l, const snap_gen_t& r) {
+  auto ret = toMatchKindCMP(l.snap, r.snap);
+  if (ret != MatchKindCMP::EQ)
+    return ret;
+  return toMatchKindCMP(l.gen, r.gen);
+}
+
+/**
+ * string_key_view_t
+ *
+ * The layout to store char array as an oid or an ns string which may be
+ * compressed.
+ *
+ * If compressed, the physical block only stores an unsigned int of
+ * string_size_t, with value 0 denoting Type::MIN, and value max() denoting
+ * Type::MAX.
+ *
+ * If not compressed (Type::STR), the physical block stores the char array and
+ * a valid string_size_t value.
+ */
+struct string_key_view_t {
+  enum class Type {MIN, STR, MAX};
+  // presumably the maximum string length is 2KiB
+  using string_size_t = uint16_t;
+  static constexpr auto MAX = std::numeric_limits<string_size_t>::max();
+  static constexpr auto MIN = string_size_t(0u);
+  static auto is_valid_size(size_t size) {
+    return (size > MIN && size < MAX);
+  }
+
+  string_key_view_t(const char* p_end) {
+    p_length = p_end - sizeof(string_size_t);
+    std::memcpy(&length, p_length, sizeof(string_size_t));
+    if (is_valid_size(length)) {
+      auto _p_key = p_length - length;
+      p_key = static_cast<const char*>(_p_key);
+    } else {
+      assert(length == MAX || length == MIN);
+      p_key = nullptr;
+    }
+  }
+  Type type() const {
+    if (length == MIN) {
+      return Type::MIN;
+    } else if (length == MAX) {
+      return Type::MAX;
+    } else {
+      assert(is_valid_size(length));
+      return Type::STR;
+    }
+  }
+  const char* p_start() const {
+    if (p_key) {
+      return p_key;
+    } else {
+      return p_length;
+    }
+  }
+  const char* p_next_end() const {
+    if (p_key) {
+      return p_start();
+    } else {
+      return p_length + sizeof(string_size_t);
+    }
+  }
+  node_offset_t size() const {
+    size_t ret = length + sizeof(string_size_t);
+    assert(ret < NODE_BLOCK_SIZE);
+    return ret;
+  }
+  node_offset_t size_logical() const {
+    assert(type() == Type::STR);
+    assert(is_valid_size(length));
+    return length;
+  }
+  node_offset_t size_overhead() const {
+    assert(type() == Type::STR);
+    return sizeof(string_size_t);
+  }
+
+  std::string_view to_string_view() const {
+    assert(type() == Type::STR);
+    assert(is_valid_size(length));
+    return {p_key, length};
+  }
+  bool operator==(const string_key_view_t& x) const {
+    if (type() == x.type() && type() != Type::STR)
+      return true;
+    if (type() != x.type())
+      return false;
+    if (length != x.length)
+      return false;
+    return (memcmp(p_key, x.p_key, length) == 0);
+  }
+  bool operator!=(const string_key_view_t& x) const { return !(*this == x); }
+
+  static void append_str(
+      NodeExtentMutable&, std::string_view, char*& p_append);
+
+  static void test_append_str(std::string_view str, char*& p_append) {
+    assert(is_valid_size(str.length()));
+    p_append -= sizeof(string_size_t);
+    string_size_t len = str.length();
+    std::memcpy(p_append, &len, sizeof(string_size_t));
+    p_append -= len;
+    std::memcpy(p_append, str.data(), len);
+  }
+
+  static void append_dedup(
+      NodeExtentMutable&, const Type& dedup_type, char*& p_append);
+
+  static void test_append_dedup(const Type& dedup_type, char*& p_append) {
+    p_append -= sizeof(string_size_t);
+    string_size_t len;
+    if (dedup_type == Type::MIN) {
+      len = MIN;
+    } else if (dedup_type == Type::MAX) {
+      len = MAX;
+    } else {
+      ceph_abort("impossible path");
+    }
+    std::memcpy(p_append, &len, sizeof(string_size_t));
+  }
+
+  const char* p_key;
+  const char* p_length;
+  // TODO: remove if p_length is aligned
+  string_size_t length;
+};
+
+/**
+ * string_view_masked_t
+ *
+ * A common class to hide the underlying string implementation regardless of a
+ * string_key_view_t (maybe compressed), a string/string_view, or a compressed
+ * string. And leverage this consistant class to do compare, print, convert and
+ * append operations.
+ */
+class string_view_masked_t {
+ public:
+  using string_size_t = string_key_view_t::string_size_t;
+  using Type = string_key_view_t::Type;
+  explicit string_view_masked_t(const string_key_view_t& index)
+      : type{index.type()} {
+    if (type == Type::STR) {
+      view = index.to_string_view();
+    }
+  }
+  explicit string_view_masked_t(std::string_view str)
+      : type{Type::STR}, view{str} {
+    assert(string_key_view_t::is_valid_size(view.size()));
+  }
+
+  Type get_type() const { return type; }
+  std::string_view to_string_view() const {
+    assert(get_type() == Type::STR);
+    return view;
+  }
+  string_size_t size() const {
+    assert(get_type() == Type::STR);
+    assert(string_key_view_t::is_valid_size(view.size()));
+    return view.size();
+  }
+  bool operator==(const string_view_masked_t& x) const {
+    if (get_type() == x.get_type() && get_type() != Type::STR)
+      return true;
+    if (get_type() != x.get_type())
+      return false;
+    if (size() != x.size())
+      return false;
+    return (memcmp(view.data(), x.view.data(), size()) == 0);
+  }
+  bool operator!=(const string_view_masked_t& x) const { return !(*this == x); }
+  void encode(ceph::bufferlist& bl) const {
+    if (get_type() == Type::MIN) {
+      ceph::encode(string_key_view_t::MIN, bl);
+    } else if (get_type() == Type::MAX) {
+      ceph::encode(string_key_view_t::MAX, bl);
+    } else {
+      ceph::encode(size(), bl);
+      ceph::encode_nohead(view, bl);
+    }
+  }
+  static auto min() { return string_view_masked_t{Type::MIN}; }
+  static auto max() { return string_view_masked_t{Type::MAX}; }
+  static string_view_masked_t decode(
+      std::string& str_storage, ceph::bufferlist::const_iterator& delta) {
+    string_size_t size;
+    ceph::decode(size, delta);
+    if (size == string_key_view_t::MIN) {
+      return min();
+    } else if (size == string_key_view_t::MAX) {
+      return max();
+    } else {
+      ceph::decode_nohead(size, str_storage, delta);
+      return string_view_masked_t(str_storage);
+    }
+  }
+
+ private:
+  explicit string_view_masked_t(Type type)
+      : type{type} {}
+
+  Type type;
+  std::string_view view;
+};
+inline MatchKindCMP compare_to(const string_view_masked_t& l, const string_view_masked_t& r) {
+  using Type = string_view_masked_t::Type;
+  auto l_type = l.get_type();
+  auto r_type = r.get_type();
+  if (l_type == Type::STR && r_type == Type::STR) {
+    assert(l.size() && r.size());
+    return toMatchKindCMP(l.to_string_view(), r.to_string_view());
+  } else if (l_type == r_type) {
+    return MatchKindCMP::EQ;
+  } else if (l_type == Type::MIN || r_type == Type::MAX) {
+    return MatchKindCMP::LT;
+  } else { // l_type == Type::MAX || r_type == Type::MIN
+    return MatchKindCMP::GT;
+  }
+}
+inline MatchKindCMP compare_to(std::string_view l, const string_view_masked_t& r) {
+  using Type = string_view_masked_t::Type;
+  assert(l.length());
+  auto r_type = r.get_type();
+  if (r_type == Type::MIN) {
+    return MatchKindCMP::GT;
+  } else if (r_type == Type::MAX) {
+    return MatchKindCMP::LT;
+  } else { // r_type == Type::STR
+    assert(r.size());
+    return toMatchKindCMP(l, r.to_string_view());
+  }
+}
+inline MatchKindCMP compare_to(const string_view_masked_t& l, std::string_view r) {
+  return reverse(compare_to(r, l));
+}
+inline std::ostream& operator<<(std::ostream& os, const string_view_masked_t& masked) {
+  using Type = string_view_masked_t::Type;
+  auto type = masked.get_type();
+  if (type == Type::MIN) {
+    return os << "MIN";
+  } else if (type == Type::MAX) {
+    return os << "MAX";
+  } else { // type == Type::STR
+    auto view = masked.to_string_view();
+    if (view.length() <= 12) {
+      os << "\"" << view << "\"";
+    } else {
+      os << "\"" << std::string_view(view.data(), 4) << ".."
+         << std::string_view(view.data() + view.length() - 2, 2)
+         << "/" << view.length() << "B\"";
+    }
+    return os;
+  }
+}
+
+struct ns_oid_view_t {
+  using string_size_t = string_key_view_t::string_size_t;
+  using Type = string_key_view_t::Type;
+
+  ns_oid_view_t(const char* p_end) : nspace(p_end), oid(nspace.p_next_end()) {}
+  Type type() const { return oid.type(); }
+  const char* p_start() const { return oid.p_start(); }
+  node_offset_t size() const {
+    if (type() == Type::STR) {
+      size_t ret = nspace.size() + oid.size();
+      assert(ret < NODE_BLOCK_SIZE);
+      return ret;
+    } else {
+      return sizeof(string_size_t);
+    }
+  }
+  node_offset_t size_logical() const {
+    assert(type() == Type::STR);
+    return nspace.size_logical() + oid.size_logical();
+  }
+  node_offset_t size_overhead() const {
+    assert(type() == Type::STR);
+    return nspace.size_overhead() + oid.size_overhead();
+  }
+  bool operator==(const ns_oid_view_t& x) const {
+    return (string_view_masked_t{nspace} == string_view_masked_t{x.nspace} &&
+            string_view_masked_t{oid} == string_view_masked_t{x.oid});
+  }
+  bool operator!=(const ns_oid_view_t& x) const { return !(*this == x); }
+
+  template <KeyT KT>
+  static node_offset_t estimate_size(const full_key_t<KT>& key);
+
+  template <KeyT KT>
+  static void append(NodeExtentMutable&,
+                     const full_key_t<KT>& key,
+                     char*& p_append);
+
+  static void append(NodeExtentMutable& mut,
+                     const ns_oid_view_t& view,
+                     char*& p_append) {
+    if (view.type() == Type::STR) {
+      string_key_view_t::append_str(mut, view.nspace.to_string_view(), p_append);
+      string_key_view_t::append_str(mut, view.oid.to_string_view(), p_append);
+    } else {
+      string_key_view_t::append_dedup(mut, view.type(), p_append);
+    }
+  }
+
+  template <KeyT KT>
+  static void test_append(const full_key_t<KT>& key, char*& p_append);
+
+  string_key_view_t nspace;
+  string_key_view_t oid;
+};
+inline std::ostream& operator<<(std::ostream& os, const ns_oid_view_t& ns_oid) {
+  return os << string_view_masked_t{ns_oid.nspace} << ","
+            << string_view_masked_t{ns_oid.oid};
+}
+inline MatchKindCMP compare_to(const ns_oid_view_t& l, const ns_oid_view_t& r) {
+  auto ret = compare_to(string_view_masked_t{l.nspace},
+                        string_view_masked_t{r.nspace});
+  if (ret != MatchKindCMP::EQ)
+    return ret;
+  return compare_to(string_view_masked_t{l.oid},
+                    string_view_masked_t{r.oid});
+}
+
+/**
+ * key_hobj_t
+ *
+ * A specialized implementation of a full_key_t storing a ghobject_t passed
+ * from user.
+ */
+class key_hobj_t {
+ public:
+  explicit key_hobj_t(const ghobject_t& ghobj) : ghobj{ghobj} {}
+  /*
+   * common interfaces as a full_key_t
+   */
+  shard_t shard() const {
+    return ghobj.shard_id;
+  }
+  pool_t pool() const {
+    return ghobj.hobj.pool;
+  }
+  crush_hash_t crush() const {
+    return ghobj.hobj.get_hash();
+  }
+  std::string_view nspace() const {
+    // TODO(cross-node string dedup)
+    return ghobj.hobj.nspace;
+  }
+  string_view_masked_t nspace_masked() const {
+    // TODO(cross-node string dedup)
+    return string_view_masked_t{nspace()};
+  }
+  std::string_view oid() const {
+    // TODO(cross-node string dedup)
+    return ghobj.hobj.oid.name;
+  }
+  string_view_masked_t oid_masked() const {
+    // TODO(cross-node string dedup)
+    return string_view_masked_t{oid()};
+  }
+  ns_oid_view_t::Type dedup_type() const {
+    return _dedup_type;
+  }
+  snap_t snap() const {
+    return ghobj.hobj.snap;
+  }
+  gen_t gen() const {
+    return ghobj.generation;
+  }
+
+  bool operator==(const full_key_t<KeyT::VIEW>& o) const;
+  bool operator==(const full_key_t<KeyT::HOBJ>& o) const;
+  bool operator!=(const full_key_t<KeyT::VIEW>& o) const {
+    return !operator==(o);
+  }
+  bool operator!=(const full_key_t<KeyT::HOBJ>& o) const {
+    return !operator==(o);
+  }
+
+  std::ostream& dump(std::ostream& os) const {
+    os << "key_hobj(" << (unsigned)shard() << ","
+       << pool() << "," << crush() << "; "
+       << string_view_masked_t{nspace()} << ","
+       << string_view_masked_t{oid()} << "; "
+       << snap() << "," << gen() << ")";
+    return os;
+  }
+
+  static key_hobj_t decode(ceph::bufferlist::const_iterator& delta) {
+    shard_t shard;
+    ceph::decode(shard, delta);
+    pool_t pool;
+    ceph::decode(pool, delta);
+    crush_hash_t crush;
+    ceph::decode(crush, delta);
+    std::string nspace;
+    auto nspace_masked = string_view_masked_t::decode(nspace, delta);
+    // TODO(cross-node string dedup)
+    assert(nspace_masked.get_type() == string_view_masked_t::Type::STR);
+    std::string oid;
+    auto oid_masked = string_view_masked_t::decode(oid, delta);
+    // TODO(cross-node string dedup)
+    assert(oid_masked.get_type() == string_view_masked_t::Type::STR);
+    snap_t snap;
+    ceph::decode(snap, delta);
+    gen_t gen;
+    ceph::decode(gen, delta);
+    return key_hobj_t(ghobject_t(
+        shard_id_t(shard), pool, crush, nspace, oid, snap, gen));
+  }
+
+ private:
+  ns_oid_view_t::Type _dedup_type = ns_oid_view_t::Type::STR;
+  ghobject_t ghobj;
+};
+inline std::ostream& operator<<(std::ostream& os, const key_hobj_t& key) {
+  return key.dump(os);
+}
+
+/**
+ * key_view_t
+ *
+ * A specialized implementation of a full_key_t pointing to the locations
+ * storing the full key in a tree node.
+ */
+class key_view_t {
+ public:
+  /**
+   * common interfaces as a full_key_t
+   */
+  shard_t shard() const {
+    return shard_pool_packed().shard;
+  }
+  pool_t pool() const {
+    return shard_pool_packed().pool;
+  }
+  crush_hash_t crush() const {
+    return crush_packed().crush;
+  }
+  std::string_view nspace() const {
+    // TODO(cross-node string dedup)
+    return ns_oid_view().nspace.to_string_view();
+  }
+  string_view_masked_t nspace_masked() const {
+    // TODO(cross-node string dedup)
+    return string_view_masked_t{ns_oid_view().nspace};
+  }
+  std::string_view oid() const {
+    // TODO(cross-node string dedup)
+    return ns_oid_view().oid.to_string_view();
+  }
+  string_view_masked_t oid_masked() const {
+    // TODO(cross-node string dedup)
+    return string_view_masked_t{ns_oid_view().oid};
+  }
+  ns_oid_view_t::Type dedup_type() const {
+    return ns_oid_view().type();
+  }
+  snap_t snap() const {
+    return snap_gen_packed().snap;
+  }
+  gen_t gen() const {
+    return snap_gen_packed().gen;
+  }
+
+  bool operator==(const full_key_t<KeyT::VIEW>& o) const;
+  bool operator==(const full_key_t<KeyT::HOBJ>& o) const;
+  bool operator!=(const full_key_t<KeyT::VIEW>& o) const {
+    return !operator==(o);
+  }
+  bool operator!=(const full_key_t<KeyT::HOBJ>& o) const {
+    return !operator==(o);
+  }
+
+  /**
+   * key_view_t specific interfaces
+   */
+  bool has_shard_pool() const {
+    return p_shard_pool != nullptr;
+  }
+  bool has_crush() const {
+    return p_crush != nullptr;
+  }
+  bool has_ns_oid() const {
+    return p_ns_oid.has_value();
+  }
+  bool has_snap_gen() const {
+    return p_snap_gen != nullptr;
+  }
+
+  const shard_pool_t& shard_pool_packed() const {
+    assert(has_shard_pool());
+    return *p_shard_pool;
+  }
+  const crush_t& crush_packed() const {
+    assert(has_crush());
+    return *p_crush;
+  }
+  const ns_oid_view_t& ns_oid_view() const {
+    assert(has_ns_oid());
+    return *p_ns_oid;
+  }
+  const snap_gen_t& snap_gen_packed() const {
+    assert(has_snap_gen());
+    return *p_snap_gen;
+  }
+
+  size_t size_logical() const {
+    return sizeof(shard_t) + sizeof(pool_t) + sizeof(crush_hash_t) +
+           sizeof(snap_t) + sizeof(gen_t) + ns_oid_view().size_logical();
+  }
+
+  ghobject_t to_ghobj() const {
+    return ghobject_t(
+        shard_id_t(shard()), pool(), crush(),
+        std::string(nspace()), std::string(oid()), snap(), gen());
+  }
+
+  void replace(const crush_t& key) { p_crush = &key; }
+  void set(const crush_t& key) {
+    assert(!has_crush());
+    replace(key);
+  }
+  void replace(const shard_pool_crush_t& key) { p_shard_pool = &key.shard_pool; }
+  void set(const shard_pool_crush_t& key) {
+    set(key.crush);
+    assert(!has_shard_pool());
+    replace(key);
+  }
+  void replace(const ns_oid_view_t& key) { p_ns_oid = key; }
+  void set(const ns_oid_view_t& key) {
+    assert(!has_ns_oid());
+    replace(key);
+  }
+  void replace(const snap_gen_t& key) { p_snap_gen = &key; }
+  void set(const snap_gen_t& key) {
+    assert(!has_snap_gen());
+    replace(key);
+  }
+
+  std::ostream& dump(std::ostream& os) const {
+    os << "key_view(";
+    if (has_shard_pool()) {
+      os << (unsigned)shard() << "," << pool() << ",";
+    } else {
+      os << "X,X,";
+    }
+    if (has_crush()) {
+      os << crush() << "; ";
+    } else {
+      os << "X; ";
+    }
+    if (has_ns_oid()) {
+      os << ns_oid_view() << "; ";
+    } else {
+      os << "X,X; ";
+    }
+    if (has_snap_gen()) {
+      os << snap() << "," << gen() << ")";
+    } else {
+      os << "X,X)";
+    }
+    return os;
+  }
+
+ private:
+  const shard_pool_t* p_shard_pool = nullptr;
+  const crush_t* p_crush = nullptr;
+  std::optional<ns_oid_view_t> p_ns_oid;
+  const snap_gen_t* p_snap_gen = nullptr;
+};
+
+template <KeyT KT>
+void encode_key(const full_key_t<KT>& key, ceph::bufferlist& bl) {
+  ceph::encode(key.shard(), bl);
+  ceph::encode(key.pool(), bl);
+  ceph::encode(key.crush(), bl);
+  key.nspace_masked().encode(bl);
+  key.oid_masked().encode(bl);
+  ceph::encode(key.snap(), bl);
+  ceph::encode(key.gen(), bl);
+}
+
+inline MatchKindCMP compare_to(std::string_view l, std::string_view r) {
+  return toMatchKindCMP(l, r);
+}
+template <KeyT TypeL, KeyT TypeR>
+bool compare_full_key(const full_key_t<TypeL>& l, const full_key_t<TypeR>& r) {
+  if (l.shard() != r.shard())
+    return false;
+  if (l.pool() != r.pool())
+    return false;
+  if (l.crush() != r.crush())
+    return false;
+  if (compare_to(l.nspace(), r.nspace()) != MatchKindCMP::EQ)
+    return false;
+  if (compare_to(l.oid(), r.oid()) != MatchKindCMP::EQ)
+    return false;
+  if (l.snap() != r.snap())
+    return false;
+  if (l.gen() != r.gen())
+    return false;
+  return true;
+}
+
+inline bool key_hobj_t::operator==(const full_key_t<KeyT::VIEW>& o) const {
+  return compare_full_key<KeyT::HOBJ, KeyT::VIEW>(*this, o);
+}
+inline bool key_hobj_t::operator==(const full_key_t<KeyT::HOBJ>& o) const {
+  return compare_full_key<KeyT::HOBJ, KeyT::HOBJ>(*this, o);
+}
+inline bool key_view_t::operator==(const full_key_t<KeyT::VIEW>& o) const {
+  return compare_full_key<KeyT::VIEW, KeyT::VIEW>(*this, o);
+}
+inline bool key_view_t::operator==(const full_key_t<KeyT::HOBJ>& o) const {
+  return compare_full_key<KeyT::VIEW, KeyT::HOBJ>(*this, o);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const key_view_t& key) {
+  return key.dump(os);
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const shard_pool_t& target) {
+  auto ret = toMatchKindCMP(key.shard(), target.shard);
+  if (ret != MatchKindCMP::EQ)
+    return ret;
+  return toMatchKindCMP(key.pool(), target.pool);
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const crush_t& target) {
+  return toMatchKindCMP(key.crush(), target.crush);
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const shard_pool_crush_t& target) {
+  auto ret = compare_to<Type>(key, target.shard_pool);
+  if (ret != MatchKindCMP::EQ)
+    return ret;
+  return compare_to<Type>(key, target.crush);
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const ns_oid_view_t& target) {
+  auto ret = compare_to(key.nspace(), string_view_masked_t{target.nspace});
+  if (ret != MatchKindCMP::EQ)
+    return ret;
+  return compare_to(key.oid(), string_view_masked_t{target.oid});
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const snap_gen_t& target) {
+  auto ret = toMatchKindCMP(key.snap(), target.snap);
+  if (ret != MatchKindCMP::EQ)
+    return ret;
+  return toMatchKindCMP(key.gen(), target.gen);
+}
+
+template <KeyT KT>
+shard_pool_t shard_pool_t::from_key(const full_key_t<KT>& key) {
+  if constexpr (KT == KeyT::VIEW) {
+    return key.shard_pool_packed();
+  } else {
+    return {key.shard(), key.pool()};
+  }
+}
+
+template <KeyT KT>
+crush_t crush_t::from_key(const full_key_t<KT>& key) {
+  if constexpr (KT == KeyT::VIEW) {
+    return key.crush_packed();
+  } else {
+    return {key.crush()};
+  }
+}
+
+template <KeyT KT>
+shard_pool_crush_t shard_pool_crush_t::from_key(const full_key_t<KT>& key) {
+  return {shard_pool_t::from_key<KT>(key), crush_t::from_key<KT>(key)};
+}
+
+template <KeyT KT>
+snap_gen_t snap_gen_t::from_key(const full_key_t<KT>& key) {
+  if constexpr (KT == KeyT::VIEW) {
+    return key.snap_gen_packed();
+  } else {
+    return {key.snap(), key.gen()};
+  }
+}
+
+template <KeyT KT>
+node_offset_t ns_oid_view_t::estimate_size(const full_key_t<KT>& key) {
+  if constexpr (KT == KeyT::VIEW) {
+    return key.ns_oid_view().size();
+  } else {
+    if (key.dedup_type() != Type::STR) {
+      // size after deduplication
+      return sizeof(string_size_t);
+    } else {
+      return 2 * sizeof(string_size_t) + key.nspace().size() + key.oid().size();
+    }
+  }
+}
+
+template <KeyT KT>
+void ns_oid_view_t::append(
+    NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) {
+  if (key.dedup_type() == Type::STR) {
+    string_key_view_t::append_str(mut, key.nspace(), p_append);
+    string_key_view_t::append_str(mut, key.oid(), p_append);
+  } else {
+    string_key_view_t::append_dedup(mut, key.dedup_type(), p_append);
+  }
+}
+
+template <KeyT KT>
+void ns_oid_view_t::test_append(const full_key_t<KT>& key, char*& p_append) {
+  if (key.dedup_type() == Type::STR) {
+    string_key_view_t::test_append_str(key.nspace(), p_append);
+    string_key_view_t::test_append_str(key.oid(), p_append);
+  } else {
+    string_key_view_t::test_append_dedup(key.dedup_type(), p_append);
+  }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc
new file mode 100644
index 000000000..4a5988185
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc
@@ -0,0 +1,318 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_stage.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+#include "node_stage_layout.h"
+
+namespace crimson::os::seastore::onode {
+
+#define NODE_T node_extent_t<FieldType, NODE_TYPE>
+#define NODE_INST(FT, NT) node_extent_t<FT, NT>
+
+template <typename FieldType, node_type_t NODE_TYPE>
+const char* NODE_T::p_left_bound() const {
+  if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+    // N3 internal node doesn't have the right part
+    return nullptr;
+  } else {
+    auto ret = p_start() + fields().get_item_end_offset(keys());
+    if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+      if (is_level_tail()) {
+        ret -= sizeof(laddr_t);
+      }
+    }
+    return ret;
+  }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+node_offset_t NODE_T::size_to_nxt_at(index_t index) const {
+  assert(index < keys());
+  if constexpr (FIELD_TYPE == field_type_t::N0 ||
+                FIELD_TYPE == field_type_t::N1) {
+    return FieldType::estimate_insert_one();
+  } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+    auto p_end = p_start() + p_fields->get_item_end_offset(index);
+    return FieldType::estimate_insert_one() + ns_oid_view_t(p_end).size();
+  } else {
+    ceph_abort("N3 node is not nested");
+  }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+memory_range_t NODE_T::get_nxt_container(index_t index) const {
+  if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+    ceph_abort("N3 internal node doesn't have the right part");
+  } else {
+    node_offset_t item_start_offset = p_fields->get_item_start_offset(index);
+    node_offset_t item_end_offset = p_fields->get_item_end_offset(index);
+    assert(item_start_offset < item_end_offset);
+    auto item_p_start = p_start() + item_start_offset;
+    auto item_p_end = p_start() + item_end_offset;
+    if constexpr (FIELD_TYPE == field_type_t::N2) {
+      // range for sub_items_t<NODE_TYPE>
+      item_p_end = ns_oid_view_t(item_p_end).p_start();
+      assert(item_p_start < item_p_end);
+    } else {
+      // range for item_iterator_t<NODE_TYPE>
+    }
+    return {item_p_start, item_p_end};
+  }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+void NODE_T::bootstrap_extent(
+    NodeExtentMutable& mut,
+    field_type_t field_type, node_type_t node_type,
+    bool is_level_tail, level_t level) {
+  node_header_t::bootstrap_extent(
+      mut, field_type, node_type, is_level_tail, level);
+  mut.copy_in_relative(
+      sizeof(node_header_t), typename FieldType::num_keys_t(0u));
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+void NODE_T::update_is_level_tail(
+    NodeExtentMutable& mut, const node_extent_t& extent, bool value) {
+  node_header_t::update_is_level_tail(mut, extent.p_fields->header, value);
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+memory_range_t NODE_T::insert_prefix_at(
+    NodeExtentMutable& mut, const node_extent_t& node, const full_key_t<KT>& key,
+    index_t index, node_offset_t size, const char* p_left_bound) {
+  if constexpr (FIELD_TYPE == field_type_t::N0 ||
+                FIELD_TYPE == field_type_t::N1) {
+    assert(index <= node.keys());
+    assert(p_left_bound == node.p_left_bound());
+    assert(size > FieldType::estimate_insert_one());
+    auto size_right = size - FieldType::estimate_insert_one();
+    const char* p_insert = node.p_start() + node.fields().get_item_end_offset(index);
+    const char* p_insert_front = p_insert - size_right;
+    FieldType::template insert_at<KT>(mut, key, node.fields(), index, size_right);
+    mut.shift_absolute(p_left_bound,
+                       p_insert - p_left_bound,
+                       -(int)size_right);
+    return {p_insert_front, p_insert};
+  } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+    ceph_abort("not implemented");
+  } else {
+    ceph_abort("impossible");
+  }
+}
+#define IPA_TEMPLATE(FT, NT, KT)                                         \
+  template memory_range_t NODE_INST(FT, NT)::insert_prefix_at<KT>(       \
+      NodeExtentMutable&, const node_extent_t&, const full_key_t<KT>&, \
+      index_t, node_offset_t, const char*)
+IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ);
+
+template <typename FieldType, node_type_t NODE_TYPE>
+void NODE_T::update_size_at(
+    NodeExtentMutable& mut, const node_extent_t& node, index_t index, int change) {
+  assert(index < node.keys());
+  FieldType::update_size_at(mut, node.fields(), index, change);
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+node_offset_t NODE_T::trim_until(
+    NodeExtentMutable& mut, const node_extent_t& node, index_t index) {
+  assert(!node.is_level_tail());
+  auto keys = node.keys();
+  assert(index <= keys);
+  if (index == keys) {
+    return 0;
+  }
+  if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+    ceph_abort("not implemented");
+  } else {
+    mut.copy_in_absolute(
+        (void*)&node.p_fields->num_keys, num_keys_t(index));
+  }
+  // no need to calculate trim size for node
+  return 0;
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+node_offset_t NODE_T::trim_at(
+    NodeExtentMutable& mut, const node_extent_t& node,
+    index_t index, node_offset_t trimmed) {
+  assert(!node.is_level_tail());
+  assert(index < node.keys());
+  if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+    ceph_abort("not implemented");
+  } else {
+    node_offset_t offset = node.p_fields->get_item_start_offset(index);
+    size_t new_offset = offset + trimmed;
+    assert(new_offset < node.p_fields->get_item_end_offset(index));
+    mut.copy_in_absolute(const_cast<void*>(node.p_fields->p_offset(index)),
+                         node_offset_t(new_offset));
+    mut.copy_in_absolute(
+        (void*)&node.p_fields->num_keys, num_keys_t(index + 1));
+  }
+  // no need to calculate trim size for node
+  return 0;
+}
+
+#define NODE_TEMPLATE(FT, NT) template class NODE_INST(FT, NT)
+NODE_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(node_fields_0_t, node_type_t::LEAF);
+NODE_TEMPLATE(node_fields_1_t, node_type_t::LEAF);
+NODE_TEMPLATE(node_fields_2_t, node_type_t::LEAF);
+NODE_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF);
+
+#define APPEND_T node_extent_t<FieldType, NODE_TYPE>::Appender<KT>
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+void APPEND_T::append(const node_extent_t& src, index_t from, index_t items) {
+  assert(from <= src.keys());
+  if (p_src == nullptr) {
+    p_src = &src;
+  } else {
+    assert(p_src == &src);
+  }
+  if (items == 0) {
+    return;
+  }
+  assert(from < src.keys());
+  assert(from + items <= src.keys());
+  num_keys += items;
+  if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+    ceph_abort("impossible path");
+  } else {
+    // append left part forwards
+    node_offset_t offset_left_start = src.fields().get_key_start_offset(from);
+    node_offset_t offset_left_end = src.fields().get_key_start_offset(from + items);
+    node_offset_t left_size = offset_left_end - offset_left_start;
+    if (num_keys == 0) {
+      // no need to adjust offset
+      assert(from == 0);
+      assert(p_start + offset_left_start == p_append_left);
+      p_mut->copy_in_absolute(p_append_left,
+          src.p_start() + offset_left_start, left_size);
+    } else {
+      node_offset_t step_size = FieldType::estimate_insert_one();
+      node_offset_t offset_base = src.fields().get_item_end_offset(from);
+      int offset_change = p_append_right - p_start - offset_base;
+      auto p_offset_dst = p_append_left;
+      if constexpr (FIELD_TYPE != field_type_t::N2) {
+        // copy keys
+        p_mut->copy_in_absolute(p_append_left,
+            src.p_start() + offset_left_start, left_size);
+        // point to offset for update
+        p_offset_dst += sizeof(typename FieldType::key_t);
+      }
+      for (auto i = from; i < from + items; ++i) {
+        p_mut->copy_in_absolute(p_offset_dst,
+            node_offset_t(src.fields().get_item_start_offset(i) + offset_change));
+        p_offset_dst += step_size;
+      }
+      assert(p_append_left + left_size + sizeof(typename FieldType::key_t) ==
+             p_offset_dst);
+    }
+    p_append_left += left_size;
+
+    // append right part backwards
+    node_offset_t offset_right_start = src.fields().get_item_end_offset(from + items);
+    node_offset_t offset_right_end = src.fields().get_item_end_offset(from);
+    node_offset_t right_size = offset_right_end - offset_right_start;
+    p_append_right -= right_size;
+    p_mut->copy_in_absolute(p_append_right,
+        src.p_start() + offset_right_start, right_size);
+  }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+void APPEND_T::append(
+    const full_key_t<KT>& key, const value_t& value, const value_t*& p_value) {
+  if constexpr (FIELD_TYPE == field_type_t::N3) {
+    ceph_abort("not implemented");
+  } else {
+    ceph_abort("should not happen");
+  }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const key_get_type& partial_key) {
+  if constexpr (FIELD_TYPE == field_type_t::N0 ||
+                FIELD_TYPE == field_type_t::N1) {
+    FieldType::append_key(*p_mut, partial_key, p_append_left);
+  } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+    FieldType::append_key(*p_mut, partial_key, p_append_right);
+  } else {
+    ceph_abort("impossible path");
+  }
+  return {p_mut, p_append_right};
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const full_key_t<KT>& key) {
+  if constexpr (FIELD_TYPE == field_type_t::N0 ||
+                FIELD_TYPE == field_type_t::N1) {
+    FieldType::template append_key<KT>(*p_mut, key, p_append_left);
+  } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+    FieldType::template append_key<KT>(*p_mut, key, p_append_right);
+  } else {
+    ceph_abort("impossible path");
+  }
+  return {p_mut, p_append_right};
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+char* APPEND_T::wrap() {
+  assert(p_append_left <= p_append_right);
+  assert(p_src);
+  if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+    if (p_src->is_level_tail()) {
+      laddr_t tail_value = p_src->get_end_p_laddr()->value;
+      p_append_right -= sizeof(laddr_t);
+      assert(p_append_left <= p_append_right);
+      p_mut->copy_in_absolute(p_append_right, tail_value);
+    }
+  }
+  p_mut->copy_in_absolute(p_start + offsetof(FieldType, num_keys), num_keys);
+  return p_append_left;
+}
+
+#define APPEND_TEMPLATE(FT, NT, KT) template class node_extent_t<FT, NT>::Appender<KT>
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::HOBJ);
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h
new file mode 100644
index 000000000..cf0ca463c
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h
@@ -0,0 +1,226 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "key_layout.h"
+#include "stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+/**
+ * node_extent_t
+ *
+ * The top indexing stage implementation for node N0/N1/N2/N3, implements
+ * staged contract as an indexable container, and provides access to node
+ * header.
+ *
+ * The specific field layout are defined by FieldType which are
+ * node_fields_0_t, node_fields_1_t, node_fields_2_t, internal_fields_3_t and
+ * leaf_fields_3_t. Diagrams see node_stage_layout.h.
+ */
+template <typename FieldType, node_type_t _NODE_TYPE>
+class node_extent_t {
+ public:
+  using value_t = value_type_t<_NODE_TYPE>;
+  using num_keys_t = typename FieldType::num_keys_t;
+  static constexpr node_type_t NODE_TYPE = _NODE_TYPE;
+  static constexpr field_type_t FIELD_TYPE = FieldType::FIELD_TYPE;
+  static constexpr node_offset_t EXTENT_SIZE =
+    (FieldType::SIZE + DISK_BLOCK_SIZE - 1u) / DISK_BLOCK_SIZE * DISK_BLOCK_SIZE;
+
+  // TODO: remove
+  node_extent_t() = default;
+
+  node_extent_t(const FieldType* p_fields) : p_fields{p_fields} {
+    validate(*p_fields);
+  }
+
+  const char* p_start() const { return fields_start(*p_fields); }
+
+  const char* off_to_ptr(node_offset_t off) const {
+    assert(off <= FieldType::SIZE);
+    return p_start() + off;
+  }
+
+  node_offset_t ptr_to_off(const void* ptr) const {
+    auto _ptr = static_cast<const char*>(ptr);
+    assert(_ptr >= p_start());
+    auto off = _ptr - p_start();
+    assert(off <= FieldType::SIZE);
+    return off;
+  }
+
+  bool is_level_tail() const { return p_fields->is_level_tail(); }
+  level_t level() const { return p_fields->header.level; }
+  node_offset_t free_size() const {
+    return p_fields->template free_size_before<NODE_TYPE>(keys());
+  }
+  node_offset_t total_size() const { return p_fields->total_size(); }
+  const char* p_left_bound() const;
+  template <node_type_t T = NODE_TYPE>
+  std::enable_if_t<T == node_type_t::INTERNAL, const laddr_packed_t*>
+  get_end_p_laddr() const {
+    assert(is_level_tail());
+    if constexpr (FIELD_TYPE == field_type_t::N3) {
+      return &p_fields->child_addrs[keys()];
+    } else {
+      auto offset_start = p_fields->get_item_end_offset(keys());
+      assert(offset_start <= FieldType::SIZE);
+      offset_start -= sizeof(laddr_packed_t);
+      auto p_addr = p_start() + offset_start;
+      return reinterpret_cast<const laddr_packed_t*>(p_addr);
+    }
+  }
+
+  // container type system
+  using key_get_type = typename FieldType::key_get_type;
+  static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE;
+  index_t keys() const { return p_fields->num_keys; }
+  key_get_type operator[] (index_t index) const { return p_fields->get_key(index); }
+  node_offset_t size_before(index_t index) const {
+    auto free_size = p_fields->template free_size_before<NODE_TYPE>(index);
+    assert(total_size() >= free_size);
+    return total_size() - free_size;
+  }
+  node_offset_t size_to_nxt_at(index_t index) const;
+  node_offset_t size_overhead_at(index_t index) const {
+    return FieldType::ITEM_OVERHEAD; }
+  memory_range_t get_nxt_container(index_t index) const;
+
+  template <typename T = FieldType>
+  std::enable_if_t<T::FIELD_TYPE == field_type_t::N3, const value_t*>
+  get_p_value(index_t index) const {
+    assert(index < keys());
+    if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+      return &p_fields->child_addrs[index];
+    } else {
+      auto range = get_nxt_container(index);
+      auto ret = reinterpret_cast<const onode_t*>(range.p_start);
+      assert(range.p_start + ret->size == range.p_end);
+      return ret;
+    }
+  }
+
+  void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+    assert(p_node_start == p_start());
+    // nothing to encode as the container range is the entire extent
+  }
+
+  static node_extent_t decode(const char* p_node_start,
+                              ceph::bufferlist::const_iterator& delta) {
+    // nothing to decode
+    return node_extent_t(reinterpret_cast<const FieldType*>(p_node_start));
+  }
+
+  static void validate(const FieldType& fields) {
+#ifndef NDEBUG
+    assert(fields.header.get_node_type() == NODE_TYPE);
+    assert(fields.header.get_field_type() == FieldType::FIELD_TYPE);
+    if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+      assert(fields.header.level > 0u);
+    } else {
+      assert(fields.header.level == 0u);
+    }
+#endif
+  }
+
+  static void bootstrap_extent(
+      NodeExtentMutable&, field_type_t, node_type_t, bool, level_t);
+
+  static void update_is_level_tail(NodeExtentMutable&, const node_extent_t&, bool);
+
+  static node_offset_t header_size() { return FieldType::HEADER_SIZE; }
+
+  template <KeyT KT>
+  static node_offset_t estimate_insert(
+      const full_key_t<KT>& key, const value_t& value) {
+    auto size = FieldType::estimate_insert_one();
+    if constexpr (FIELD_TYPE == field_type_t::N2) {
+      size += ns_oid_view_t::estimate_size<KT>(key);
+    } else if constexpr (FIELD_TYPE == field_type_t::N3 &&
+                         NODE_TYPE == node_type_t::LEAF) {
+      size += value.size;
+    }
+    return size;
+  }
+
+  template <KeyT KT>
+  static const value_t* insert_at(
+      NodeExtentMutable& mut, const node_extent_t&,
+      const full_key_t<KT>& key, const value_t& value,
+      index_t index, node_offset_t size, const char* p_left_bound) {
+    if constexpr (FIELD_TYPE == field_type_t::N3) {
+      ceph_abort("not implemented");
+    } else {
+      ceph_abort("impossible");
+    }
+  }
+
+  template <KeyT KT>
+  static memory_range_t insert_prefix_at(
+      NodeExtentMutable&, const node_extent_t&,
+      const full_key_t<KT>& key,
+      index_t index, node_offset_t size, const char* p_left_bound);
+
+  static void update_size_at(
+      NodeExtentMutable&, const node_extent_t&, index_t index, int change);
+
+  static node_offset_t trim_until(
+      NodeExtentMutable&, const node_extent_t&, index_t index);
+  static node_offset_t trim_at(NodeExtentMutable&, const node_extent_t&,
+                        index_t index, node_offset_t trimmed);
+
+  template <KeyT KT>
+  class Appender;
+
+ private:
+  const FieldType& fields() const { return *p_fields; }
+  const FieldType* p_fields;
+};
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+class node_extent_t<FieldType, NODE_TYPE>::Appender {
+ public:
+  Appender(NodeExtentMutable* p_mut, char* p_append)
+    : p_mut{p_mut}, p_start{p_append} {
+#ifndef NDEBUG
+    auto p_fields = reinterpret_cast<const FieldType*>(p_append);
+    assert(*(p_fields->header.get_field_type()) == FIELD_TYPE);
+    assert(p_fields->header.get_node_type() == NODE_TYPE);
+    assert(p_fields->num_keys == 0);
+#endif
+    p_append_left = p_start + FieldType::HEADER_SIZE;
+    p_append_right = p_start + FieldType::SIZE;
+  }
+  void append(const node_extent_t& src, index_t from, index_t items);
+  void append(const full_key_t<KT>&, const value_t&, const value_t*&);
+  char* wrap();
+  std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&);
+  std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&);
+  void wrap_nxt(char* p_append) {
+    if constexpr (FIELD_TYPE != field_type_t::N3) {
+      assert(p_append < p_append_right);
+      assert(p_append_left < p_append);
+      p_append_right = p_append;
+      FieldType::append_offset(*p_mut, p_append - p_start, p_append_left);
+      ++num_keys;
+    } else {
+      ceph_abort("not implemented");
+    }
+  }
+
+ private:
+  const node_extent_t* p_src = nullptr;
+  NodeExtentMutable* p_mut;
+  char* p_start;
+  char* p_append_left;
+  char* p_append_right;
+  num_keys_t num_keys = 0;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc
new file mode 100644
index 000000000..81bfac72a
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_stage_layout.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+void node_header_t::bootstrap_extent(
+    NodeExtentMutable& mut,
+    field_type_t field_type, node_type_t node_type,
+    bool is_level_tail, level_t level) {
+  node_header_t header;
+  header.set_field_type(field_type);
+  header.set_node_type(node_type);
+  header.set_is_level_tail(is_level_tail);
+  header.level = level;
+  mut.copy_in_relative(0, header);
+}
+
+void node_header_t::update_is_level_tail(
+    NodeExtentMutable& mut, const node_header_t& header, bool value) {
+  auto& _header = const_cast<node_header_t&>(header);
+  _header.set_is_level_tail(value);
+  mut.validate_inplace_update(_header);
+}
+
+#define F013_T _node_fields_013_t<SlotType>
+#define F013_INST(ST) _node_fields_013_t<ST>
+
+template <typename SlotType>
+void F013_T::update_size_at(
+    NodeExtentMutable& mut, const me_t& node, index_t index, int change) {
+  assert(index <= node.num_keys);
+  for (const auto* p_slot = &node.slots[index];
+       p_slot < &node.slots[node.num_keys];
+       ++p_slot) {
+    node_offset_t offset = p_slot->right_offset;
+    mut.copy_in_absolute(
+        (void*)&(p_slot->right_offset),
+        node_offset_t(offset - change));
+  }
+}
+
+template <typename SlotType>
+void F013_T::append_key(
+    NodeExtentMutable& mut, const key_t& key, char*& p_append) {
+  mut.copy_in_absolute(p_append, key);
+  p_append += sizeof(key_t);
+}
+
+template <typename SlotType>
+void F013_T::append_offset(
+    NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) {
+  mut.copy_in_absolute(p_append, offset_to_right);
+  p_append += sizeof(node_offset_t);
+}
+
+template <typename SlotType>
+template <KeyT KT>
+void F013_T::insert_at(
+    NodeExtentMutable& mut, const full_key_t<KT>& key,
+    const me_t& node, index_t index, node_offset_t size_right) {
+  assert(index <= node.num_keys);
+  update_size_at(mut, node, index, size_right);
+  auto p_insert = const_cast<char*>(fields_start(node)) +
+                  node.get_key_start_offset(index);
+  auto p_shift_end = fields_start(node) + node.get_key_start_offset(node.num_keys);
+  mut.shift_absolute(p_insert, p_shift_end - p_insert, estimate_insert_one());
+  mut.copy_in_absolute((void*)&node.num_keys, num_keys_t(node.num_keys + 1));
+  append_key(mut, key_t::template from_key<KT>(key), p_insert);
+  append_offset(mut, node.get_item_end_offset(index) - size_right, p_insert);
+}
+#define IA_TEMPLATE(ST, KT) template void F013_INST(ST)::      \
+    insert_at<KT>(NodeExtentMutable&, const full_key_t<KT>&, \
+                  const F013_INST(ST)&, index_t, node_offset_t)
+IA_TEMPLATE(slot_0_t, KeyT::VIEW);
+IA_TEMPLATE(slot_1_t, KeyT::VIEW);
+IA_TEMPLATE(slot_3_t, KeyT::VIEW);
+IA_TEMPLATE(slot_0_t, KeyT::HOBJ);
+IA_TEMPLATE(slot_1_t, KeyT::HOBJ);
+IA_TEMPLATE(slot_3_t, KeyT::HOBJ);
+
+#define F013_TEMPLATE(ST) template struct F013_INST(ST)
+F013_TEMPLATE(slot_0_t);
+F013_TEMPLATE(slot_1_t);
+F013_TEMPLATE(slot_3_t);
+
+void node_fields_2_t::append_offset(
+    NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) {
+  mut.copy_in_absolute(p_append, offset_to_right);
+  p_append += sizeof(node_offset_t);
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h
new file mode 100644
index 000000000..14ba95bf4
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h
@@ -0,0 +1,366 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "key_layout.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+struct node_header_t {
+  static constexpr unsigned FIELD_TYPE_BITS = 6u;
+  static_assert(static_cast<uint8_t>(field_type_t::_MAX) <= 1u << FIELD_TYPE_BITS);
+  static constexpr unsigned NODE_TYPE_BITS = 1u;
+  static constexpr unsigned B_LEVEL_TAIL_BITS = 1u;
+  using bits_t = uint8_t;
+
+  node_header_t() {}
+  std::optional<field_type_t> get_field_type() const {
+    if (field_type >= FIELD_TYPE_MAGIC &&
+        field_type < static_cast<uint8_t>(field_type_t::_MAX)) {
+      return static_cast<field_type_t>(field_type);
+    } else {
+      return std::nullopt;
+    }
+  }
+  node_type_t get_node_type() const {
+    return static_cast<node_type_t>(node_type);
+  }
+  bool get_is_level_tail() const {
+    return is_level_tail;
+  }
+
+  static void bootstrap_extent(
+      NodeExtentMutable&, field_type_t, node_type_t, bool, level_t);
+
+  static void update_is_level_tail(NodeExtentMutable&, const node_header_t&, bool);
+
+  bits_t field_type : FIELD_TYPE_BITS;
+  bits_t node_type : NODE_TYPE_BITS;
+  bits_t is_level_tail : B_LEVEL_TAIL_BITS;
+  static_assert(sizeof(bits_t) * 8 ==
+                FIELD_TYPE_BITS + NODE_TYPE_BITS + B_LEVEL_TAIL_BITS);
+  level_t level;
+
+ private:
+  void set_field_type(field_type_t type) {
+    field_type = static_cast<uint8_t>(type);
+  }
+  void set_node_type(node_type_t type) {
+    node_type = static_cast<uint8_t>(type);
+  }
+  void set_is_level_tail(bool value) {
+    is_level_tail = static_cast<uint8_t>(value);
+  }
+} __attribute__((packed));
+
+template <typename FixedKeyType, field_type_t _FIELD_TYPE>
+struct _slot_t {
+  using key_t = FixedKeyType;
+  static constexpr field_type_t FIELD_TYPE = _FIELD_TYPE;
+  static constexpr node_offset_t OVERHEAD = sizeof(_slot_t) - sizeof(key_t);
+
+  key_t key;
+  node_offset_t right_offset;
+} __attribute__((packed));
+using slot_0_t = _slot_t<shard_pool_crush_t, field_type_t::N0>;
+using slot_1_t = _slot_t<crush_t, field_type_t::N1>;
+using slot_3_t = _slot_t<snap_gen_t, field_type_t::N3>;
+
+struct node_range_t {
+  node_offset_t start;
+  node_offset_t end;
+};
+
+template <typename FieldType>
+const char* fields_start(const FieldType& node) {
+  return reinterpret_cast<const char*>(&node);
+}
+
+template <node_type_t NODE_TYPE, typename FieldType>
+node_range_t fields_free_range_before(
+    const FieldType& node, index_t index) {
+  assert(index <= node.num_keys);
+  node_offset_t offset_start = node.get_key_start_offset(index);
+  node_offset_t offset_end =
+    (index == 0 ? FieldType::SIZE
+                   : node.get_item_start_offset(index - 1));
+  if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+    if (node.is_level_tail() && index == node.num_keys) {
+      offset_end -= sizeof(laddr_t);
+    }
+  }
+  assert(offset_start <= offset_end);
+  assert(offset_end - offset_start < FieldType::SIZE);
+  return {offset_start, offset_end};
+}
+
+/**
+ * _node_fields_013_t (node_fields_0_t, node_fields_1_t, leaf_fields_3_t
+ *
+ * The STAGE_LEFT layout implementation for node N0/N1, or the STAGE_RIGHT
+ * layout implementation for leaf node N3.
+ *
+ * The node layout storing n slots:
+ *
+ * # <----------------------------- node range --------------------------------------> #
+ * #                                                 #<~># free space                  #
+ * # <----- left part -----------------------------> # <~# <----- right slots -------> #
+ * #               # <---- left slots -------------> #~> #                             #
+ * #               #                slots [2, n) |<~>#   #<~>| right slots [2, n)      #
+ * #               # <- slot 0 -> | <- slot 1 -> |   #   #   | <-- s1 --> | <-- s0 --> #
+ * #               #              |              |   #   #   |            |            #
+ * #        | num_ #     | right  |     | right  |   #   #   | next-stage | next-stage #
+ * # header | keys # key | offset | key | offset |   #   #   | container  | container  #
+ * #        |      # 0   | 0      | 1   | 1      |...#...#...| or onode 1 | or onode 0 #
+ *                           |              |                ^            ^
+ *                           |              |                |            |
+ *                           |              +----------------+            |
+ *                           +--------------------------------------------+
+ */
+template <typename SlotType>
+struct _node_fields_013_t {
+  // TODO: decide by NODE_BLOCK_SIZE, sizeof(SlotType), sizeof(laddr_t)
+  // and the minimal size of variable_key.
+  using num_keys_t = uint8_t;
+  using key_t = typename SlotType::key_t;
+  using key_get_type = const key_t&;
+  using me_t = _node_fields_013_t<SlotType>;
+  static constexpr field_type_t FIELD_TYPE = SlotType::FIELD_TYPE;
+  static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE;
+  static constexpr node_offset_t HEADER_SIZE =
+    sizeof(node_header_t) + sizeof(num_keys_t);
+  static constexpr node_offset_t ITEM_OVERHEAD = SlotType::OVERHEAD;
+
+  bool is_level_tail() const { return header.get_is_level_tail(); }
+  node_offset_t total_size() const { return SIZE; }
+  key_get_type get_key(index_t index) const {
+    assert(index < num_keys);
+    return slots[index].key;
+  }
+  node_offset_t get_key_start_offset(index_t index) const {
+    assert(index <= num_keys);
+    auto offset = HEADER_SIZE + sizeof(SlotType) * index;
+    assert(offset < SIZE);
+    return offset;
+  }
+  node_offset_t get_item_start_offset(index_t index) const {
+    assert(index < num_keys);
+    auto offset = slots[index].right_offset;
+    assert(offset <= SIZE);
+    return offset;
+  }
+  const void* p_offset(index_t index) const {
+    assert(index < num_keys);
+    return &slots[index].right_offset;
+  }
+  node_offset_t get_item_end_offset(index_t index) const {
+    return index == 0 ? SIZE : get_item_start_offset(index - 1);
+  }
+  template <node_type_t NODE_TYPE>
+  node_offset_t free_size_before(index_t index) const {
+    auto range = fields_free_range_before<NODE_TYPE>(*this, index);
+    return range.end - range.start;
+  }
+
+  static node_offset_t estimate_insert_one() { return sizeof(SlotType); }
+  template <KeyT KT>
+  static void insert_at(
+      NodeExtentMutable&, const full_key_t<KT>& key,
+      const me_t& node, index_t index, node_offset_t size_right);
+  static void update_size_at(
+      NodeExtentMutable&, const me_t& node, index_t index, int change);
+  static void append_key(
+      NodeExtentMutable&, const key_t& key, char*& p_append);
+  template <KeyT KT>
+  static void append_key(
+      NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) {
+    append_key(mut, key_t::template from_key<KT>(key), p_append);
+  }
+  static void append_offset(
+      NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append);
+
+  node_header_t header;
+  num_keys_t num_keys = 0u;
+  SlotType slots[];
+} __attribute__((packed));
+using node_fields_0_t = _node_fields_013_t<slot_0_t>;
+using node_fields_1_t = _node_fields_013_t<slot_1_t>;
+
+/**
+ * node_fields_2_t
+ *
+ * The STAGE_STRING layout implementation for node N2.
+ *
+ * The node layout storing n slots:
+ *
+ * # <--------------------------------- node range ----------------------------------------> #
+ * #                                     #<~># free space                                    #
+ * # <------- left part ---------------> # <~# <--------- right slots ---------------------> #
+ * #               # <---- offsets ----> #~> #<~>| slots [2, n)                              #
+ * #               #  offsets [2, n) |<~>#   #   | <----- slot 1 ----> | <----- slot 0 ----> #
+ * #               #                 |   #   #   |                     |                     #
+ * #        | num_ # offset | offset |   #   #   | next-stage | ns-oid | next-stage | ns-oid #
+ * # header | keys # 0      | 1      |...#...#...| container1 | 1      | container0 | 0      #
+ *                     |        |                ^                     ^
+ *                     |        |                |                     |
+ *                     |        +----------------+                     |
+ *                     +-----------------------------------------------+
+ */
+struct node_fields_2_t {
+  // TODO: decide by NODE_BLOCK_SIZE, sizeof(node_off_t), sizeof(laddr_t)
+  // and the minimal size of variable_key.
+  using num_keys_t = uint8_t;
+  using key_t = ns_oid_view_t;
+  using key_get_type = key_t;
+  static constexpr field_type_t FIELD_TYPE = field_type_t::N2;
+  static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE;
+  static constexpr node_offset_t HEADER_SIZE =
+    sizeof(node_header_t) + sizeof(num_keys_t);
+  static constexpr node_offset_t ITEM_OVERHEAD = sizeof(node_offset_t);
+
+  bool is_level_tail() const { return header.get_is_level_tail(); }
+  node_offset_t total_size() const { return SIZE; }
+  key_get_type get_key(index_t index) const {
+    assert(index < num_keys);
+    node_offset_t item_end_offset =
+      (index == 0 ? SIZE : offsets[index - 1]);
+    assert(item_end_offset <= SIZE);
+    const char* p_start = fields_start(*this);
+    return key_t(p_start + item_end_offset);
+  }
+  node_offset_t get_key_start_offset(index_t index) const {
+    assert(index <= num_keys);
+    auto offset = HEADER_SIZE + sizeof(node_offset_t) * num_keys;
+    assert(offset <= SIZE);
+    return offset;
+  }
+  node_offset_t get_item_start_offset(index_t index) const {
+    assert(index < num_keys);
+    auto offset = offsets[index];
+    assert(offset <= SIZE);
+    return offset;
+  }
+  const void* p_offset(index_t index) const {
+    assert(index < num_keys);
+    return &offsets[index];
+  }
+  node_offset_t get_item_end_offset(index_t index) const {
+    return index == 0 ? SIZE : get_item_start_offset(index - 1);
+  }
+  template <node_type_t NODE_TYPE>
+  node_offset_t free_size_before(index_t index) const {
+    auto range = fields_free_range_before<NODE_TYPE>(*this, index);
+    return range.end - range.start;
+  }
+
+  static node_offset_t estimate_insert_one() { return sizeof(node_offset_t); }
+  template <KeyT KT>
+  static void insert_at(
+      NodeExtentMutable& mut, const full_key_t<KT>& key,
+      const node_fields_2_t& node, index_t index, node_offset_t size_right) {
+    ceph_abort("not implemented");
+  }
+  static void update_size_at(
+      NodeExtentMutable& mut, const node_fields_2_t& node, index_t index, int change) {
+    ceph_abort("not implemented");
+  }
+  static void append_key(
+      NodeExtentMutable& mut, const key_t& key, char*& p_append) {
+    ns_oid_view_t::append(mut, key, p_append);
+  }
+  template <KeyT KT>
+  static void append_key(
+      NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) {
+    ns_oid_view_t::append<KT>(mut, key, p_append);
+  }
+  static void append_offset(
+      NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append);
+
+  node_header_t header;
+  num_keys_t num_keys = 0u;
+  node_offset_t offsets[];
+} __attribute__((packed));
+
+/**
+ * internal_fields_3_t
+ *
+ * The STAGE_RIGHT layout implementation for N2.
+ *
+ * The node layout storing 3 children:
+ *
+ * # <---------------- node range ---------------------------> #
+ * #               # <-- keys ---> # <---- laddrs -----------> #
+ * #  free space:  #           |<~>#                       |<~>#
+ * #               #           |   #                       |   #
+ * #        | num_ # key | key |   # laddr | laddr | laddr |   #
+ * # header | keys # 0   | 1   |...# 0     | 1     | 2     |...#
+ */
+// TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t)
+static constexpr unsigned MAX_NUM_KEYS_I3 = 170u;
+template <unsigned MAX_NUM_KEYS>
+struct _internal_fields_3_t {
+  using key_get_type = const snap_gen_t&;
+  using me_t = _internal_fields_3_t<MAX_NUM_KEYS>;
+  // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t)
+  using num_keys_t = uint8_t;
+  static constexpr field_type_t FIELD_TYPE = field_type_t::N3;
+  static constexpr node_offset_t SIZE = sizeof(me_t);
+  static constexpr node_offset_t HEADER_SIZE =
+    sizeof(node_header_t) + sizeof(num_keys_t);
+  static constexpr node_offset_t ITEM_OVERHEAD = 0u;
+
+  bool is_level_tail() const { return header.get_is_level_tail(); }
+  node_offset_t total_size() const {
+    if (is_level_tail()) {
+      return SIZE - sizeof(snap_gen_t);
+    } else {
+      return SIZE;
+    }
+  }
+  key_get_type get_key(index_t index) const {
+    assert(index < num_keys);
+    return keys[index];
+  }
+  template <node_type_t NODE_TYPE>
+  std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, node_offset_t>
+  free_size_before(index_t index) const {
+    assert(index <= num_keys);
+    assert(num_keys <= (is_level_tail() ? MAX_NUM_KEYS - 1 : MAX_NUM_KEYS));
+    auto free = (MAX_NUM_KEYS - index) * (sizeof(snap_gen_t) + sizeof(laddr_t));
+    if (is_level_tail() && index == num_keys) {
+      free -= (sizeof(snap_gen_t) + sizeof(laddr_t));
+    }
+    assert(free < SIZE);
+    return free;
+  }
+
+  static node_offset_t estimate_insert_one() {
+    return sizeof(snap_gen_t) + sizeof(laddr_t);
+  }
+  template <KeyT KT>
+  static void insert_at(
+      NodeExtentMutable& mut, const full_key_t<KT>& key,
+      const me_t& node, index_t index, node_offset_t size_right) {
+    ceph_abort("not implemented");
+  }
+  static void update_size_at(
+      NodeExtentMutable& mut, const me_t& node, index_t index, int change) {
+    ceph_abort("not implemented");
+  }
+
+  node_header_t header;
+  num_keys_t num_keys = 0u;
+  snap_gen_t keys[MAX_NUM_KEYS];
+  laddr_packed_t child_addrs[MAX_NUM_KEYS];
+} __attribute__((packed));
+static_assert(_internal_fields_3_t<MAX_NUM_KEYS_I3>::SIZE <= NODE_BLOCK_SIZE &&
+              _internal_fields_3_t<MAX_NUM_KEYS_I3 + 1>::SIZE > NODE_BLOCK_SIZE);
+using internal_fields_3_t = _internal_fields_3_t<MAX_NUM_KEYS_I3>;
+
+using leaf_fields_3_t = _node_fields_013_t<slot_3_t>;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h
new file mode 100644
index 000000000..cac167a98
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h
@@ -0,0 +1,2186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <type_traits>
+
+#include "common/likely.h"
+
+#include "sub_items_stage.h"
+#include "item_iterator_stage.h"
+
+namespace crimson::os::seastore::onode {
+
+struct search_result_bs_t {
+  index_t index;
+  MatchKindBS match;
+};
+template <typename FGetKey>
+search_result_bs_t binary_search(
+    const full_key_t<KeyT::HOBJ>& key,
+    index_t begin, index_t end, FGetKey&& f_get_key) {
+  assert(begin <= end);
+  while (begin < end) {
+    auto total = begin + end;
+    auto mid = total >> 1;
+    // do not copy if return value is reference
+    decltype(f_get_key(mid)) target = f_get_key(mid);
+    auto match = compare_to<KeyT::HOBJ>(key, target);
+    if (match == MatchKindCMP::LT) {
+      end = mid;
+    } else if (match == MatchKindCMP::GT) {
+      begin = mid + 1;
+    } else {
+      return {mid, MatchKindBS::EQ};
+    }
+  }
+  return {begin , MatchKindBS::NE};
+}
+
+template <typename PivotType, typename FGet>
+search_result_bs_t binary_search_r(
+    index_t rend, index_t rbegin, FGet&& f_get, const PivotType& key) {
+  assert(rend <= rbegin);
+  while (rend < rbegin) {
+    auto total = rend + rbegin + 1;
+    auto mid = total >> 1;
+    // do not copy if return value is reference
+    decltype(f_get(mid)) target = f_get(mid);
+    int match = target - key;
+    if (match < 0) {
+      rend = mid;
+    } else if (match > 0) {
+      rbegin = mid - 1;
+    } else {
+      return {mid, MatchKindBS::EQ};
+    }
+  }
+  return {rbegin, MatchKindBS::NE};
+}
+
+inline bool matchable(field_type_t type, match_stat_t mstat) {
+  assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX);
+  /*
+   * compressed prefix by field type:
+   * N0: NONE
+   * N1: pool/shard
+   * N2: pool/shard crush
+   * N3: pool/shard crush ns/oid
+   *
+   * if key matches the node's compressed prefix, return true
+   * else, return false
+   */
+#ifndef NDEBUG
+  if (mstat == MSTAT_END) {
+    assert(type == field_type_t::N0);
+  }
+#endif
+  return mstat + to_unsigned(type) < 4;
+}
+
+inline void assert_mstat(
+    const full_key_t<KeyT::HOBJ>& key,
+    const full_key_t<KeyT::VIEW>& index,
+    match_stat_t mstat) {
+  assert(mstat >= MSTAT_MIN && mstat <= MSTAT_LT2);
+  // key < index ...
+  switch (mstat) {
+   case MSTAT_EQ:
+    break;
+   case MSTAT_LT0:
+    assert(compare_to<KeyT::HOBJ>(key, index.snap_gen_packed()) == MatchKindCMP::LT);
+    break;
+   case MSTAT_LT1:
+    assert(compare_to<KeyT::HOBJ>(key, index.ns_oid_view()) == MatchKindCMP::LT);
+    break;
+   case MSTAT_LT2:
+    if (index.has_shard_pool()) {
+      assert(compare_to<KeyT::HOBJ>(key, shard_pool_crush_t{
+               index.shard_pool_packed(), index.crush_packed()}) == MatchKindCMP::LT);
+    } else {
+      assert(compare_to<KeyT::HOBJ>(key, index.crush_packed()) == MatchKindCMP::LT);
+    }
+    break;
+   default:
+    ceph_abort("impossible path");
+  }
+  // key == index ...
+  switch (mstat) {
+   case MSTAT_EQ:
+    assert(compare_to<KeyT::HOBJ>(key, index.snap_gen_packed()) == MatchKindCMP::EQ);
+   case MSTAT_LT0:
+    if (!index.has_ns_oid())
+      break;
+    assert(index.ns_oid_view().type() == ns_oid_view_t::Type::MAX ||
+           compare_to<KeyT::HOBJ>(key, index.ns_oid_view()) == MatchKindCMP::EQ);
+   case MSTAT_LT1:
+    if (!index.has_crush())
+      break;
+    assert(compare_to<KeyT::HOBJ>(key, index.crush_packed()) == MatchKindCMP::EQ);
+    if (!index.has_shard_pool())
+      break;
+    assert(compare_to<KeyT::HOBJ>(key, index.shard_pool_packed()) == MatchKindCMP::EQ);
+   default:
+    break;
+  }
+}
+
+#define NXT_STAGE_T staged<next_param_t>
+
+enum class TrimType { BEFORE, AFTER, AT };
+
+/**
+ * staged
+ *
+ * Implements recursive logic that modifies or reads the node layout
+ * (N0/N1/N2/N3 * LEAF/INTERNAL) with the multi-stage design. The specific
+ * stage implementation is flexible. So the implementations for different
+ * stages can be assembled independently, as long as they follow the
+ * definitions of container interfaces.
+ *
+ * Multi-stage is designed to index different portions of onode keys
+ * stage-by-stage. There are at most 3 stages for a node:
+ * - STAGE_LEFT:   index shard-pool-crush for N0, or index crush for N1 node;
+ * - STAGE_STRING: index ns-oid for N0/N1/N2 nodes;
+ * - STAGE_RIGHT:  index snap-gen for N0/N1/N2/N3 nodes;
+ *
+ * The intention is to consolidate the high-level indexing implementations at
+ * the level of stage, so we don't need to write them repeatedly for every
+ * stage and for every node type.
+ */
+template <typename Params>
+struct staged {
+  static_assert(Params::STAGE >= STAGE_BOTTOM);
+  static_assert(Params::STAGE <= STAGE_TOP);
+  using container_t = typename Params::container_t;
+  using key_get_type = typename container_t::key_get_type;
+  using next_param_t = typename Params::next_param_t;
+  using position_t = staged_position_t<Params::STAGE>;
+  using result_t = staged_result_t<Params::NODE_TYPE, Params::STAGE>;
+  using value_t = value_type_t<Params::NODE_TYPE>;
+  static constexpr auto CONTAINER_TYPE = container_t::CONTAINER_TYPE;
+  static constexpr bool IS_BOTTOM = (Params::STAGE == STAGE_BOTTOM);
+  static constexpr auto NODE_TYPE = Params::NODE_TYPE;
+  static constexpr auto STAGE = Params::STAGE;
+
+  template <bool is_exclusive>
+  static void _left_or_right(index_t& split_index, index_t insert_index,
+                             std::optional<bool>& is_insert_left) {
+    assert(!is_insert_left.has_value());
+    assert(is_valid_index(split_index));
+    if constexpr (is_exclusive) {
+      if (split_index <= insert_index) {
+        // ...[s_index-1] |!| (i_index) [s_index]...
+        // offset i_position to right
+        is_insert_left = false;
+      } else {
+        // ...[s_index-1] (i_index)) |?[s_index]| ...
+        // ...(i_index)...[s_index-1] |?[s_index]| ...
+        is_insert_left = true;
+        --split_index;
+      }
+    } else {
+      if (split_index < insert_index) {
+        // ...[s_index-1] |?[s_index]| ...[(i_index)[s_index_k]...
+        is_insert_left = false;
+      } else if (split_index > insert_index) {
+        // ...[(i_index)s_index-1] |?[s_index]| ...
+        // ...[(i_index)s_index_k]...[s_index-1] |?[s_index]| ...
+        is_insert_left = true;
+      } else {
+        // ...[s_index-1] |?[(i_index)s_index]| ...
+        // i_to_left = std::nullopt;
+      }
+    }
+  }
+
+  template <ContainerType CTYPE, typename Enable = void> class _iterator_t;
+  template <ContainerType CTYPE>
+  class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::INDEXABLE>> {
+   /*
+    * indexable container type system:
+    *   CONTAINER_TYPE = ContainerType::INDEXABLE
+    *   keys() const -> index_t
+    *   operator[](index_t) const -> key_get_type
+    *   size_before(index_t) const -> node_offset_t
+    *   size_overhead_at(index_t) const -> node_offset_t
+    *   (IS_BOTTOM) get_p_value(index_t) const -> const value_t*
+    *   (!IS_BOTTOM) size_to_nxt_at(index_t) const -> node_offset_t
+    *   (!IS_BOTTOM) get_nxt_container(index_t) const
+    *   encode(p_node_start, encoded)
+    *   decode(p_node_start, delta) -> container_t
+    * static:
+    *   header_size() -> node_offset_t
+    *   estimate_insert(key, value) -> node_offset_t
+    *   (IS_BOTTOM) insert_at(mut, src, key, value,
+    *                         index, size, p_left_bound) -> const value_t*
+    *   (!IS_BOTTOM) insert_prefix_at(mut, src, key,
+    *                         index, size, p_left_bound) -> memory_range_t
+    *   (!IS_BOTTOM) update_size_at(mut, src, index, size)
+    *   trim_until(mut, container, index) -> trim_size
+    *   (!IS_BOTTOM) trim_at(mut, container, index, trimmed) -> trim_size
+    *
+    * Appender::append(const container_t& src, from, items)
+    */
+   public:
+    using me_t = _iterator_t<CTYPE>;
+
+    _iterator_t(const container_t& container) : container{container} {
+      assert(container.keys());
+    }
+
+    index_t index() const {
+      return _index;
+    }
+    key_get_type get_key() const {
+      assert(!is_end());
+      return container[_index];
+    }
+    node_offset_t size_to_nxt() const {
+      assert(!is_end());
+      return container.size_to_nxt_at(_index);
+    }
+    template <typename T = typename NXT_STAGE_T::container_t>
+    std::enable_if_t<!IS_BOTTOM, T> get_nxt_container() const {
+      assert(!is_end());
+      return container.get_nxt_container(_index);
+    }
+    template <typename T = value_t>
+    std::enable_if_t<IS_BOTTOM, const T*> get_p_value() const {
+      assert(!is_end());
+      return container.get_p_value(_index);
+    }
+    bool is_last() const {
+      return _index + 1 == container.keys();
+    }
+    bool is_end() const { return _index == container.keys(); }
+    node_offset_t size() const {
+      assert(!is_end());
+      assert(header_size() == container.size_before(0));
+      assert(container.size_before(_index + 1) > container.size_before(_index));
+      return container.size_before(_index + 1) -
+             container.size_before(_index);
+    }
+    node_offset_t size_overhead() const {
+      assert(!is_end());
+      return container.size_overhead_at(_index);
+    }
+
+    me_t& operator++() {
+      assert(!is_end());
+      assert(!is_last());
+      ++_index;
+      return *this;
+    }
+    void seek_at(index_t index) {
+      assert(index < container.keys());
+      seek_till_end(index);
+    }
+    void seek_till_end(index_t index) {
+      assert(!is_end());
+      assert(this->index() == 0);
+      assert(index <= container.keys());
+      _index = index;
+    }
+    void seek_last() {
+      assert(!is_end());
+      assert(index() == 0);
+      _index = container.keys() - 1;
+    }
+    void set_end() {
+      assert(!is_end());
+      assert(is_last());
+      ++_index;
+    }
+    // Note: possible to return an end iterator
+    MatchKindBS seek(const full_key_t<KeyT::HOBJ>& key, bool exclude_last) {
+      assert(!is_end());
+      assert(index() == 0);
+      index_t end_index = container.keys();
+      if (exclude_last) {
+        assert(end_index);
+        --end_index;
+        assert(compare_to<KeyT::HOBJ>(key, container[end_index]) == MatchKindCMP::LT);
+      }
+      auto ret = binary_search(key, _index, end_index,
+          [this] (index_t index) { return container[index]; });
+      _index = ret.index;
+      return ret.match;
+    }
+
+    template <KeyT KT, typename T = value_t>
+    std::enable_if_t<IS_BOTTOM, const T*> insert(
+        NodeExtentMutable& mut, const full_key_t<KT>& key,
+        const value_t& value, node_offset_t insert_size, const char* p_left_bound) {
+      return container_t::template insert_at<KT>(
+          mut, container, key, value, _index, insert_size, p_left_bound);
+    }
+
+    template <KeyT KT, typename T = memory_range_t>
+    std::enable_if_t<!IS_BOTTOM, T> insert_prefix(
+        NodeExtentMutable& mut, const full_key_t<KT>& key,
+        node_offset_t size, const char* p_left_bound) {
+      return container_t::template insert_prefix_at<KT>(
+          mut, container, key, _index, size, p_left_bound);
+    }
+
+    template <typename T = void>
+    std::enable_if_t<!IS_BOTTOM, T>
+    update_size(NodeExtentMutable& mut, node_offset_t insert_size) {
+      assert(!is_end());
+      container_t::update_size_at(mut, container, _index, insert_size);
+    }
+
+    // Note: possible to return an end iterator when is_exclusive is true
+    template <bool is_exclusive>
+    size_t seek_split_inserted(
+        size_t start_size, size_t extra_size, size_t target_size,
+        index_t& insert_index, size_t insert_size,
+        std::optional<bool>& is_insert_left) {
+      assert(!is_end());
+      assert(index() == 0);
+      // replace insert_index placeholder
+      if constexpr (!is_exclusive) {
+        if (insert_index == INDEX_LAST) {
+          insert_index = container.keys() - 1;
+        }
+      } else {
+        if (insert_index == INDEX_END) {
+          insert_index = container.keys();
+        }
+      }
+      assert(insert_index <= container.keys());
+
+      auto start_size_1 = start_size + extra_size;
+      auto f_get_used_size = [this, start_size, start_size_1,
+                              insert_index, insert_size] (index_t index) {
+        size_t current_size;
+        if (unlikely(index == 0)) {
+          current_size = start_size;
+        } else {
+          current_size = start_size_1;
+          if (index > insert_index) {
+            current_size += insert_size;
+            if constexpr (is_exclusive) {
+              --index;
+            }
+          }
+          // already includes header size
+          current_size += container.size_before(index);
+        }
+        return current_size;
+      };
+      index_t s_end;
+      if constexpr (is_exclusive) {
+        s_end = container.keys();
+      } else {
+        s_end = container.keys() - 1;
+      }
+      _index = binary_search_r(0, s_end, f_get_used_size, target_size).index;
+      size_t current_size = f_get_used_size(_index);
+      assert(current_size <= target_size);
+
+      _left_or_right<is_exclusive>(_index, insert_index, is_insert_left);
+      return current_size;
+    }
+
+    size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) {
+      assert(!is_end());
+      assert(index() == 0);
+      auto start_size_1 = start_size + extra_size;
+      auto f_get_used_size = [this, start_size, start_size_1] (index_t index) {
+        size_t current_size;
+        if (unlikely(index == 0)) {
+          current_size = start_size;
+        } else {
+          // already includes header size
+          current_size = start_size_1 + container.size_before(index);
+        }
+        return current_size;
+      };
+      _index = binary_search_r(
+          0, container.keys() - 1, f_get_used_size, target_size).index;
+      size_t current_size = f_get_used_size(_index);
+      assert(current_size <= target_size);
+      return current_size;
+    }
+
+    // Note: possible to return an end iterater if to_index == INDEX_END
+    template <KeyT KT>
+    void copy_out_until(
+        typename container_t::template Appender<KT>& appender, index_t& to_index) {
+      auto num_keys = container.keys();
+      index_t items;
+      if (to_index == INDEX_END) {
+        items = num_keys - _index;
+        appender.append(container, _index, items);
+        _index = num_keys;
+        to_index = _index;
+      } else if (to_index == INDEX_LAST) {
+        assert(!is_end());
+        items = num_keys - 1 - _index;
+        appender.append(container, _index, items);
+        _index = num_keys - 1;
+        to_index = _index;
+      } else {
+        assert(_index <= to_index);
+        assert(to_index <= num_keys);
+        items = to_index - _index;
+        appender.append(container, _index, items);
+        _index = to_index;
+      }
+    }
+
+    node_offset_t trim_until(NodeExtentMutable& mut) {
+      return container_t::trim_until(mut, container, _index);
+    }
+
+    template <typename T = node_offset_t>
+    std::enable_if_t<!IS_BOTTOM, T>
+    trim_at(NodeExtentMutable& mut, node_offset_t trimmed) {
+      return container_t::trim_at(mut, container, _index, trimmed);
+    }
+
+    void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+      container.encode(p_node_start, encoded);
+      ceph::encode(_index, encoded);
+    }
+
+    static me_t decode(const char* p_node_start,
+                       ceph::bufferlist::const_iterator& delta) {
+      auto container = container_t::decode(p_node_start, delta);
+      auto ret = me_t(container);
+      index_t index;
+      ceph::decode(index, delta);
+      ret.seek_till_end(index);
+      return ret;
+    }
+
+    static node_offset_t header_size() {
+      return container_t::header_size();
+    }
+
+    template <KeyT KT>
+    static node_offset_t estimate_insert(
+        const full_key_t<KT>& key, const value_t& value) {
+      return container_t::template estimate_insert<KT>(key, value);
+    }
+
+   private:
+    container_t container;
+    index_t _index = 0;
+  };
+
+  template <ContainerType CTYPE>
+  class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::ITERATIVE>> {
+    /*
+     * iterative container type system (!IS_BOTTOM):
+     *   CONTAINER_TYPE = ContainerType::ITERATIVE
+     *   index() const -> index_t
+     *   get_key() const -> key_get_type
+     *   size() const -> node_offset_t
+     *   size_to_nxt() const -> node_offset_t
+     *   size_overhead() const -> node_offset_t
+     *   get_nxt_container() const
+     *   has_next() const -> bool
+     *   encode(p_node_start, encoded)
+     *   decode(p_node_start, delta) -> container_t
+     *   operator++()
+     * static:
+     *   header_size() -> node_offset_t
+     *   estimate_insert(key, value) -> node_offset_t
+     *   insert_prefix(mut, src, key, is_end, size, p_left_bound) -> memory_range_t
+     *   update_size(mut, src, size)
+     *   trim_until(mut, container) -> trim_size
+     *   trim_at(mut, container, trimmed) -> trim_size
+     */
+    // currently the iterative iterator is only implemented with STAGE_STRING
+    // for in-node space efficiency
+    static_assert(STAGE == STAGE_STRING);
+   public:
+    using me_t = _iterator_t<CTYPE>;
+
+    _iterator_t(const container_t& container) : container{container} {}
+
+    index_t index() const {
+      if (is_end()) {
+        return container.index() + 1;
+      } else {
+        return container.index();
+      }
+    }
+    key_get_type get_key() const {
+      assert(!is_end());
+      return container.get_key();
+    }
+    node_offset_t size_to_nxt() const {
+      assert(!is_end());
+      return container.size_to_nxt();
+    }
+    const typename NXT_STAGE_T::container_t get_nxt_container() const {
+      assert(!is_end());
+      return container.get_nxt_container();
+    }
+    bool is_last() const {
+      assert(!is_end());
+      return !container.has_next();
+    }
+    bool is_end() const {
+#ifndef NDEBUG
+      if (_is_end) {
+        assert(!container.has_next());
+      }
+#endif
+      return _is_end;
+    }
+    node_offset_t size() const {
+      assert(!is_end());
+      return container.size();
+    }
+    node_offset_t size_overhead() const {
+      assert(!is_end());
+      return container.size_overhead();
+    }
+
+    me_t& operator++() {
+      assert(!is_end());
+      assert(!is_last());
+      ++container;
+      return *this;
+    }
+    void seek_at(index_t index) {
+      assert(!is_end());
+      assert(this->index() == 0);
+      while (index > 0) {
+        assert(container.has_next());
+        ++container;
+        --index;
+      }
+    }
+    void seek_till_end(index_t index) {
+      assert(!is_end());
+      assert(this->index() == 0);
+      while (index > 0) {
+        if (!container.has_next()) {
+          assert(index == 1);
+          set_end();
+          break;
+        }
+        ++container;
+        --index;
+      }
+    }
+    void seek_last() {
+      assert(!is_end());
+      assert(index() == 0);
+      while (container.has_next()) {
+        ++container;
+      }
+    }
+    void set_end() {
+      assert(!is_end());
+      assert(is_last());
+      _is_end = true;
+    }
+    // Note: possible to return an end iterator
+    MatchKindBS seek(const full_key_t<KeyT::HOBJ>& key, bool exclude_last) {
+      assert(!is_end());
+      assert(index() == 0);
+      do {
+        if (exclude_last && is_last()) {
+          assert(compare_to<KeyT::HOBJ>(key, get_key()) == MatchKindCMP::LT);
+          return MatchKindBS::NE;
+        }
+        auto match = compare_to<KeyT::HOBJ>(key, get_key());
+        if (match == MatchKindCMP::LT) {
+          return MatchKindBS::NE;
+        } else if (match == MatchKindCMP::EQ) {
+          return MatchKindBS::EQ;
+        } else {
+          if (container.has_next()) {
+            ++container;
+          } else {
+            // end
+            break;
+          }
+        }
+      } while (true);
+      assert(!exclude_last);
+      set_end();
+      return MatchKindBS::NE;
+    }
+
+    template <KeyT KT>
+    memory_range_t insert_prefix(
+        NodeExtentMutable& mut, const full_key_t<KT>& key,
+        node_offset_t size, const char* p_left_bound) {
+      return container_t::template insert_prefix<KT>(
+          mut, container, key, is_end(), size, p_left_bound);
+    }
+
+    void update_size(NodeExtentMutable& mut, node_offset_t insert_size) {
+      assert(!is_end());
+      container_t::update_size(mut, container, insert_size);
+    }
+
+    // Note: possible to return an end iterator when is_exclusive is true
+    // insert_index can still be INDEX_LAST or INDEX_END
+    template <bool is_exclusive>
+    size_t seek_split_inserted(
+        size_t start_size, size_t extra_size, size_t target_size,
+        index_t& insert_index, size_t insert_size,
+        std::optional<bool>& is_insert_left) {
+      assert(!is_end());
+      assert(index() == 0);
+      size_t current_size = start_size;
+      index_t split_index = 0;
+      extra_size += header_size();
+      do {
+        if constexpr (!is_exclusive) {
+          if (is_last()) {
+            assert(split_index == index());
+            if (insert_index == INDEX_LAST) {
+              insert_index = index();
+            }
+            assert(insert_index <= index());
+            break;
+          }
+        }
+
+        size_t nxt_size = current_size;
+        if (split_index == 0) {
+          nxt_size += extra_size;
+        }
+        if (split_index == insert_index) {
+          nxt_size += insert_size;
+          if constexpr (is_exclusive) {
+            if (nxt_size > target_size) {
+              break;
+            }
+            current_size = nxt_size;
+            ++split_index;
+          }
+        }
+        nxt_size += size();
+        if (nxt_size > target_size) {
+          break;
+        }
+        current_size = nxt_size;
+
+        if constexpr (is_exclusive) {
+          if (is_last()) {
+            assert(split_index == index());
+            set_end();
+            split_index = index();
+            if (insert_index == INDEX_END) {
+              insert_index = index();
+            }
+            assert(insert_index == index());
+            break;
+          } else {
+            ++(*this);
+            ++split_index;
+          }
+        } else {
+          ++(*this);
+          ++split_index;
+        }
+      } while (true);
+      assert(current_size <= target_size);
+
+      _left_or_right<is_exclusive>(split_index, insert_index, is_insert_left);
+      assert(split_index == index());
+      return current_size;
+    }
+
+    size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) {
+      assert(!is_end());
+      assert(index() == 0);
+      size_t current_size = start_size;
+      do {
+        if (is_last()) {
+          break;
+        }
+
+        size_t nxt_size = current_size;
+        if (index() == 0) {
+          nxt_size += extra_size;
+        }
+        nxt_size += size();
+        if (nxt_size > target_size) {
+          break;
+        }
+        current_size = nxt_size;
+        ++(*this);
+      } while (true);
+      assert(current_size <= target_size);
+      return current_size;
+    }
+
+    // Note: possible to return an end iterater if to_index == INDEX_END
+    template <KeyT KT>
+    void copy_out_until(
+        typename container_t::template Appender<KT>& appender, index_t& to_index) {
+      if (is_end()) {
+        assert(!container.has_next());
+        if (to_index == INDEX_END) {
+          to_index = index();
+        }
+        assert(to_index == index());
+        return;
+      }
+      index_t items;
+      if (to_index == INDEX_END || to_index == INDEX_LAST) {
+        items = to_index;
+      } else {
+        assert(is_valid_index(to_index));
+        assert(index() <= to_index);
+        items = to_index - index();
+      }
+      if (appender.append(container, items)) {
+        set_end();
+      }
+      to_index = index();
+    }
+
+    node_offset_t trim_until(NodeExtentMutable& mut) {
+      if (is_end()) {
+        return 0;
+      }
+      return container_t::trim_until(mut, container);
+    }
+
+    node_offset_t trim_at(NodeExtentMutable& mut, node_offset_t trimmed) {
+      assert(!is_end());
+      return container_t::trim_at(mut, container, trimmed);
+    }
+
+    void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+      container.encode(p_node_start, encoded);
+      uint8_t is_end = _is_end;
+      ceph::encode(is_end, encoded);
+    }
+
+    static me_t decode(const char* p_node_start,
+                       ceph::bufferlist::const_iterator& delta) {
+      auto container = container_t::decode(p_node_start, delta);
+      auto ret = me_t(container);
+      uint8_t is_end;
+      ceph::decode(is_end, delta);
+      if (is_end) {
+        ret.set_end();
+      }
+      return ret;
+    }
+
+    static node_offset_t header_size() {
+      return container_t::header_size();
+    }
+
+    template <KeyT KT>
+    static node_offset_t estimate_insert(const full_key_t<KT>& key, const value_t& value) {
+      return container_t::template estimate_insert<KT>(key, value);
+    }
+
+   private:
+    container_t container;
+    bool _is_end = false;
+  };
+
+  /*
+   * iterator_t encapsulates both indexable and iterative implementations
+   * from a *non-empty* container.
+   * cstr(const container_t&)
+   * access:
+   *   index() -> index_t
+   *   get_key() -> key_get_type (const reference or value type)
+   *   is_last() -> bool
+   *   is_end() -> bool
+   *   size() -> node_offset_t
+   *   size_overhead() -> node_offset_t
+   *   (IS_BOTTOM) get_p_value() -> const value_t*
+   *   (!IS_BOTTOM) get_nxt_container() -> nxt_stage::container_t
+   *   (!IS_BOTTOM) size_to_nxt() -> node_offset_t
+   * seek:
+   *   operator++() -> iterator_t&
+   *   seek_at(index)
+   *   seek_till_end(index)
+   *   seek_last()
+   *   set_end()
+   *   seek(key, exclude_last) -> MatchKindBS
+   * insert:
+   *   (IS_BOTTOM) insert(mut, key, value, size, p_left_bound) -> p_value
+   *   (!IS_BOTTOM) insert_prefix(mut, key, size, p_left_bound) -> memory_range_t
+   *   (!IS_BOTTOM) update_size(mut, size)
+   * split:
+   *   seek_split_inserted<bool is_exclusive>(
+   *       start_size, extra_size, target_size, insert_index, insert_size,
+   *       std::optional<bool>& is_insert_left)
+   *           -> insert to left/right/unknown (!exclusive)
+   *           -> insert to left/right         (exclusive, can be end)
+   *     -> split_size
+   *   seek_split(start_size, extra_size, target_size) -> split_size
+   *   copy_out_until(appender, to_index) (can be end)
+   *   trim_until(mut) -> trim_size
+   *   (!IS_BOTTOM) trim_at(mut, trimmed) -> trim_size
+   * denc:
+   *   encode(p_node_start, encoded)
+   *   decode(p_node_start, delta) -> iterator_t
+   * static:
+   *   header_size() -> node_offset_t
+   *   estimate_insert(key, value) -> node_offset_t
+   */
+  using iterator_t = _iterator_t<CONTAINER_TYPE>;
+  /* TODO: detailed comments
+   * - trim_until(mut) -> trim_size
+   *   * keep 0 to i - 1, and remove the rest, return the size trimmed.
+   *   * if this is the end iterator, do nothing and return 0.
+   *   * if this is the start iterator, normally needs to go to the higher
+   *     stage to trim the entire container.
+   * - trim_at(mut, trimmed) -> trim_size
+   *   * trim happens inside the current iterator, causing the size reduced by
+   *     <trimmed>, return the total size trimmed.
+   */
+
+  /*
+   * Lookup internals (hide?)
+   */
+
+  template <bool GET_KEY>
+  static result_t smallest_result(
+      const iterator_t& iter, full_key_t<KeyT::VIEW>* index_key) {
+    static_assert(!IS_BOTTOM);
+    assert(!iter.is_end());
+    auto pos_smallest = NXT_STAGE_T::position_t::begin();
+    auto nxt_container = iter.get_nxt_container();
+    auto value_ptr = NXT_STAGE_T::template get_p_value<GET_KEY>(
+        nxt_container, pos_smallest, index_key);
+    if constexpr (GET_KEY) {
+      index_key->set(iter.get_key());
+    }
+    return result_t{{iter.index(), pos_smallest}, value_ptr, STAGE};
+  }
+
+  template <bool GET_KEY>
+  static result_t nxt_lower_bound(
+      const full_key_t<KeyT::HOBJ>& key, iterator_t& iter,
+      MatchHistory& history, full_key_t<KeyT::VIEW>* index_key) {
+    static_assert(!IS_BOTTOM);
+    assert(!iter.is_end());
+    auto nxt_container = iter.get_nxt_container();
+    auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>(
+        nxt_container, key, history, index_key);
+    if (nxt_result.is_end()) {
+      if (iter.is_last()) {
+        return result_t::end();
+      } else {
+        return smallest_result<GET_KEY>(++iter, index_key);
+      }
+    } else {
+      if constexpr (GET_KEY) {
+        index_key->set(iter.get_key());
+      }
+      return result_t::from_nxt(iter.index(), nxt_result);
+    }
+  }
+
+  template <bool GET_POS, bool GET_KEY, bool GET_VAL>
+  static void lookup_largest_slot(
+      const container_t& container, position_t* p_position,
+      full_key_t<KeyT::VIEW>* p_index_key, const value_t** pp_value) {
+    auto iter = iterator_t(container);
+    iter.seek_last();
+    if constexpr (GET_KEY) {
+      assert(p_index_key);
+      p_index_key->set(iter.get_key());
+    }
+    if constexpr (GET_POS) {
+      assert(p_position);
+      p_position->index = iter.index();
+    }
+    if constexpr (IS_BOTTOM) {
+      if constexpr (GET_VAL) {
+        assert(pp_value);
+        *pp_value = iter.get_p_value();
+      }
+    } else {
+      auto nxt_container = iter.get_nxt_container();
+      if constexpr (GET_POS) {
+        NXT_STAGE_T::template lookup_largest_slot<true, GET_KEY, GET_VAL>(
+            nxt_container, &p_position->nxt, p_index_key, pp_value);
+      } else {
+        NXT_STAGE_T::template lookup_largest_slot<false, GET_KEY, GET_VAL>(
+            nxt_container, nullptr, p_index_key, pp_value);
+      }
+    }
+  }
+
+  template <bool GET_KEY = false>
+  static const value_t* get_p_value(
+      const container_t& container, const position_t& position,
+      full_key_t<KeyT::VIEW>* index_key = nullptr) {
+    auto iter = iterator_t(container);
+    iter.seek_at(position.index);
+    if constexpr (GET_KEY) {
+      index_key->set(iter.get_key());
+    }
+    if constexpr (!IS_BOTTOM) {
+      auto nxt_container = iter.get_nxt_container();
+      return NXT_STAGE_T::template get_p_value<GET_KEY>(
+          nxt_container, position.nxt, index_key);
+    } else {
+      return iter.get_p_value();
+    }
+  }
+
+  static void get_key_view(
+      const container_t& container,
+      const position_t& position,
+      full_key_t<KeyT::VIEW>& index_key) {
+    auto iter = iterator_t(container);
+    iter.seek_at(position.index);
+    index_key.set(iter.get_key());
+    if constexpr (!IS_BOTTOM) {
+      auto nxt_container = iter.get_nxt_container();
+      return NXT_STAGE_T::get_key_view(nxt_container, position.nxt, index_key);
+    }
+  }
+
+  template <bool GET_KEY = false>
+  static result_t lower_bound(
+      const container_t& container,
+      const full_key_t<KeyT::HOBJ>& key,
+      MatchHistory& history,
+      full_key_t<KeyT::VIEW>* index_key = nullptr) {
+    bool exclude_last = false;
+    if (history.get<STAGE>().has_value()) {
+      if (*history.get<STAGE>() == MatchKindCMP::EQ) {
+        // lookup is short-circuited
+        if constexpr (!IS_BOTTOM) {
+          assert(history.get<STAGE - 1>().has_value());
+          if (history.is_GT<STAGE - 1>()) {
+            auto iter = iterator_t(container);
+            bool test_key_equal;
+            if constexpr (STAGE == STAGE_STRING) {
+              // TODO(cross-node string dedup)
+              // test_key_equal = (iter.get_key().type() == ns_oid_view_t::Type::MIN);
+              auto cmp = compare_to<KeyT::HOBJ>(key, iter.get_key());
+              assert(cmp != MatchKindCMP::GT);
+              test_key_equal = (cmp == MatchKindCMP::EQ);
+            } else {
+              auto cmp = compare_to<KeyT::HOBJ>(key, iter.get_key());
+              // From history, key[stage] == parent[stage][index - 1]
+              // which should be the smallest possible value for all
+              // index[stage][*]
+              assert(cmp != MatchKindCMP::GT);
+              test_key_equal = (cmp == MatchKindCMP::EQ);
+            }
+            if (test_key_equal) {
+              return nxt_lower_bound<GET_KEY>(key, iter, history, index_key);
+            } else {
+              // key[stage] < index[stage][left-most]
+              return smallest_result<GET_KEY>(iter, index_key);
+            }
+          }
+        }
+        // IS_BOTTOM || !history.is_GT<STAGE - 1>()
+        auto iter = iterator_t(container);
+        iter.seek_last();
+        if constexpr (STAGE == STAGE_STRING) {
+          // TODO(cross-node string dedup)
+          // assert(iter.get_key().type() == ns_oid_view_t::Type::MAX);
+          assert(compare_to<KeyT::HOBJ>(key, iter.get_key()) == MatchKindCMP::EQ);
+        } else {
+          assert(compare_to<KeyT::HOBJ>(key, iter.get_key()) == MatchKindCMP::EQ);
+        }
+        if constexpr (GET_KEY) {
+          index_key->set(iter.get_key());
+        }
+        if constexpr (IS_BOTTOM) {
+          auto value_ptr = iter.get_p_value();
+          return result_t{{iter.index()}, value_ptr, MSTAT_EQ};
+        } else {
+          auto nxt_container = iter.get_nxt_container();
+          auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>(
+              nxt_container, key, history, index_key);
+          // !history.is_GT<STAGE - 1>() means
+          // key[stage+1 ...] <= index[stage+1 ...][*]
+          assert(!nxt_result.is_end());
+          return result_t::from_nxt(iter.index(), nxt_result);
+        }
+      } else if (*history.get<STAGE>() == MatchKindCMP::LT) {
+        exclude_last = true;
+      }
+    }
+    auto iter = iterator_t(container);
+    auto bs_match = iter.seek(key, exclude_last);
+    if (iter.is_end()) {
+      assert(!exclude_last);
+      assert(bs_match == MatchKindBS::NE);
+      history.set<STAGE>(MatchKindCMP::GT);
+      return result_t::end();
+    }
+    history.set<STAGE>(bs_match == MatchKindBS::EQ ?
+                       MatchKindCMP::EQ : MatchKindCMP::LT);
+    if constexpr (IS_BOTTOM) {
+      if constexpr (GET_KEY) {
+        index_key->set(iter.get_key());
+      }
+      auto value_ptr = iter.get_p_value();
+      return result_t{{iter.index()}, value_ptr,
+                      (bs_match == MatchKindBS::EQ ? MSTAT_EQ : MSTAT_LT0)};
+    } else {
+      if (bs_match == MatchKindBS::EQ) {
+        return nxt_lower_bound<GET_KEY>(key, iter, history, index_key);
+      } else {
+        return smallest_result<GET_KEY>(iter, index_key);
+      }
+    }
+  }
+
+  template <KeyT KT>
+  static node_offset_t insert_size(const full_key_t<KT>& key, const value_t& value) {
+    if constexpr (IS_BOTTOM) {
+      return iterator_t::template estimate_insert<KT>(key, value);
+    } else {
+      return iterator_t::template estimate_insert<KT>(key, value) +
+             NXT_STAGE_T::iterator_t::header_size() +
+             NXT_STAGE_T::template insert_size<KT>(key, value);
+    }
+  }
+
+  template <KeyT KT>
+  static node_offset_t insert_size_at(
+      match_stage_t stage, const full_key_t<KeyT::HOBJ>& key, const value_t& value) {
+    if (stage == STAGE) {
+      return insert_size<KT>(key, value);
+    } else {
+      assert(stage < STAGE);
+      return NXT_STAGE_T::template insert_size_at<KT>(stage, key, value);
+    }
+  }
+
+  template <typename T = std::tuple<match_stage_t, node_offset_t>>
+  static std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, T> evaluate_insert(
+      const container_t& container, const full_key_t<KeyT::VIEW>& key,
+      const value_t& value, position_t& position, bool evaluate_last) {
+    auto iter = iterator_t(container);
+    auto& index = position.index;
+    if (evaluate_last || index == INDEX_END) {
+      iter.seek_last();
+      index = iter.index();
+      // evaluate the previous index
+    } else {
+      assert(is_valid_index(index));
+      // evaluate the current index
+      iter.seek_at(index);
+      auto match = compare_to<KeyT::VIEW>(key, iter.get_key());
+      if (match == MatchKindCMP::EQ) {
+        if constexpr (IS_BOTTOM) {
+          ceph_abort("insert conflict at current index!");
+        } else {
+          // insert into the current index
+          auto nxt_container = iter.get_nxt_container();
+          return NXT_STAGE_T::evaluate_insert(
+              nxt_container, key, value, position.nxt, false);
+        }
+      } else {
+        assert(match == MatchKindCMP::LT);
+        if (index == 0) {
+          // already the first index, so insert at the current index
+          return {STAGE, insert_size<KeyT::VIEW>(key, value)};
+        }
+        --index;
+        iter = iterator_t(container);
+        iter.seek_at(index);
+        // proceed to evaluate the previous index
+      }
+    }
+
+    // XXX(multi-type): when key is from a different type of node
+    auto match = compare_to<KeyT::VIEW>(key, iter.get_key());
+    if (match == MatchKindCMP::GT) {
+      // key doesn't match both indexes, so insert at the current index
+      ++index;
+      return {STAGE, insert_size<KeyT::VIEW>(key, value)};
+    } else {
+      assert(match == MatchKindCMP::EQ);
+      if constexpr (IS_BOTTOM) {
+        // ceph_abort?
+        ceph_abort("insert conflict at the previous index!");
+      } else {
+        // insert into the previous index
+        auto nxt_container = iter.get_nxt_container();
+        return NXT_STAGE_T::evaluate_insert(
+            nxt_container, key, value, position.nxt, true);
+      }
+    }
+  }
+
+  template <typename T = bool>
+  static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T>
+  compensate_insert_position_at(match_stage_t stage, position_t& position) {
+    auto& index = position.index;
+    if (stage == STAGE) {
+      assert(index == 0);
+      // insert at the end of the current stage
+      index = INDEX_END;
+      return true;
+    } else {
+      if constexpr (IS_BOTTOM) {
+        ceph_abort("impossible path");
+      } else {
+        assert(stage < STAGE);
+        bool compensate = NXT_STAGE_T::
+          compensate_insert_position_at(stage, position.nxt);
+        if (compensate) {
+          assert(is_valid_index(index));
+          if (index == 0) {
+            // insert into the *last* index of the current stage
+            index = INDEX_LAST;
+            return true;
+          } else {
+            --index;
+            return false;
+          }
+        } else {
+          return false;
+        }
+      }
+    }
+  }
+
+  static void patch_insert_end(position_t& insert_pos, match_stage_t insert_stage) {
+    assert(insert_stage <= STAGE);
+    if (insert_stage == STAGE) {
+      insert_pos.index = INDEX_END;
+    } else if constexpr (!IS_BOTTOM) {
+      insert_pos.index = INDEX_LAST;
+      NXT_STAGE_T::patch_insert_end(insert_pos.nxt, insert_stage);
+    }
+  }
+
+  template <typename T = std::tuple<match_stage_t, node_offset_t>>
+  static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T> evaluate_insert(
+      const full_key_t<KeyT::HOBJ>& key, const onode_t& value,
+      const MatchHistory& history, match_stat_t mstat, position_t& position) {
+    match_stage_t insert_stage = STAGE_TOP;
+    while (*history.get_by_stage(insert_stage) == MatchKindCMP::EQ) {
+      assert(insert_stage != STAGE_BOTTOM && "insert conflict!");
+      --insert_stage;
+    }
+
+    if (history.is_GT()) {
+      if (position.is_end()) {
+        // no need to compensate insert position
+        assert(insert_stage <= STAGE && "impossible insert stage");
+      } else if (position == position_t::begin()) {
+        // I must be short-circuited by staged::smallest_result()
+        // in staged::lower_bound(), so we need to rely on mstat instead
+        assert(mstat >= MSTAT_LT0 && mstat <= MSTAT_LT3);
+        if (mstat == MSTAT_LT0) {
+          insert_stage = STAGE_RIGHT;
+        } else if (mstat == MSTAT_LT1) {
+          insert_stage = STAGE_STRING;
+        } else {
+          insert_stage = STAGE_LEFT;
+        }
+        // XXX(multi-type): need to upgrade node type before inserting an
+        // incompatible index at front.
+        assert(insert_stage <= STAGE && "incompatible insert");
+      } else {
+        assert(insert_stage <= STAGE && "impossible insert stage");
+        [[maybe_unused]] bool ret = compensate_insert_position_at(insert_stage, position);
+        assert(!ret);
+      }
+    }
+
+    if (position.is_end()) {
+      patch_insert_end(position, insert_stage);
+    }
+
+    node_offset_t insert_size = insert_size_at<KeyT::HOBJ>(insert_stage, key, value);
+
+    return {insert_stage, insert_size};
+  }
+
+  template <KeyT KT>
+  static const value_t* insert_new(
+      NodeExtentMutable& mut, const memory_range_t& range,
+      const full_key_t<KT>& key, const value_t& value) {
+    char* p_insert = const_cast<char*>(range.p_end);
+    const value_t* p_value = nullptr;
+    StagedAppender<KT> appender;
+    appender.init(&mut, p_insert);
+    appender.append(key, value, p_value);
+    [[maybe_unused]] const char* p_insert_front = appender.wrap();
+    assert(p_insert_front == range.p_start);
+    return p_value;
+  }
+
+  template <KeyT KT, bool SPLIT>
+  static const value_t* proceed_insert_recursively(
+      NodeExtentMutable& mut, const container_t& container,
+      const full_key_t<KT>& key, const value_t& value,
+      position_t& position, match_stage_t& stage,
+      node_offset_t& _insert_size, const char* p_left_bound) {
+    // proceed insert from right to left
+    assert(stage <= STAGE);
+    auto iter = iterator_t(container);
+    auto& index = position.index;
+
+    bool do_insert = false;
+    if (stage == STAGE) {
+      if (index == INDEX_END) {
+        iter.seek_last();
+        iter.set_end();
+        index = iter.index();
+      } else {
+        assert(is_valid_index(index));
+        iter.seek_till_end(index);
+      }
+      do_insert = true;
+    } else { // stage < STAGE
+      if (index == INDEX_LAST) {
+        iter.seek_last();
+        index = iter.index();
+      } else {
+        assert(is_valid_index(index));
+        iter.seek_till_end(index);
+      }
+      if constexpr (SPLIT) {
+        if (iter.is_end()) {
+          // insert at the higher stage due to split
+          do_insert = true;
+          _insert_size = insert_size<KT>(key, value);
+          stage = STAGE;
+        }
+      } else {
+        assert(!iter.is_end());
+      }
+    }
+
+    if (do_insert) {
+      if constexpr (!IS_BOTTOM) {
+        position.nxt = position_t::nxt_t::begin();
+      }
+      assert(_insert_size == insert_size<KT>(key, value));
+      if constexpr (IS_BOTTOM) {
+        return iter.template insert<KT>(
+            mut, key, value, _insert_size, p_left_bound);
+      } else {
+        auto range = iter.template insert_prefix<KT>(
+            mut, key, _insert_size, p_left_bound);
+        return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value);
+      }
+    } else {
+      if constexpr (!IS_BOTTOM) {
+        auto nxt_container = iter.get_nxt_container();
+        auto p_value = NXT_STAGE_T::template proceed_insert_recursively<KT, SPLIT>(
+            mut, nxt_container, key, value,
+            position.nxt, stage, _insert_size, p_left_bound);
+        iter.update_size(mut, _insert_size);
+        return p_value;
+      } else {
+        ceph_abort("impossible path");
+      }
+    }
+  }
+
+  template <KeyT KT, bool SPLIT>
+  static const value_t* proceed_insert(
+      NodeExtentMutable& mut, const container_t& container,
+      const full_key_t<KT>& key, const value_t& value,
+      position_t& position, match_stage_t& stage, node_offset_t& _insert_size) {
+    auto p_left_bound = container.p_left_bound();
+    if (unlikely(!container.keys())) {
+      if (position.is_end()) {
+        position = position_t::begin();
+        assert(stage == STAGE);
+        assert(_insert_size == insert_size<KT>(key, value));
+      } else if (position == position_t::begin()) {
+        // when insert into a trimmed and empty left node
+        stage = STAGE;
+        _insert_size = insert_size<KT>(key, value);
+      } else {
+        ceph_abort("impossible path");
+      }
+      if constexpr (IS_BOTTOM) {
+        return container_t::template insert_at<KT>(
+            mut, container, key, value, 0, _insert_size, p_left_bound);
+      } else {
+        auto range = container_t::template insert_prefix_at<KT>(
+            mut, container, key, 0, _insert_size, p_left_bound);
+        return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value);
+      }
+    } else {
+      return proceed_insert_recursively<KT, SPLIT>(
+          mut, container, key, value,
+          position, stage, _insert_size, p_left_bound);
+    }
+  }
+
+  static std::ostream& dump(const container_t& container,
+                            std::ostream& os,
+                            const std::string& prefix,
+                            size_t& size,
+                            const char* p_start) {
+    auto iter = iterator_t(container);
+    assert(!iter.is_end());
+    std::string prefix_blank(prefix.size(), ' ');
+    const std::string* p_prefix = &prefix;
+    size += iterator_t::header_size();
+    do {
+      std::ostringstream sos;
+      sos << *p_prefix << iter.get_key() << ": ";
+      std::string i_prefix = sos.str();
+      if constexpr (!IS_BOTTOM) {
+        auto nxt_container = iter.get_nxt_container();
+        size += iter.size_to_nxt();
+        NXT_STAGE_T::dump(nxt_container, os, i_prefix, size, p_start);
+      } else {
+        auto value_ptr = iter.get_p_value();
+        int offset = reinterpret_cast<const char*>(value_ptr) - p_start;
+        size += iter.size();
+        os << "\n" << i_prefix;
+        if constexpr (NODE_TYPE == node_type_t::LEAF) {
+          os << *value_ptr;
+        } else {
+          os << "0x" << std::hex << value_ptr->value << std::dec;
+        }
+        os << " " << size << "B"
+           << "  @" << offset << "B";
+      }
+      if (iter.is_last()) {
+        break;
+      } else {
+        ++iter;
+        p_prefix = &prefix_blank;
+      }
+    } while (true);
+    return os;
+  }
+
+  static void validate(const container_t& container) {
+    auto iter = iterator_t(container);
+    assert(!iter.is_end());
+    auto key = iter.get_key();
+    do {
+      if constexpr (!IS_BOTTOM) {
+        auto nxt_container = iter.get_nxt_container();
+        NXT_STAGE_T::validate(nxt_container);
+      }
+      if (iter.is_last()) {
+        break;
+      } else {
+        ++iter;
+        assert(compare_to(key, iter.get_key()) == MatchKindCMP::LT);
+        key = iter.get_key();
+      }
+    } while (true);
+  }
+
+  static void get_stats(const container_t& container, node_stats_t& stats,
+                        full_key_t<KeyT::VIEW>& index_key) {
+    auto iter = iterator_t(container);
+    assert(!iter.is_end());
+    stats.size_overhead += iterator_t::header_size();
+    do {
+      index_key.replace(iter.get_key());
+      stats.size_overhead += iter.size_overhead();
+      if constexpr (!IS_BOTTOM) {
+        auto nxt_container = iter.get_nxt_container();
+        NXT_STAGE_T::get_stats(nxt_container, stats, index_key);
+      } else {
+        ++stats.num_kvs;
+        size_t kv_logical_size = index_key.size_logical();
+        size_t value_size;
+        if constexpr (NODE_TYPE == node_type_t::LEAF) {
+          value_size = iter.get_p_value()->size;
+        } else {
+          value_size = sizeof(value_t);
+        }
+        stats.size_value += value_size;
+        kv_logical_size += value_size;
+        stats.size_logical += kv_logical_size;
+      }
+      if (iter.is_last()) {
+        break;
+      } else {
+        ++iter;
+      }
+    } while (true);
+  }
+
+  static bool next_position(const container_t& container, position_t& pos) {
+    auto iter = iterator_t(container);
+    assert(!iter.is_end());
+    iter.seek_at(pos.index);
+    bool find_next;
+    if constexpr (!IS_BOTTOM) {
+      auto nxt_container = iter.get_nxt_container();
+      find_next = NXT_STAGE_T::next_position(nxt_container, pos.nxt);
+    } else {
+      find_next = true;
+    }
+    if (find_next) {
+      if (iter.is_last()) {
+        return true;
+      } else {
+        pos.index = iter.index() + 1;
+        if constexpr (!IS_BOTTOM) {
+          pos.nxt = NXT_STAGE_T::position_t::begin();
+        }
+        return false;
+      }
+    } else {
+      return false;
+    }
+  }
+
+  struct _BaseEmpty {};
+  class _BaseWithNxtIterator {
+   protected:
+    typename NXT_STAGE_T::StagedIterator _nxt;
+  };
+  class StagedIterator
+      : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtIterator> {
+   public:
+    StagedIterator() = default;
+    bool valid() const { return iter.has_value(); }
+    index_t index() const {
+      return iter->index();
+    }
+    bool is_end() const { return iter->is_end(); }
+    bool in_progress() const {
+      assert(valid());
+      if constexpr (!IS_BOTTOM) {
+        if (this->_nxt.valid()) {
+          if (this->_nxt.index() == 0) {
+            return this->_nxt.in_progress();
+          } else {
+            return true;
+          }
+        } else {
+          return false;
+        }
+      } else {
+        return false;
+      }
+    }
+    key_get_type get_key() const { return iter->get_key(); }
+
+    iterator_t& get() { return *iter; }
+    void set(const container_t& container) {
+      assert(!valid());
+      iter = iterator_t(container);
+    }
+    void set_end() { iter->set_end(); }
+    typename NXT_STAGE_T::StagedIterator& nxt() {
+      if constexpr (!IS_BOTTOM) {
+        if (!this->_nxt.valid()) {
+          auto nxt_container = iter->get_nxt_container();
+          this->_nxt.set(nxt_container);
+        }
+        return this->_nxt;
+      } else {
+        ceph_abort("impossible path");
+      }
+    }
+    typename NXT_STAGE_T::StagedIterator& get_nxt() {
+      if constexpr (!IS_BOTTOM) {
+        return this->_nxt;
+      } else {
+        ceph_abort("impossible path");
+      }
+    }
+    StagedIterator& operator++() {
+      if (iter->is_last()) {
+        iter->set_end();
+      } else {
+        ++(*iter);
+      }
+      if constexpr (!IS_BOTTOM) {
+        this->_nxt.reset();
+      }
+      return *this;
+    }
+    void reset() {
+      if (valid()) {
+        iter.reset();
+        if constexpr (!IS_BOTTOM) {
+          this->_nxt.reset();
+        }
+      }
+    }
+    std::ostream& print(std::ostream& os, bool is_top) const {
+      if (valid()) {
+        if (iter->is_end()) {
+          return os << "END";
+        } else {
+          os << index();
+        }
+      } else {
+        if (is_top) {
+          return os << "invalid StagedIterator!";
+        } else {
+          os << "0!";
+        }
+      }
+      if constexpr (!IS_BOTTOM) {
+        os << ", ";
+        return this->_nxt.print(os, false);
+      } else {
+        return os;
+      }
+    }
+    position_t get_pos() const {
+      if (valid()) {
+        if constexpr (IS_BOTTOM) {
+          return position_t{index()};
+        } else {
+          return position_t{index(), this->_nxt.get_pos()};
+        }
+      } else {
+        return position_t::begin();
+      }
+    }
+    void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+      uint8_t present = static_cast<bool>(iter);
+      ceph::encode(present, encoded);
+      if (iter.has_value()) {
+        iter->encode(p_node_start, encoded);
+        if constexpr (!IS_BOTTOM) {
+          this->_nxt.encode(p_node_start, encoded);
+        }
+      }
+    }
+    static StagedIterator decode(const char* p_node_start,
+                                 ceph::bufferlist::const_iterator& delta) {
+      StagedIterator ret;
+      uint8_t present;
+      ceph::decode(present, delta);
+      if (present) {
+        ret.iter = iterator_t::decode(p_node_start, delta);
+        if constexpr (!IS_BOTTOM) {
+          ret._nxt = NXT_STAGE_T::StagedIterator::decode(p_node_start, delta);
+        }
+      }
+      return ret;
+    }
+    friend std::ostream& operator<<(std::ostream& os, const StagedIterator& iter) {
+      return iter.print(os, true);
+    }
+   private:
+    std::optional<iterator_t> iter;
+  };
+
+  static bool recursively_locate_split(
+      size_t& current_size, size_t extra_size,
+      size_t target_size, StagedIterator& split_at) {
+    assert(current_size <= target_size);
+    iterator_t& split_iter = split_at.get();
+    current_size = split_iter.seek_split(current_size, extra_size, target_size);
+    assert(current_size <= target_size);
+    assert(!split_iter.is_end());
+    if (split_iter.index() == 0) {
+      extra_size += iterator_t::header_size();
+    } else {
+      extra_size = 0;
+    }
+    bool locate_nxt;
+    if constexpr (!IS_BOTTOM) {
+      locate_nxt = NXT_STAGE_T::recursively_locate_split(
+          current_size, extra_size + split_iter.size_to_nxt(),
+          target_size, split_at.nxt());
+    } else { // IS_BOTTOM
+      // located upper_bound, fair split strategy
+      size_t nxt_size = split_iter.size() + extra_size;
+      assert(current_size + nxt_size > target_size);
+      if (current_size + nxt_size/2 < target_size) {
+        // include next
+        current_size += nxt_size;
+        locate_nxt = true;
+      } else {
+        // exclude next
+        locate_nxt = false;
+      }
+    }
+    if (locate_nxt) {
+      if (split_iter.is_last()) {
+        return true;
+      } else {
+        ++split_at;
+        return false;
+      }
+    } else {
+      return false;
+    }
+  }
+
+  static bool recursively_locate_split_inserted(
+      size_t& current_size, size_t extra_size, size_t target_size,
+      position_t& insert_pos, match_stage_t insert_stage, size_t insert_size,
+      std::optional<bool>& is_insert_left, StagedIterator& split_at) {
+    assert(current_size <= target_size);
+    assert(!is_insert_left.has_value());
+    iterator_t& split_iter = split_at.get();
+    auto& insert_index = insert_pos.index;
+    if (insert_stage == STAGE) {
+      current_size = split_iter.template seek_split_inserted<true>(
+          current_size, extra_size, target_size,
+          insert_index, insert_size, is_insert_left);
+      assert(is_insert_left.has_value());
+      assert(current_size <= target_size);
+      if (split_iter.index() == 0) {
+        if (insert_index == 0) {
+          if (*is_insert_left == false) {
+            extra_size += iterator_t::header_size();
+          } else {
+            extra_size = 0;
+          }
+        } else {
+          extra_size += iterator_t::header_size();
+        }
+      } else {
+        extra_size = 0;
+      }
+      if (*is_insert_left == false && split_iter.index() == insert_index) {
+        // split_iter can be end
+        // found the lower-bound of target_size
+        // ...[s_index-1] |!| (i_index) [s_index]...
+
+        // located upper-bound, fair split strategy
+        // look at the next slot (the insert item)
+        size_t nxt_size = insert_size + extra_size;
+        assert(current_size + nxt_size > target_size);
+        if (current_size + nxt_size/2 < target_size) {
+          // include next
+          *is_insert_left = true;
+          current_size += nxt_size;
+          if (split_iter.is_end()) {
+            // ...[s_index-1] (i_index) |!|
+            return true;
+          } else {
+            return false;
+          }
+        } else {
+          // exclude next
+          return false;
+        }
+      } else {
+        // Already considered insert effect in the current stage.
+        // Look into the next stage to identify the target_size lower-bound w/o
+        // insert effect.
+        assert(!split_iter.is_end());
+        bool locate_nxt;
+        if constexpr (!IS_BOTTOM) {
+          locate_nxt = NXT_STAGE_T::recursively_locate_split(
+              current_size, extra_size + split_iter.size_to_nxt(),
+              target_size, split_at.nxt());
+        } else { // IS_BOTTOM
+          // located upper-bound, fair split strategy
+          // look at the next slot
+          size_t nxt_size = split_iter.size() + extra_size;
+          assert(current_size + nxt_size > target_size);
+          if (current_size + nxt_size/2 < target_size) {
+            // include next
+            current_size += nxt_size;
+            locate_nxt = true;
+          } else {
+            // exclude next
+            locate_nxt = false;
+          }
+        }
+        if (locate_nxt) {
+          if (split_iter.is_last()) {
+            auto end_index = split_iter.index() + 1;
+            if (insert_index == INDEX_END) {
+              insert_index = end_index;
+            }
+            assert(insert_index <= end_index);
+            if (insert_index == end_index) {
+              assert(*is_insert_left == false);
+              split_iter.set_end();
+              // ...[s_index-1] |!| (i_index)
+              return false;
+            } else {
+              assert(*is_insert_left == true);
+              return true;
+            }
+          } else {
+            ++split_at;
+            return false;
+          }
+        } else {
+          return false;
+        }
+      }
+    } else {
+      if constexpr (!IS_BOTTOM) {
+        assert(insert_stage < STAGE);
+        current_size = split_iter.template seek_split_inserted<false>(
+            current_size, extra_size, target_size,
+            insert_index, insert_size, is_insert_left);
+        assert(!split_iter.is_end());
+        assert(current_size <= target_size);
+        if (split_iter.index() == 0) {
+          extra_size += iterator_t::header_size();
+        } else {
+          extra_size = 0;
+        }
+        bool locate_nxt;
+        if (!is_insert_left.has_value()) {
+          // Considered insert effect in the current stage, and insert happens
+          // in the lower stage.
+          // Look into the next stage to identify the target_size lower-bound w/
+          // insert effect.
+          assert(split_iter.index() == insert_index);
+          locate_nxt = NXT_STAGE_T::recursively_locate_split_inserted(
+              current_size, extra_size + split_iter.size_to_nxt(), target_size,
+              insert_pos.nxt, insert_stage, insert_size,
+              is_insert_left, split_at.nxt());
+          assert(is_insert_left.has_value());
+#ifndef NDEBUG
+          if (locate_nxt) {
+            assert(*is_insert_left == true);
+          }
+#endif
+        } else {
+          // is_insert_left.has_value() == true
+          // Insert will *not* happen in the lower stage.
+          // Need to look into the next stage to identify the target_size
+          // lower-bound w/ insert effect
+          assert(split_iter.index() != insert_index);
+          locate_nxt = NXT_STAGE_T::recursively_locate_split(
+              current_size, extra_size + split_iter.size_to_nxt(),
+              target_size, split_at.nxt());
+#ifndef NDEBUG
+          if (split_iter.index() < insert_index) {
+            assert(*is_insert_left == false);
+          } else {
+            assert(*is_insert_left == true);
+          }
+#endif
+        }
+        if (locate_nxt) {
+          if (split_iter.is_last()) {
+            return true;
+          } else {
+            ++split_at;
+            return false;
+          }
+        } else {
+          return false;
+        }
+      } else {
+        ceph_abort("impossible path");
+        return false;;
+      }
+    }
+  }
+
+  /*
+   * container appender type system
+   *   container_t::Appender(NodeExtentMutable& mut, char* p_append)
+   *   append(const container_t& src, index_t from, index_t items)
+   *   wrap() -> char*
+   * IF !IS_BOTTOM:
+   *   open_nxt(const key_get_type&)
+   *   open_nxt(const full_key_t&)
+   *       -> std::tuple<NodeExtentMutable&, char*>
+   *   wrap_nxt(char* p_append)
+   * ELSE
+   *   append(const full_key_t& key, const value_t& value)
+   */
+  template <KeyT KT>
+  struct _BaseWithNxtAppender {
+    typename NXT_STAGE_T::template StagedAppender<KT> _nxt;
+  };
+  template <KeyT KT>
+  class StagedAppender
+      : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtAppender<KT>> {
+   public:
+    StagedAppender() = default;
+    ~StagedAppender() {
+      assert(!require_wrap_nxt);
+      assert(!valid());
+    }
+    bool valid() const { return appender.has_value(); }
+    index_t index() const {
+      assert(valid());
+      return _index;
+    }
+    bool in_progress() const { return require_wrap_nxt; }
+    // TODO: pass by reference
+    void init(NodeExtentMutable* p_mut, char* p_start) {
+      assert(!valid());
+      appender = typename container_t::template Appender<KT>(p_mut, p_start);
+      _index = 0;
+    }
+    // possible to make src_iter end if to_index == INDEX_END
+    void append_until(StagedIterator& src_iter, index_t& to_index) {
+      assert(!require_wrap_nxt);
+      auto s_index = src_iter.index();
+      src_iter.get().template copy_out_until<KT>(*appender, to_index);
+      assert(src_iter.index() == to_index);
+      assert(to_index >= s_index);
+      auto increment = (to_index - s_index);
+      if (increment) {
+        _index += increment;
+        if constexpr (!IS_BOTTOM) {
+          src_iter.get_nxt().reset();
+        }
+      }
+    }
+    void append(const full_key_t<KT>& key,
+                const value_t& value, const value_t*& p_value) {
+      assert(!require_wrap_nxt);
+      if constexpr (!IS_BOTTOM) {
+        auto& nxt = open_nxt(key);
+        nxt.append(key, value, p_value);
+        wrap_nxt();
+      } else {
+        appender->append(key, value, p_value);
+        ++_index;
+      }
+    }
+    char* wrap() {
+      assert(valid());
+      assert(_index > 0);
+      if constexpr (!IS_BOTTOM) {
+        if (require_wrap_nxt) {
+          wrap_nxt();
+        }
+      }
+      auto ret = appender->wrap();
+      appender.reset();
+      return ret;
+    }
+    typename NXT_STAGE_T::template StagedAppender<KT>&
+    open_nxt(key_get_type paritial_key) {
+      assert(!require_wrap_nxt);
+      if constexpr (!IS_BOTTOM) {
+        require_wrap_nxt = true;
+        auto [p_mut, p_append] = appender->open_nxt(paritial_key);
+        this->_nxt.init(p_mut, p_append);
+        return this->_nxt;
+      } else {
+        ceph_abort("impossible path");
+      }
+    }
+    typename NXT_STAGE_T::template StagedAppender<KT>&
+    open_nxt(const full_key_t<KT>& key) {
+      assert(!require_wrap_nxt);
+      if constexpr (!IS_BOTTOM) {
+        require_wrap_nxt = true;
+        auto [p_mut, p_append] = appender->open_nxt(key);
+        this->_nxt.init(p_mut, p_append);
+        return this->_nxt;
+      } else {
+        ceph_abort("impossible path");
+      }
+    }
+    typename NXT_STAGE_T::template StagedAppender<KT>& get_nxt() {
+      if constexpr (!IS_BOTTOM) {
+        assert(require_wrap_nxt);
+        return this->_nxt;
+      } else {
+        ceph_abort("impossible path");
+      }
+    }
+    void wrap_nxt() {
+      if constexpr (!IS_BOTTOM) {
+        assert(require_wrap_nxt);
+        require_wrap_nxt = false;
+        auto p_append = this->_nxt.wrap();
+        appender->wrap_nxt(p_append);
+        ++_index;
+      } else {
+        ceph_abort("impossible path");
+      }
+    }
+   private:
+    std::optional<typename container_t::template Appender<KT>> appender;
+    index_t _index;
+    bool require_wrap_nxt = false;
+  };
+
+  template <KeyT KT>
+  static void _append_range(
+      StagedIterator& src_iter, StagedAppender<KT>& appender, index_t& to_index) {
+    if (src_iter.is_end()) {
+      // append done
+      assert(to_index == INDEX_END);
+      to_index = src_iter.index();
+    } else if constexpr (!IS_BOTTOM) {
+      if (appender.in_progress()) {
+        // appender has appended something at the current item,
+        // cannot append the current item as-a-whole
+        index_t to_index_nxt = INDEX_END;
+        NXT_STAGE_T::template _append_range<KT>(
+            src_iter.nxt(), appender.get_nxt(), to_index_nxt);
+        ++src_iter;
+        appender.wrap_nxt();
+      } else if (src_iter.in_progress()) {
+        // src_iter is not at the beginning of the current item,
+        // cannot append the current item as-a-whole
+        index_t to_index_nxt = INDEX_END;
+        NXT_STAGE_T::template _append_range<KT>(
+            src_iter.nxt(), appender.open_nxt(src_iter.get_key()), to_index_nxt);
+        ++src_iter;
+        appender.wrap_nxt();
+      } else {
+        // we can safely append the current item as-a-whole
+      }
+    }
+    appender.append_until(src_iter, to_index);
+  }
+
+  template <KeyT KT>
+  static void _append_into(StagedIterator& src_iter, StagedAppender<KT>& appender,
+                           position_t& position, match_stage_t stage) {
+    assert(position.index == src_iter.index());
+    // reaches the last item
+    if (stage == STAGE) {
+      // done, end recursion
+      if constexpr (!IS_BOTTOM) {
+        position.nxt = position_t::nxt_t::begin();
+      }
+    } else {
+      assert(stage < STAGE);
+      // proceed append in the next stage
+      NXT_STAGE_T::template append_until<KT>(
+          src_iter.nxt(), appender.open_nxt(src_iter.get_key()),
+          position.nxt, stage);
+    }
+  }
+
+  template <KeyT KT>
+  static void append_until(StagedIterator& src_iter, StagedAppender<KT>& appender,
+                           position_t& position, match_stage_t stage) {
+    index_t from_index = src_iter.index();
+    index_t& to_index = position.index;
+    assert(from_index <= to_index);
+    if constexpr (IS_BOTTOM) {
+      assert(stage == STAGE);
+      appender.append_until(src_iter, to_index);
+    } else {
+      assert(stage <= STAGE);
+      if (src_iter.index() == to_index) {
+        _append_into<KT>(src_iter, appender, position, stage);
+      } else {
+        if (to_index == INDEX_END) {
+          assert(stage == STAGE);
+        } else if (to_index == INDEX_LAST) {
+          assert(stage < STAGE);
+        }
+        _append_range<KT>(src_iter, appender, to_index);
+        _append_into<KT>(src_iter, appender, position, stage);
+      }
+    }
+    to_index -= from_index;
+  }
+
+  template <KeyT KT>
+  static bool append_insert(
+      const full_key_t<KT>& key, const value_t& value,
+      StagedIterator& src_iter, StagedAppender<KT>& appender,
+      bool is_front_insert, match_stage_t& stage, const value_t*& p_value) {
+    assert(src_iter.valid());
+    if (stage == STAGE) {
+      appender.append(key, value, p_value);
+      if (src_iter.is_end()) {
+        return true;
+      } else {
+        return false;
+      }
+    } else {
+      assert(stage < STAGE);
+      if constexpr (!IS_BOTTOM) {
+        auto nxt_is_end = NXT_STAGE_T::template append_insert<KT>(
+            key, value, src_iter.get_nxt(), appender.get_nxt(),
+            is_front_insert, stage, p_value);
+        if (nxt_is_end) {
+          appender.wrap_nxt();
+          ++src_iter;
+          if (is_front_insert) {
+            stage = STAGE;
+          }
+          if (src_iter.is_end()) {
+            return true;
+          }
+        }
+        return false;
+      } else {
+        ceph_abort("impossible path");
+      }
+    }
+  }
+
+  /* TrimType:
+   *   BEFORE: remove the entire container, normally means the according higher
+   *           stage iterator needs to be trimmed as-a-whole.
+   *   AFTER: retain the entire container, normally means the trim should be
+   *          start from the next iterator at the higher stage.
+   *   AT: trim happens in the current container, and the according higher
+   *       stage iterator needs to be adjusted by the trimmed size.
+   */
+  static std::tuple<TrimType, node_offset_t>
+  recursively_trim(NodeExtentMutable& mut, StagedIterator& trim_at) {
+    if (!trim_at.valid()) {
+      return {TrimType::BEFORE, 0u};
+    }
+    if (trim_at.is_end()) {
+      return {TrimType::AFTER, 0u};
+    }
+
+    auto& iter = trim_at.get();
+    if constexpr (!IS_BOTTOM) {
+      auto [type, trimmed] = NXT_STAGE_T::recursively_trim(
+          mut, trim_at.get_nxt());
+      node_offset_t trim_size;
+      if (type == TrimType::AFTER) {
+        if (iter.is_last()) {
+          return {TrimType::AFTER, 0u};
+        }
+        ++trim_at;
+        trim_size = iter.trim_until(mut);
+      } else if (type == TrimType::BEFORE) {
+        if (iter.index() == 0) {
+          return {TrimType::BEFORE, 0u};
+        }
+        trim_size = iter.trim_until(mut);
+      } else {
+        trim_size = iter.trim_at(mut, trimmed);
+      }
+      return {TrimType::AT, trim_size};
+    } else {
+      if (iter.index() == 0) {
+        return {TrimType::BEFORE, 0u};
+      } else {
+        auto trimmed = iter.trim_until(mut);
+        return {TrimType::AT, trimmed};
+      }
+    }
+  }
+
+  static void trim(NodeExtentMutable& mut, StagedIterator& trim_at) {
+    auto [type, trimmed] = recursively_trim(mut, trim_at);
+    if (type == TrimType::BEFORE) {
+      assert(trim_at.valid());
+      auto& iter = trim_at.get();
+      iter.trim_until(mut);
+    }
+  }
+};
+
+/**
+ * Configurations for struct staged
+ *
+ * staged_params_* assembles different container_t implementations (defined by
+ * stated::_iterator_t) by STAGE, and constructs the final multi-stage
+ * implementations for different node layouts defined by
+ * node_extent_t<FieldType, NODE_TYPE>.
+ *
+ * The specialized implementations for different layouts are accessible through
+ * the helper type node_to_stage_t<node_extent_t<FieldType, NODE_TYPE>>.
+ *
+ * Specifically, the settings of 8 layouts are:
+ *
+ * The layout (N0, LEAF/INTERNAL) has 3 stages:
+ * - STAGE_LEFT:   node_extent_t<node_fields_0_t, LEAF/INTERNAL>
+ * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL>
+ * - STAGE_RIGHT:  sub_items_t<LEAF/INTERNAL>
+ *
+ * The layout (N1, LEAF/INTERNAL) has 3 stages:
+ * - STAGE_LEFT:   node_extent_t<node_fields_1_t, LEAF/INTERNAL>
+ * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL>
+ * - STAGE_RIGHT:  sub_items_t<LEAF/INTERNAL>
+ *
+ * The layout (N2, LEAF/INTERNAL) has 2 stages:
+ * - STAGE_STRING: node_extent_t<node_fields_2_t, LEAF/INTERNAL>
+ * - STAGE_RIGHT:  sub_items_t<LEAF/INTERNAL>
+ *
+ * The layout (N3, LEAF) has 1 stage:
+ * - STAGE_RIGHT:  node_extent_t<leaf_fields_3_t, LEAF>
+ *
+ * The layout (N3, INTERNAL) has 1 stage:
+ * - STAGE_RIGHT:  node_extent_t<internal_fields_3_t, INTERNAL>
+ */
+
+template <node_type_t _NODE_TYPE>
+struct staged_params_subitems {
+  using container_t = sub_items_t<_NODE_TYPE>;
+  static constexpr auto NODE_TYPE = _NODE_TYPE;
+  static constexpr auto STAGE = STAGE_RIGHT;
+
+  // dummy type in order to make our type system work
+  // any better solution to get rid of this?
+  using next_param_t = staged_params_subitems<NODE_TYPE>;
+};
+
+template <node_type_t _NODE_TYPE>
+struct staged_params_item_iterator {
+  using container_t = item_iterator_t<_NODE_TYPE>;
+  static constexpr auto NODE_TYPE = _NODE_TYPE;
+  static constexpr auto STAGE = STAGE_STRING;
+
+  using next_param_t = staged_params_subitems<NODE_TYPE>;
+};
+
+template <typename NodeType>
+struct staged_params_node_01 {
+  using container_t = NodeType;
+  static constexpr auto NODE_TYPE = NodeType::NODE_TYPE;
+  static constexpr auto STAGE = STAGE_LEFT;
+
+  using next_param_t = staged_params_item_iterator<NODE_TYPE>;
+};
+
+template <typename NodeType>
+struct staged_params_node_2 {
+  using container_t = NodeType;
+  static constexpr auto NODE_TYPE = NodeType::NODE_TYPE;
+  static constexpr auto STAGE = STAGE_STRING;
+
+  using next_param_t = staged_params_subitems<NODE_TYPE>;
+};
+
+template <typename NodeType>
+struct staged_params_node_3 {
+  using container_t = NodeType;
+  static constexpr auto NODE_TYPE = NodeType::NODE_TYPE;
+  static constexpr auto STAGE = STAGE_RIGHT;
+
+  // dummy type in order to make our type system work
+  // any better solution to get rid of this?
+  using next_param_t = staged_params_node_3<NodeType>;
+};
+
+template <typename NodeType, typename Enable = void> struct _node_to_stage_t;
+template <typename NodeType>
+struct _node_to_stage_t<NodeType,
+    std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N0 ||
+                     NodeType::FIELD_TYPE == field_type_t::N1>> {
+  using type = staged<staged_params_node_01<NodeType>>;
+};
+template <typename NodeType>
+struct _node_to_stage_t<NodeType,
+    std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N2>> {
+  using type = staged<staged_params_node_2<NodeType>>;
+};
+template <typename NodeType>
+struct _node_to_stage_t<NodeType,
+    std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N3>> {
+  using type = staged<staged_params_node_3<NodeType>>;
+};
+template <typename NodeType>
+using node_to_stage_t = typename _node_to_stage_t<NodeType>::type;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h
new file mode 100644
index 000000000..a9d5cef3b
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h
@@ -0,0 +1,411 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <optional>
+#include <ostream>
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/tree_types.h"
+
+namespace crimson::os::seastore::onode {
+
+using match_stage_t = int8_t;
+constexpr match_stage_t STAGE_LEFT = 2;   // shard/pool/crush
+constexpr match_stage_t STAGE_STRING = 1; // nspace/oid
+constexpr match_stage_t STAGE_RIGHT = 0;  // snap/gen
+constexpr auto STAGE_TOP = STAGE_LEFT;
+constexpr auto STAGE_BOTTOM = STAGE_RIGHT;
+constexpr bool is_valid_stage(match_stage_t stage) {
+  return std::clamp(stage, STAGE_BOTTOM, STAGE_TOP) == stage;
+}
+// TODO: replace by
+// using match_history_t = int8_t;
+//     left_m, str_m, right_m
+//  3: GT,
+//  2: EQ,     GT,
+//  1: EQ,     EQ,    GT
+//  0: EQ,     EQ,    EQ
+// -1: EQ,     EQ,    LT
+// -2: EQ,     LT,
+// -3: LT,
+
+struct MatchHistory {
+  template <match_stage_t STAGE>
+  const std::optional<MatchKindCMP>& get() const {
+    static_assert(is_valid_stage(STAGE));
+    if constexpr (STAGE == STAGE_RIGHT) {
+      return right_match;
+    } else if (STAGE == STAGE_STRING) {
+      return string_match;
+    } else {
+      return left_match;
+    }
+  }
+
+  const std::optional<MatchKindCMP>&
+  get_by_stage(match_stage_t stage) const {
+    assert(is_valid_stage(stage));
+    if (stage == STAGE_RIGHT) {
+      return right_match;
+    } else if (stage == STAGE_STRING) {
+      return string_match;
+    } else {
+      return left_match;
+    }
+  }
+
+  template <match_stage_t STAGE = STAGE_TOP>
+  const bool is_GT() const;
+
+  template <match_stage_t STAGE>
+  void set(MatchKindCMP match) {
+    static_assert(is_valid_stage(STAGE));
+    if constexpr (STAGE < STAGE_TOP) {
+      assert(*get<STAGE + 1>() == MatchKindCMP::EQ);
+    }
+    assert(!get<STAGE>().has_value() || *get<STAGE>() != MatchKindCMP::EQ);
+    const_cast<std::optional<MatchKindCMP>&>(get<STAGE>()) = match;
+  }
+
+  std::ostream& dump(std::ostream& os) const {
+    os << "history(";
+    dump_each(os, left_match) << ", ";
+    dump_each(os, string_match) << ", ";
+    dump_each(os, right_match) << ")";
+    return os;
+  }
+
+  std::ostream& dump_each(
+      std::ostream& os, const std::optional<MatchKindCMP>& match) const {
+    if (!match.has_value()) {
+      return os << "--";
+    } else if (*match == MatchKindCMP::LT) {
+      return os << "LT";
+    } else if (*match == MatchKindCMP::EQ) {
+      return os << "EQ";
+    } else if (*match == MatchKindCMP::GT) {
+      return os << "GT";
+    } else {
+      ceph_abort("impossble path");
+    }
+  }
+
+  std::optional<MatchKindCMP> left_match;
+  std::optional<MatchKindCMP> string_match;
+  std::optional<MatchKindCMP> right_match;
+};
+inline std::ostream& operator<<(std::ostream& os, const MatchHistory& pos) {
+  return pos.dump(os);
+}
+
+template <match_stage_t STAGE>
+struct _check_GT_t {
+  static bool eval(const MatchHistory* history) {
+    return history->get<STAGE>() &&
+           (*history->get<STAGE>() == MatchKindCMP::GT ||
+            (*history->get<STAGE>() == MatchKindCMP::EQ &&
+             _check_GT_t<STAGE - 1>::eval(history)));
+  }
+};
+template <>
+struct _check_GT_t<STAGE_RIGHT> {
+  static bool eval(const MatchHistory* history) {
+    return history->get<STAGE_RIGHT>() &&
+           *history->get<STAGE_RIGHT>() == MatchKindCMP::GT;
+  }
+};
+template <match_stage_t STAGE>
+const bool MatchHistory::is_GT() const {
+  static_assert(is_valid_stage(STAGE));
+  if constexpr (STAGE < STAGE_TOP) {
+    assert(get<STAGE + 1>() == MatchKindCMP::EQ);
+  }
+  return _check_GT_t<STAGE>::eval(this);
+}
+
+template <match_stage_t STAGE>
+struct staged_position_t {
+  static_assert(is_valid_stage(STAGE));
+  using me_t = staged_position_t<STAGE>;
+  using nxt_t = staged_position_t<STAGE - 1>;
+  bool is_end() const {
+    if (index == INDEX_END) {
+      return true;
+    } else {
+      assert(is_valid_index(index));
+      return false;
+    }
+  }
+  index_t& index_by_stage(match_stage_t stage) {
+    assert(stage <= STAGE);
+    if (STAGE == stage) {
+      return index;
+    } else {
+      return nxt.index_by_stage(stage);
+    }
+  }
+
+  int cmp(const me_t& o) const {
+    if (index > o.index) {
+      return 1;
+    } else if (index < o.index) {
+      return -1;
+    } else {
+      return nxt.cmp(o.nxt);
+    }
+  }
+  bool operator>(const me_t& o) const { return cmp(o) > 0; }
+  bool operator>=(const me_t& o) const { return cmp(o) >= 0; }
+  bool operator<(const me_t& o) const { return cmp(o) < 0; }
+  bool operator<=(const me_t& o) const { return cmp(o) <= 0; }
+  bool operator==(const me_t& o) const { return cmp(o) == 0; }
+  bool operator!=(const me_t& o) const { return cmp(o) != 0; }
+
+  me_t& operator-=(const me_t& o) {
+    assert(is_valid_index(o.index));
+    assert(index >= o.index);
+    if (index != INDEX_END) {
+      assert(is_valid_index(index));
+      index -= o.index;
+      if (index == 0) {
+        nxt -= o.nxt;
+      }
+    }
+    return *this;
+  }
+
+  void encode(ceph::bufferlist& encoded) const {
+    ceph::encode(index, encoded);
+    nxt.encode(encoded);
+  }
+
+  static me_t decode(ceph::bufferlist::const_iterator& delta) {
+    me_t ret;
+    ceph::decode(ret.index, delta);
+    ret.nxt = nxt_t::decode(delta);
+    return ret;
+  }
+
+  static me_t begin() { return {0u, nxt_t::begin()}; }
+  static me_t end() {
+    return {INDEX_END, nxt_t::end()};
+  }
+
+  index_t index;
+  nxt_t nxt;
+};
+template <match_stage_t STAGE>
+std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE>& pos) {
+  if (pos.index == INDEX_END) {
+    os << "END";
+  } else if (pos.index == INDEX_LAST) {
+    os << "LAST";
+  } else {
+    os << pos.index;
+    assert(is_valid_index(pos.index));
+  }
+  return os << ", " << pos.nxt;
+}
+
+template <>
+struct staged_position_t<STAGE_BOTTOM> {
+  using me_t = staged_position_t<STAGE_BOTTOM>;
+  bool is_end() const {
+    if (index == INDEX_END) {
+      return true;
+    } else {
+      assert(is_valid_index(index));
+      return false;
+    }
+  }
+  index_t& index_by_stage(match_stage_t stage) {
+    assert(stage == STAGE_BOTTOM);
+    return index;
+  }
+
+  int cmp(const staged_position_t<STAGE_BOTTOM>& o) const {
+    if (index > o.index) {
+      return 1;
+    } else if (index < o.index) {
+      return -1;
+    } else {
+      return 0;
+    }
+  }
+  bool operator>(const me_t& o) const { return cmp(o) > 0; }
+  bool operator>=(const me_t& o) const { return cmp(o) >= 0; }
+  bool operator<(const me_t& o) const { return cmp(o) < 0; }
+  bool operator<=(const me_t& o) const { return cmp(o) <= 0; }
+  bool operator==(const me_t& o) const { return cmp(o) == 0; }
+  bool operator!=(const me_t& o) const { return cmp(o) != 0; }
+
+  me_t& operator-=(const me_t& o) {
+    assert(is_valid_index(o.index));
+    assert(index >= o.index);
+    if (index != INDEX_END) {
+      assert(is_valid_index(index));
+      index -= o.index;
+    }
+    return *this;
+  }
+
+  void encode(ceph::bufferlist& encoded) const {
+    ceph::encode(index, encoded);
+  }
+
+  static me_t decode(ceph::bufferlist::const_iterator& delta) {
+    me_t ret;
+    ceph::decode(ret.index, delta);
+    return ret;
+  }
+
+  static me_t begin() { return {0u}; }
+  static me_t end() { return {INDEX_END}; }
+
+  index_t index;
+};
+template <>
+inline std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE_BOTTOM>& pos) {
+  if (pos.index == INDEX_END) {
+    os << "END";
+  } else if (pos.index == INDEX_LAST) {
+    os << "LAST";
+  } else {
+    os << pos.index;
+    assert(is_valid_index(pos.index));
+  }
+  return os;
+}
+
+using search_position_t = staged_position_t<STAGE_TOP>;
+
+template <match_stage_t STAGE>
+const staged_position_t<STAGE>& cast_down(const search_position_t& pos) {
+  if constexpr (STAGE == STAGE_LEFT) {
+    return pos;
+  } else if constexpr (STAGE == STAGE_STRING) {
+#ifndef NDEBUG
+    if (pos.is_end()) {
+      assert(pos.nxt.is_end());
+    } else {
+      assert(pos.index == 0u);
+    }
+#endif
+    return pos.nxt;
+  } else if constexpr (STAGE == STAGE_RIGHT) {
+#ifndef NDEBUG
+    if (pos.is_end()) {
+      assert(pos.nxt.nxt.is_end());
+    } else {
+      assert(pos.index == 0u);
+      assert(pos.nxt.index == 0u);
+    }
+#endif
+    return pos.nxt.nxt;
+  } else {
+    ceph_abort("impossible path");
+  }
+}
+
+template <match_stage_t STAGE>
+staged_position_t<STAGE>& cast_down(search_position_t& pos) {
+  const search_position_t& _pos = pos;
+  return const_cast<staged_position_t<STAGE>&>(cast_down<STAGE>(_pos));
+}
+
+template <match_stage_t STAGE>
+staged_position_t<STAGE>& cast_down_fill_0(search_position_t& pos) {
+  if constexpr (STAGE == STAGE_LEFT) {
+    return pos;
+  } if constexpr (STAGE == STAGE_STRING) {
+    pos.index = 0;
+    return pos.nxt;
+  } else if constexpr (STAGE == STAGE_RIGHT) {
+    pos.index = 0;
+    pos.nxt.index = 0;
+    return pos.nxt.nxt;
+  } else {
+    ceph_abort("impossible path");
+  }
+}
+
+inline search_position_t&& normalize(search_position_t&& pos) { return std::move(pos); }
+
+template <match_stage_t STAGE, typename = std::enable_if_t<STAGE != STAGE_TOP>>
+search_position_t normalize(staged_position_t<STAGE>&& pos) {
+  if (pos.is_end()) {
+    return search_position_t::end();
+  }
+  if constexpr (STAGE == STAGE_STRING) {
+    return {0u, std::move(pos)};
+  } else if (STAGE == STAGE_RIGHT) {
+    return {0u, {0u, std::move(pos)}};
+  } else {
+    ceph_abort("impossible path");
+  }
+}
+
+struct memory_range_t {
+  const char* p_start;
+  const char* p_end;
+};
+
+enum class ContainerType { ITERATIVE, INDEXABLE };
+
+template <node_type_t> struct value_type;
+template<> struct value_type<node_type_t::INTERNAL> { using type = laddr_packed_t; };
+template<> struct value_type<node_type_t::LEAF> { using type = onode_t; };
+template <node_type_t NODE_TYPE>
+using value_type_t = typename value_type<NODE_TYPE>::type;
+
+template <node_type_t NODE_TYPE, match_stage_t STAGE>
+struct staged_result_t {
+  using me_t = staged_result_t<NODE_TYPE, STAGE>;
+  bool is_end() const { return position.is_end(); }
+
+  static me_t end() {
+    return {staged_position_t<STAGE>::end(), nullptr, MSTAT_END};
+  }
+  template <typename T = me_t>
+  static std::enable_if_t<STAGE != STAGE_BOTTOM, T> from_nxt(
+      index_t index, const staged_result_t<NODE_TYPE, STAGE - 1>& nxt_stage_result) {
+    return {{index, nxt_stage_result.position},
+            nxt_stage_result.p_value,
+            nxt_stage_result.mstat};
+  }
+
+  staged_position_t<STAGE> position;
+  const value_type_t<NODE_TYPE>* p_value;
+  match_stat_t mstat;
+};
+
+template <node_type_t NODE_TYPE>
+using lookup_result_t = staged_result_t<NODE_TYPE, STAGE_TOP>;
+
+template <node_type_t NODE_TYPE>
+lookup_result_t<NODE_TYPE>&& normalize(
+    lookup_result_t<NODE_TYPE>&& result) { return std::move(result); }
+
+template <node_type_t NODE_TYPE, match_stage_t STAGE,
+          typename = std::enable_if_t<STAGE != STAGE_TOP>>
+lookup_result_t<NODE_TYPE> normalize(
+    staged_result_t<NODE_TYPE, STAGE>&& result) {
+  // FIXME: assert result.mstat correct
+  return {normalize(std::move(result.position)), result.p_value, result.mstat};
+}
+
+struct node_stats_t {
+  size_t size_persistent = 0;
+  size_t size_filled = 0;
+  // filled by staged::get_stats()
+  size_t size_logical = 0;
+  size_t size_overhead = 0;
+  size_t size_value = 0;
+  unsigned num_kvs = 0;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc
new file mode 100644
index 000000000..aaca6c3c6
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc
@@ -0,0 +1,208 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "sub_items_stage.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+template <KeyT KT>
+const laddr_packed_t* internal_sub_items_t::insert_at(
+    NodeExtentMutable& mut, const internal_sub_items_t& sub_items,
+    const full_key_t<KT>& key, const laddr_packed_t& value,
+    index_t index, node_offset_t size, const char* p_left_bound) {
+  assert(index <= sub_items.keys());
+  assert(size == estimate_insert<KT>(key, value));
+  const char* p_shift_start = p_left_bound;
+  const char* p_shift_end = reinterpret_cast<const char*>(
+      sub_items.p_first_item + 1 - index);
+  mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size);
+
+  auto p_insert = const_cast<char*>(p_shift_end) - size;
+  auto item = internal_sub_item_t{snap_gen_t::from_key<KT>(key), value};
+  mut.copy_in_absolute(p_insert, item);
+  return &reinterpret_cast<internal_sub_item_t*>(p_insert)->value;
+}
+#define IA_TEMPLATE(KT)                                                     \
+  template const laddr_packed_t* internal_sub_items_t::insert_at<KT>(       \
+    NodeExtentMutable&, const internal_sub_items_t&, const full_key_t<KT>&, \
+    const laddr_packed_t&, index_t, node_offset_t, const char*)
+IA_TEMPLATE(KeyT::VIEW);
+IA_TEMPLATE(KeyT::HOBJ);
+
+node_offset_t internal_sub_items_t::trim_until(
+    NodeExtentMutable&, internal_sub_items_t& items, index_t index) {
+  assert(index != 0);
+  auto keys = items.keys();
+  assert(index <= keys);
+  size_t ret = sizeof(internal_sub_item_t) * (keys - index);
+  assert(ret < NODE_BLOCK_SIZE);
+  return ret;
+}
+
+template <KeyT KT>
+void internal_sub_items_t::Appender<KT>::append(
+    const internal_sub_items_t& src, index_t from, index_t items) {
+  assert(from <= src.keys());
+  if (items == 0) {
+    return;
+  }
+  assert(from < src.keys());
+  assert(from + items <= src.keys());
+  node_offset_t size = sizeof(internal_sub_item_t) * items;
+  p_append -= size;
+  p_mut->copy_in_absolute(p_append, src.p_first_item + 1 - from - items, size);
+}
+
+template <KeyT KT>
+void internal_sub_items_t::Appender<KT>::append(
+    const full_key_t<KT>& key, const laddr_packed_t& value,
+    const laddr_packed_t*& p_value) {
+  p_append -= sizeof(internal_sub_item_t);
+  auto item = internal_sub_item_t{snap_gen_t::from_key<KT>(key), value};
+  p_mut->copy_in_absolute(p_append, item);
+  p_value = &reinterpret_cast<internal_sub_item_t*>(p_append)->value;
+}
+
+template <KeyT KT>
+const onode_t* leaf_sub_items_t::insert_at(
+    NodeExtentMutable& mut, const leaf_sub_items_t& sub_items,
+    const full_key_t<KT>& key, const onode_t& value,
+    index_t index, node_offset_t size, const char* p_left_bound) {
+  assert(index <= sub_items.keys());
+  assert(size == estimate_insert<KT>(key, value));
+  // a. [... item(index)] << size
+  const char* p_shift_start = p_left_bound;
+  const char* p_shift_end = sub_items.get_item_end(index);
+  mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size);
+
+  // b. insert item
+  auto p_insert = const_cast<char*>(p_shift_end - size);
+  auto p_value = reinterpret_cast<const onode_t*>(p_insert);
+  mut.copy_in_absolute(p_insert, &value, value.size);
+  p_insert += value.size;
+  mut.copy_in_absolute(p_insert, snap_gen_t::template from_key<KT>(key));
+  assert(p_insert + sizeof(snap_gen_t) + sizeof(node_offset_t) == p_shift_end);
+
+  // c. compensate affected offsets
+  auto item_size = value.size + sizeof(snap_gen_t);
+  for (auto i = index; i < sub_items.keys(); ++i) {
+    const node_offset_packed_t& offset_i = sub_items.get_offset(i);
+    mut.copy_in_absolute((void*)&offset_i, node_offset_t(offset_i.value + item_size));
+  }
+
+  // d. [item(index-1) ... item(0) ... offset(index)] <<< sizeof(node_offset_t)
+  const char* p_offset = (index == 0 ?
+                          (const char*)&sub_items.get_offset(0) + sizeof(node_offset_t) :
+                          (const char*)&sub_items.get_offset(index - 1));
+  p_shift_start = p_shift_end;
+  p_shift_end = p_offset;
+  mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)sizeof(node_offset_t));
+
+  // e. insert offset
+  node_offset_t offset_to_item_start = item_size + sub_items.get_offset_to_end(index);
+  mut.copy_in_absolute(
+      const_cast<char*>(p_shift_end) - sizeof(node_offset_t), offset_to_item_start);
+
+  // f. update num_sub_keys
+  mut.copy_in_absolute((void*)sub_items.p_num_keys, num_keys_t(sub_items.keys() + 1));
+
+  return p_value;
+}
+template const onode_t* leaf_sub_items_t::insert_at<KeyT::HOBJ>(
+    NodeExtentMutable&, const leaf_sub_items_t&, const full_key_t<KeyT::HOBJ>&,
+    const onode_t&, index_t, node_offset_t, const char*);
+
+node_offset_t leaf_sub_items_t::trim_until(
+    NodeExtentMutable& mut, leaf_sub_items_t& items, index_t index) {
+  assert(index != 0);
+  auto keys = items.keys();
+  assert(index <= keys);
+  if (index == keys) {
+    return 0;
+  }
+  index_t trim_items = keys - index;
+  const char* p_items_start = items.p_start();
+  const char* p_shift_start = items.get_item_end(index);
+  const char* p_shift_end = items.get_item_end(0);
+  size_t size_trim_offsets = sizeof(node_offset_t) * trim_items;
+  mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start,
+                     size_trim_offsets);
+  mut.copy_in_absolute((void*)items.p_num_keys, num_keys_t(index));
+  size_t ret = size_trim_offsets + (p_shift_start - p_items_start);
+  assert(ret < NODE_BLOCK_SIZE);
+  return ret;
+}
+
+template class internal_sub_items_t::Appender<KeyT::VIEW>;
+template class internal_sub_items_t::Appender<KeyT::HOBJ>;
+
+// helper type for the visitor
+template<class... Ts> struct overloaded : Ts... { using Ts::operator()...; };
+// explicit deduction guide
+template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
+
+template <KeyT KT>
+char* leaf_sub_items_t::Appender<KT>::wrap() {
+  auto p_cur = p_append;
+  num_keys_t num_keys = 0;
+  for (auto i = 0u; i < cnt; ++i) {
+    auto& a = appends[i];
+    std::visit(overloaded {
+      [&] (const range_items_t& arg) { num_keys += arg.items; },
+      [&] (const kv_item_t& arg) { ++num_keys; }
+    }, a);
+  }
+  assert(num_keys);
+  p_cur -= sizeof(num_keys_t);
+  p_mut->copy_in_absolute(p_cur, num_keys);
+
+  node_offset_t last_offset = 0;
+  for (auto i = 0u; i < cnt; ++i) {
+    auto& a = appends[i];
+    std::visit(overloaded {
+      [&] (const range_items_t& arg) {
+        int compensate = (last_offset - op_src->get_offset_to_end(arg.from));
+        node_offset_t offset;
+        for (auto i = arg.from; i < arg.from + arg.items; ++i) {
+          offset = op_src->get_offset(i).value + compensate;
+          p_cur -= sizeof(node_offset_t);
+          p_mut->copy_in_absolute(p_cur, offset);
+        }
+        last_offset = offset;
+      },
+      [&] (const kv_item_t& arg) {
+        last_offset += sizeof(snap_gen_t) + arg.p_value->size;
+        p_cur -= sizeof(node_offset_t);
+        p_mut->copy_in_absolute(p_cur, last_offset);
+      }
+    }, a);
+  }
+
+  for (auto i = 0u; i < cnt; ++i) {
+    auto& a = appends[i];
+    std::visit(overloaded {
+      [&] (const range_items_t& arg) {
+        auto _p_start = op_src->get_item_end(arg.from + arg.items);
+        size_t _len = op_src->get_item_end(arg.from) - _p_start;
+        p_cur -= _len;
+        p_mut->copy_in_absolute(p_cur, _p_start, _len);
+      },
+      [&] (const kv_item_t& arg) {
+        assert(pp_value);
+        p_cur -= sizeof(snap_gen_t);
+        p_mut->copy_in_absolute(p_cur, snap_gen_t::template from_key<KT>(*arg.p_key));
+        p_cur -= arg.p_value->size;
+        p_mut->copy_in_absolute(p_cur, arg.p_value, arg.p_value->size);
+        *pp_value = reinterpret_cast<const onode_t*>(p_cur);
+      }
+    }, a);
+  }
+  return p_cur;
+}
+
+template class leaf_sub_items_t::Appender<KeyT::VIEW>;
+template class leaf_sub_items_t::Appender<KeyT::HOBJ>;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h
new file mode 100644
index 000000000..8ef5f7472
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h
@@ -0,0 +1,341 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <variant>
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "key_layout.h"
+#include "stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+struct internal_sub_item_t {
+  const snap_gen_t& get_key() const { return key; }
+  const laddr_packed_t* get_p_value() const { return &value; }
+
+  snap_gen_t key;
+  laddr_packed_t value;
+} __attribute__((packed));
+
+/**
+ * internal_sub_items_t
+ *
+ * The STAGE_RIGHT implementation for internal node N0/N1/N2, implements staged
+ * contract as an indexable container to index snap-gen to child node
+ * addresses.
+ *
+ * The layout of the contaner storing n sub-items:
+ *
+ * # <--------- container range -----------> #
+ * #<~># sub-items [2, n)                    #
+ * #   # <- sub-item 1 -> # <- sub-item 0 -> #
+ * #...# snap-gen | laddr # snap-gen | laddr #
+ *                        ^
+ *                        |
+ *           p_first_item +
+ */
+class internal_sub_items_t {
+ public:
+  using num_keys_t = index_t;
+
+  internal_sub_items_t(const memory_range_t& range) {
+    assert(range.p_start < range.p_end);
+    assert((range.p_end - range.p_start) % sizeof(internal_sub_item_t) == 0);
+    num_items = (range.p_end - range.p_start) / sizeof(internal_sub_item_t);
+    assert(num_items > 0);
+    auto _p_first_item = range.p_end - sizeof(internal_sub_item_t);
+    p_first_item = reinterpret_cast<const internal_sub_item_t*>(_p_first_item);
+  }
+
+  // container type system
+  using key_get_type = const snap_gen_t&;
+  static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE;
+  num_keys_t keys() const { return num_items; }
+  key_get_type operator[](index_t index) const {
+    assert(index < num_items);
+    return (p_first_item - index)->get_key();
+  }
+  node_offset_t size_before(index_t index) const {
+    size_t ret = index * sizeof(internal_sub_item_t);
+    assert(ret < NODE_BLOCK_SIZE);
+    return ret;
+  }
+  const laddr_packed_t* get_p_value(index_t index) const {
+    assert(index < num_items);
+    return (p_first_item - index)->get_p_value();
+  }
+  node_offset_t size_overhead_at(index_t index) const { return 0u; }
+  void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+    auto p_end = reinterpret_cast<const char*>(p_first_item) +
+                 sizeof(internal_sub_item_t);
+    auto p_start = p_end - num_items * sizeof(internal_sub_item_t);
+    int start_offset = p_start - p_node_start;
+    int end_offset = p_end - p_node_start;
+    assert(start_offset > 0 &&
+           start_offset < end_offset &&
+           end_offset < NODE_BLOCK_SIZE);
+    ceph::encode(static_cast<node_offset_t>(start_offset), encoded);
+    ceph::encode(static_cast<node_offset_t>(end_offset), encoded);
+  }
+
+  static internal_sub_items_t decode(
+      const char* p_node_start, ceph::bufferlist::const_iterator& delta) {
+    node_offset_t start_offset;
+    ceph::decode(start_offset, delta);
+    node_offset_t end_offset;
+    ceph::decode(end_offset, delta);
+    assert(start_offset < end_offset);
+    assert(end_offset <= NODE_BLOCK_SIZE);
+    return internal_sub_items_t({p_node_start + start_offset,
+                                 p_node_start + end_offset});
+  }
+
+  static node_offset_t header_size() { return 0u; }
+
+  template <KeyT KT>
+  static node_offset_t estimate_insert(
+      const full_key_t<KT>&, const laddr_packed_t&) {
+    return sizeof(internal_sub_item_t);
+  }
+
+  template <KeyT KT>
+  static const laddr_packed_t* insert_at(
+      NodeExtentMutable&, const internal_sub_items_t&,
+      const full_key_t<KT>&, const laddr_packed_t&,
+      index_t index, node_offset_t size, const char* p_left_bound);
+
+  static node_offset_t trim_until(NodeExtentMutable&, internal_sub_items_t&, index_t);
+
+  template <KeyT KT>
+  class Appender;
+
+ private:
+  index_t num_items;
+  const internal_sub_item_t* p_first_item;
+};
+
+template <KeyT KT>
+class internal_sub_items_t::Appender {
+ public:
+  Appender(NodeExtentMutable* p_mut, char* p_append)
+    : p_mut{p_mut}, p_append{p_append} {}
+  void append(const internal_sub_items_t& src, index_t from, index_t items);
+  void append(const full_key_t<KT>&, const laddr_packed_t&, const laddr_packed_t*&);
+  char* wrap() { return p_append; }
+ private:
+  NodeExtentMutable* p_mut;
+  char* p_append;
+};
+
+/**
+ * leaf_sub_items_t
+ *
+ * The STAGE_RIGHT implementation for leaf node N0/N1/N2, implements staged
+ * contract as an indexable container to index snap-gen to onode_t.
+ *
+ * The layout of the contaner storing n sub-items:
+ *
+ * # <------------------------ container range -------------------------------> #
+ * # <---------- sub-items ----------------> # <--- offsets ---------#          #
+ * #<~># sub-items [2, n)                    #<~>| offsets [2, n)    #          #
+ * #   # <- sub-item 1 -> # <- sub-item 0 -> #   |                   #          #
+ * #...# snap-gen | onode # snap-gen | onode #...| offset1 | offset0 # num_keys #
+ *                                           ^             ^         ^
+ *                                           |             |         |
+ *                               p_items_end +   p_offsets +         |
+ *                                                        p_num_keys +
+ */
+class leaf_sub_items_t {
+ public:
+  // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t),
+  //       and the minimal size of onode_t
+  using num_keys_t = uint8_t;
+
+  leaf_sub_items_t(const memory_range_t& range) {
+    assert(range.p_start < range.p_end);
+    auto _p_num_keys = range.p_end - sizeof(num_keys_t);
+    assert(range.p_start < _p_num_keys);
+    p_num_keys = reinterpret_cast<const num_keys_t*>(_p_num_keys);
+    assert(keys());
+    auto _p_offsets = _p_num_keys - sizeof(node_offset_t);
+    assert(range.p_start < _p_offsets);
+    p_offsets = reinterpret_cast<const node_offset_packed_t*>(_p_offsets);
+    p_items_end = reinterpret_cast<const char*>(&get_offset(keys() - 1));
+    assert(range.p_start < p_items_end);
+    assert(range.p_start == p_start());
+  }
+
+  bool operator==(const leaf_sub_items_t& x) {
+    return (p_num_keys == x.p_num_keys &&
+            p_offsets == x.p_offsets &&
+            p_items_end == x.p_items_end);
+  }
+
+  const char* p_start() const { return get_item_end(keys()); }
+
+  const node_offset_packed_t& get_offset(index_t index) const {
+    assert(index < keys());
+    return *(p_offsets - index);
+  }
+
+  const node_offset_t get_offset_to_end(index_t index) const {
+    assert(index <= keys());
+    return index == 0 ? 0 : get_offset(index - 1).value;
+  }
+
+  const char* get_item_start(index_t index) const {
+    return p_items_end - get_offset(index).value;
+  }
+
+  const char* get_item_end(index_t index) const {
+    return p_items_end - get_offset_to_end(index);
+  }
+
+  // container type system
+  using key_get_type = const snap_gen_t&;
+  static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE;
+  num_keys_t keys() const { return *p_num_keys; }
+  key_get_type operator[](index_t index) const {
+    assert(index < keys());
+    auto pointer = get_item_end(index);
+    assert(get_item_start(index) < pointer);
+    pointer -= sizeof(snap_gen_t);
+    assert(get_item_start(index) < pointer);
+    return *reinterpret_cast<const snap_gen_t*>(pointer);
+  }
+  node_offset_t size_before(index_t index) const {
+    assert(index <= keys());
+    size_t ret;
+    if (index == 0) {
+      ret = sizeof(num_keys_t);
+    } else {
+      --index;
+      ret = sizeof(num_keys_t) +
+            (index + 1) * sizeof(node_offset_t) +
+            get_offset(index).value;
+    }
+    assert(ret < NODE_BLOCK_SIZE);
+    return ret;
+  }
+  node_offset_t size_overhead_at(index_t index) const { return sizeof(node_offset_t); }
+  const onode_t* get_p_value(index_t index) const {
+    assert(index < keys());
+    auto pointer = get_item_start(index);
+    auto value = reinterpret_cast<const onode_t*>(pointer);
+    assert(pointer + value->size + sizeof(snap_gen_t) == get_item_end(index));
+    return value;
+  }
+  void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+    auto p_end = reinterpret_cast<const char*>(p_num_keys) +
+                  sizeof(num_keys_t);
+    int start_offset = p_start() - p_node_start;
+    int end_offset = p_end - p_node_start;
+    assert(start_offset > 0 &&
+           start_offset < end_offset &&
+           end_offset < NODE_BLOCK_SIZE);
+    ceph::encode(static_cast<node_offset_t>(start_offset), encoded);
+    ceph::encode(static_cast<node_offset_t>(end_offset), encoded);
+  }
+
+  static leaf_sub_items_t decode(
+      const char* p_node_start, ceph::bufferlist::const_iterator& delta) {
+    node_offset_t start_offset;
+    ceph::decode(start_offset, delta);
+    node_offset_t end_offset;
+    ceph::decode(end_offset, delta);
+    assert(start_offset < end_offset);
+    assert(end_offset <= NODE_BLOCK_SIZE);
+    return leaf_sub_items_t({p_node_start + start_offset,
+                             p_node_start + end_offset});
+  }
+
+  static node_offset_t header_size() { return sizeof(num_keys_t); }
+
+  template <KeyT KT>
+  static node_offset_t estimate_insert(const full_key_t<KT>&, const onode_t& value) {
+    return value.size + sizeof(snap_gen_t) + sizeof(node_offset_t);
+  }
+
+  template <KeyT KT>
+  static const onode_t* insert_at(
+      NodeExtentMutable&, const leaf_sub_items_t&,
+      const full_key_t<KT>&, const onode_t&,
+      index_t index, node_offset_t size, const char* p_left_bound);
+
+  static node_offset_t trim_until(NodeExtentMutable&, leaf_sub_items_t&, index_t index);
+
+  template <KeyT KT>
+  class Appender;
+
+ private:
+  // TODO: support unaligned access
+  const num_keys_t* p_num_keys;
+  const node_offset_packed_t* p_offsets;
+  const char* p_items_end;
+};
+
+constexpr index_t APPENDER_LIMIT = 3u;
+
+template <KeyT KT>
+class leaf_sub_items_t::Appender {
+  struct range_items_t {
+    index_t from;
+    index_t items;
+  };
+  struct kv_item_t {
+    const full_key_t<KT>* p_key;
+    const onode_t* p_value;
+  };
+  using var_t = std::variant<range_items_t, kv_item_t>;
+
+ public:
+  Appender(NodeExtentMutable* p_mut, char* p_append)
+    : p_mut{p_mut}, p_append{p_append} {
+  }
+
+  void append(const leaf_sub_items_t& src, index_t from, index_t items) {
+    assert(cnt <= APPENDER_LIMIT);
+    assert(from <= src.keys());
+    if (items == 0) {
+      return;
+    }
+    if (op_src) {
+      assert(*op_src == src);
+    } else {
+      op_src = src;
+    }
+    assert(from < src.keys());
+    assert(from + items <= src.keys());
+    appends[cnt] = range_items_t{from, items};
+    ++cnt;
+  }
+  void append(const full_key_t<KT>& key,
+              const onode_t& value, const onode_t*& p_value) {
+    assert(pp_value == nullptr);
+    assert(cnt <= APPENDER_LIMIT);
+    appends[cnt] = kv_item_t{&key, &value};
+    ++cnt;
+    pp_value = &p_value;
+  }
+  char* wrap();
+
+ private:
+  std::optional<leaf_sub_items_t> op_src;
+  const onode_t** pp_value = nullptr;
+  NodeExtentMutable* p_mut;
+  char* p_append;
+  var_t appends[APPENDER_LIMIT];
+  index_t cnt = 0;
+};
+
+template <node_type_t> struct _sub_items_t;
+template<> struct _sub_items_t<node_type_t::INTERNAL> { using type = internal_sub_items_t; };
+template<> struct _sub_items_t<node_type_t::LEAF> { using type = leaf_sub_items_t; };
+template <node_type_t NODE_TYPE>
+using sub_items_t = typename _sub_items_t<NODE_TYPE>::type;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc
new file mode 100644
index 000000000..5a28f5097
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc
@@ -0,0 +1,26 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "super.h"
+#include "node.h"
+
+namespace crimson::os::seastore::onode {
+
+Ref<Node> RootNodeTrackerIsolated::get_root(Transaction& t) const {
+  auto iter = tracked_supers.find(&t);
+  if (iter == tracked_supers.end()) {
+    return nullptr;
+  } else {
+    return iter->second->get_p_root();
+  }
+}
+
+Ref<Node> RootNodeTrackerShared::get_root(Transaction&) const {
+  if (is_clean()) {
+    return nullptr;
+  } else {
+    return tracked_super->get_p_root();
+  }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.h b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h
new file mode 100644
index 000000000..5eefee9ff
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h
@@ -0,0 +1,143 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+
+#include "crimson/common/type_helpers.h"
+
+#include "fwd.h"
+
+namespace crimson::os::seastore::onode {
+
+class Node;
+class Super;
+
+/**
+ * RootNodeTracker
+ *
+ * An abstracted tracker to get the root node by Transaction.
+ */
+class RootNodeTracker {
+ public:
+  virtual ~RootNodeTracker() = default;
+  virtual bool is_clean() const = 0;
+  virtual Ref<Node> get_root(Transaction&) const = 0;
+  static RootNodeTrackerURef create(bool read_isolated);
+ protected:
+  RootNodeTracker() = default;
+  RootNodeTracker(const RootNodeTracker&) = delete;
+  RootNodeTracker(RootNodeTracker&&) = delete;
+  RootNodeTracker& operator=(const RootNodeTracker&) = delete;
+  RootNodeTracker& operator=(RootNodeTracker&&) = delete;
+  virtual void do_track_super(Transaction&, Super&) = 0;
+  virtual void do_untrack_super(Transaction&, Super&) = 0;
+  friend class Super;
+};
+
+/**
+ * Super
+ *
+ * The parent of root node. It contains the relationship between a Transaction
+ * and a root node address.
+ */
+class Super {
+ public:
+  using URef = std::unique_ptr<Super>;
+  Super(const Super&) = delete;
+  Super(Super&&) = delete;
+  Super& operator=(const Super&) = delete;
+  Super& operator=(Super&&) = delete;
+  virtual ~Super() {
+    assert(tracked_root_node == nullptr);
+    tracker.do_untrack_super(t, *this);
+  }
+
+  virtual laddr_t get_root_laddr() const = 0;
+  virtual void write_root_laddr(context_t, laddr_t) = 0;
+
+  void do_track_root(Node& root) {
+    assert(tracked_root_node == nullptr);
+    tracked_root_node = &root;
+  }
+  void do_untrack_root(Node& root) {
+    assert(tracked_root_node == &root);
+    tracked_root_node = nullptr;
+  }
+  Node* get_p_root() const {
+    assert(tracked_root_node != nullptr);
+    return tracked_root_node;
+  }
+
+ protected:
+  Super(Transaction& t, RootNodeTracker& tracker)
+      : t{t}, tracker{tracker} {
+    tracker.do_track_super(t, *this);
+  }
+
+ private:
+  Transaction& t;
+  RootNodeTracker& tracker;
+  Node* tracked_root_node = nullptr;
+};
+
+/**
+ * RootNodeTrackerIsolated
+ *
+ * A concrete RootNodeTracker implementation which provides root node isolation
+ * between Transactions for Seastore backend.
+ */
+class RootNodeTrackerIsolated final : public RootNodeTracker {
+ public:
+  ~RootNodeTrackerIsolated() override { assert(is_clean()); }
+ protected:
+  bool is_clean() const override {
+    return tracked_supers.empty();
+  }
+  void do_track_super(Transaction& t, Super& super) override {
+    assert(tracked_supers.find(&t) == tracked_supers.end());
+    tracked_supers[&t] = &super;
+  }
+  void do_untrack_super(Transaction& t, Super& super) override {
+    [[maybe_unused]] auto removed = tracked_supers.erase(&t);
+    assert(removed);
+  }
+  ::Ref<Node> get_root(Transaction& t) const override;
+  std::map<Transaction*, Super*> tracked_supers;
+};
+
+/**
+ * RootNodeTrackerShared
+ *
+ * A concrete RootNodeTracker implementation which has no isolation between
+ * Transactions for Dummy backend.
+ */
+class RootNodeTrackerShared final : public RootNodeTracker {
+ public:
+  ~RootNodeTrackerShared() override { assert(is_clean()); }
+ protected:
+  bool is_clean() const override {
+    return tracked_super == nullptr;
+  }
+  void do_track_super(Transaction&, Super& super) override {
+    assert(is_clean());
+    tracked_super = &super;
+  }
+  void do_untrack_super(Transaction&, Super& super) override {
+    assert(tracked_super == &super);
+    tracked_super = nullptr;
+  }
+  ::Ref<Node> get_root(Transaction&) const override;
+  Super* tracked_super = nullptr;
+};
+
+inline RootNodeTrackerURef RootNodeTracker::create(bool read_isolated) {
+  if (read_isolated) {
+    return RootNodeTrackerURef(new RootNodeTrackerIsolated());
+  } else {
+    return RootNodeTrackerURef(new RootNodeTrackerShared());
+  }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc
new file mode 100644
index 000000000..2c8c21652
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tree.h"
+
+#include "node.h"
+#include "node_extent_manager.h"
+#include "stages/key_layout.h"
+#include "super.h"
+
+namespace crimson::os::seastore::onode {
+
+using btree_ertr = Btree::btree_ertr;
+template <class ValueT=void>
+using btree_future = Btree::btree_future<ValueT>;
+using Cursor = Btree::Cursor;
+
+Cursor::Cursor(Btree* p_tree, Ref<tree_cursor_t> _p_cursor)
+  : p_tree(p_tree) {
+  if (_p_cursor->is_end()) {
+    // no need to hold the leaf node
+  } else {
+    p_cursor = _p_cursor;
+  }
+}
+Cursor::Cursor(Btree* p_tree) : p_tree{p_tree} {}
+Cursor::Cursor(const Cursor&) = default;
+Cursor::Cursor(Cursor&&) noexcept = default;
+Cursor& Cursor::operator=(const Cursor&) = default;
+Cursor& Cursor::operator=(Cursor&&) = default;
+Cursor::~Cursor() = default;
+
+bool Cursor::is_end() const {
+  if (p_cursor) {
+    assert(!p_cursor->is_end());
+    return false;
+  } else {
+    return true;
+  }
+}
+
+ghobject_t Cursor::get_ghobj() const {
+  return p_cursor->get_key_view().to_ghobj();
+}
+
+const onode_t* Cursor::value() const {
+  return p_cursor->get_p_value();
+}
+
+bool Cursor::operator==(const Cursor& x) const {
+  return p_cursor == x.p_cursor;
+}
+
+Cursor& Cursor::operator++() {
+  // TODO
+  return *this;
+}
+
+Cursor Cursor::operator++(int) {
+  Cursor tmp = *this;
+  ++*this;
+  return tmp;
+}
+
+Cursor Cursor::make_end(Btree* p_tree) {
+  return {p_tree};
+}
+
+Btree::Btree(NodeExtentManagerURef&& _nm)
+  : nm{std::move(_nm)},
+    root_tracker{RootNodeTracker::create(nm->is_read_isolated())} {}
+
+Btree::~Btree() { assert(root_tracker->is_clean()); }
+
+btree_future<> Btree::mkfs(Transaction& t) {
+  return Node::mkfs(get_context(t), *root_tracker);
+}
+
+btree_future<Cursor> Btree::begin(Transaction& t) {
+  return get_root(t).safe_then([this, &t](auto root) {
+    return root->lookup_smallest(get_context(t));
+  }).safe_then([this](auto cursor) {
+    return Cursor{this, cursor};
+  });
+}
+
+btree_future<Cursor> Btree::last(Transaction& t) {
+  return get_root(t).safe_then([this, &t](auto root) {
+    return root->lookup_largest(get_context(t));
+  }).safe_then([this](auto cursor) {
+    return Cursor(this, cursor);
+  });
+}
+
+Cursor Btree::end() {
+  return Cursor::make_end(this);
+}
+
+btree_future<bool>
+Btree::contains(Transaction& t, const ghobject_t& obj) {
+  return seastar::do_with(
+    full_key_t<KeyT::HOBJ>(obj),
+    [this, &t](auto& key) -> btree_future<bool> {
+      return get_root(t).safe_then([this, &t, &key](auto root) {
+        // TODO: improve lower_bound()
+        return root->lower_bound(get_context(t), key);
+      }).safe_then([](auto result) {
+        return MatchKindBS::EQ == result.match();
+      });
+    }
+  );
+}
+
+btree_future<Cursor>
+Btree::find(Transaction& t, const ghobject_t& obj) {
+  return seastar::do_with(
+    full_key_t<KeyT::HOBJ>(obj),
+    [this, &t](auto& key) -> btree_future<Cursor> {
+      return get_root(t).safe_then([this, &t, &key](auto root) {
+        // TODO: improve lower_bound()
+        return root->lower_bound(get_context(t), key);
+      }).safe_then([this](auto result) {
+        if (result.match() == MatchKindBS::EQ) {
+          return Cursor(this, result.p_cursor);
+        } else {
+          return Cursor::make_end(this);
+        }
+      });
+    }
+  );
+}
+
+btree_future<Cursor>
+Btree::lower_bound(Transaction& t, const ghobject_t& obj) {
+  return seastar::do_with(
+    full_key_t<KeyT::HOBJ>(obj),
+    [this, &t](auto& key) -> btree_future<Cursor> {
+      return get_root(t).safe_then([this, &t, &key](auto root) {
+        return root->lower_bound(get_context(t), key);
+      }).safe_then([this](auto result) {
+        return Cursor(this, result.p_cursor);
+      });
+    }
+  );
+}
+
+btree_future<std::pair<Cursor, bool>>
+Btree::insert(Transaction& t, const ghobject_t& obj, const onode_t& value) {
+  return seastar::do_with(
+    full_key_t<KeyT::HOBJ>(obj),
+    [this, &t, &value](auto& key) -> btree_future<std::pair<Cursor, bool>> {
+      return get_root(t).safe_then([this, &t, &key, &value](auto root) {
+        return root->insert(get_context(t), key, value);
+      }).safe_then([this](auto ret) {
+        auto& [cursor, success] = ret;
+        return std::make_pair(Cursor(this, cursor), success);
+      });
+    }
+  );
+}
+
+btree_future<size_t> Btree::erase(Transaction& t, const ghobject_t& obj) {
+  // TODO
+  return btree_ertr::make_ready_future<size_t>(0u);
+}
+
+btree_future<Cursor> Btree::erase(Cursor& pos) {
+  // TODO
+  return btree_ertr::make_ready_future<Cursor>(
+      Cursor::make_end(this));
+}
+
+btree_future<Cursor>
+Btree::erase(Cursor& first, Cursor& last) {
+  // TODO
+  return btree_ertr::make_ready_future<Cursor>(
+      Cursor::make_end(this));
+}
+
+btree_future<size_t> Btree::height(Transaction& t) {
+  return get_root(t).safe_then([](auto root) {
+    return size_t(root->level() + 1);
+  });
+}
+
+btree_future<tree_stats_t> Btree::get_stats_slow(Transaction& t) {
+  return get_root(t).safe_then([this, &t](auto root) {
+    unsigned height = root->level() + 1;
+    return root->get_tree_stats(get_context(t)
+    ).safe_then([height](auto stats) {
+      stats.height = height;
+      return btree_ertr::make_ready_future<tree_stats_t>(stats);
+    });
+  });
+}
+
+std::ostream& Btree::dump(Transaction& t, std::ostream& os) {
+  auto root = root_tracker->get_root(t);
+  if (root) {
+    root->dump(os);
+  } else {
+    os << "empty tree!";
+  }
+  return os;
+}
+
+std::ostream& Btree::print(std::ostream& os) const {
+  return os << "BTree-" << *nm;
+}
+
+btree_future<Ref<Node>> Btree::get_root(Transaction& t) {
+  auto root = root_tracker->get_root(t);
+  if (root) {
+    return btree_ertr::make_ready_future<Ref<Node>>(root);
+  } else {
+    return Node::load_root(get_context(t), *root_tracker);
+  }
+}
+
+bool Btree::test_is_clean() const {
+  return root_tracker->is_clean();
+}
+
+btree_future<> Btree::test_clone_from(
+    Transaction& t, Transaction& t_from, Btree& from) {
+  // Note: assume the tree to clone is tracked correctly in memory.
+  // In some unit tests, parts of the tree are stubbed out that they
+  // should not be loaded from NodeExtentManager.
+  return from.get_root(t_from
+  ).safe_then([this, &t](auto root_from) {
+    return root_from->test_clone_root(get_context(t), *root_tracker);
+  });
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h
new file mode 100644
index 000000000..7ee618cb3
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h
@@ -0,0 +1,119 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+
+#include "common/hobject.h"
+#include "crimson/common/type_helpers.h"
+
+#include "fwd.h"
+#include "tree_types.h"
+
+/**
+ * tree.h
+ *
+ * An example implementation to expose tree interfaces to users. The current
+ * interface design is based on:
+ * - ceph::os::Transaction::create/touch/remove()
+ * - ceph::ObjectStore::collection_list()
+ * - ceph::BlueStore::get_onode()
+ * - db->get_iterator(PREFIIX_OBJ) by ceph::BlueStore::fsck()
+ *
+ * TODO: Redesign the interfaces based on real onode manager requirements.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class Node;
+class Btree {
+ public:
+  using btree_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error,
+    crimson::ct_error::invarg,
+    crimson::ct_error::enoent,
+    crimson::ct_error::erange>;
+  template <class ValueT=void>
+  using btree_future = btree_ertr::future<ValueT>;
+
+  Btree(NodeExtentManagerURef&& nm);
+  Btree(const Btree&) = delete;
+  Btree(Btree&&) = delete;
+  Btree& operator=(const Btree&) = delete;
+  Btree& operator=(Btree&&) = delete;
+  ~Btree();
+
+  btree_future<> mkfs(Transaction&);
+
+  class Cursor;
+  // lookup
+  btree_future<Cursor> begin(Transaction&);
+  btree_future<Cursor> last(Transaction&);
+  Cursor end();
+  btree_future<bool> contains(Transaction&, const ghobject_t&);
+  btree_future<Cursor> find(Transaction&, const ghobject_t&);
+  btree_future<Cursor> lower_bound(Transaction&, const ghobject_t&);
+
+  // modifiers
+  // TODO: replace onode_t
+  btree_future<std::pair<Cursor, bool>>
+  insert(Transaction&, const ghobject_t&, const onode_t&);
+  btree_future<size_t> erase(Transaction&, const ghobject_t& key);
+  btree_future<Cursor> erase(Cursor& pos);
+  btree_future<Cursor> erase(Cursor& first, Cursor& last);
+
+  // stats
+  btree_future<size_t> height(Transaction&);
+  btree_future<tree_stats_t> get_stats_slow(Transaction&);
+  std::ostream& dump(Transaction&, std::ostream&);
+  std::ostream& print(std::ostream& os) const;
+
+  // test_only
+  bool test_is_clean() const;
+  btree_future<> test_clone_from(Transaction& t, Transaction& t_from, Btree& from);
+
+ private:
+  context_t get_context(Transaction& t) { return {*nm, t}; }
+  btree_future<Ref<Node>> get_root(Transaction& t);
+
+  NodeExtentManagerURef nm;
+  RootNodeTrackerURef root_tracker;
+
+  friend class DummyChildPool;
+};
+inline std::ostream& operator<<(std::ostream& os, const Btree& tree) {
+  return tree.print(os);
+}
+
+class tree_cursor_t;
+class Btree::Cursor {
+ public:
+  Cursor(const Cursor&);
+  Cursor(Cursor&&) noexcept;
+  Cursor& operator=(const Cursor&);
+  Cursor& operator=(Cursor&&);
+  ~Cursor();
+
+  bool is_end() const;
+  // XXX: return key_view_t to avoid unecessary ghobject_t constructions
+  ghobject_t get_ghobj() const;
+  const onode_t* value() const;
+  bool operator==(const Cursor& x) const;
+  bool operator!=(const Cursor& x) const { return !(*this == x); }
+  Cursor& operator++();
+  Cursor operator++(int);
+
+ private:
+  Cursor(Btree*, Ref<tree_cursor_t>);
+  Cursor(Btree*);
+
+  static Cursor make_end(Btree*);
+
+  Btree* p_tree;
+  Ref<tree_cursor_t> p_cursor;
+
+  friend class Btree;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h
new file mode 100644
index 000000000..0bb345e0a
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h
@@ -0,0 +1,125 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+
+namespace crimson::os::seastore::onode {
+
+// TODO: Redesign according to real requirement from onode manager
+struct onode_t {
+  // onode should be smaller than a node
+  uint16_t size; // address up to 64 KiB sized node
+  uint16_t id;
+  // omap, extent_map, inline data
+
+  bool operator==(const onode_t& o) const { return size == o.size && id == o.id; }
+  bool operator!=(const onode_t& o) const { return !(*this == o); }
+
+  void encode(ceph::bufferlist& encoded) const {
+    ceph::encode(size, encoded);
+    ceph::encode(id, encoded);
+  }
+  static onode_t decode(ceph::bufferlist::const_iterator& delta) {
+    uint16_t size;
+    ceph::decode(size, delta);
+    uint16_t id;
+    ceph::decode(id, delta);
+    onode_t ret{size, id};
+    return ret;
+  }
+  static void validate_tail_magic(const onode_t& onode) {
+    auto p_target = (const char*)&onode + onode.size - sizeof(uint32_t);
+    uint32_t target;
+    std::memcpy(&target, p_target, sizeof(uint32_t));
+    ceph_assert(target == onode.size * 137);
+  }
+  static std::unique_ptr<char[]> allocate(const onode_t& config) {
+    ceph_assert(config.size >= sizeof(onode_t) + sizeof(uint32_t));
+
+    auto ret = std::make_unique<char[]>(config.size);
+    char* p_mem = ret.get();
+    auto p_onode = reinterpret_cast<onode_t*>(p_mem);
+    *p_onode = config;
+
+    uint32_t tail_magic = config.size * 137;
+    p_mem += (config.size - sizeof(uint32_t));
+    std::memcpy(p_mem, &tail_magic, sizeof(uint32_t));
+    validate_tail_magic(*p_onode);
+
+    return ret;
+  }
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const onode_t& node) {
+  return os << "onode(" << node.id << ", " << node.size << "B)";
+}
+
+struct tree_stats_t {
+  size_t size_persistent_leaf = 0;
+  size_t size_persistent_internal = 0;
+  size_t size_filled_leaf = 0;
+  size_t size_filled_internal = 0;
+  size_t size_logical_leaf = 0;
+  size_t size_logical_internal = 0;
+  size_t size_overhead_leaf = 0;
+  size_t size_overhead_internal = 0;
+  size_t size_value_leaf = 0;
+  size_t size_value_internal = 0;
+  unsigned num_kvs_leaf = 0;
+  unsigned num_kvs_internal = 0;
+  unsigned num_nodes_leaf = 0;
+  unsigned num_nodes_internal = 0;
+  unsigned height = 0;
+
+  size_t size_persistent() const {
+    return size_persistent_leaf + size_persistent_internal; }
+  size_t size_filled() const {
+    return size_filled_leaf + size_filled_internal; }
+  size_t size_logical() const {
+    return size_logical_leaf + size_logical_internal; }
+  size_t size_overhead() const {
+    return size_overhead_leaf + size_overhead_internal; }
+  size_t size_value() const {
+    return size_value_leaf + size_value_internal; }
+  unsigned num_kvs() const {
+    return num_kvs_leaf + num_kvs_internal; }
+  unsigned num_nodes() const {
+    return num_nodes_leaf + num_nodes_internal; }
+
+  double ratio_fullness() const {
+    return (double)size_filled() / size_persistent(); }
+  double ratio_key_compression() const {
+    return (double)(size_filled() - size_value()) / (size_logical() - size_value()); }
+  double ratio_overhead() const {
+    return (double)size_overhead() / size_filled(); }
+  double ratio_keys_leaf() const {
+    return (double)num_kvs_leaf / num_kvs(); }
+  double ratio_nodes_leaf() const {
+    return (double)num_nodes_leaf / num_nodes(); }
+  double ratio_filled_leaf() const {
+    return (double)size_filled_leaf / size_filled(); }
+};
+inline std::ostream& operator<<(std::ostream& os, const tree_stats_t& stats) {
+  os << "Tree stats:"
+     << "\n  height = " << stats.height
+     << "\n  num values = " << stats.num_kvs_leaf
+     << "\n  num nodes  = " << stats.num_nodes()
+     << " (leaf=" << stats.num_nodes_leaf
+     << ", internal=" << stats.num_nodes_internal << ")"
+     << "\n  size persistent = " << stats.size_persistent() << "B"
+     << "\n  size filled     = " << stats.size_filled() << "B"
+     << " (value=" << stats.size_value_leaf << "B"
+     << ", rest=" << stats.size_filled() - stats.size_value_leaf << "B)"
+     << "\n  size logical    = " << stats.size_logical() << "B"
+     << "\n  size overhead   = " << stats.size_overhead() << "B"
+     << "\n  ratio fullness  = " << stats.ratio_fullness()
+     << "\n  ratio keys leaf = " << stats.ratio_keys_leaf()
+     << "\n  ratio nodes leaf  = " << stats.ratio_nodes_leaf()
+     << "\n  ratio filled leaf = " << stats.ratio_filled_leaf()
+     << "\n  ratio key compression = " << stats.ratio_key_compression();
+  assert(stats.num_kvs_internal + 1 == stats.num_nodes());
+  return os;
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h
new file mode 100644
index 000000000..536052003
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h
@@ -0,0 +1,333 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <cstring>
+#include <random>
+#include <string>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+#include "crimson/common/log.h"
+#include "stages/key_layout.h"
+#include "tree.h"
+
+/**
+ * tree_utils.h
+ *
+ * Contains shared logic for unit tests and perf tool.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class Onodes {
+ public:
+  Onodes(size_t n) {
+    for (size_t i = 1; i <= n; ++i) {
+      auto p_onode = &create(i * 8);
+      onodes.push_back(p_onode);
+    }
+  }
+
+  Onodes(std::vector<size_t> sizes) {
+    for (auto& size : sizes) {
+      auto p_onode = &create(size);
+      onodes.push_back(p_onode);
+    }
+  }
+
+  ~Onodes() = default;
+
+  const onode_t& create(size_t size) {
+    ceph_assert(size <= std::numeric_limits<uint16_t>::max());
+    onode_t config{static_cast<uint16_t>(size), id++};
+    auto onode = onode_t::allocate(config);
+    auto p_onode = onode.get();
+    tracked_onodes.push_back(std::move(onode));
+    return *reinterpret_cast<onode_t*>(p_onode);
+  }
+
+  const onode_t& pick() const {
+    auto index = rd() % onodes.size();
+    return *onodes[index];
+  }
+
+  const onode_t& pick_largest() const {
+    return *onodes[onodes.size() - 1];
+  }
+
+  static void validate_cursor(
+      const Btree::Cursor& cursor, const ghobject_t& key, const onode_t& onode) {
+    ceph_assert(!cursor.is_end());
+    ceph_assert(cursor.get_ghobj() == key);
+    ceph_assert(cursor.value());
+    ceph_assert(cursor.value() != &onode);
+    ceph_assert(*cursor.value() == onode);
+    onode_t::validate_tail_magic(*cursor.value());
+  }
+
+ private:
+  uint16_t id = 0;
+  mutable std::random_device rd;
+  std::vector<const onode_t*> onodes;
+  std::vector<std::unique_ptr<char[]>> tracked_onodes;
+};
+
+class KVPool {
+  struct kv_conf_t {
+    unsigned index2;
+    unsigned index1;
+    unsigned index0;
+    size_t ns_size;
+    size_t oid_size;
+    const onode_t* p_value;
+
+    ghobject_t get_ghobj() const {
+      assert(index1 < 10);
+      std::ostringstream os_ns;
+      os_ns << "ns" << index1;
+      unsigned current_size = (unsigned)os_ns.tellp();
+      assert(ns_size >= current_size);
+      os_ns << std::string(ns_size - current_size, '_');
+
+      std::ostringstream os_oid;
+      os_oid << "oid" << index1;
+      current_size = (unsigned)os_oid.tellp();
+      assert(oid_size >= current_size);
+      os_oid << std::string(oid_size - current_size, '_');
+
+      return ghobject_t(shard_id_t(index2), index2, index2,
+                        os_ns.str(), os_oid.str(), index0, index0);
+    }
+  };
+  using kv_vector_t = std::vector<kv_conf_t>;
+
+ public:
+  using kv_t = std::pair<ghobject_t, const onode_t*>;
+
+  KVPool(const std::vector<size_t>& str_sizes,
+         const std::vector<size_t>& onode_sizes,
+         const std::pair<unsigned, unsigned>& range2,
+         const std::pair<unsigned, unsigned>& range1,
+         const std::pair<unsigned, unsigned>& range0)
+      : str_sizes{str_sizes}, onodes{onode_sizes} {
+    ceph_assert(range2.first < range2.second);
+    ceph_assert(range2.second - 1 <= (unsigned)std::numeric_limits<shard_t>::max());
+    ceph_assert(range2.second - 1 <= std::numeric_limits<crush_hash_t>::max());
+    ceph_assert(range1.first < range1.second);
+    ceph_assert(range1.second - 1 <= 9);
+    ceph_assert(range0.first < range0.second);
+    std::random_device rd;
+    for (unsigned i = range2.first; i < range2.second; ++i) {
+      for (unsigned j = range1.first; j < range1.second; ++j) {
+        auto ns_size = (unsigned)str_sizes[rd() % str_sizes.size()];
+        auto oid_size = (unsigned)str_sizes[rd() % str_sizes.size()];
+        for (unsigned k = range0.first; k < range0.second; ++k) {
+          kvs.emplace_back(kv_conf_t{i, j, k, ns_size, oid_size, &onodes.pick()});
+        }
+      }
+    }
+    random_kvs = kvs;
+    std::random_shuffle(random_kvs.begin(), random_kvs.end());
+  }
+
+  class iterator_t {
+   public:
+    iterator_t() = default;
+    iterator_t(const iterator_t&) = default;
+    iterator_t(iterator_t&&) = default;
+    iterator_t& operator=(const iterator_t&) = default;
+    iterator_t& operator=(iterator_t&&) = default;
+
+    kv_t get_kv() const {
+      assert(!is_end());
+      auto& conf = (*p_kvs)[i];
+      return std::make_pair(conf.get_ghobj(), conf.p_value);
+    }
+    bool is_end() const { return !p_kvs || i >= p_kvs->size(); }
+    size_t index() const { return i; }
+
+    iterator_t& operator++() {
+      assert(!is_end());
+      ++i;
+      return *this;
+    }
+
+    iterator_t operator++(int) {
+      iterator_t tmp = *this;
+      ++*this;
+      return tmp;
+    }
+
+   private:
+    iterator_t(const kv_vector_t& kvs) : p_kvs{&kvs} {}
+
+    const kv_vector_t* p_kvs = nullptr;
+    size_t i = 0;
+    friend class KVPool;
+  };
+
+  iterator_t begin() const {
+    return iterator_t(kvs);
+  }
+
+  iterator_t random_begin() const {
+    return iterator_t(random_kvs);
+  }
+
+  size_t size() const {
+    return kvs.size();
+  }
+
+ private:
+  std::vector<size_t> str_sizes;
+  Onodes onodes;
+  kv_vector_t kvs;
+  kv_vector_t random_kvs;
+};
+
+template <bool TRACK>
+class TreeBuilder {
+ public:
+  using ertr = Btree::btree_ertr;
+  template <class ValueT=void>
+  using future = ertr::future<ValueT>;
+
+  TreeBuilder(KVPool& kvs, NodeExtentManagerURef&& nm)
+      : kvs{kvs} {
+    tree.emplace(std::move(nm));
+  }
+
+  future<> bootstrap(Transaction& t) {
+    std::ostringstream oss;
+#ifndef NDEBUG
+    oss << "debug=on, ";
+#else
+    oss << "debug=off, ";
+#endif
+#ifdef UNIT_TESTS_BUILT
+    oss << "UNIT_TEST_BUILT=on, ";
+#else
+    oss << "UNIT_TEST_BUILT=off, ";
+#endif
+    if constexpr (TRACK) {
+      oss << "track=on, ";
+    } else {
+      oss << "track=off, ";
+    }
+    oss << *tree;
+    logger().warn("TreeBuilder: {}, bootstrapping ...", oss.str());
+    return tree->mkfs(t);
+  }
+
+  future<> insert(Transaction& t) {
+    kv_iter = kvs.random_begin();
+    auto cursors = seastar::make_lw_shared<std::vector<Btree::Cursor>>();
+    logger().warn("start inserting {} kvs ...", kvs.size());
+    auto start_time = mono_clock::now();
+    return crimson::do_until([&t, this, cursors]() -> future<bool> {
+      if (kv_iter.is_end()) {
+        return ertr::make_ready_future<bool>(true);
+      }
+      auto [key, p_value] = kv_iter.get_kv();
+      logger().debug("[{}] {} -> {}", kv_iter.index(), key_hobj_t{key}, *p_value);
+      return tree->insert(t, key, *p_value
+      ).safe_then([&t, this, cursors](auto ret) {
+        auto& [cursor, success] = ret;
+        assert(success == true);
+        if constexpr (TRACK) {
+          cursors->emplace_back(cursor);
+        }
+#ifndef NDEBUG
+        auto [key, p_value] = kv_iter.get_kv();
+        Onodes::validate_cursor(cursor, key, *p_value);
+        return tree->lower_bound(t, key).safe_then([this, cursor](auto cursor_) {
+          auto [key, p_value] = kv_iter.get_kv();
+          ceph_assert(cursor_.get_ghobj() == key);
+          ceph_assert(cursor_.value() == cursor.value());
+          ++kv_iter;
+          return ertr::make_ready_future<bool>(false);
+        });
+#else
+        ++kv_iter;
+        return ertr::make_ready_future<bool>(false);
+#endif
+      });
+    }).safe_then([&t, this, start_time, cursors] {
+      std::chrono::duration<double> duration = mono_clock::now() - start_time;
+      logger().warn("Insert done! {}s", duration.count());
+      if (!cursors->empty()) {
+        logger().info("Verifing tracked cursors ...");
+        kv_iter = kvs.random_begin();
+        return seastar::do_with(
+            cursors->begin(), [&t, this, cursors](auto& c_iter) {
+          return crimson::do_until([&t, this, &c_iter, cursors]() -> future<bool> {
+            if (kv_iter.is_end()) {
+              logger().info("Verify done!");
+              return ertr::make_ready_future<bool>(true);
+            }
+            assert(c_iter != cursors->end());
+            auto [k, v] = kv_iter.get_kv();
+            // validate values in tree keep intact
+            return tree->lower_bound(t, k).safe_then([this, &c_iter](auto cursor) {
+              auto [k, v] = kv_iter.get_kv();
+              Onodes::validate_cursor(cursor, k, *v);
+              // validate values in cursors keep intact
+              Onodes::validate_cursor(*c_iter, k, *v);
+              ++kv_iter;
+              ++c_iter;
+              return ertr::make_ready_future<bool>(false);
+            });
+          });
+        });
+      } else {
+        return ertr::now();
+      }
+    });
+  }
+
+  future<> get_stats(Transaction& t) {
+    return tree->get_stats_slow(t
+    ).safe_then([this](auto stats) {
+      logger().warn("{}", stats);
+    });
+  }
+
+  void reload(NodeExtentManagerURef&& nm) {
+    tree.emplace(std::move(nm));
+  }
+
+  future<> validate(Transaction& t) {
+    logger().info("Verifing insertion ...");
+    return seastar::do_with(
+        kvs.begin(), [&t, this] (auto& kvs_iter) {
+      return crimson::do_until([&t, this, &kvs_iter]() -> future<bool> {
+        if (kvs_iter.is_end()) {
+          logger().info("Verify done!");
+          return ertr::make_ready_future<bool>(true);
+        }
+        auto [k, v] = kvs_iter.get_kv();
+        return tree->lower_bound(t, k
+        ).safe_then([&kvs_iter, k=k, v=v] (auto cursor) {
+          Onodes::validate_cursor(cursor, k, *v);
+          ++kvs_iter;
+          return ertr::make_ready_future<bool>(false);
+        });
+      });
+    });
+  }
+
+ private:
+  static seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+
+  KVPool& kvs;
+  std::optional<Btree> tree;
+  KVPool::iterator_t kv_iter;
+};
+
+}
diff --git a/src/crimson/os/seastore/root_block.h b/src/crimson/os/seastore/root_block.h
new file mode 100644
index 000000000..4a5024caa
--- /dev/null
+++ b/src/crimson/os/seastore/root_block.h
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/cached_extent.h"
+
+namespace crimson::os::seastore {
+
+/**
+ * root_t
+ *
+ * Contains information required to find metadata roots.
+ * TODO: generalize this to permit more than one lba_manager implementation
+ */
+struct __attribute__((aligned(8), packed)) root_t {
+  depth_t lba_depth = 0;
+  depth_t segment_depth = 0;
+  paddr_t lba_root_addr;
+  paddr_t segment_root;
+  laddr_t onode_root = L_ADDR_NULL;
+
+  void adjust_addrs_from_base(paddr_t base) {
+    if (lba_root_addr.is_relative()) {
+      lba_root_addr = base.add_record_relative(lba_root_addr);
+    }
+  }
+};
+
+/**
+ * RootBlock
+ *
+ * Holds the physical addresses of all metadata roots.
+ * In-memory values may be
+ * - absolute: reference to block which predates the current transaction
+ * - record_relative: reference to block updated in this transaction
+ *   if !pending()
+ *
+ * Journal replay only considers deltas and must always discover the most
+ * recent value for the RootBlock.  Because the contents of root_t above are
+ * very small, it's simplest to stash the entire root_t value into the delta
+ * and never actually write the RootBlock to a physical location (safe since
+ * nothing references the location of the RootBlock).
+ *
+ * As a result, Cache treats the root differently in a few ways including:
+ * - state will only ever be DIRTY or MUTATION_PENDING
+ * - RootBlock's never show up in the transaction fresh or dirty lists --
+ *   there's a special Transaction::root member for when the root needs to
+ *   be mutated.
+ *
+ * TODO: Journal trimming will need to be aware of the most recent RootBlock
+ * delta location, or, even easier, just always write one out with the
+ * mutation which changes the journal trim bound.
+ */
+struct RootBlock : CachedExtent {
+  constexpr static segment_off_t SIZE = 4<<10;
+  using Ref = TCachedExtentRef<RootBlock>;
+
+  root_t root;
+
+  RootBlock() : CachedExtent(0) {}
+
+  RootBlock(const RootBlock &rhs) = default;
+
+  CachedExtentRef duplicate_for_write() final {
+    return CachedExtentRef(new RootBlock(*this));
+  };
+
+  static constexpr extent_types_t TYPE = extent_types_t::ROOT;
+  extent_types_t get_type() const final {
+    return extent_types_t::ROOT;
+  }
+
+  /// dumps root as delta
+  ceph::bufferlist get_delta() final {
+    ceph::bufferlist bl;
+    ceph::buffer::ptr bptr(sizeof(root_t));
+    *reinterpret_cast<root_t*>(bptr.c_str()) = root;
+    bl.append(bptr);
+    return bl;
+  }
+
+  /// overwrites root
+  void apply_delta_and_adjust_crc(paddr_t base, const ceph::bufferlist &_bl) final {
+    assert(_bl.length() == sizeof(root_t));
+    ceph::bufferlist bl = _bl;
+    bl.rebuild();
+    root = *reinterpret_cast<const root_t*>(bl.front().c_str());
+    root.adjust_addrs_from_base(base);
+  }
+
+  /// Patches relative addrs in memory based on record commit addr
+  void on_delta_write(paddr_t record_block_offset) final {
+    root.adjust_addrs_from_base(record_block_offset);
+  }
+
+  complete_load_ertr::future<> complete_load() final {
+    ceph_abort_msg("Root is only written via deltas");
+  }
+
+  void on_initial_write() final {
+    ceph_abort_msg("Root is only written via deltas");
+  }
+
+  root_t &get_root() { return root; }
+};
+using RootBlockRef = RootBlock::Ref;
+
+}
diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc
new file mode 100644
index 000000000..50c148cea
--- /dev/null
+++ b/src/crimson/os/seastore/seastore.cc
@@ -0,0 +1,532 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "seastore.h"
+
+#include <boost/algorithm/string/trim.hpp>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "common/safe_io.h"
+#include "os/Transaction.h"
+
+#include "crimson/common/buffer_io.h"
+#include "crimson/common/config_proxy.h"
+
+#include "crimson/os/futurized_collection.h"
+
+#include "crimson/os/seastore/segment_manager/ephemeral.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/onode_manager.h"
+#include "crimson/os/seastore/cache.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+using crimson::common::local_conf;
+
+namespace crimson::os::seastore {
+
+struct SeastoreCollection final : public FuturizedCollection {
+  template <typename... T>
+  SeastoreCollection(T&&... args) :
+    FuturizedCollection(std::forward<T>(args)...) {}
+};
+
+SeaStore::SeaStore(const std::string& path)
+  : segment_manager(segment_manager::create_test_ephemeral() /* TODO */),
+    segment_cleaner(
+      std::make_unique<SegmentCleaner>(
+	SegmentCleaner::config_t::default_from_segment_manager(
+	  *segment_manager))),
+    cache(std::make_unique<Cache>(*segment_manager)),
+    journal(new Journal(*segment_manager)),
+    lba_manager(
+      lba_manager::create_lba_manager(*segment_manager, *cache)),
+    transaction_manager(
+      new TransactionManager(
+	*segment_manager,
+	*segment_cleaner,
+	*journal,
+	*cache,
+	*lba_manager)),
+    onode_manager(onode_manager::create_ephemeral())
+{
+  journal->set_segment_provider(&*segment_cleaner);
+  segment_cleaner->set_extent_callback(&*transaction_manager);
+}
+
+SeaStore::~SeaStore() = default;
+
+seastar::future<> SeaStore::stop()
+{
+  return seastar::now();
+}
+
+seastar::future<> SeaStore::mount()
+{
+  return seastar::now();
+}
+
+seastar::future<> SeaStore::umount()
+{
+  return seastar::now();
+}
+
+seastar::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
+{
+  return seastar::now();
+}
+
+seastar::future<store_statfs_t> SeaStore::stat() const
+{
+  logger().debug("{}", __func__);
+  store_statfs_t st;
+  return seastar::make_ready_future<store_statfs_t>(st);
+}
+
+seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
+SeaStore::list_objects(CollectionRef ch,
+                        const ghobject_t& start,
+                        const ghobject_t& end,
+                        uint64_t limit) const
+{
+  return seastar::make_ready_future<std::tuple<std::vector<ghobject_t>, ghobject_t>>(
+    std::make_tuple(std::vector<ghobject_t>(), end));
+}
+
+seastar::future<CollectionRef> SeaStore::create_new_collection(const coll_t& cid)
+{
+  auto c = _get_collection(cid);
+  return seastar::make_ready_future<CollectionRef>(c);
+}
+
+seastar::future<CollectionRef> SeaStore::open_collection(const coll_t& cid)
+{
+  return seastar::make_ready_future<CollectionRef>(_get_collection(cid));
+}
+
+seastar::future<std::vector<coll_t>> SeaStore::list_collections()
+{
+  return seastar::make_ready_future<std::vector<coll_t>>();
+}
+
+SeaStore::read_errorator::future<ceph::bufferlist> SeaStore::read(
+  CollectionRef ch,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t len,
+  uint32_t op_flags)
+{
+  return read_errorator::make_ready_future<ceph::bufferlist>();
+}
+
+SeaStore::read_errorator::future<ceph::bufferlist> SeaStore::readv(
+  CollectionRef ch,
+  const ghobject_t& oid,
+  interval_set<uint64_t>& m,
+  uint32_t op_flags)
+{
+  return read_errorator::make_ready_future<ceph::bufferlist>();
+}
+
+SeaStore::get_attr_errorator::future<ceph::bufferptr> SeaStore::get_attr(
+  CollectionRef ch,
+  const ghobject_t& oid,
+  std::string_view name) const
+{
+  auto c = static_cast<SeastoreCollection*>(ch.get());
+  logger().debug("{} {} {}",
+                __func__, c->get_cid(), oid);
+  return crimson::ct_error::enoent::make();
+}
+
+SeaStore::get_attrs_ertr::future<SeaStore::attrs_t> SeaStore::get_attrs(
+  CollectionRef ch,
+  const ghobject_t& oid)
+{
+  auto c = static_cast<SeastoreCollection*>(ch.get());
+  logger().debug("{} {} {}",
+		 __func__, c->get_cid(), oid);
+  return crimson::ct_error::enoent::make();
+}
+
+seastar::future<struct stat> stat(
+  CollectionRef c,
+  const ghobject_t& oid)
+{
+  return seastar::make_ready_future<struct stat>();
+}
+
+
+seastar::future<struct stat> SeaStore::stat(
+  CollectionRef c,
+  const ghobject_t& oid)
+{
+  struct stat st;
+  return seastar::make_ready_future<struct stat>(st);
+}
+
+auto
+SeaStore::omap_get_header(
+  CollectionRef c,
+  const ghobject_t& oid)
+  -> read_errorator::future<bufferlist>
+{
+  return seastar::make_ready_future<bufferlist>();
+}
+
+auto
+SeaStore::omap_get_values(
+  CollectionRef ch,
+  const ghobject_t& oid,
+  const omap_keys_t& keys)
+  -> read_errorator::future<omap_values_t>
+{
+  auto c = static_cast<SeastoreCollection*>(ch.get());
+  logger().debug("{} {} {}",
+                __func__, c->get_cid(), oid);
+  return seastar::make_ready_future<omap_values_t>();
+}
+
+auto
+SeaStore::omap_get_values(
+  CollectionRef ch,
+  const ghobject_t &oid,
+  const std::optional<string> &start)
+  -> read_errorator::future<std::tuple<bool, SeaStore::omap_values_t>>
+{
+  auto c = static_cast<SeastoreCollection*>(ch.get());
+  logger().debug(
+    "{} {} {}",
+    __func__, c->get_cid(), oid);
+  return seastar::make_ready_future<std::tuple<bool, omap_values_t>>(
+    std::make_tuple(false, omap_values_t()));
+}
+
+seastar::future<FuturizedStore::OmapIteratorRef> get_omap_iterator(
+  CollectionRef ch,
+  const ghobject_t& oid)
+{
+  return seastar::make_ready_future<FuturizedStore::OmapIteratorRef>();
+}
+
+seastar::future<std::map<uint64_t, uint64_t>> fiemap(
+  CollectionRef ch,
+  const ghobject_t& oid,
+  uint64_t off,
+  uint64_t len)
+{
+  return seastar::make_ready_future<std::map<uint64_t, uint64_t>>();
+}
+
+seastar::future<> SeaStore::do_transaction(
+  CollectionRef _ch,
+  ceph::os::Transaction&& _t)
+{
+  return seastar::do_with(
+    _t.begin(),
+    transaction_manager->create_transaction(),
+    std::vector<OnodeRef>(),
+    std::move(_t),
+    std::move(_ch),
+    [this](auto &iter, auto &trans, auto &onodes, auto &t, auto &ch) {
+      return onode_manager->get_or_create_onodes(
+	*trans, iter.get_objects()).safe_then(
+	  [this, &iter, &trans, &onodes, &t, &ch](auto &&read_onodes) {
+	    onodes = std::move(read_onodes);
+	    return seastar::do_until(
+	      [&iter]() { return iter.have_op(); },
+	      [this, &iter, &trans, &onodes, &t, &ch]() {
+		return _do_transaction_step(trans, ch, onodes, iter).safe_then(
+		  [this, &trans] {
+		    return transaction_manager->submit_transaction(std::move(trans));
+		  }).handle_error(
+		    // TODO: add errorator::do_until
+		    crimson::ct_error::eagain::handle([]() {
+		      // TODO retry
+		    }),
+		    write_ertr::all_same_way([&t](auto e) {
+		      logger().error(" transaction dump:\n");
+		      JSONFormatter f(true);
+		      f.open_object_section("transaction");
+		      t.dump(&f);
+		      f.close_section();
+		      std::stringstream str;
+		      f.flush(str);
+		      logger().error("{}", str.str());
+		      abort();
+		    }));
+	      });
+	  }).safe_then([this, &trans, &onodes]() {
+	    return onode_manager->write_dirty(*trans, onodes);
+	  }).safe_then([]() {
+	    // TODO: complete transaction!
+	    return;
+	  }).handle_error(
+	    write_ertr::all_same_way([&t](auto e) {
+	      logger().error(" transaction dump:\n");
+	      JSONFormatter f(true);
+	      f.open_object_section("transaction");
+	      t.dump(&f);
+	      f.close_section();
+	      std::stringstream str;
+	      f.flush(str);
+	      logger().error("{}", str.str());
+	      abort();
+	    })).then([&t]() {
+	      for (auto i : {
+		  t.get_on_applied(),
+		    t.get_on_commit(),
+		    t.get_on_applied_sync()}) {
+		if (i) {
+		  i->complete(0);
+		}
+	      }
+	    });
+    });
+}
+
+SeaStore::write_ertr::future<> SeaStore::_do_transaction_step(
+  TransactionRef &trans,
+  CollectionRef &col,
+  std::vector<OnodeRef> &onodes,
+  ceph::os::Transaction::iterator &i)
+{
+  auto get_onode = [&onodes](size_t i) -> OnodeRef& {
+    ceph_assert(i < onodes.size());
+    return onodes[i];
+  };
+
+  using ceph::os::Transaction;
+  try {
+    switch (auto op = i.decode_op(); op->op) {
+    case Transaction::OP_NOP:
+      return write_ertr::now();
+    case Transaction::OP_REMOVE:
+    {
+      return _remove(trans, get_onode(op->oid));
+    }
+    break;
+    case Transaction::OP_TOUCH:
+    {
+      return _touch(trans, get_onode(op->oid));
+    }
+    break;
+    case Transaction::OP_WRITE:
+    {
+      uint64_t off = op->off;
+      uint64_t len = op->len;
+      uint32_t fadvise_flags = i.get_fadvise_flags();
+      ceph::bufferlist bl;
+      i.decode_bl(bl);
+      return _write(trans, get_onode(op->oid), off, len, bl, fadvise_flags);
+    }
+    break;
+    case Transaction::OP_TRUNCATE:
+    {
+      uint64_t off = op->off;
+      return _truncate(trans, get_onode(op->oid), off);
+    }
+    break;
+    case Transaction::OP_SETATTR:
+    {
+      std::string name = i.decode_string();
+      ceph::bufferlist bl;
+      i.decode_bl(bl);
+      std::map<std::string, bufferptr> to_set;
+      to_set[name] = bufferptr(bl.c_str(), bl.length());
+      return _setattrs(trans, get_onode(op->oid), to_set);
+    }
+    break;
+    case Transaction::OP_MKCOLL:
+    {
+      coll_t cid = i.get_cid(op->cid);
+      return _create_collection(trans, cid, op->split_bits);
+    }
+    break;
+    case Transaction::OP_OMAP_SETKEYS:
+    {
+      std::map<std::string, ceph::bufferlist> aset;
+      i.decode_attrset(aset);
+      return _omap_set_values(trans, get_onode(op->oid), std::move(aset));
+    }
+    break;
+    case Transaction::OP_OMAP_SETHEADER:
+    {
+      ceph::bufferlist bl;
+      i.decode_bl(bl);
+      return _omap_set_header(trans, get_onode(op->oid), bl);
+    }
+    break;
+    case Transaction::OP_OMAP_RMKEYS:
+    {
+      omap_keys_t keys;
+      i.decode_keyset(keys);
+      return _omap_rmkeys(trans, get_onode(op->oid), keys);
+    }
+    break;
+    case Transaction::OP_OMAP_RMKEYRANGE:
+    {
+      string first, last;
+      first = i.decode_string();
+      last = i.decode_string();
+      return _omap_rmkeyrange(trans, get_onode(op->oid), first, last);
+    }
+    break;
+    case Transaction::OP_COLL_HINT:
+    {
+      ceph::bufferlist hint;
+      i.decode_bl(hint);
+      return write_ertr::now();
+    }
+    default:
+      logger().error("bad op {}", static_cast<unsigned>(op->op));
+      return crimson::ct_error::input_output_error::make();
+    }
+  } catch (std::exception &e) {
+    logger().error("{} got exception {}", __func__, e);
+    return crimson::ct_error::input_output_error::make();
+  }
+}
+
+SeaStore::write_ertr::future<> SeaStore::_remove(
+  TransactionRef &trans,
+  OnodeRef &onode)
+{
+  logger().debug("{} onode={}",
+                __func__, *onode);
+  return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_touch(
+  TransactionRef &trans,
+  OnodeRef &onode)
+{
+  logger().debug("{} onode={}",
+                __func__, *onode);
+  return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_write(
+  TransactionRef &trans,
+  OnodeRef &onode,
+  uint64_t offset, size_t len, const ceph::bufferlist& bl,
+  uint32_t fadvise_flags)
+{
+  logger().debug("{}: {} {} ~ {}",
+                __func__, *onode, offset, len);
+  assert(len == bl.length());
+
+/*
+  return onode_manager->get_or_create_onode(cid, oid).safe_then([=, &bl](auto ref) {
+    return;
+  }).handle_error(
+    crimson::ct_error::enoent::handle([]() {
+      return;
+    }),
+    OnodeManager::open_ertr::pass_further{}
+  );
+  */
+  return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_omap_set_values(
+  TransactionRef &trans,
+  OnodeRef &onode,
+  std::map<std::string, ceph::bufferlist> &&aset)
+{
+  logger().debug(
+    "{}: {} {} keys",
+    __func__, *onode, aset.size());
+
+  return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_omap_set_header(
+  TransactionRef &trans,
+  OnodeRef &onode,
+  const ceph::bufferlist &header)
+{
+  logger().debug(
+    "{}: {} {} bytes",
+    __func__, *onode, header.length());
+  return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_omap_rmkeys(
+  TransactionRef &trans,
+  OnodeRef &onode,
+  const omap_keys_t& aset)
+{
+  logger().debug(
+    "{} {} {} keys",
+    __func__, *onode, aset.size());
+  return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_omap_rmkeyrange(
+  TransactionRef &trans,
+  OnodeRef &onode,
+  const std::string &first,
+  const std::string &last)
+{
+  logger().debug(
+    "{} {} first={} last={}",
+    __func__, *onode, first, last);
+  return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_truncate(
+  TransactionRef &trans,
+  OnodeRef &onode,
+  uint64_t size)
+{
+  logger().debug("{} onode={} size={}",
+                __func__, *onode, size);
+  return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_setattrs(
+  TransactionRef &trans,
+  OnodeRef &onode,
+  std::map<std::string,bufferptr>& aset)
+{
+  logger().debug("{} onode={}",
+                __func__, *onode);
+  return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_create_collection(
+  TransactionRef &trans,
+  const coll_t& cid, int bits)
+{
+  return write_ertr::now();
+}
+
+boost::intrusive_ptr<SeastoreCollection> SeaStore::_get_collection(const coll_t& cid)
+{
+  return new SeastoreCollection{cid};
+}
+
+seastar::future<> SeaStore::write_meta(const std::string& key,
+					const std::string& value)
+{
+  return seastar::make_ready_future<>();
+}
+
+seastar::future<std::tuple<int, std::string>> SeaStore::read_meta(const std::string& key)
+{
+  return seastar::make_ready_future<std::tuple<int, std::string>>(
+    std::make_tuple(0, ""s));
+}
+
+uuid_d SeaStore::get_fsid() const
+{
+  return osd_fsid;
+}
+
+}
diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h
new file mode 100644
index 000000000..798442c34
--- /dev/null
+++ b/src/crimson/os/seastore/seastore.h
@@ -0,0 +1,181 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <map>
+#include <typeinfo>
+#include <vector>
+
+#include <optional>
+#include <seastar/core/future.hh>
+
+#include "osd/osd_types.h"
+#include "include/uuid.h"
+
+#include "os/Transaction.h"
+#include "crimson/os/seastore/segment_cleaner.h"
+#include "crimson/os/futurized_store.h"
+#include "transaction.h"
+
+namespace crimson::os::seastore {
+
+class SeastoreCollection;
+class SegmentManager;
+class OnodeManager;
+class Onode;
+using OnodeRef = boost::intrusive_ptr<Onode>;
+class Journal;
+class LBAManager;
+class TransactionManager;
+class Cache;
+
+class SeaStore final : public FuturizedStore {
+  uuid_d osd_fsid;
+
+public:
+
+  SeaStore(const std::string& path);
+  ~SeaStore() final;
+
+  seastar::future<> stop() final;
+  seastar::future<> mount() final;
+  seastar::future<> umount() final;
+
+  seastar::future<> mkfs(uuid_d new_osd_fsid) final;
+  seastar::future<store_statfs_t> stat() const final;
+
+  read_errorator::future<ceph::bufferlist> read(
+    CollectionRef c,
+    const ghobject_t& oid,
+    uint64_t offset,
+    size_t len,
+    uint32_t op_flags = 0) final;
+  read_errorator::future<ceph::bufferlist> readv(
+    CollectionRef c,
+    const ghobject_t& oid,
+    interval_set<uint64_t>& m,
+    uint32_t op_flags = 0) final;
+  get_attr_errorator::future<ceph::bufferptr> get_attr(
+    CollectionRef c,
+    const ghobject_t& oid,
+    std::string_view name) const final;
+  get_attrs_ertr::future<attrs_t> get_attrs(
+    CollectionRef c,
+    const ghobject_t& oid) final;
+
+  seastar::future<struct stat> stat(
+    CollectionRef c,
+    const ghobject_t& oid) final;
+
+  read_errorator::future<omap_values_t> omap_get_values(
+    CollectionRef c,
+    const ghobject_t& oid,
+    const omap_keys_t& keys) final;
+
+  /// Retrieves paged set of values > start (if present)
+  read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
+    CollectionRef c,           ///< [in] collection
+    const ghobject_t &oid,     ///< [in] oid
+    const std::optional<std::string> &start ///< [in] start, empty for begin
+    ) final; ///< @return <done, values> values.empty() iff done
+
+  read_errorator::future<bufferlist> omap_get_header(
+    CollectionRef c,
+    const ghobject_t& oid) final;
+
+  seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
+    CollectionRef c,
+    const ghobject_t& start,
+    const ghobject_t& end,
+    uint64_t limit) const final;
+
+  seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
+  seastar::future<CollectionRef> open_collection(const coll_t& cid) final;
+  seastar::future<std::vector<coll_t>> list_collections() final;
+
+  seastar::future<> do_transaction(
+    CollectionRef ch,
+    ceph::os::Transaction&& txn) final;
+
+  seastar::future<OmapIteratorRef> get_omap_iterator(
+    CollectionRef ch,
+    const ghobject_t& oid) final;
+  seastar::future<std::map<uint64_t, uint64_t>> fiemap(
+    CollectionRef ch,
+    const ghobject_t& oid,
+    uint64_t off,
+    uint64_t len) final;
+
+  seastar::future<> write_meta(const std::string& key,
+		  const std::string& value) final;
+  seastar::future<std::tuple<int, std::string>> read_meta(const std::string& key) final;
+  uuid_d get_fsid() const final;
+
+  unsigned get_max_attr_name_length() const final {
+    return 256;
+  }
+
+private:
+  std::unique_ptr<SegmentManager> segment_manager;
+  std::unique_ptr<SegmentCleaner> segment_cleaner;
+  std::unique_ptr<Cache> cache;
+  std::unique_ptr<Journal> journal;
+  std::unique_ptr<LBAManager> lba_manager;
+  std::unique_ptr<TransactionManager> transaction_manager;
+  std::unique_ptr<OnodeManager> onode_manager;
+
+
+  using write_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  write_ertr::future<> _do_transaction_step(
+    TransactionRef &trans,
+    CollectionRef &col,
+    std::vector<OnodeRef> &onodes,
+    ceph::os::Transaction::iterator &i);
+
+  write_ertr::future<> _remove(
+    TransactionRef &trans,
+    OnodeRef &onode);
+  write_ertr::future<> _touch(
+    TransactionRef &trans,
+    OnodeRef &onode);
+  write_ertr::future<> _write(
+    TransactionRef &trans,
+    OnodeRef &onode,
+    uint64_t offset, size_t len, const ceph::bufferlist& bl,
+    uint32_t fadvise_flags);
+  write_ertr::future<> _omap_set_values(
+    TransactionRef &trans,
+    OnodeRef &onode,
+    std::map<std::string, ceph::bufferlist> &&aset);
+  write_ertr::future<> _omap_set_header(
+    TransactionRef &trans,
+    OnodeRef &onode,
+    const ceph::bufferlist &header);
+  write_ertr::future<> _omap_rmkeys(
+    TransactionRef &trans,
+    OnodeRef &onode,
+    const omap_keys_t& aset);
+  write_ertr::future<> _omap_rmkeyrange(
+    TransactionRef &trans,
+    OnodeRef &onode,
+    const std::string &first,
+    const std::string &last);
+  write_ertr::future<> _truncate(
+    TransactionRef &trans,
+    OnodeRef &onode, uint64_t size);
+  write_ertr::future<> _setattrs(
+    TransactionRef &trans,
+    OnodeRef &onode,
+    std::map<std::string,bufferptr>& aset);
+  write_ertr::future<> _create_collection(
+    TransactionRef &trans,
+    const coll_t& cid, int bits);
+
+  boost::intrusive_ptr<SeastoreCollection> _get_collection(const coll_t& cid);
+};
+
+}
diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc
new file mode 100644
index 000000000..ff43b1e51
--- /dev/null
+++ b/src/crimson/os/seastore/seastore_types.cc
@@ -0,0 +1,105 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/seastore_types.h"
+
+namespace crimson::os::seastore {
+
+std::ostream &segment_to_stream(std::ostream &out, const segment_id_t &t)
+{
+  if (t == NULL_SEG_ID)
+    return out << "NULL_SEG";
+  else if (t == BLOCK_REL_SEG_ID)
+    return out << "BLOCK_REL_SEG";
+  else if (t == RECORD_REL_SEG_ID)
+    return out << "RECORD_REL_SEG";
+  else if (t == FAKE_SEG_ID)
+    return out << "FAKE_SEG";
+  else
+    return out << t;
+}
+
+std::ostream &offset_to_stream(std::ostream &out, const segment_off_t &t)
+{
+  if (t == NULL_SEG_OFF)
+    return out << "NULL_OFF";
+  else
+    return out << t;
+}
+
+std::ostream &operator<<(std::ostream &out, const paddr_t &rhs)
+{
+  out << "paddr_t<";
+  segment_to_stream(out, rhs.segment);
+  out << ", ";
+  offset_to_stream(out, rhs.offset);
+  return out << ">";
+}
+
+std::ostream &operator<<(std::ostream &out, const journal_seq_t &seq)
+{
+  return out << "journal_seq_t(segment_seq="
+	     << seq.segment_seq << ", offset="
+	     << seq.offset
+	     << ")";
+}
+
+std::ostream &operator<<(std::ostream &out, extent_types_t t)
+{
+  switch (t) {
+  case extent_types_t::ROOT:
+    return out << "ROOT";
+  case extent_types_t::LADDR_INTERNAL:
+    return out << "LADDR_INTERNAL";
+  case extent_types_t::LADDR_LEAF:
+    return out << "LADDR_LEAF";
+  case extent_types_t::EXTMAP_INNER:
+    return out << "EXTMAP_INNER";
+  case extent_types_t::EXTMAP_LEAF:
+    return out << "EXTMAP_LEAF";
+  case extent_types_t::ONODE_BLOCK_STAGED:
+    return out << "ONODE_BLOCK_STAGED";
+  case extent_types_t::TEST_BLOCK:
+    return out << "TEST_BLOCK";
+  case extent_types_t::TEST_BLOCK_PHYSICAL:
+    return out << "TEST_BLOCK_PHYSICAL";
+  case extent_types_t::NONE:
+    return out << "NONE";
+  default:
+    return out << "UNKNOWN";
+  }
+}
+
+std::ostream &operator<<(std::ostream &out, const laddr_list_t &rhs)
+{
+  bool first = false;
+  for (auto &i: rhs) {
+    out << (first ? '[' : ',') << '(' << i.first << ',' << i.second << ')';
+    first = true;
+  }
+  return out << ']';
+}
+std::ostream &operator<<(std::ostream &out, const paddr_list_t &rhs)
+{
+  bool first = false;
+  for (auto &i: rhs) {
+    out << (first ? '[' : ',') << '(' << i.first << ',' << i.second << ')';
+    first = true;
+  }
+  return out << ']';
+}
+
+std::ostream &operator<<(std::ostream &lhs, const delta_info_t &rhs)
+{
+  return lhs << "delta_info_t("
+	     << "type: " << rhs.type
+	     << ", paddr: " << rhs.paddr
+	     << ", laddr: " << rhs.laddr
+	     << ", prev_crc: " << rhs.prev_crc
+	     << ", final_crc: " << rhs.final_crc
+	     << ", length: " << rhs.length
+	     << ", pversion: " << rhs.pversion
+	     << ")";
+}
+
+}
diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h
new file mode 100644
index 000000000..cb8480268
--- /dev/null
+++ b/src/crimson/os/seastore/seastore_types.h
@@ -0,0 +1,369 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <limits>
+#include <iostream>
+
+#include "include/byteorder.h"
+#include "include/denc.h"
+#include "include/buffer.h"
+#include "include/cmp.h"
+#include "include/uuid.h"
+
+namespace crimson::os::seastore {
+
+using depth_t = int32_t;
+using depth_le_t = ceph_les32;
+
+using checksum_t = uint32_t;
+
+// Immutable metadata for seastore to set at mkfs time
+struct seastore_meta_t {
+  uuid_d seastore_id;
+
+  DENC(seastore_meta_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.seastore_id, p);
+    DENC_FINISH(p);
+  }
+};
+
+// Identifies segment location on disk, see SegmentManager,
+using segment_id_t = uint32_t;
+constexpr segment_id_t NULL_SEG_ID =
+  std::numeric_limits<segment_id_t>::max() - 1;
+/* Used to denote relative paddr_t */
+constexpr segment_id_t RECORD_REL_SEG_ID =
+  std::numeric_limits<segment_id_t>::max() - 2;
+constexpr segment_id_t BLOCK_REL_SEG_ID =
+  std::numeric_limits<segment_id_t>::max() - 3;
+
+// for tests which generate fake paddrs
+constexpr segment_id_t FAKE_SEG_ID =
+  std::numeric_limits<segment_id_t>::max() - 4;
+
+std::ostream &segment_to_stream(std::ostream &, const segment_id_t &t);
+
+// Offset within a segment on disk, see SegmentManager
+// may be negative for relative offsets
+using segment_off_t = int32_t;
+constexpr segment_off_t NULL_SEG_OFF =
+  std::numeric_limits<segment_id_t>::max();
+
+std::ostream &offset_to_stream(std::ostream &, const segment_off_t &t);
+
+/* Monotonically increasing segment seq, uniquely identifies
+ * the incarnation of a segment */
+using segment_seq_t = uint32_t;
+static constexpr segment_seq_t NULL_SEG_SEQ =
+  std::numeric_limits<segment_seq_t>::max();
+
+// Offset of delta within a record
+using record_delta_idx_t = uint32_t;
+constexpr record_delta_idx_t NULL_DELTA_IDX =
+  std::numeric_limits<record_delta_idx_t>::max();
+
+/**
+ * paddr_t
+ *
+ * <segment, offset> offset on disk, see SegmentManager
+ *
+ * May be absolute, record_relative, or block_relative.
+ *
+ * Blocks get read independently of the surrounding record,
+ * so paddrs embedded directly within a block need to refer
+ * to other blocks within the same record by a block_relative
+ * addr relative to the block's own offset.  By contrast,
+ * deltas to existing blocks need to use record_relative
+ * addrs relative to the first block of the record.
+ *
+ * Fresh extents during a transaction are refered to by
+ * record_relative paddrs.
+ */
+struct paddr_t {
+  segment_id_t segment = NULL_SEG_ID;
+  segment_off_t offset = NULL_SEG_OFF;
+
+  bool is_relative() const {
+    return segment == RECORD_REL_SEG_ID ||
+      segment == BLOCK_REL_SEG_ID;
+  }
+
+  bool is_record_relative() const {
+    return segment == RECORD_REL_SEG_ID;
+  }
+
+  bool is_block_relative() const {
+    return segment == BLOCK_REL_SEG_ID;
+  }
+
+  paddr_t add_offset(segment_off_t o) const {
+    return paddr_t{segment, offset + o};
+  }
+
+  paddr_t add_relative(paddr_t o) const {
+    assert(o.is_relative());
+    return paddr_t{segment, offset + o.offset};
+  }
+
+  paddr_t add_block_relative(paddr_t o) const {
+    // special version mainly for documentation purposes
+    assert(o.is_block_relative());
+    return add_relative(o);
+  }
+
+  paddr_t add_record_relative(paddr_t o) const {
+    // special version mainly for documentation purposes
+    assert(o.is_record_relative());
+    return add_relative(o);
+  }
+
+  /**
+   * paddr_t::operator-
+   *
+   * Only defined for record_relative paddr_ts.  Yields a
+   * block_relative address.
+   */
+  paddr_t operator-(paddr_t rhs) const {
+    assert(rhs.is_relative() && is_relative());
+    assert(rhs.segment == segment);
+    return paddr_t{
+      BLOCK_REL_SEG_ID,
+      offset - rhs.offset
+    };
+  }
+
+  /**
+   * maybe_relative_to
+   *
+   * Helper for the case where an in-memory paddr_t may be
+   * either block_relative or absolute (not record_relative).
+   *
+   * base must be either absolute or record_relative.
+   */
+  paddr_t maybe_relative_to(paddr_t base) const {
+    assert(!base.is_block_relative());
+    if (is_block_relative())
+      return base.add_block_relative(*this);
+    else
+      return *this;
+  }
+
+  DENC(paddr_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.segment, p);
+    denc(v.offset, p);
+    DENC_FINISH(p);
+  }
+};
+WRITE_CMP_OPERATORS_2(paddr_t, segment, offset)
+WRITE_EQ_OPERATORS_2(paddr_t, segment, offset)
+constexpr paddr_t P_ADDR_NULL = paddr_t{};
+constexpr paddr_t P_ADDR_MIN = paddr_t{0, 0};
+constexpr paddr_t make_record_relative_paddr(segment_off_t off) {
+  return paddr_t{RECORD_REL_SEG_ID, off};
+}
+constexpr paddr_t make_block_relative_paddr(segment_off_t off) {
+  return paddr_t{BLOCK_REL_SEG_ID, off};
+}
+constexpr paddr_t make_fake_paddr(segment_off_t off) {
+  return paddr_t{FAKE_SEG_ID, off};
+}
+
+struct paddr_le_t {
+  ceph_le32 segment = init_le32(NULL_SEG_ID);
+  ceph_les32 offset = init_les32(NULL_SEG_OFF);
+
+  paddr_le_t() = default;
+  paddr_le_t(ceph_le32 segment, ceph_les32 offset)
+    : segment(segment), offset(offset) {}
+  paddr_le_t(segment_id_t segment, segment_off_t offset)
+    : segment(init_le32(segment)), offset(init_les32(offset)) {}
+  paddr_le_t(const paddr_t &addr) : paddr_le_t(addr.segment, addr.offset) {}
+
+  operator paddr_t() const {
+    return paddr_t{segment, offset};
+  }
+};
+
+std::ostream &operator<<(std::ostream &out, const paddr_t &rhs);
+
+using objaddr_t = uint32_t;
+constexpr objaddr_t OBJ_ADDR_MIN = std::numeric_limits<objaddr_t>::min();
+
+/* Monotonically increasing identifier for the location of a
+ * journal_record.
+ */
+struct journal_seq_t {
+  segment_seq_t segment_seq = 0;
+  paddr_t offset;
+
+  DENC(journal_seq_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.segment_seq, p);
+    denc(v.offset, p);
+    DENC_FINISH(p);
+  }
+};
+WRITE_CMP_OPERATORS_2(journal_seq_t, segment_seq, offset)
+WRITE_EQ_OPERATORS_2(journal_seq_t, segment_seq, offset)
+
+std::ostream &operator<<(std::ostream &out, const journal_seq_t &seq);
+
+static constexpr journal_seq_t NO_DELTAS = journal_seq_t{
+  NULL_SEG_SEQ,
+  P_ADDR_NULL
+};
+
+// logical addr, see LBAManager, TransactionManager
+using laddr_t = uint64_t;
+constexpr laddr_t L_ADDR_MIN = std::numeric_limits<laddr_t>::min();
+constexpr laddr_t L_ADDR_MAX = std::numeric_limits<laddr_t>::max();
+constexpr laddr_t L_ADDR_NULL = std::numeric_limits<laddr_t>::max();
+constexpr laddr_t L_ADDR_ROOT = std::numeric_limits<laddr_t>::max() - 1;
+constexpr laddr_t L_ADDR_LBAT = std::numeric_limits<laddr_t>::max() - 2;
+
+struct laddr_le_t {
+  ceph_le64 laddr = init_le64(L_ADDR_NULL);
+
+  laddr_le_t() = default;
+  laddr_le_t(const laddr_le_t &) = default;
+  explicit laddr_le_t(const laddr_t &addr)
+    : laddr(init_le64(addr)) {}
+
+  operator laddr_t() const {
+    return laddr_t(laddr);
+  }
+  laddr_le_t& operator=(laddr_t addr) {
+    ceph_le64 val;
+    val = addr;
+    laddr = val;
+    return *this;
+  }
+};
+
+// logical offset, see LBAManager, TransactionManager
+using extent_len_t = uint32_t;
+constexpr extent_len_t EXTENT_LEN_MAX =
+  std::numeric_limits<extent_len_t>::max();
+
+using extent_len_le_t = ceph_le32;
+inline extent_len_le_t init_extent_len_le_t(extent_len_t len) {
+  return init_le32(len);
+}
+
+struct laddr_list_t : std::list<std::pair<laddr_t, extent_len_t>> {
+  template <typename... T>
+  laddr_list_t(T&&... args)
+    : std::list<std::pair<laddr_t, extent_len_t>>(std::forward<T>(args)...) {}
+};
+struct paddr_list_t : std::list<std::pair<paddr_t, extent_len_t>> {
+  template <typename... T>
+  paddr_list_t(T&&... args)
+    : std::list<std::pair<paddr_t, extent_len_t>>(std::forward<T>(args)...) {}
+};
+
+std::ostream &operator<<(std::ostream &out, const laddr_list_t &rhs);
+std::ostream &operator<<(std::ostream &out, const paddr_list_t &rhs);
+
+/* identifies type of extent, used for interpretting deltas, managing
+ * writeback.
+ *
+ * Note that any new extent type needs to be added to
+ * Cache::get_extent_by_type in cache.cc
+ */
+enum class extent_types_t : uint8_t {
+  ROOT = 0,
+  LADDR_INTERNAL = 1,
+  LADDR_LEAF = 2,
+  ONODE_BLOCK = 3,
+  EXTMAP_INNER = 4,
+  EXTMAP_LEAF = 5,
+  ONODE_BLOCK_STAGED = 6,
+
+  // Test Block Types
+  TEST_BLOCK = 0xF0,
+  TEST_BLOCK_PHYSICAL = 0xF1,
+
+  // None
+  NONE = 0xFF
+};
+
+inline bool is_logical_type(extent_types_t type) {
+  switch (type) {
+  case extent_types_t::ROOT:
+  case extent_types_t::LADDR_INTERNAL:
+  case extent_types_t::LADDR_LEAF:
+    return false;
+  default:
+    return true;
+  }
+}
+
+std::ostream &operator<<(std::ostream &out, extent_types_t t);
+
+/* description of a new physical extent */
+struct extent_t {
+  extent_types_t type;  ///< type of extent
+  laddr_t addr;         ///< laddr of extent (L_ADDR_NULL for non-logical)
+  ceph::bufferlist bl;  ///< payload, bl.length() == length, aligned
+};
+
+using extent_version_t = uint32_t;
+constexpr extent_version_t EXTENT_VERSION_NULL = 0;
+
+/* description of a mutation to a physical extent */
+struct delta_info_t {
+  extent_types_t type = extent_types_t::NONE;  ///< delta type
+  paddr_t paddr;                               ///< physical address
+  laddr_t laddr = L_ADDR_NULL;                 ///< logical address
+  uint32_t prev_crc = 0;
+  uint32_t final_crc = 0;
+  segment_off_t length = NULL_SEG_OFF;         ///< extent length
+  extent_version_t pversion;                   ///< prior version
+  ceph::bufferlist bl;                         ///< payload
+
+  DENC(delta_info_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.type, p);
+    denc(v.paddr, p);
+    denc(v.laddr, p);
+    denc(v.prev_crc, p);
+    denc(v.final_crc, p);
+    denc(v.length, p);
+    denc(v.pversion, p);
+    denc(v.bl, p);
+    DENC_FINISH(p);
+  }
+
+  bool operator==(const delta_info_t &rhs) const {
+    return (
+      type == rhs.type &&
+      paddr == rhs.paddr &&
+      laddr == rhs.laddr &&
+      prev_crc == rhs.prev_crc &&
+      final_crc == rhs.final_crc &&
+      length == rhs.length &&
+      pversion == rhs.pversion &&
+      bl == rhs.bl
+    );
+  }
+
+  friend std::ostream &operator<<(std::ostream &lhs, const delta_info_t &rhs);
+};
+
+std::ostream &operator<<(std::ostream &lhs, const delta_info_t &rhs);
+
+struct record_t {
+  std::vector<extent_t> extents;
+  std::vector<delta_info_t> deltas;
+};
+
+}
+
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::seastore_meta_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::paddr_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_seq_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::delta_info_t)
diff --git a/src/crimson/os/seastore/segment_cleaner.cc b/src/crimson/os/seastore/segment_cleaner.cc
new file mode 100644
index 000000000..3597c21df
--- /dev/null
+++ b/src/crimson/os/seastore/segment_cleaner.cc
@@ -0,0 +1,340 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/common/log.h"
+
+#include "crimson/os/seastore/transaction.h"
+#include "crimson/os/seastore/segment_cleaner.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+namespace crimson::os::seastore {
+
+bool SpaceTrackerSimple::equals(const SpaceTrackerI &_other) const
+{
+  const auto &other = static_cast<const SpaceTrackerSimple&>(_other);
+
+  if (other.live_bytes_by_segment.size() != live_bytes_by_segment.size()) {
+    logger().error("{}: different segment counts, bug in test");
+    assert(0 == "segment counts should match");
+    return false;
+  }
+
+  bool all_match = true;
+  for (segment_id_t i = 0; i < live_bytes_by_segment.size(); ++i) {
+    if (other.live_bytes_by_segment[i] != live_bytes_by_segment[i]) {
+      all_match = false;
+      logger().debug(
+	"{}: segment_id {} live bytes mismatch *this: {}, other: {}",
+	__func__,
+	i,
+	live_bytes_by_segment[i],
+	other.live_bytes_by_segment[i]);
+    }
+  }
+  return all_match;
+}
+
+int64_t SpaceTrackerDetailed::SegmentMap::allocate(
+  segment_id_t segment,
+  segment_off_t offset,
+  extent_len_t len,
+  const extent_len_t block_size)
+{
+  assert(offset % block_size == 0);
+  assert(len % block_size == 0);
+
+  const auto b = (offset / block_size);
+  const auto e = (offset + len) / block_size;
+
+  bool error = false;
+  for (auto i = b; i < e; ++i) {
+    if (bitmap[i]) {
+      if (!error) {
+	logger().error(
+	  "SegmentMap::allocate found allocated in {}, {} ~ {}",
+	  segment,
+	  offset,
+	  len);
+	error = true;
+      }
+      logger().debug(
+	"SegmentMap::allocate block {} allocated",
+	i * block_size);
+    }
+    bitmap[i] = true;
+  }
+  return update_usage(block_size);
+}
+
+int64_t SpaceTrackerDetailed::SegmentMap::release(
+  segment_id_t segment,
+  segment_off_t offset,
+  extent_len_t len,
+  const extent_len_t block_size)
+{
+  assert(offset % block_size == 0);
+  assert(len % block_size == 0);
+
+  const auto b = (offset / block_size);
+  const auto e = (offset + len) / block_size;
+
+  bool error = false;
+  for (auto i = b; i < e; ++i) {
+    if (!bitmap[i]) {
+      if (!error) {
+	logger().error(
+	  "SegmentMap::release found unallocated in {}, {} ~ {}",
+	  segment,
+	  offset,
+	  len);
+	error = true;
+      }
+      logger().debug(
+	"SegmentMap::release block {} unallocated",
+	i * block_size);
+    }
+    bitmap[i] = false;
+  }
+  return update_usage(-(int64_t)block_size);
+}
+
+bool SpaceTrackerDetailed::equals(const SpaceTrackerI &_other) const
+{
+  const auto &other = static_cast<const SpaceTrackerDetailed&>(_other);
+
+  if (other.segment_usage.size() != segment_usage.size()) {
+    logger().error("{}: different segment counts, bug in test");
+    assert(0 == "segment counts should match");
+    return false;
+  }
+
+  bool all_match = true;
+  for (segment_id_t i = 0; i < segment_usage.size(); ++i) {
+    if (other.segment_usage[i].get_usage() != segment_usage[i].get_usage()) {
+      all_match = false;
+      logger().error(
+	"{}: segment_id {} live bytes mismatch *this: {}, other: {}",
+	__func__,
+	i,
+	segment_usage[i].get_usage(),
+	other.segment_usage[i].get_usage());
+    }
+  }
+  return all_match;
+}
+
+void SpaceTrackerDetailed::SegmentMap::dump_usage(extent_len_t block_size) const
+{
+  for (unsigned i = 0; i < bitmap.size(); ++i) {
+    if (bitmap[i]) {
+      logger().debug("    {} still live", i * block_size);
+    }
+  }
+}
+
+void SpaceTrackerDetailed::dump_usage(segment_id_t id) const
+{
+  logger().debug("SpaceTrackerDetailed::dump_usage {}", id);
+  segment_usage[id].dump_usage(block_size);
+}
+
+SegmentCleaner::get_segment_ret SegmentCleaner::get_segment()
+{
+  for (size_t i = 0; i < segments.size(); ++i) {
+    if (segments[i].is_empty()) {
+      mark_open(i);
+      logger().debug("{}: returning segment {}", __func__, i);
+      return get_segment_ret(
+	get_segment_ertr::ready_future_marker{},
+	i);
+    }
+  }
+  assert(0 == "out of space handling todo");
+  return get_segment_ret(
+    get_segment_ertr::ready_future_marker{},
+    0);
+}
+
+void SegmentCleaner::update_journal_tail_target(journal_seq_t target)
+{
+  logger().debug(
+    "{}: {}",
+    __func__,
+    target);
+  assert(journal_tail_target == journal_seq_t() || target >= journal_tail_target);
+  if (journal_tail_target == journal_seq_t() || target > journal_tail_target) {
+    journal_tail_target = target;
+  }
+}
+
+void SegmentCleaner::update_journal_tail_committed(journal_seq_t committed)
+{
+  if (journal_tail_committed == journal_seq_t() ||
+      committed > journal_tail_committed) {
+    logger().debug(
+      "{}: update journal_tail_committed {}",
+      __func__,
+      committed);
+    journal_tail_committed = committed;
+  }
+  if (journal_tail_target == journal_seq_t() ||
+      committed > journal_tail_target) {
+    logger().debug(
+      "{}: update journal_tail_target {}",
+      __func__,
+      committed);
+    journal_tail_target = committed;
+  }
+}
+
+void SegmentCleaner::close_segment(segment_id_t segment)
+{
+  mark_closed(segment);
+}
+
+SegmentCleaner::do_immediate_work_ret SegmentCleaner::do_immediate_work(
+  Transaction &t)
+{
+  auto next_target = get_dirty_tail_limit();
+  logger().debug(
+    "{}: journal_tail_target={} get_dirty_tail_limit()={}",
+    __func__,
+    journal_tail_target,
+    next_target);
+
+  logger().debug(
+    "SegmentCleaner::do_immediate_work gc total {}, available {}, unavailable {}, used {}  available_ratio {}, reclaim_ratio {}, bytes_to_gc_for_available {}, bytes_to_gc_for_reclaim {}",
+    get_total_bytes(),
+    get_available_bytes(),
+    get_unavailable_bytes(),
+    get_used_bytes(),
+    get_available_ratio(),
+    get_reclaim_ratio(),
+    get_immediate_bytes_to_gc_for_available(),
+    get_immediate_bytes_to_gc_for_reclaim());
+
+  auto dirty_fut = do_immediate_work_ertr::now();
+  if (journal_tail_target < next_target) {
+    dirty_fut = rewrite_dirty(t, next_target);
+  }
+  return dirty_fut.safe_then([=, &t] {
+    return do_gc(t, get_immediate_bytes_to_gc());
+  }).handle_error(
+    do_immediate_work_ertr::pass_further{},
+    crimson::ct_error::assert_all{}
+  );
+}
+
+SegmentCleaner::do_deferred_work_ret SegmentCleaner::do_deferred_work(
+  Transaction &t)
+{
+  return do_deferred_work_ret(
+    do_deferred_work_ertr::ready_future_marker{},
+    ceph::timespan());
+}
+
+SegmentCleaner::rewrite_dirty_ret SegmentCleaner::rewrite_dirty(
+  Transaction &t,
+  journal_seq_t limit)
+{
+  return ecb->get_next_dirty_extents(
+    limit
+  ).then([=, &t](auto dirty_list) {
+    if (dirty_list.empty()) {
+      return do_immediate_work_ertr::now();
+    } else {
+      update_journal_tail_target(dirty_list.front()->get_dirty_from());
+    }
+    return seastar::do_with(
+      std::move(dirty_list),
+      [this, &t](auto &dirty_list) {
+	return crimson::do_for_each(
+	  dirty_list,
+	  [this, &t](auto &e) {
+	    logger().debug(
+	      "SegmentCleaner::do_immediate_work cleaning {}",
+	      *e);
+	    return ecb->rewrite_extent(t, e);
+	  });
+      });
+  });
+}
+
+SegmentCleaner::do_gc_ret SegmentCleaner::do_gc(
+  Transaction &t,
+  size_t bytes)
+{
+  if (bytes == 0) {
+    return do_gc_ertr::now();
+  }
+
+  if (!scan_cursor) {
+    paddr_t next = P_ADDR_NULL;
+    next.segment = get_next_gc_target();
+    if (next == P_ADDR_NULL) {
+      logger().debug(
+	"SegmentCleaner::do_gc: no segments to gc");
+      return do_gc_ertr::now();
+    }
+    next.offset = 0;
+    scan_cursor =
+      std::make_unique<ExtentCallbackInterface::scan_extents_cursor>(
+	next);
+    logger().debug(
+      "SegmentCleaner::do_gc: starting gc on segment {}",
+      scan_cursor->get_offset().segment);
+  }
+
+  return ecb->scan_extents(
+    *scan_cursor,
+    bytes
+  ).safe_then([=, &t](auto addrs) {
+    return seastar::do_with(
+      std::move(addrs),
+      [=, &t](auto &addr_list) {
+	return crimson::do_for_each(
+	  addr_list,
+	  [=, &t](auto &addr_pair) {
+	    auto &[addr, info] = addr_pair;
+	    logger().debug(
+	      "SegmentCleaner::do_gc: checking addr {}",
+	      addr);
+	    return ecb->get_extent_if_live(
+	      t,
+	      info.type,
+	      addr,
+	      info.addr,
+	      info.len
+	    ).safe_then([addr=addr, &t, this](CachedExtentRef ext) {
+	      if (!ext) {
+		logger().debug(
+		  "SegmentCleaner::do_gc: addr {} dead, skipping",
+		  addr);
+		return ExtentCallbackInterface::rewrite_extent_ertr::now();
+	      } else {
+		logger().debug(
+		  "SegmentCleaner::do_gc: addr {} alive, gc'ing {}",
+		  addr,
+		  *ext);
+	      }
+	      return ecb->rewrite_extent(
+		t,
+		ext);
+	    });
+	  }).safe_then([&t, this] {
+	    if (scan_cursor->is_complete()) {
+	      t.mark_segment_to_release(scan_cursor->get_offset().segment);
+	      scan_cursor.reset();
+	    }
+	    return ExtentCallbackInterface::release_segment_ertr::now();
+	  });
+      });
+  });
+}
+
+}
diff --git a/src/crimson/os/seastore/segment_cleaner.h b/src/crimson/os/seastore/segment_cleaner.h
new file mode 100644
index 000000000..38ebd05bc
--- /dev/null
+++ b/src/crimson/os/seastore/segment_cleaner.h
@@ -0,0 +1,691 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive/set.hpp>
+
+#include "common/ceph_time.h"
+
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/journal.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/segment_manager.h"
+
+namespace crimson::os::seastore {
+class Transaction;
+
+struct segment_info_t {
+  Segment::segment_state_t state = Segment::segment_state_t::EMPTY;
+
+  // Will be non-null for any segments in the current journal
+  segment_seq_t journal_segment_seq = NULL_SEG_SEQ;
+
+
+  bool is_in_journal(journal_seq_t tail_committed) const {
+    return journal_segment_seq != NULL_SEG_SEQ &&
+      tail_committed.segment_seq <= journal_segment_seq;
+  }
+
+  bool is_empty() const {
+    return state == Segment::segment_state_t::EMPTY;
+  }
+
+  bool is_closed() const {
+    return state == Segment::segment_state_t::CLOSED;
+  }
+
+  bool is_open() const {
+    return state == Segment::segment_state_t::OPEN;
+  }
+};
+
+class SpaceTrackerI {
+public:
+  virtual int64_t allocate(
+    segment_id_t segment,
+    segment_off_t offset,
+    extent_len_t len) = 0;
+
+  virtual int64_t release(
+    segment_id_t segment,
+    segment_off_t offset,
+    extent_len_t len) = 0;
+
+  virtual int64_t get_usage(
+    segment_id_t segment) const = 0;
+
+  virtual bool equals(const SpaceTrackerI &other) const = 0;
+
+  virtual std::unique_ptr<SpaceTrackerI> make_empty() const = 0;
+
+  virtual void dump_usage(segment_id_t) const = 0;
+
+  virtual void reset() = 0;
+
+  virtual ~SpaceTrackerI() = default;
+};
+using SpaceTrackerIRef = std::unique_ptr<SpaceTrackerI>;
+
+class SpaceTrackerSimple : public SpaceTrackerI {
+  // Tracks live space for each segment
+  std::vector<int64_t> live_bytes_by_segment;
+
+  int64_t update_usage(segment_id_t segment, int64_t delta) {
+    assert(segment < live_bytes_by_segment.size());
+    live_bytes_by_segment[segment] += delta;
+    assert(live_bytes_by_segment[segment] >= 0);
+    return live_bytes_by_segment[segment];
+  }
+public:
+  SpaceTrackerSimple(size_t num_segments)
+    : live_bytes_by_segment(num_segments, 0) {}
+
+  int64_t allocate(
+    segment_id_t segment,
+    segment_off_t offset,
+    extent_len_t len) final {
+    return update_usage(segment, len);
+  }
+
+  int64_t release(
+    segment_id_t segment,
+    segment_off_t offset,
+    extent_len_t len) final {
+    return update_usage(segment, -len);
+  }
+
+  int64_t get_usage(segment_id_t segment) const final {
+    assert(segment < live_bytes_by_segment.size());
+    return live_bytes_by_segment[segment];
+  }
+
+  void dump_usage(segment_id_t) const final {}
+
+  void reset() final {
+    for (auto &i: live_bytes_by_segment)
+      i = 0;
+  }
+
+  SpaceTrackerIRef make_empty() const final {
+    return SpaceTrackerIRef(
+      new SpaceTrackerSimple(live_bytes_by_segment.size()));
+  }
+
+  bool equals(const SpaceTrackerI &other) const;
+};
+
+class SpaceTrackerDetailed : public SpaceTrackerI {
+  class SegmentMap {
+    int64_t used = 0;
+    std::vector<bool> bitmap;
+
+  public:
+    SegmentMap(size_t blocks) : bitmap(blocks, false) {}
+
+    int64_t update_usage(int64_t delta) {
+      used += delta;
+      return used;
+    }
+
+    int64_t allocate(
+      segment_id_t segment,
+      segment_off_t offset,
+      extent_len_t len,
+      const extent_len_t block_size);
+
+    int64_t release(
+      segment_id_t segment,
+      segment_off_t offset,
+      extent_len_t len,
+      const extent_len_t block_size);
+
+    int64_t get_usage() const {
+      return used;
+    }
+
+    void dump_usage(extent_len_t block_size) const;
+
+    void reset() {
+      used = 0;
+      for (auto &&i: bitmap) {
+	i = false;
+      }
+    }
+  };
+  const size_t block_size;
+  const size_t segment_size;
+
+  // Tracks live space for each segment
+  std::vector<SegmentMap> segment_usage;
+
+public:
+  SpaceTrackerDetailed(size_t num_segments, size_t segment_size, size_t block_size)
+    : block_size(block_size),
+      segment_size(segment_size),
+      segment_usage(num_segments, segment_size / block_size) {}
+
+  int64_t allocate(
+    segment_id_t segment,
+    segment_off_t offset,
+    extent_len_t len) final {
+    assert(segment < segment_usage.size());
+    return segment_usage[segment].allocate(segment, offset, len, block_size);
+  }
+
+  int64_t release(
+    segment_id_t segment,
+    segment_off_t offset,
+    extent_len_t len) final {
+    assert(segment < segment_usage.size());
+    return segment_usage[segment].release(segment, offset, len, block_size);
+  }
+
+  int64_t get_usage(segment_id_t segment) const final {
+    assert(segment < segment_usage.size());
+    return segment_usage[segment].get_usage();
+  }
+
+  void dump_usage(segment_id_t seg) const final;
+
+  void reset() final {
+    for (auto &i: segment_usage)
+      i.reset();
+  }
+
+  SpaceTrackerIRef make_empty() const final {
+    return SpaceTrackerIRef(
+      new SpaceTrackerDetailed(
+	segment_usage.size(),
+	segment_size,
+	block_size));
+  }
+
+  bool equals(const SpaceTrackerI &other) const;
+};
+
+
+class SegmentCleaner : public JournalSegmentProvider {
+public:
+  /// Config
+  struct config_t {
+    size_t num_segments = 0;
+    size_t segment_size = 0;
+    size_t block_size = 0;
+    size_t target_journal_segments = 0;
+    size_t max_journal_segments = 0;
+
+    double reclaim_ratio_hard_limit = 0;
+    // don't apply reclaim ratio with available space below this
+    double reclaim_ratio_usage_min = 0;
+
+    double available_ratio_hard_limit = 0;
+
+    static config_t default_from_segment_manager(
+      SegmentManager &manager) {
+      return config_t{
+	manager.get_num_segments(),
+	static_cast<size_t>(manager.get_segment_size()),
+	(size_t)manager.get_block_size(),
+	2,
+	4,
+	.5,
+	.95,
+	.2
+	};
+    }
+  };
+
+  /// Callback interface for querying and operating on segments
+  class ExtentCallbackInterface {
+  public:
+    virtual ~ExtentCallbackInterface() = default;
+    /**
+     * get_next_dirty_extent
+     *
+     * returns all extents with dirty_from < bound
+     */
+    using get_next_dirty_extents_ertr = crimson::errorator<>;
+    using get_next_dirty_extents_ret = get_next_dirty_extents_ertr::future<
+      std::vector<CachedExtentRef>>;
+    virtual get_next_dirty_extents_ret get_next_dirty_extents(
+      journal_seq_t bound ///< [in] return extents with dirty_from < bound
+    ) = 0;
+
+    /**
+     * rewrite_extent
+     *
+     * Updates t with operations moving the passed extents to a new
+     * segment.  extent may be invalid, implementation must correctly
+     * handle finding the current instance if it is still alive and
+     * otherwise ignore it.
+     */
+    using rewrite_extent_ertr = crimson::errorator<
+      crimson::ct_error::input_output_error>;
+    using rewrite_extent_ret = rewrite_extent_ertr::future<>;
+    virtual rewrite_extent_ret rewrite_extent(
+      Transaction &t,
+      CachedExtentRef extent) = 0;
+
+    /**
+     * get_extent_if_live
+     *
+     * Returns extent at specified location if still referenced by
+     * lba_manager and not removed by t.
+     *
+     * See TransactionManager::get_extent_if_live and
+     * LBAManager::get_physical_extent_if_live.
+     */
+    using get_extent_if_live_ertr = crimson::errorator<
+      crimson::ct_error::input_output_error>;
+    using get_extent_if_live_ret = get_extent_if_live_ertr::future<
+      CachedExtentRef>;
+    virtual get_extent_if_live_ret get_extent_if_live(
+      Transaction &t,
+      extent_types_t type,
+      paddr_t addr,
+      laddr_t laddr,
+      segment_off_t len) = 0;
+
+    /**
+     * scan_extents
+     *
+     * Interface shim for Journal::scan_extents
+     */
+    using scan_extents_cursor = Journal::scan_valid_records_cursor;
+    using scan_extents_ertr = Journal::scan_extents_ertr;
+    using scan_extents_ret = Journal::scan_extents_ret;
+    virtual scan_extents_ret scan_extents(
+      scan_extents_cursor &cursor,
+      extent_len_t bytes_to_read) = 0;
+
+    /**
+     * release_segment
+     *
+     * Release segment.
+     */
+    using release_segment_ertr = SegmentManager::release_ertr;
+    using release_segment_ret = release_segment_ertr::future<>;
+    virtual release_segment_ret release_segment(
+      segment_id_t id) = 0;
+  };
+
+private:
+  const config_t config;
+
+  SpaceTrackerIRef space_tracker;
+  std::vector<segment_info_t> segments;
+  size_t empty_segments;
+  int64_t used_bytes = 0;
+  bool init_complete = false;
+
+  journal_seq_t journal_tail_target;
+  journal_seq_t journal_tail_committed;
+  journal_seq_t journal_head;
+
+  ExtentCallbackInterface *ecb = nullptr;
+
+public:
+  SegmentCleaner(config_t config, bool detailed = false)
+    : config(config),
+      space_tracker(
+	detailed ?
+	(SpaceTrackerI*)new SpaceTrackerDetailed(
+	  config.num_segments,
+	  config.segment_size,
+	  config.block_size) :
+	(SpaceTrackerI*)new SpaceTrackerSimple(
+	  config.num_segments)),
+      segments(config.num_segments),
+      empty_segments(config.num_segments) {}
+
+  get_segment_ret get_segment() final;
+
+  void close_segment(segment_id_t segment) final;
+
+  void set_journal_segment(
+    segment_id_t segment, segment_seq_t seq) final {
+    assert(segment < segments.size());
+    segments[segment].journal_segment_seq = seq;
+    assert(segments[segment].is_open());
+  }
+
+  journal_seq_t get_journal_tail_target() const final {
+    return journal_tail_target;
+  }
+
+  void update_journal_tail_committed(journal_seq_t committed) final;
+
+  void update_journal_tail_target(journal_seq_t target);
+
+  void init_journal_tail(journal_seq_t tail) {
+    journal_tail_target = journal_tail_committed = tail;
+  }
+
+  void set_journal_head(journal_seq_t head) {
+    assert(journal_head == journal_seq_t() || head >= journal_head);
+    journal_head = head;
+  }
+
+  void init_mark_segment_closed(segment_id_t segment, segment_seq_t seq) final {
+    crimson::get_logger(ceph_subsys_filestore).debug(
+      "SegmentCleaner::init_mark_segment_closed: segment {}, seq {}",
+      segment,
+      seq);
+    mark_closed(segment);
+    segments[segment].journal_segment_seq = seq;
+  }
+
+  segment_seq_t get_seq(segment_id_t id) final {
+    return segments[id].journal_segment_seq;
+  }
+
+  void mark_segment_released(segment_id_t segment) {
+    return mark_empty(segment);
+  }
+
+  void mark_space_used(
+    paddr_t addr,
+    extent_len_t len,
+    bool init_scan = false) {
+    assert(addr.segment < segments.size());
+
+    if (!init_scan && !init_complete)
+      return;
+
+    if (!init_scan) {
+      assert(segments[addr.segment].state == Segment::segment_state_t::OPEN);
+    }
+
+    used_bytes += len;
+    [[maybe_unused]] auto ret = space_tracker->allocate(
+      addr.segment,
+      addr.offset,
+      len);
+    assert(ret > 0);
+  }
+
+  void mark_space_free(
+    paddr_t addr,
+    extent_len_t len) {
+    if (!init_complete)
+      return;
+
+    used_bytes -= len;
+    assert(addr.segment < segments.size());
+
+    [[maybe_unused]] auto ret = space_tracker->release(
+      addr.segment,
+      addr.offset,
+      len);
+    assert(ret >= 0);
+  }
+
+  segment_id_t get_next_gc_target() const {
+    segment_id_t ret = NULL_SEG_ID;
+    int64_t least_live_bytes = std::numeric_limits<int64_t>::max();
+    for (segment_id_t i = 0; i < segments.size(); ++i) {
+      if (segments[i].is_closed() &&
+	  !segments[i].is_in_journal(journal_tail_committed) &&
+	  space_tracker->get_usage(i) < least_live_bytes) {
+	ret = i;
+	least_live_bytes = space_tracker->get_usage(i);
+      }
+    }
+    if (ret != NULL_SEG_ID) {
+      crimson::get_logger(ceph_subsys_filestore).debug(
+	"SegmentCleaner::get_next_gc_target: segment {} seq {}",
+	ret,
+	segments[ret].journal_segment_seq);
+    }
+    return ret;
+  }
+
+  SpaceTrackerIRef get_empty_space_tracker() const {
+    return space_tracker->make_empty();
+  }
+
+  void complete_init() { init_complete = true; }
+
+  void set_extent_callback(ExtentCallbackInterface *cb) {
+    ecb = cb;
+  }
+
+  bool debug_check_space(const SpaceTrackerI &tracker) {
+    return space_tracker->equals(tracker);
+  }
+
+  /**
+   * do_immediate_work
+   *
+   * Should be invoked prior to submission of any transaction,
+   * will piggy-back work required to maintain deferred work
+   * constraints.
+   */
+  using do_immediate_work_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using do_immediate_work_ret = do_immediate_work_ertr::future<>;
+  do_immediate_work_ret do_immediate_work(
+    Transaction &t);
+
+
+  /**
+   * do_deferred_work
+   *
+   * Should be called at idle times -- will perform background
+   * operations based on deferred work constraints.
+   *
+   * If returned timespan is non-zero, caller should pause calling
+   * back into do_deferred_work before returned timespan has elapsed,
+   * or a foreground operation occurs.
+   */
+  using do_deferred_work_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using do_deferred_work_ret = do_deferred_work_ertr::future<
+    ceph::timespan
+    >;
+  do_deferred_work_ret do_deferred_work(
+    Transaction &t);
+
+private:
+
+  // journal status helpers
+
+  /**
+   * rewrite_dirty
+   *
+   * Writes out dirty blocks dirtied earlier than limit.
+   */
+  using rewrite_dirty_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error>;
+  using rewrite_dirty_ret = rewrite_dirty_ertr::future<>;
+  rewrite_dirty_ret rewrite_dirty(
+    Transaction &t,
+    journal_seq_t limit);
+
+  journal_seq_t get_dirty_tail() const {
+    auto ret = journal_head;
+    ret.segment_seq -= std::min(
+      static_cast<size_t>(ret.segment_seq),
+      config.target_journal_segments);
+    return ret;
+  }
+
+  journal_seq_t get_dirty_tail_limit() const {
+    auto ret = journal_head;
+    ret.segment_seq -= std::min(
+      static_cast<size_t>(ret.segment_seq),
+      config.max_journal_segments);
+    return ret;
+  }
+
+  // GC status helpers
+  std::unique_ptr<ExtentCallbackInterface::scan_extents_cursor> scan_cursor;
+
+  /**
+   * do_gc
+   *
+   * Performs bytes worth of gc work on t.
+   */
+  using do_gc_ertr = SegmentManager::read_ertr;
+  using do_gc_ret = do_gc_ertr::future<>;
+  do_gc_ret do_gc(
+    Transaction &t,
+    size_t bytes);
+
+  size_t get_bytes_used_current_segment() const {
+    assert(journal_head != journal_seq_t());
+    return journal_head.offset.offset;
+  }
+
+  size_t get_bytes_available_current_segment() const {
+    return config.segment_size - get_bytes_used_current_segment();
+  }
+
+  /**
+   * get_bytes_scanned_current_segment
+   *
+   * Returns the number of bytes from the current gc segment that
+   * have been scanned.
+   */
+  size_t get_bytes_scanned_current_segment() const {
+    if (!scan_cursor)
+      return 0;
+
+    return scan_cursor->get_offset().offset;
+  }
+
+  size_t get_available_bytes() const {
+    return (empty_segments * config.segment_size) +
+      get_bytes_available_current_segment() +
+      get_bytes_scanned_current_segment();
+  }
+
+  size_t get_total_bytes() const {
+    return config.segment_size * config.num_segments;
+  }
+
+  size_t get_unavailable_bytes() const {
+    return get_total_bytes() - get_available_bytes();
+  }
+
+  /// Returns bytes currently occupied by live extents (not journal)
+  size_t get_used_bytes() const {
+    return used_bytes;
+  }
+
+  /// Returns the number of bytes in unavailable segments that are not live
+  size_t get_reclaimable_bytes() const {
+    return get_unavailable_bytes() - get_used_bytes();
+  }
+
+  /**
+   * get_reclaim_ratio
+   *
+   * Returns the ratio of unavailable space that is not currently used.
+   */
+  double get_reclaim_ratio() const {
+    if (get_unavailable_bytes() == 0) return 0;
+    return (double)get_reclaimable_bytes() / (double)get_unavailable_bytes();
+  }
+
+  /**
+   * get_available_ratio
+   *
+   * Returns ratio of available space to write to total space
+   */
+  double get_available_ratio() const {
+    return (double)get_available_bytes() / (double)get_total_bytes();
+  }
+
+  /**
+   * get_immediate_bytes_to_gc_for_reclaim
+   *
+   * Returns the number of bytes to gc in order to bring the
+   * reclaim ratio below reclaim_ratio_usage_min.
+   */
+  size_t get_immediate_bytes_to_gc_for_reclaim() const {
+    if (get_reclaim_ratio() < config.reclaim_ratio_hard_limit)
+      return 0;
+
+    const size_t unavailable_target = std::max(
+      get_used_bytes() / (1.0 - config.reclaim_ratio_hard_limit),
+      (1 - config.reclaim_ratio_usage_min) * get_total_bytes());
+
+    if (unavailable_target > get_unavailable_bytes())
+      return 0;
+
+    return (get_unavailable_bytes() - unavailable_target) / get_reclaim_ratio();
+  }
+
+  /**
+   * get_immediate_bytes_to_gc_for_available
+   *
+   * Returns the number of bytes to gc in order to bring the
+   * the ratio of available disk space to total disk space above
+   * available_ratio_hard_limit.
+   */
+  size_t get_immediate_bytes_to_gc_for_available() const {
+    if (get_available_ratio() > config.available_ratio_hard_limit) {
+      return 0;
+    }
+
+    const double ratio_to_make_available = config.available_ratio_hard_limit -
+      get_available_ratio();
+    return ratio_to_make_available * (double)get_total_bytes()
+      / get_reclaim_ratio();
+  }
+
+  /**
+   * get_immediate_bytes_to_gc
+   *
+   * Returns number of bytes to gc in order to restore any strict
+   * limits.
+   */
+  size_t get_immediate_bytes_to_gc() const {
+    // number of bytes to gc in order to correct reclaim ratio
+    size_t for_reclaim = get_immediate_bytes_to_gc_for_reclaim();
+
+    // number of bytes to gc in order to correct available_ratio
+    size_t for_available = get_immediate_bytes_to_gc_for_available();
+
+    return std::max(for_reclaim, for_available);
+  }
+
+  void mark_closed(segment_id_t segment) {
+    assert(segments.size() > segment);
+    if (init_complete) {
+      assert(segments[segment].is_open());
+    } else {
+      assert(segments[segment].is_empty());
+      assert(empty_segments > 0);
+      --empty_segments;
+    }
+    crimson::get_logger(ceph_subsys_filestore).debug(
+      "mark_closed: empty_segments: {}",
+      empty_segments);
+    segments[segment].state = Segment::segment_state_t::CLOSED;
+  }
+
+  void mark_empty(segment_id_t segment) {
+    assert(segments.size() > segment);
+    assert(segments[segment].is_closed());
+    assert(segments.size() > empty_segments);
+    ++empty_segments;
+    if (space_tracker->get_usage(segment) != 0) {
+      space_tracker->dump_usage(segment);
+      assert(space_tracker->get_usage(segment) == 0);
+    }
+    segments[segment].state = Segment::segment_state_t::EMPTY;
+  }
+
+  void mark_open(segment_id_t segment) {
+    assert(segments.size() > segment);
+    assert(segments[segment].is_empty());
+    assert(empty_segments > 0);
+    --empty_segments;
+    segments[segment].state = Segment::segment_state_t::OPEN;
+  }
+};
+
+}
diff --git a/src/crimson/os/seastore/segment_manager.h b/src/crimson/os/seastore/segment_manager.h
new file mode 100644
index 000000000..61c6509d1
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager.h
@@ -0,0 +1,128 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iosfwd>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "include/buffer_fwd.h"
+#include "crimson/osd/exceptions.h"
+
+namespace crimson::os::seastore {
+
+class Segment : public boost::intrusive_ref_counter<
+  Segment,
+  boost::thread_unsafe_counter>{
+public:
+
+  enum class segment_state_t : uint8_t {
+    EMPTY = 0,
+    OPEN = 1,
+    CLOSED = 2
+  };
+
+  /**
+   * get_segment_id
+   */
+  virtual segment_id_t get_segment_id() const = 0;
+
+  /**
+   * min next write location
+   */
+  virtual segment_off_t get_write_ptr() const = 0;
+
+  /**
+   * max capacity
+   */
+  virtual segment_off_t get_write_capacity() const = 0;
+
+  /**
+   * close
+   *
+   * Closes segment for writes.  Won't complete until
+   * outstanding writes to this segment are complete.
+   */
+  using close_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error,
+    crimson::ct_error::invarg,
+    crimson::ct_error::enoent>;
+  virtual close_ertr::future<> close() = 0;
+
+
+  /**
+   * write
+   *
+   * @param offset offset of write, must be aligned to <> and >= write pointer, advances
+   *               write pointer
+   * @param bl     buffer to write, will be padded if not aligned
+  */
+  using write_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error, // media error or corruption
+    crimson::ct_error::invarg,             // if offset is < write pointer or misaligned
+    crimson::ct_error::ebadf,              // segment closed
+    crimson::ct_error::enospc              // write exceeds segment size
+    >;
+  virtual write_ertr::future<> write(
+    segment_off_t offset, ceph::bufferlist bl) = 0;
+
+  virtual ~Segment() {}
+};
+using SegmentRef = boost::intrusive_ptr<Segment>;
+
+constexpr size_t PADDR_SIZE = sizeof(paddr_t);
+
+class SegmentManager {
+public:
+  using open_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error,
+    crimson::ct_error::invarg,
+    crimson::ct_error::enoent>;
+  virtual open_ertr::future<SegmentRef> open(segment_id_t id) = 0;
+
+  using release_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error,
+    crimson::ct_error::invarg,
+    crimson::ct_error::enoent>;
+  virtual release_ertr::future<> release(segment_id_t id) = 0;
+
+  using read_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error,
+    crimson::ct_error::invarg,
+    crimson::ct_error::enoent,
+    crimson::ct_error::erange>;
+  virtual read_ertr::future<> read(
+    paddr_t addr,
+    size_t len,
+    ceph::bufferptr &out) = 0;
+  read_ertr::future<ceph::bufferptr> read(
+    paddr_t addr,
+    size_t len) {
+    auto ptrref = std::make_unique<ceph::bufferptr>(
+      buffer::create_page_aligned(len));
+    return read(addr, len, *ptrref).safe_then(
+      [ptrref=std::move(ptrref)]() mutable {
+	return read_ertr::make_ready_future<bufferptr>(std::move(*ptrref));
+      });
+  }
+
+  /* Methods for discovering device geometry, segmentid set, etc */
+  virtual size_t get_size() const = 0;
+  virtual segment_off_t get_block_size() const = 0;
+  virtual segment_off_t get_segment_size() const = 0;
+  virtual segment_id_t get_num_segments() const {
+    ceph_assert(get_size() % get_segment_size() == 0);
+    return ((segment_id_t)(get_size() / get_segment_size()));
+  }
+  virtual const seastore_meta_t &get_meta() const = 0;
+
+  virtual ~SegmentManager() {}
+};
+using SegmentManagerRef = std::unique_ptr<SegmentManager>;
+
+}
diff --git a/src/crimson/os/seastore/segment_manager/block.cc b/src/crimson/os/seastore/segment_manager/block.cc
new file mode 100644
index 000000000..6a4991d42
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager/block.cc
@@ -0,0 +1,402 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include "crimson/common/log.h"
+
+#include "include/buffer.h"
+#include "crimson/os/seastore/segment_manager/block.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+
+namespace crimson::os::seastore::segment_manager::block {
+
+static write_ertr::future<> do_write(
+  seastar::file &device,
+  uint64_t offset,
+  bufferptr &bptr)
+{
+  logger().debug(
+    "block: do_write offset {} len {}",
+    offset,
+    bptr.length());
+  return device.dma_write(
+    offset,
+    bptr.c_str(),
+    bptr.length()
+  ).handle_exception([](auto e) -> write_ertr::future<size_t> {
+      logger().error(
+	"do_write: dma_write got error {}",
+	e);
+      return crimson::ct_error::input_output_error::make();
+  }).then([length=bptr.length()](auto result)
+	       -> write_ertr::future<> {
+    if (result != length) {
+      return crimson::ct_error::input_output_error::make();
+    }
+    return write_ertr::now();
+  });
+}
+
+static read_ertr::future<> do_read(
+  seastar::file &device,
+  uint64_t offset,
+  bufferptr &bptr)
+{
+  logger().debug(
+    "block: do_read offset {} len {}",
+    offset,
+    bptr.length());
+  return device.dma_read(
+    offset,
+    bptr.c_str(),
+    bptr.length()
+  ).handle_exception([](auto e) -> read_ertr::future<size_t> {
+    logger().error(
+      "do_read: dma_read got error {}",
+      e);
+    return crimson::ct_error::input_output_error::make();
+  }).then([length=bptr.length()](auto result) -> read_ertr::future<> {
+    if (result != length) {
+      return crimson::ct_error::input_output_error::make();
+    }
+    return read_ertr::now();
+  });
+}
+
+write_ertr::future<>
+SegmentStateTracker::write_out(
+  seastar::file &device,
+  uint64_t offset)
+{
+  return do_write(device, offset, bptr);
+}
+
+write_ertr::future<>
+SegmentStateTracker::read_in(
+  seastar::file &device,
+  uint64_t offset)
+{
+  return do_read(
+    device,
+    offset,
+    bptr);
+}
+
+static
+block_sm_superblock_t make_superblock(
+  const BlockSegmentManager::mkfs_config_t &config,
+  const seastar::stat_data &data)
+{
+  logger().debug(
+    "{}: size {}, block_size {}, allocated_size {}, configured_size {}",
+    __func__,
+    data.size,
+    data.block_size,
+    data.allocated_size,
+    config.total_size);
+  size_t size = (data.size == 0) ? config.total_size : data.size;
+  size_t raw_segments = size / config.segment_size;
+  size_t tracker_size = SegmentStateTracker::get_raw_size(
+    raw_segments,
+    data.block_size);
+  size_t segments = (size - tracker_size - data.block_size)
+    / config.segment_size;
+  return block_sm_superblock_t{
+    size,
+    config.segment_size,
+    data.block_size,
+    segments,
+    data.block_size,
+    tracker_size + data.block_size,
+    config.meta
+  };
+}
+
+using open_device_ret = 
+  BlockSegmentManager::access_ertr::future<
+  std::pair<seastar::file, seastar::stat_data>
+  >;
+static
+open_device_ret open_device(const std::string &in_path, seastar::open_flags mode)
+{
+  return seastar::do_with(
+    in_path,
+    [mode](auto &path) {
+      return seastar::file_stat(path, seastar::follow_symlink::yes
+      ).then([mode, &path](auto stat) mutable {
+	return seastar::open_file_dma(path, mode).then([=](auto file) {
+	  logger().debug("open_device: open successful");
+	  return std::make_pair(file, stat);
+	});
+      }).handle_exception([](auto e) -> open_device_ret {
+	logger().error(
+	  "open_device: got error {}",
+	  e);
+	return crimson::ct_error::input_output_error::make();
+      });
+    });
+}
+
+  
+static
+BlockSegmentManager::access_ertr::future<>
+write_superblock(seastar::file &device, block_sm_superblock_t sb)
+{
+  assert(ceph::encoded_sizeof_bounded<block_sm_superblock_t>() <
+	 sb.block_size);
+  return seastar::do_with(
+    bufferptr(ceph::buffer::create_page_aligned(sb.block_size)),
+    [=, &device](auto &bp) {
+      bufferlist bl;
+      encode(sb, bl);
+      auto iter = bl.begin();
+      assert(bl.length() < sb.block_size);
+      iter.copy(bl.length(), bp.c_str());
+      logger().debug("write_superblock: doing writeout");
+      return do_write(device, 0, bp);
+    });
+}
+
+static
+BlockSegmentManager::access_ertr::future<block_sm_superblock_t>
+read_superblock(seastar::file &device, seastar::stat_data sd)
+{
+  assert(ceph::encoded_sizeof_bounded<block_sm_superblock_t>() <
+	 sd.block_size);
+  return seastar::do_with(
+    bufferptr(ceph::buffer::create_page_aligned(sd.block_size)),
+    [=, &device](auto &bp) {
+      return do_read(
+	device,
+	0,
+	bp
+      ).safe_then([=, &bp] {
+	  bufferlist bl;
+	  bl.push_back(bp);
+	  block_sm_superblock_t ret;
+	  auto bliter = bl.cbegin();
+	  decode(ret, bliter);
+	  return BlockSegmentManager::access_ertr::future<block_sm_superblock_t>(
+	    BlockSegmentManager::access_ertr::ready_future_marker{},
+	    ret);
+      });
+    });
+}
+
+BlockSegment::BlockSegment(
+  BlockSegmentManager &manager, segment_id_t id)
+  : manager(manager), id(id) {}
+
+segment_off_t BlockSegment::get_write_capacity() const
+{
+  return manager.get_segment_size();
+}
+
+Segment::close_ertr::future<> BlockSegment::close()
+{
+  manager.segment_close(id);
+  return close_ertr::now();
+}
+
+Segment::write_ertr::future<> BlockSegment::write(
+  segment_off_t offset, ceph::bufferlist bl)
+{
+  if (offset < write_pointer || offset % manager.superblock.block_size != 0)
+    return crimson::ct_error::invarg::make();
+
+  if (offset + bl.length() > manager.superblock.segment_size)
+    return crimson::ct_error::enospc::make();
+
+  write_pointer = offset + bl.length();
+  return manager.segment_write({id, offset}, bl);
+}
+
+Segment::close_ertr::future<> BlockSegmentManager::segment_close(segment_id_t id)
+{
+  assert(tracker);
+  tracker->set(id, segment_state_t::CLOSED);
+  return tracker->write_out(device, superblock.tracker_offset);
+}
+
+Segment::write_ertr::future<> BlockSegmentManager::segment_write(
+  paddr_t addr,
+  ceph::bufferlist bl,
+  bool ignore_check)
+{
+  assert((bl.length() % superblock.block_size) == 0);
+  logger().debug(
+    "segment_write to segment {} at offset {}, physical offset {}, len {}",
+    addr.segment,
+    addr.offset,
+    get_offset(addr),
+    bl.length());
+
+  
+  // TODO send an iovec and avoid the copy -- bl should have aligned
+  // constituent buffers and they will remain unmodified until the write
+  // completes
+  return seastar::do_with(
+    bufferptr(ceph::buffer::create_page_aligned(bl.length())),
+    [&](auto &bp) {
+      auto iter = bl.cbegin();
+      iter.copy(bl.length(), bp.c_str());
+      return do_write(device, get_offset(addr), bp);
+    });
+}
+
+BlockSegmentManager::~BlockSegmentManager()
+{
+}
+
+BlockSegmentManager::mount_ret BlockSegmentManager::mount(mount_config_t config)
+{
+  return open_device(
+    config.path, seastar::open_flags::rw | seastar::open_flags::dsync
+  ).safe_then([=](auto p) {
+    device = std::move(p.first);
+    auto sd = p.second;
+    return read_superblock(device, sd);
+  }).safe_then([=](auto sb) {
+    superblock = sb;
+    tracker = std::make_unique<SegmentStateTracker>(
+      superblock.segments,
+      superblock.block_size);
+    return tracker->read_in(
+      device,
+      superblock.tracker_offset
+    ).safe_then([this] {
+      for (segment_id_t i = 0; i < tracker->get_capacity(); ++i) {
+	if (tracker->get(i) == segment_state_t::OPEN) {
+	  tracker->set(i, segment_state_t::CLOSED);
+	}
+      }
+      return tracker->write_out(device, superblock.tracker_offset);
+    });
+  });
+}
+
+BlockSegmentManager::mkfs_ret BlockSegmentManager::mkfs(mkfs_config_t config)
+{
+  return seastar::do_with(
+    seastar::file{},
+    seastar::stat_data{},
+    block_sm_superblock_t{},
+    std::unique_ptr<SegmentStateTracker>(),
+    [=](auto &device, auto &stat, auto &sb, auto &tracker) {
+      return open_device(
+	config.path, seastar::open_flags::rw
+      ).safe_then([&, config](auto p) {
+	device = p.first;
+	stat = p.second;
+	sb = make_superblock(config, stat);
+	return write_superblock(device, sb);
+      }).safe_then([&] {
+	logger().debug("BlockSegmentManager::mkfs: superblock written");
+	tracker.reset(new SegmentStateTracker(sb.segments, sb.block_size));
+	return tracker->write_out(device, sb.tracker_offset);
+      }).finally([&] {
+	return device.close();
+      }).safe_then([] {
+	logger().debug("BlockSegmentManager::mkfs: complete");
+	return mkfs_ertr::now();
+      });
+    });
+}
+
+BlockSegmentManager::close_ertr::future<> BlockSegmentManager::close()
+{
+  return device.close();
+}
+
+SegmentManager::open_ertr::future<SegmentRef> BlockSegmentManager::open(
+  segment_id_t id)
+{
+  if (id >= get_num_segments()) {
+    logger().error("BlockSegmentManager::open: invalid segment {}", id);
+    return crimson::ct_error::invarg::make();
+  }
+
+  if (tracker->get(id) != segment_state_t::EMPTY) {
+    logger().error(
+      "BlockSegmentManager::open: invalid segment {} state {}",
+      id,
+      tracker->get(id));
+    return crimson::ct_error::invarg::make();
+  }
+
+  tracker->set(id, segment_state_t::OPEN);
+  return tracker->write_out(device, superblock.tracker_offset
+  ).safe_then([this, id] {
+    return open_ertr::future<SegmentRef>(
+      open_ertr::ready_future_marker{},
+      SegmentRef(new BlockSegment(*this, id)));
+  });
+}
+
+SegmentManager::release_ertr::future<> BlockSegmentManager::release(
+  segment_id_t id)
+{
+  logger().debug("BlockSegmentManager::release: {}", id);
+
+  if (id >= get_num_segments()) {
+    logger().error(
+      "BlockSegmentManager::release: invalid segment {}",
+      id);
+    return crimson::ct_error::invarg::make();
+  }
+
+  if (tracker->get(id) != segment_state_t::CLOSED) {
+    logger().error(
+      "BlockSegmentManager::release: invalid segment {} state {}",
+      id,
+      tracker->get(id));
+    return crimson::ct_error::invarg::make();
+  }
+
+  tracker->set(id, segment_state_t::EMPTY);
+  return tracker->write_out(device, superblock.tracker_offset);
+}
+
+SegmentManager::read_ertr::future<> BlockSegmentManager::read(
+  paddr_t addr,
+  size_t len,
+  ceph::bufferptr &out)
+{
+  if (addr.segment >= get_num_segments()) {
+    logger().error(
+      "BlockSegmentManager::read: invalid segment {}",
+      addr);
+    return crimson::ct_error::invarg::make();
+  }
+
+  if (addr.offset + len > superblock.segment_size) {
+    logger().error(
+      "BlockSegmentManager::read: invalid offset {}~{}!",
+      addr,
+      len);
+    return crimson::ct_error::invarg::make();
+  }
+
+  if (tracker->get(addr.segment) == segment_state_t::EMPTY) {
+    logger().error(
+      "BlockSegmentManager::read: read on invalid segment {} state {}",
+      addr.segment,
+      tracker->get(addr.segment));
+    return crimson::ct_error::enoent::make();
+  }
+
+  return do_read(
+    device,
+    get_offset(addr),
+    out);
+}
+
+}
diff --git a/src/crimson/os/seastore/segment_manager/block.h b/src/crimson/os/seastore/segment_manager/block.h
new file mode 100644
index 000000000..927b13e4e
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager/block.h
@@ -0,0 +1,222 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <seastar/core/file.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/reactor.hh>
+
+#include "crimson/common/layout.h"
+
+#include "crimson/os/seastore/segment_manager.h"
+
+namespace crimson::os::seastore::segment_manager::block {
+
+struct block_sm_superblock_t {
+  size_t size = 0;
+  size_t segment_size = 0;
+  size_t block_size = 0;
+    
+  size_t segments = 0;
+  uint64_t tracker_offset = 0;
+  uint64_t first_segment_offset = 0;
+
+  seastore_meta_t meta;
+    
+  DENC(block_sm_superblock_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.size, p);
+    denc(v.segment_size, p);
+    denc(v.block_size, p);
+    denc(v.segments, p);
+    denc(v.tracker_offset, p);
+    denc(v.first_segment_offset, p);
+    denc(v.meta, p);
+    DENC_FINISH(p);
+  }
+};
+
+using write_ertr = crimson::errorator<
+  crimson::ct_error::input_output_error>;
+using read_ertr = crimson::errorator<
+  crimson::ct_error::input_output_error>;
+
+/**
+ * SegmentStateTracker
+ *
+ * Tracks lifecycle state of each segment using space at the beginning
+ * of the drive.
+ */
+class SegmentStateTracker {
+  using segment_state_t = Segment::segment_state_t;
+
+  bufferptr bptr;
+
+  using L = absl::container_internal::Layout<uint8_t>;
+  const L layout;
+
+public:
+  static size_t get_raw_size(size_t segments, size_t block_size) {
+    return p2roundup(segments, block_size);
+  }
+
+  SegmentStateTracker(size_t segments, size_t block_size)
+    : bptr(ceph::buffer::create_page_aligned(
+	     get_raw_size(segments, block_size))),
+      layout(bptr.length())
+  {
+    ::memset(
+      bptr.c_str(),
+      static_cast<char>(segment_state_t::EMPTY),
+      bptr.length());
+  }
+
+  size_t get_size() const {
+    return bptr.length();
+  }
+
+  size_t get_capacity() const {
+    return bptr.length();
+  }
+
+  segment_state_t get(segment_id_t offset) const {
+    assert(offset < get_capacity());
+    return static_cast<segment_state_t>(
+      layout.template Pointer<0>(
+	bptr.c_str())[offset]);
+  }
+
+  void set(segment_id_t offset, segment_state_t state) {
+    assert(offset < get_capacity());
+    layout.template Pointer<0>(bptr.c_str())[offset] =
+      static_cast<uint8_t>(state);
+  }
+
+  write_ertr::future<> write_out(
+    seastar::file &device,
+    uint64_t offset);
+
+  read_ertr::future<> read_in(
+    seastar::file &device,
+    uint64_t offset);
+};
+
+class BlockSegmentManager;
+class BlockSegment final : public Segment {
+  friend class BlockSegmentManager;
+  BlockSegmentManager &manager;
+  const segment_id_t id;
+  segment_off_t write_pointer = 0;
+public:
+  BlockSegment(BlockSegmentManager &manager, segment_id_t id);
+
+  segment_id_t get_segment_id() const final { return id; }
+  segment_off_t get_write_capacity() const final;
+  segment_off_t get_write_ptr() const final { return write_pointer; }
+  close_ertr::future<> close() final;
+  write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final;
+
+  ~BlockSegment() {}
+};
+
+/**
+ * BlockSegmentManager
+ *
+ * Implements SegmentManager on a conventional block device.
+ * SegmentStateTracker uses space at the start of the device to store
+ * state analagous to that of the segments of a zns device.
+ */
+class BlockSegmentManager final : public SegmentManager {
+public:
+  using access_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error,
+    crimson::ct_error::permission_denied,
+    crimson::ct_error::enoent>;
+
+
+  struct mount_config_t {
+    std::string path;
+  };
+  using mount_ertr = access_ertr;
+  using mount_ret = access_ertr::future<>;
+  mount_ret mount(mount_config_t);
+
+  struct mkfs_config_t {
+    std::string path;
+    size_t segment_size = 0;
+    size_t total_size = 0;
+    seastore_meta_t meta;
+  };
+  using mkfs_ertr = access_ertr;
+  using mkfs_ret = mkfs_ertr::future<>;
+  static mkfs_ret mkfs(mkfs_config_t);
+  
+  using close_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error
+    >;
+  close_ertr::future<> close();
+
+  BlockSegmentManager() = default;
+  ~BlockSegmentManager();
+
+  open_ertr::future<SegmentRef> open(segment_id_t id) final;
+
+  release_ertr::future<> release(segment_id_t id) final;
+
+  read_ertr::future<> read(
+    paddr_t addr,
+    size_t len,
+    ceph::bufferptr &out) final;
+
+  size_t get_size() const final {
+    return superblock.size;
+  }
+  segment_off_t get_block_size() const {
+    return superblock.block_size;
+  }
+  segment_off_t get_segment_size() const {
+    return superblock.segment_size;
+  }
+
+  // public so tests can bypass segment interface when simpler
+  Segment::write_ertr::future<> segment_write(
+    paddr_t addr,
+    ceph::bufferlist bl,
+    bool ignore_check=false);
+
+private:
+  friend class BlockSegment;
+  using segment_state_t = Segment::segment_state_t;
+
+  
+  std::unique_ptr<SegmentStateTracker> tracker;
+  block_sm_superblock_t superblock;
+  seastar::file device;
+
+  size_t get_offset(paddr_t addr) {
+    return superblock.first_segment_offset +
+      (addr.segment * superblock.segment_size) +
+      addr.offset;
+  }
+
+  const seastore_meta_t &get_meta() const {
+    return superblock.meta;
+  }
+
+  std::vector<segment_state_t> segment_state;
+
+  char *buffer = nullptr;
+
+  Segment::close_ertr::future<> segment_close(segment_id_t id);
+};
+
+}
+
+WRITE_CLASS_DENC_BOUNDED(
+  crimson::os::seastore::segment_manager::block::block_sm_superblock_t
+)
+
diff --git a/src/crimson/os/seastore/segment_manager/ephemeral.cc b/src/crimson/os/seastore/segment_manager/ephemeral.cc
new file mode 100644
index 000000000..3250303ad
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager/ephemeral.cc
@@ -0,0 +1,226 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include "seastar/core/sleep.hh"
+
+#include "crimson/common/log.h"
+
+#include "include/buffer.h"
+#include "crimson/os/seastore/segment_manager/ephemeral.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+namespace crimson::os::seastore::segment_manager {
+
+std::ostream &operator<<(std::ostream &lhs, const ephemeral_config_t &c) {
+  return lhs << "ephemeral_config_t(size=" << c.size << ", block_size=" << c.block_size
+	     << ", segment_size=" << c.segment_size << ")";
+}
+
+EphemeralSegmentManagerRef create_test_ephemeral() {
+  return EphemeralSegmentManagerRef(
+    new EphemeralSegmentManager(DEFAULT_TEST_EPHEMERAL));
+}
+
+EphemeralSegment::EphemeralSegment(
+  EphemeralSegmentManager &manager, segment_id_t id)
+  : manager(manager), id(id) {}
+
+segment_off_t EphemeralSegment::get_write_capacity() const
+{
+  return manager.get_segment_size();
+}
+
+Segment::close_ertr::future<> EphemeralSegment::close()
+{
+  manager.segment_close(id);
+  return close_ertr::now().safe_then([] {
+    return seastar::sleep(std::chrono::milliseconds(1));
+  });
+}
+
+Segment::write_ertr::future<> EphemeralSegment::write(
+  segment_off_t offset, ceph::bufferlist bl)
+{
+  if (offset < write_pointer || offset % manager.config.block_size != 0)
+    return crimson::ct_error::invarg::make();
+
+  if (offset + bl.length() > (size_t)manager.get_segment_size())
+    return crimson::ct_error::enospc::make();
+
+  return manager.segment_write({id, offset}, bl);
+}
+
+Segment::close_ertr::future<> EphemeralSegmentManager::segment_close(segment_id_t id)
+{
+  if (segment_state[id] != segment_state_t::OPEN)
+    return crimson::ct_error::invarg::make();
+
+  segment_state[id] = segment_state_t::CLOSED;
+  return Segment::close_ertr::now().safe_then([] {
+    return seastar::sleep(std::chrono::milliseconds(1));
+  });
+}
+
+Segment::write_ertr::future<> EphemeralSegmentManager::segment_write(
+  paddr_t addr,
+  ceph::bufferlist bl,
+  bool ignore_check)
+{
+  logger().debug(
+    "segment_write to segment {} at offset {}, physical offset {}, len {}, crc {}",
+    addr.segment,
+    addr.offset,
+    get_offset(addr),
+    bl.length(),
+    bl.crc32c(1));
+  if (!ignore_check && segment_state[addr.segment] != segment_state_t::OPEN)
+    return crimson::ct_error::invarg::make();
+
+  bl.begin().copy(bl.length(), buffer + get_offset(addr));
+  return Segment::write_ertr::now().safe_then([] {
+    return seastar::sleep(std::chrono::milliseconds(1));
+  });
+}
+
+EphemeralSegmentManager::init_ertr::future<> EphemeralSegmentManager::init()
+{
+  logger().debug(
+    "Initing ephemeral segment manager with config {}",
+    config);
+
+  meta = seastore_meta_t{};
+
+  if (config.block_size % (4<<10) != 0) {
+    return crimson::ct_error::invarg::make();
+  }
+  if (config.segment_size % config.block_size != 0) {
+    return crimson::ct_error::invarg::make();
+  }
+  if (config.size % config.segment_size != 0) {
+    return crimson::ct_error::invarg::make();
+  }
+
+  auto addr = ::mmap(
+    nullptr,
+    config.size,
+    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
+    -1,
+    0);
+
+  segment_state.resize(config.size / config.segment_size, segment_state_t::EMPTY);
+
+  if (addr == MAP_FAILED)
+    return crimson::ct_error::enospc::make();
+
+  buffer = (char*)addr;
+
+  ::memset(buffer, 0, config.size);
+  return init_ertr::now().safe_then([] {
+    return seastar::sleep(std::chrono::milliseconds(1));
+  });
+}
+
+EphemeralSegmentManager::~EphemeralSegmentManager()
+{
+  if (buffer) {
+    ::munmap(buffer, config.size);
+  }
+}
+
+void EphemeralSegmentManager::remount()
+{
+  for (auto &i : segment_state) {
+    if (i == Segment::segment_state_t::OPEN)
+      i = Segment::segment_state_t::CLOSED;
+  }
+}
+
+SegmentManager::open_ertr::future<SegmentRef> EphemeralSegmentManager::open(
+  segment_id_t id)
+{
+  if (id >= get_num_segments()) {
+    logger().error("EphemeralSegmentManager::open: invalid segment {}", id);
+    return crimson::ct_error::invarg::make();
+  }
+
+  if (segment_state[id] != segment_state_t::EMPTY) {
+    logger().error("EphemeralSegmentManager::open: segment {} not empty", id);
+    return crimson::ct_error::invarg::make();
+  }
+
+  segment_state[id] = segment_state_t::OPEN;
+  return open_ertr::make_ready_future<SegmentRef>(new EphemeralSegment(*this, id));
+}
+
+SegmentManager::release_ertr::future<> EphemeralSegmentManager::release(
+  segment_id_t id)
+{
+  logger().debug("EphemeralSegmentManager::release: {}", id);
+
+  if (id >= get_num_segments()) {
+    logger().error(
+      "EphemeralSegmentManager::release: invalid segment {}",
+      id);
+    return crimson::ct_error::invarg::make();
+  }
+
+  if (segment_state[id] != segment_state_t::CLOSED) {
+    logger().error(
+      "EphemeralSegmentManager::release: segment id {} not closed",
+      id);
+    return crimson::ct_error::invarg::make();
+  }
+
+  ::memset(buffer + get_offset({id, 0}), 0, config.segment_size);
+  segment_state[id] = segment_state_t::EMPTY;
+  return release_ertr::now().safe_then([] {
+    return seastar::sleep(std::chrono::milliseconds(1));
+  });
+}
+
+SegmentManager::read_ertr::future<> EphemeralSegmentManager::read(
+  paddr_t addr,
+  size_t len,
+  ceph::bufferptr &out)
+{
+  if (addr.segment >= get_num_segments()) {
+    logger().error(
+      "EphemeralSegmentManager::read: invalid segment {}",
+      addr);
+    return crimson::ct_error::invarg::make();
+  }
+
+  if (addr.offset + len > config.segment_size) {
+    logger().error(
+      "EphemeralSegmentManager::read: invalid offset {}~{}!",
+      addr,
+      len);
+    return crimson::ct_error::invarg::make();
+  }
+
+  out.copy_in(0, len, buffer + get_offset(addr));
+
+  bufferlist bl;
+  bl.push_back(out);
+  logger().debug(
+    "segment_read to segment {} at offset {}, physical offset {}, length {}, crc {}",
+    addr.segment,
+    addr.offset,
+    get_offset(addr),
+    len,
+    bl.begin().crc32c(len, 1));
+
+  return read_ertr::now().safe_then([] {
+    return seastar::sleep(std::chrono::milliseconds(1));
+  });
+}
+
+}
diff --git a/src/crimson/os/seastore/segment_manager/ephemeral.h b/src/crimson/os/seastore/segment_manager/ephemeral.h
new file mode 100644
index 000000000..9f19cb4d0
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager/ephemeral.h
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "crimson/os/seastore/segment_manager.h"
+
+#include "crimson/os/seastore/segment_manager/ephemeral.h"
+
+namespace crimson::os::seastore::segment_manager {
+
+class EphemeralSegmentManager;
+using EphemeralSegmentManagerRef = std::unique_ptr<EphemeralSegmentManager>;
+
+struct ephemeral_config_t {
+  size_t size = 0;
+  size_t block_size = 0;
+  size_t segment_size = 0;
+};
+
+constexpr ephemeral_config_t DEFAULT_TEST_EPHEMERAL = {
+  1 << 30,
+  4 << 10,
+  8 << 20
+};
+
+std::ostream &operator<<(std::ostream &, const ephemeral_config_t &);
+EphemeralSegmentManagerRef create_test_ephemeral();
+
+class EphemeralSegment final : public Segment {
+  friend class EphemeralSegmentManager;
+  EphemeralSegmentManager &manager;
+  const segment_id_t id;
+  segment_off_t write_pointer = 0;
+public:
+  EphemeralSegment(EphemeralSegmentManager &manager, segment_id_t id);
+
+  segment_id_t get_segment_id() const final { return id; }
+  segment_off_t get_write_capacity() const final;
+  segment_off_t get_write_ptr() const final { return write_pointer; }
+  close_ertr::future<> close() final;
+  write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final;
+
+  ~EphemeralSegment() {}
+};
+
+class EphemeralSegmentManager final : public SegmentManager {
+  friend class EphemeralSegment;
+  using segment_state_t = Segment::segment_state_t;
+
+  const ephemeral_config_t config;
+  std::optional<seastore_meta_t> meta;
+
+  size_t get_offset(paddr_t addr) {
+    return (addr.segment * config.segment_size) + addr.offset;
+  }
+
+  std::vector<segment_state_t> segment_state;
+
+  char *buffer = nullptr;
+
+  Segment::close_ertr::future<> segment_close(segment_id_t id);
+
+public:
+  EphemeralSegmentManager(ephemeral_config_t config) : config(config) {}
+  ~EphemeralSegmentManager();
+
+  using init_ertr = crimson::errorator<
+    crimson::ct_error::enospc,
+    crimson::ct_error::invarg,
+    crimson::ct_error::erange>;
+  init_ertr::future<> init();
+
+  open_ertr::future<SegmentRef> open(segment_id_t id) final;
+
+  release_ertr::future<> release(segment_id_t id) final;
+
+  read_ertr::future<> read(
+    paddr_t addr,
+    size_t len,
+    ceph::bufferptr &out) final;
+
+  size_t get_size() const final {
+    return config.size;
+  }
+  segment_off_t get_block_size() const final {
+    return config.block_size;
+  }
+  segment_off_t get_segment_size() const final {
+    return config.segment_size;
+  }
+
+  const seastore_meta_t &get_meta() const final {
+    assert(meta);
+    return *meta;
+  }
+
+  void remount();
+
+  // public so tests can bypass segment interface when simpler
+  Segment::write_ertr::future<> segment_write(
+    paddr_t addr,
+    ceph::bufferlist bl,
+    bool ignore_check=false);
+};
+
+}
diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h
new file mode 100644
index 000000000..e189d1d32
--- /dev/null
+++ b/src/crimson/os/seastore/transaction.h
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/root_block.h"
+
+namespace crimson::os::seastore {
+
+/**
+ * Transaction
+ *
+ * Representation of in-progress mutation. Used exclusively through Cache methods.
+ */
+class Transaction {
+public:
+  using Ref = std::unique_ptr<Transaction>;
+  enum class get_extent_ret {
+    PRESENT,
+    ABSENT,
+    RETIRED
+  };
+  get_extent_ret get_extent(paddr_t addr, CachedExtentRef *out) {
+    if (retired_set.count(addr)) {
+      return get_extent_ret::RETIRED;
+    } else if (auto iter = write_set.find_offset(addr);
+	iter != write_set.end()) {
+      if (out)
+	*out = CachedExtentRef(&*iter);
+      return get_extent_ret::PRESENT;
+    } else if (
+      auto iter = read_set.find(addr);
+      iter != read_set.end()) {
+      if (out)
+	*out = CachedExtentRef(*iter);
+      return get_extent_ret::PRESENT;
+    } else {
+      return get_extent_ret::ABSENT;
+    }
+  }
+
+  void add_to_retired_set(CachedExtentRef ref) {
+    ceph_assert(!is_weak());
+    if (!ref->is_initial_pending()) {
+      // && retired_set.count(ref->get_paddr()) == 0
+      // If it's already in the set, insert here will be a noop,
+      // which is what we want.
+      retired_set.insert(ref);
+    } else {
+      ref->state = CachedExtent::extent_state_t::INVALID;
+    }
+    if (ref->is_pending()) {
+      write_set.erase(*ref);
+    }
+  }
+
+  void add_to_read_set(CachedExtentRef ref) {
+    if (is_weak()) return;
+
+    ceph_assert(read_set.count(ref) == 0);
+    read_set.insert(ref);
+  }
+
+  void add_fresh_extent(CachedExtentRef ref) {
+    ceph_assert(!is_weak());
+    fresh_block_list.push_back(ref);
+    ref->set_paddr(make_record_relative_paddr(offset));
+    offset += ref->get_length();
+    write_set.insert(*ref);
+  }
+
+  void add_mutated_extent(CachedExtentRef ref) {
+    ceph_assert(!is_weak());
+    mutated_block_list.push_back(ref);
+    write_set.insert(*ref);
+  }
+
+  void mark_segment_to_release(segment_id_t segment) {
+    assert(to_release == NULL_SEG_ID);
+    to_release = segment;
+  }
+
+  segment_id_t get_segment_to_release() const {
+    return to_release;
+  }
+
+  const auto &get_fresh_block_list() {
+    return fresh_block_list;
+  }
+
+  const auto &get_mutated_block_list() {
+    return mutated_block_list;
+  }
+
+  const auto &get_retired_set() {
+    return retired_set;
+  }
+
+  bool is_weak() const {
+    return weak;
+  }
+
+private:
+  friend class Cache;
+  friend Ref make_transaction();
+  friend Ref make_weak_transaction();
+
+  /**
+   * If set, *this may not be used to perform writes and will not provide
+   * consistentency allowing operations using to avoid maintaining a read_set.
+   */
+  const bool weak;
+
+  RootBlockRef root;        ///< ref to root if read or written by transaction
+
+  segment_off_t offset = 0; ///< relative offset of next block
+
+  pextent_set_t read_set;   ///< set of extents read by paddr
+  ExtentIndex write_set;    ///< set of extents written by paddr
+
+  std::list<CachedExtentRef> fresh_block_list;   ///< list of fresh blocks
+  std::list<CachedExtentRef> mutated_block_list; ///< list of mutated blocks
+
+  pextent_set_t retired_set; ///< list of extents mutated by this transaction
+
+  ///< if != NULL_SEG_ID, release this segment after completion
+  segment_id_t to_release = NULL_SEG_ID;
+
+  Transaction(bool weak) : weak(weak) {}
+};
+using TransactionRef = Transaction::Ref;
+
+inline TransactionRef make_transaction() {
+  return std::unique_ptr<Transaction>(new Transaction(false));
+}
+
+inline TransactionRef make_weak_transaction() {
+  return std::unique_ptr<Transaction>(new Transaction(true));
+}
+
+}
diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc
new file mode 100644
index 000000000..7b86631e2
--- /dev/null
+++ b/src/crimson/os/seastore/transaction_manager.cc
@@ -0,0 +1,306 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/denc.h"
+#include "include/intarith.h"
+
+#include "crimson/common/log.h"
+
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/segment_manager.h"
+#include "crimson/os/seastore/journal.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+namespace crimson::os::seastore {
+
+TransactionManager::TransactionManager(
+  SegmentManager &segment_manager,
+  SegmentCleaner &segment_cleaner,
+  Journal &journal,
+  Cache &cache,
+  LBAManager &lba_manager)
+  : segment_manager(segment_manager),
+    segment_cleaner(segment_cleaner),
+    cache(cache),
+    lba_manager(lba_manager),
+    journal(journal)
+{}
+
+TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
+{
+  return journal.open_for_write().safe_then([this](auto addr) {
+    logger().debug("TransactionManager::mkfs: about to do_with");
+    segment_cleaner.set_journal_head(addr);
+    return seastar::do_with(
+      create_transaction(),
+      [this](auto &transaction) {
+	logger().debug("TransactionManager::mkfs: about to cache.mkfs");
+	cache.init();
+	return cache.mkfs(*transaction
+	).safe_then([this, &transaction] {
+	  return lba_manager.mkfs(*transaction);
+	}).safe_then([this, &transaction] {
+	  logger().debug("TransactionManager::mkfs: about to submit_transaction");
+	  return submit_transaction(std::move(transaction)).handle_error(
+	    crimson::ct_error::eagain::handle([] {
+	      ceph_assert(0 == "eagain impossible");
+	      return mkfs_ertr::now();
+	    }),
+	    mkfs_ertr::pass_further{}
+	  );
+	});
+      });
+  }).safe_then([this] {
+    return journal.close();
+  });
+}
+
+TransactionManager::mount_ertr::future<> TransactionManager::mount()
+{
+  cache.init();
+  return journal.replay([this](auto seq, auto paddr, const auto &e) {
+    return cache.replay_delta(seq, paddr, e);
+  }).safe_then([this] {
+    return journal.open_for_write();
+  }).safe_then([this](auto addr) {
+    segment_cleaner.set_journal_head(addr);
+    return seastar::do_with(
+      make_weak_transaction(),
+      [this](auto &t) {
+	return cache.init_cached_extents(*t, [this](auto &t, auto &e) {
+	  return lba_manager.init_cached_extent(t, e);
+	}).safe_then([this, &t] {
+          assert(segment_cleaner.debug_check_space(
+                   *segment_cleaner.get_empty_space_tracker()));
+          return lba_manager.scan_mapped_space(
+            *t,
+            [this](paddr_t addr, extent_len_t len) {
+              logger().debug("TransactionManager::mount: marking {}~{} used",
+                           addr,
+                           len);
+              segment_cleaner.mark_space_used(
+                addr,
+                len ,
+                /* init_scan = */ true);
+            });
+        });
+      });
+  }).safe_then([this] {
+    segment_cleaner.complete_init();
+  }).handle_error(
+    mount_ertr::pass_further{},
+    crimson::ct_error::all_same_way([] {
+      ceph_assert(0 == "unhandled error");
+      return mount_ertr::now();
+    }));
+}
+
+TransactionManager::close_ertr::future<> TransactionManager::close() {
+  return cache.close(
+  ).safe_then([this] {
+    return journal.close();
+  });
+}
+
+TransactionManager::ref_ret TransactionManager::inc_ref(
+  Transaction &t,
+  LogicalCachedExtentRef &ref)
+{
+  return lba_manager.incref_extent(t, ref->get_laddr()).safe_then([](auto r) {
+    return r.refcount;
+  }).handle_error(
+    ref_ertr::pass_further{},
+    ct_error::all_same_way([](auto e) {
+      ceph_assert(0 == "unhandled error, TODO");
+    }));
+}
+
+TransactionManager::ref_ret TransactionManager::inc_ref(
+  Transaction &t,
+  laddr_t offset)
+{
+  return lba_manager.incref_extent(t, offset).safe_then([](auto result) {
+    return result.refcount;
+  });
+}
+
+TransactionManager::ref_ret TransactionManager::dec_ref(
+  Transaction &t,
+  LogicalCachedExtentRef &ref)
+{
+  return lba_manager.decref_extent(t, ref->get_laddr()
+  ).safe_then([this, &t, ref](auto ret) {
+    if (ret.refcount == 0) {
+      logger().debug(
+	"TransactionManager::dec_ref: extent {} refcount 0",
+	*ref);
+      cache.retire_extent(t, ref);
+    }
+    return ret.refcount;
+  });
+}
+
+TransactionManager::ref_ret TransactionManager::dec_ref(
+  Transaction &t,
+  laddr_t offset)
+{
+  return lba_manager.decref_extent(t, offset
+  ).safe_then([this, offset, &t](auto result) -> ref_ret {
+    if (result.refcount == 0) {
+      logger().debug(
+	"TransactionManager::dec_ref: offset {} refcount 0",
+	offset);
+      return cache.retire_extent_if_cached(t, result.addr).safe_then([] {
+	return ref_ret(
+	  ref_ertr::ready_future_marker{},
+	  0);
+      });
+    } else {
+      return ref_ret(
+	ref_ertr::ready_future_marker{},
+	result.refcount);
+    }
+  });
+}
+
+TransactionManager::submit_transaction_ertr::future<>
+TransactionManager::submit_transaction(
+  TransactionRef t)
+{
+  logger().debug("TransactionManager::submit_transaction");
+  return segment_cleaner.do_immediate_work(*t
+  ).safe_then([this, t=std::move(t)]() mutable -> submit_transaction_ertr::future<> {
+    auto record = cache.try_construct_record(*t);
+    if (!record) {
+      return crimson::ct_error::eagain::make();
+    }
+
+    return journal.submit_record(std::move(*record)
+    ).safe_then([this, t=std::move(t)](auto p) mutable {
+      auto [addr, journal_seq] = p;
+      segment_cleaner.set_journal_head(journal_seq);
+      cache.complete_commit(*t, addr, journal_seq, &segment_cleaner);
+      lba_manager.complete_transaction(*t);
+      auto to_release = t->get_segment_to_release();
+      if (to_release != NULL_SEG_ID) {
+	segment_cleaner.mark_segment_released(to_release);
+	return segment_manager.release(to_release);
+      } else {
+	return SegmentManager::release_ertr::now();
+      }
+    }).handle_error(
+      submit_transaction_ertr::pass_further{},
+      crimson::ct_error::all_same_way([](auto e) {
+	ceph_assert(0 == "Hit error submitting to journal");
+      }));
+  });
+}
+
+TransactionManager::get_next_dirty_extents_ret
+TransactionManager::get_next_dirty_extents(journal_seq_t seq)
+{
+  return cache.get_next_dirty_extents(seq);
+}
+
+TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
+  Transaction &t,
+  CachedExtentRef extent)
+{
+  {
+    auto updated = cache.update_extent_from_transaction(t, extent);
+    if (!updated) {
+      logger().debug(
+	"{}: {} is already retired, skipping",
+	__func__,
+	*extent);
+      return rewrite_extent_ertr::now();
+    }
+    extent = updated;
+  }
+
+  if (extent->get_type() == extent_types_t::ROOT) {
+    logger().debug(
+      "{}: marking root {} for rewrite",
+      __func__,
+      *extent);
+    cache.duplicate_for_write(t, extent);
+    return rewrite_extent_ertr::now();
+  }
+  return lba_manager.rewrite_extent(t, extent);
+}
+
+TransactionManager::get_extent_if_live_ret TransactionManager::get_extent_if_live(
+  Transaction &t,
+  extent_types_t type,
+  paddr_t addr,
+  laddr_t laddr,
+  segment_off_t len)
+{
+  CachedExtentRef ret;
+  auto status = cache.get_extent_if_cached(t, addr, &ret);
+  if (status != Transaction::get_extent_ret::ABSENT) {
+    return get_extent_if_live_ret(
+      get_extent_if_live_ertr::ready_future_marker{},
+      ret);
+  }
+
+  if (is_logical_type(type)) {
+    return lba_manager.get_mapping(
+      t,
+      laddr,
+      len).safe_then([=, &t](lba_pin_list_t pins) {
+	ceph_assert(pins.size() <= 1);
+	if (pins.empty()) {
+	  return get_extent_if_live_ret(
+	    get_extent_if_live_ertr::ready_future_marker{},
+	    CachedExtentRef());
+	}
+
+	auto pin = std::move(pins.front());
+	pins.pop_front();
+	ceph_assert(pin->get_laddr() == laddr);
+	ceph_assert(pin->get_length() == (extent_len_t)len);
+	if (pin->get_paddr() == addr) {
+	  return cache.get_extent_by_type(
+	    t,
+	    type,
+	    addr,
+	    laddr,
+	    len).safe_then(
+	      [this, pin=std::move(pin)](CachedExtentRef ret) mutable {
+		auto lref = ret->cast<LogicalCachedExtent>();
+		if (!lref->has_pin()) {
+		  lref->set_pin(std::move(pin));
+		  lba_manager.add_pin(lref->get_pin());
+		}
+		return get_extent_if_live_ret(
+		  get_extent_if_live_ertr::ready_future_marker{},
+		  ret);
+	      });
+	} else {
+	  return get_extent_if_live_ret(
+	    get_extent_if_live_ertr::ready_future_marker{},
+	    CachedExtentRef());
+	}
+      });
+  } else {
+    logger().debug(
+      "TransactionManager::get_extent_if_live: non-logical extent {}",
+      addr);
+    return lba_manager.get_physical_extent_if_live(
+      t,
+      type,
+      addr,
+      laddr,
+      len);
+  }
+}
+
+TransactionManager::~TransactionManager() {}
+
+}
diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h
new file mode 100644
index 000000000..d28fd0b87
--- /dev/null
+++ b/src/crimson/os/seastore/transaction_manager.h
@@ -0,0 +1,296 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <optional>
+#include <vector>
+#include <utility>
+#include <functional>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "include/buffer.h"
+
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/segment_cleaner.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/segment_manager.h"
+#include "crimson/os/seastore/lba_manager.h"
+#include "crimson/os/seastore/journal.h"
+
+namespace crimson::os::seastore {
+class Journal;
+
+/**
+ * TransactionManager
+ *
+ * Abstraction hiding reading and writing to persistence.
+ * Exposes transaction based interface with read isolation.
+ */
+class TransactionManager : public SegmentCleaner::ExtentCallbackInterface {
+public:
+  TransactionManager(
+    SegmentManager &segment_manager,
+    SegmentCleaner &segment_cleaner,
+    Journal &journal,
+    Cache &cache,
+    LBAManager &lba_manager);
+
+  /// Writes initial metadata to disk
+  using mkfs_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error
+    >;
+  mkfs_ertr::future<> mkfs();
+
+  /// Reads initial metadata from disk
+  using mount_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error
+    >;
+  mount_ertr::future<> mount();
+
+  /// Closes transaction_manager
+  using close_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error
+    >;
+  close_ertr::future<> close();
+
+  /// Creates empty transaction
+  TransactionRef create_transaction() {
+    return make_transaction();
+  }
+
+  /// Creates weak transaction
+  TransactionRef create_weak_transaction() {
+    return make_weak_transaction();
+  }
+
+  /**
+   * Read extents corresponding to specified lba range
+   */
+  using read_extent_ertr = SegmentManager::read_ertr;
+  template <typename T>
+  using read_extent_ret = read_extent_ertr::future<lextent_list_t<T>>;
+  template <typename T>
+  read_extent_ret<T> read_extents(
+    Transaction &t,
+    laddr_t offset,
+    extent_len_t length)
+  {
+    std::unique_ptr<lextent_list_t<T>> ret =
+      std::make_unique<lextent_list_t<T>>();
+    auto &ret_ref = *ret;
+    std::unique_ptr<lba_pin_list_t> pin_list =
+      std::make_unique<lba_pin_list_t>();
+    auto &pin_list_ref = *pin_list;
+    return lba_manager.get_mapping(
+      t, offset, length
+    ).safe_then([this, &t, &pin_list_ref, &ret_ref](auto pins) {
+      crimson::get_logger(ceph_subsys_filestore).debug(
+	"read_extents: mappings {}",
+	pins);
+      pins.swap(pin_list_ref);
+      return crimson::do_for_each(
+	pin_list_ref.begin(),
+	pin_list_ref.end(),
+	[this, &t, &ret_ref](auto &pin) {
+	  crimson::get_logger(ceph_subsys_filestore).debug(
+	    "read_extents: get_extent {}~{}",
+	    pin->get_paddr(),
+	    pin->get_length());
+	  return cache.get_extent<T>(
+	    t,
+	    pin->get_paddr(),
+	    pin->get_length()
+	  ).safe_then([this, &pin, &ret_ref](auto ref) mutable {
+	    if (!ref->has_pin()) {
+	      ref->set_pin(std::move(pin));
+	      lba_manager.add_pin(ref->get_pin());
+	    }
+	    ret_ref.push_back(std::make_pair(ref->get_laddr(), ref));
+	    crimson::get_logger(ceph_subsys_filestore).debug(
+	      "read_extents: got extent {}",
+	      *ref);
+	    return read_extent_ertr::now();
+	  });
+	});
+    }).safe_then([ret=std::move(ret), pin_list=std::move(pin_list)]() mutable {
+      return read_extent_ret<T>(
+	read_extent_ertr::ready_future_marker{},
+	std::move(*ret));
+    });
+  }
+
+  /// Obtain mutable copy of extent
+  LogicalCachedExtentRef get_mutable_extent(Transaction &t, LogicalCachedExtentRef ref) {
+    auto &logger = crimson::get_logger(ceph_subsys_filestore);
+    auto ret = cache.duplicate_for_write(
+      t,
+      ref)->cast<LogicalCachedExtent>();
+    if (!ret->has_pin()) {
+      logger.debug(
+	"{}: duplicating {} for write: {}",
+	__func__,
+	*ref,
+	*ret);
+      ret->set_pin(ref->get_pin().duplicate());
+    } else {
+      logger.debug(
+	"{}: {} already pending",
+	__func__,
+	*ref);
+      assert(ref->is_pending());
+      assert(&*ref == &*ret);
+    }
+    return ret;
+  }
+
+
+  using ref_ertr = LBAManager::ref_ertr;
+  using ref_ret = ref_ertr::future<unsigned>;
+
+  /// Add refcount for ref
+  ref_ret inc_ref(
+    Transaction &t,
+    LogicalCachedExtentRef &ref);
+
+  /// Add refcount for offset
+  ref_ret inc_ref(
+    Transaction &t,
+    laddr_t offset);
+
+  /// Remove refcount for ref
+  ref_ret dec_ref(
+    Transaction &t,
+    LogicalCachedExtentRef &ref);
+
+  /// Remove refcount for offset
+  ref_ret dec_ref(
+    Transaction &t,
+    laddr_t offset);
+
+  /**
+   * alloc_extent
+   *
+   * Allocates a new block of type T with the minimum lba range of size len
+   * greater than hint.
+   */
+  using alloc_extent_ertr = SegmentManager::read_ertr;
+  template <typename T>
+  using alloc_extent_ret = alloc_extent_ertr::future<TCachedExtentRef<T>>;
+  template <typename T>
+  alloc_extent_ret<T> alloc_extent(
+    Transaction &t,
+    laddr_t hint,
+    extent_len_t len) {
+    auto ext = cache.alloc_new_extent<T>(
+      t,
+      len);
+    return lba_manager.alloc_extent(
+      t,
+      hint,
+      len,
+      ext->get_paddr()
+    ).safe_then([ext=std::move(ext)](auto &&ref) mutable {
+      ext->set_pin(std::move(ref));
+      return alloc_extent_ertr::make_ready_future<TCachedExtentRef<T>>(
+	std::move(ext));
+    });
+  }
+
+  /**
+   * submit_transaction
+   *
+   * Atomically submits transaction to persistence
+   */
+  using submit_transaction_ertr = crimson::errorator<
+    crimson::ct_error::eagain, // Caller should retry transaction from beginning
+    crimson::ct_error::input_output_error // Media error
+    >;
+  submit_transaction_ertr::future<> submit_transaction(TransactionRef);
+
+  /// SegmentCleaner::ExtentCallbackInterface
+
+  using SegmentCleaner::ExtentCallbackInterface::get_next_dirty_extents_ret;
+  get_next_dirty_extents_ret get_next_dirty_extents(
+    journal_seq_t seq) final;
+
+  using SegmentCleaner::ExtentCallbackInterface::rewrite_extent_ret;
+  rewrite_extent_ret rewrite_extent(
+    Transaction &t,
+    CachedExtentRef extent) final;
+
+  using SegmentCleaner::ExtentCallbackInterface::get_extent_if_live_ret;
+  get_extent_if_live_ret get_extent_if_live(
+    Transaction &t,
+    extent_types_t type,
+    paddr_t addr,
+    laddr_t laddr,
+    segment_off_t len) final;
+
+  using scan_extents_cursor =
+    SegmentCleaner::ExtentCallbackInterface::scan_extents_cursor;
+  using scan_extents_ertr =
+    SegmentCleaner::ExtentCallbackInterface::scan_extents_ertr;
+  using scan_extents_ret =
+    SegmentCleaner::ExtentCallbackInterface::scan_extents_ret;
+  scan_extents_ret scan_extents(
+    scan_extents_cursor &cursor,
+    extent_len_t bytes_to_read) final {
+    return journal.scan_extents(cursor, bytes_to_read);
+  }
+
+  using release_segment_ret =
+    SegmentCleaner::ExtentCallbackInterface::release_segment_ret;
+  release_segment_ret release_segment(
+    segment_id_t id) final {
+    return segment_manager.release(id);
+  }
+
+  /**
+   * read_onode_root
+   *
+   * Get onode-tree root logical address
+   */
+  using read_onode_root_ertr = crimson::errorator<
+    crimson::ct_error::input_output_error
+    >;
+  using read_onode_root_ret = read_onode_root_ertr::future<laddr_t>;
+  read_onode_root_ret read_onode_root(Transaction &t) {
+    return cache.get_root(t).safe_then([](auto croot) {
+      return croot->get_root().onode_root;
+    });
+  }
+
+  /**
+   * write_onode_root
+   *
+   * Write onode-tree root logical address, must be called after read.
+   */
+  void write_onode_root(Transaction &t, laddr_t addr) {
+    auto croot = cache.get_root_fast(t);
+    croot = cache.duplicate_for_write(t, croot)->cast<RootBlock>();
+    croot->get_root().onode_root = addr;
+  }
+
+  ~TransactionManager();
+
+private:
+  friend class Transaction;
+
+  SegmentManager &segment_manager;
+  SegmentCleaner &segment_cleaner;
+  Cache &cache;
+  LBAManager &lba_manager;
+  Journal &journal;
+};
+using TransactionManagerRef = std::unique_ptr<TransactionManager>;
+
+}
diff --git a/src/crimson/osd/CMakeLists.txt b/src/crimson/osd/CMakeLists.txt
new file mode 100644
index 000000000..898f70c42
--- /dev/null
+++ b/src/crimson/osd/CMakeLists.txt
@@ -0,0 +1,57 @@
+add_executable(crimson-osd
+  backfill_state.cc
+  ec_backend.cc
+  heartbeat.cc
+  main.cc
+  osd.cc
+  osd_meta.cc
+  pg.cc
+  pg_backend.cc
+  pg_meta.cc
+  replicated_backend.cc
+  shard_services.cc
+  object_context.cc
+  ops_executer.cc
+  osd_operation.cc
+  osd_operations/client_request.cc
+  osd_operations/compound_peering_request.cc
+  osd_operations/peering_event.cc
+  osd_operations/pg_advance_map.cc
+  osd_operations/replicated_request.cc
+  osd_operations/background_recovery.cc
+  osd_operations/recovery_subrequest.cc
+  pg_recovery.cc
+  recovery_backend.cc
+  replicated_recovery_backend.cc
+  scheduler/scheduler.cc
+  scheduler/mclock_scheduler.cc
+  osdmap_gate.cc
+  pg_map.cc
+  objclass.cc
+  ${PROJECT_SOURCE_DIR}/src/objclass/class_api.cc
+  ${PROJECT_SOURCE_DIR}/src/osd/ClassHandler.cc
+  ${PROJECT_SOURCE_DIR}/src/osd/osd_op_util.cc
+  ${PROJECT_SOURCE_DIR}/src/osd/OSDCap.cc
+  ${PROJECT_SOURCE_DIR}/src/osd/PeeringState.cc
+  ${PROJECT_SOURCE_DIR}/src/osd/PGPeeringEvent.cc
+  ${PROJECT_SOURCE_DIR}/src/osd/PGStateUtils.cc
+  ${PROJECT_SOURCE_DIR}/src/osd/MissingLoc.cc
+  ${PROJECT_SOURCE_DIR}/src/osd/PGLog.cc
+  ${PROJECT_SOURCE_DIR}/src/osd/recovery_types.cc
+  ${PROJECT_SOURCE_DIR}/src/osd/osd_perf_counters.cc
+  watch.cc
+  )
+target_link_libraries(crimson-osd
+  crimson-admin
+  crimson-common
+  crimson-os
+  crimson
+  fmt::fmt
+  Boost::MPL
+  dmclock::dmclock)
+set_target_properties(crimson-osd PROPERTIES
+  POSITION_INDEPENDENT_CODE ${EXE_LINKER_USE_PIE})
+install(TARGETS crimson-osd DESTINATION bin)
+if(WITH_TESTS)
+  add_dependencies(tests crimson-osd)
+endif()
diff --git a/src/crimson/osd/acked_peers.h b/src/crimson/osd/acked_peers.h
new file mode 100644
index 000000000..b2f2562c0
--- /dev/null
+++ b/src/crimson/osd/acked_peers.h
@@ -0,0 +1,14 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <vector>
+
+namespace crimson::osd {
+  struct peer_shard_t {
+    pg_shard_t shard;
+    eversion_t last_complete_ondisk;
+  };
+  using acked_peers_t = std::vector<peer_shard_t>;
+}
diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h
new file mode 100644
index 000000000..683dc6ea6
--- /dev/null
+++ b/src/crimson/osd/backfill_facades.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/osd/backfill_state.h"
+#include "crimson/osd/pg.h"
+#include "osd/PeeringState.h"
+
+namespace crimson::osd {
+
+// PeeringFacade -- main implementation of the BackfillState::PeeringFacade
+// interface. We have the abstraction to decuple BackfillState from Peering
+// State, and thus cut depedencies in unit testing. The second implemention
+// is BackfillFixture::PeeringFacade and sits in test_backfill.cc.
+struct PeeringFacade final : BackfillState::PeeringFacade {
+  PeeringState& peering_state;
+
+  hobject_t earliest_backfill() const override {
+    return peering_state.earliest_backfill();
+  }
+
+  const std::set<pg_shard_t>& get_backfill_targets() const override {
+    return peering_state.get_backfill_targets();
+  }
+
+  const hobject_t& get_peer_last_backfill(pg_shard_t peer) const override {
+    return peering_state.get_peer_info(peer).last_backfill;
+  }
+
+  const eversion_t& get_last_update() const override {
+    return peering_state.get_info().last_update;
+  }
+
+  const eversion_t& get_log_tail() const override {
+    return peering_state.get_info().log_tail;
+  }
+
+  void scan_log_after(eversion_t v, scan_log_func_t f) const override {
+    peering_state.get_pg_log().get_log().scan_log_after(v, std::move(f));
+  }
+
+  bool is_backfill_target(pg_shard_t peer) const override {
+    return peering_state.is_backfill_target(peer);
+  }
+  void update_complete_backfill_object_stats(const hobject_t &hoid,
+                                             const pg_stat_t &stats) override {
+    peering_state.update_complete_backfill_object_stats(hoid, stats);
+  }
+
+  bool is_backfilling() const override {
+    return peering_state.is_backfilling();
+  }
+
+  PeeringFacade(PeeringState& peering_state)
+    : peering_state(peering_state) {
+  }
+};
+
+// PGFacade -- a facade (in the GoF-defined meaning) simplifying the huge
+// interface of crimson's PG class. The motivation is to have an inventory
+// of behaviour that must be provided by a unit test's mock.
+struct PGFacade final : BackfillState::PGFacade {
+  PG& pg;
+
+  const eversion_t& get_projected_last_update() const override {
+    return pg.projected_last_update;
+  }
+
+  PGFacade(PG& pg) : pg(pg) {}
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc
new file mode 100644
index 000000000..57f845f92
--- /dev/null
+++ b/src/crimson/osd/backfill_state.cc
@@ -0,0 +1,556 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <algorithm>
+#include <boost/type_index.hpp>
+
+#include "crimson/osd/backfill_state.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+namespace crimson::osd {
+
+BackfillState::BackfillState(
+  BackfillState::BackfillListener& backfill_listener,
+  std::unique_ptr<BackfillState::PeeringFacade> peering_state,
+  std::unique_ptr<BackfillState::PGFacade> pg)
+  : backfill_machine(*this,
+                     backfill_listener,
+                     std::move(peering_state),
+                     std::move(pg)),
+    progress_tracker(
+      std::make_unique<BackfillState::ProgressTracker>(backfill_machine))
+{
+  logger().debug("{}:{}", __func__, __LINE__);
+  backfill_machine.initiate();
+}
+
+template <class S>
+BackfillState::StateHelper<S>::StateHelper()
+{
+  logger().debug("enter {}",
+		 boost::typeindex::type_id<S>().pretty_name());
+}
+
+template <class S>
+BackfillState::StateHelper<S>::~StateHelper()
+{
+  logger().debug("exit {}",
+		 boost::typeindex::type_id<S>().pretty_name());
+}
+
+BackfillState::~BackfillState() = default;
+
+BackfillState::BackfillMachine::BackfillMachine(
+  BackfillState& backfill_state,
+  BackfillState::BackfillListener& backfill_listener,
+  std::unique_ptr<BackfillState::PeeringFacade> peering_state,
+  std::unique_ptr<BackfillState::PGFacade> pg)
+  : backfill_state(backfill_state),
+    backfill_listener(backfill_listener),
+    peering_state(std::move(peering_state)),
+    pg(std::move(pg))
+{}
+
+BackfillState::BackfillMachine::~BackfillMachine() = default;
+
+BackfillState::Initial::Initial(my_context ctx)
+  : my_base(ctx)
+{
+  backfill_state().last_backfill_started = peering_state().earliest_backfill();
+  logger().debug("{}: bft={} from {}",
+                 __func__, peering_state().get_backfill_targets(),
+                 backfill_state().last_backfill_started);
+  for (const auto& bt : peering_state().get_backfill_targets()) {
+    logger().debug("{}: target shard {} from {}",
+                   __func__, bt, peering_state().get_peer_last_backfill(bt));
+  }
+  ceph_assert(peering_state().get_backfill_targets().size());
+  ceph_assert(!backfill_state().last_backfill_started.is_max());
+}
+
+boost::statechart::result
+BackfillState::Initial::react(const BackfillState::Triggered& evt)
+{
+  logger().debug("{}: backfill triggered", __func__);
+  ceph_assert(backfill_state().last_backfill_started == \
+              peering_state().earliest_backfill());
+  ceph_assert(peering_state().is_backfilling());
+  // initialize BackfillIntervals
+  for (const auto& bt : peering_state().get_backfill_targets()) {
+    backfill_state().peer_backfill_info[bt].reset(
+      peering_state().get_peer_last_backfill(bt));
+  }
+  backfill_state().backfill_info.reset(backfill_state().last_backfill_started);
+  if (Enqueuing::all_enqueued(peering_state(),
+                              backfill_state().backfill_info,
+                              backfill_state().peer_backfill_info)) {
+    logger().debug("{}: switching to Done state", __func__);
+    return transit<BackfillState::Done>();
+  } else {
+    logger().debug("{}: switching to Enqueuing state", __func__);
+    return transit<BackfillState::Enqueuing>();
+  }
+}
+
+
+// -- Enqueuing
+void BackfillState::Enqueuing::maybe_update_range()
+{
+  if (auto& primary_bi = backfill_state().backfill_info;
+      primary_bi.version >= pg().get_projected_last_update()) {
+    logger().info("{}: bi is current", __func__);
+    ceph_assert(primary_bi.version == pg().get_projected_last_update());
+  } else if (primary_bi.version >= peering_state().get_log_tail()) {
+#if 0
+    if (peering_state().get_pg_log().get_log().empty() &&
+        pg().get_projected_log().empty()) {
+      /* Because we don't move log_tail on split, the log might be
+       * empty even if log_tail != last_update.  However, the only
+       * way to get here with an empty log is if log_tail is actually
+       * eversion_t(), because otherwise the entry which changed
+       * last_update since the last scan would have to be present.
+       */
+      ceph_assert(primary_bi.version == eversion_t());
+      return;
+    }
+#endif
+    logger().debug("{}: bi is old, ({}) can be updated with log to {}",
+                   __func__,
+                   primary_bi.version,
+                   pg().get_projected_last_update());
+    logger().debug("{}: scanning pg log first", __func__);
+    peering_state().scan_log_after(primary_bi.version,
+      [&](const pg_log_entry_t& e) {
+        logger().debug("maybe_update_range(lambda): updating from version {}",
+                       e.version);
+        if (e.soid >= primary_bi.begin && e.soid <  primary_bi.end) {
+	  if (e.is_update()) {
+	    logger().debug("maybe_update_range(lambda): {} updated to ver {}",
+                           e.soid, e.version);
+            primary_bi.objects.erase(e.soid);
+            primary_bi.objects.insert(std::make_pair(e.soid,
+                                                             e.version));
+	  } else if (e.is_delete()) {
+            logger().debug("maybe_update_range(lambda): {} removed",
+                           e.soid);
+            primary_bi.objects.erase(e.soid);
+          }
+        }
+      });
+    primary_bi.version = pg().get_projected_last_update();
+  } else {
+    ceph_abort_msg(
+      "scan_range should have raised primary_bi.version past log_tail");
+  }
+}
+
+void BackfillState::Enqueuing::trim_backfill_infos()
+{
+  for (const auto& bt : peering_state().get_backfill_targets()) {
+    backfill_state().peer_backfill_info[bt].trim_to(
+      std::max(peering_state().get_peer_last_backfill(bt),
+               backfill_state().last_backfill_started));
+  }
+  backfill_state().backfill_info.trim_to(
+    backfill_state().last_backfill_started);
+}
+
+/* static */ bool BackfillState::Enqueuing::all_enqueued(
+  const PeeringFacade& peering_state,
+  const BackfillInterval& backfill_info,
+  const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info)
+{
+  const bool all_local_enqueued = \
+    backfill_info.extends_to_end() && backfill_info.empty();
+  const bool all_peer_enqueued = std::all_of(
+    std::begin(peer_backfill_info),
+    std::end(peer_backfill_info),
+    [] (const auto& kv) {
+      [[maybe_unused]] const auto& [ shard, peer_backfill_info ] = kv;
+      return peer_backfill_info.extends_to_end() && peer_backfill_info.empty();
+    });
+  return all_local_enqueued && all_peer_enqueued;
+}
+
+hobject_t BackfillState::Enqueuing::earliest_peer_backfill(
+  const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const
+{
+  hobject_t e = hobject_t::get_max();
+  for (const pg_shard_t& bt : peering_state().get_backfill_targets()) {
+    const auto iter = peer_backfill_info.find(bt);
+    ceph_assert(iter != peer_backfill_info.end());
+    e = std::min(e, iter->second.begin);
+  }
+  return e;
+}
+
+bool BackfillState::Enqueuing::should_rescan_replicas(
+  const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
+  const BackfillInterval& backfill_info) const
+{
+  const auto& targets = peering_state().get_backfill_targets();
+  return std::any_of(std::begin(targets), std::end(targets),
+    [&] (const auto& bt) {
+      return ReplicasScanning::replica_needs_scan(peer_backfill_info.at(bt),
+                                                  backfill_info);
+    });
+}
+
+bool BackfillState::Enqueuing::should_rescan_primary(
+  const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
+  const BackfillInterval& backfill_info) const
+{
+  return backfill_info.begin <= earliest_peer_backfill(peer_backfill_info) &&
+	 !backfill_info.extends_to_end();
+}
+
+void BackfillState::Enqueuing::trim_backfilled_object_from_intervals(
+  BackfillState::Enqueuing::result_t&& result,
+  hobject_t& last_backfill_started,
+  std::map<pg_shard_t, BackfillInterval>& peer_backfill_info)
+{
+  std::for_each(std::begin(result.pbi_targets), std::end(result.pbi_targets),
+    [&peer_backfill_info] (const auto& bt) {
+      peer_backfill_info.at(bt).pop_front();
+    });
+  last_backfill_started = std::move(result.new_last_backfill_started);
+}
+
+BackfillState::Enqueuing::result_t
+BackfillState::Enqueuing::remove_on_peers(const hobject_t& check)
+{
+  // set `new_last_backfill_started` to `check`
+  result_t result { {}, check };
+  for (const auto& bt : peering_state().get_backfill_targets()) {
+    const auto& pbi = backfill_state().peer_backfill_info.at(bt);
+    if (pbi.begin == check) {
+      result.pbi_targets.insert(bt);
+      const auto& version = pbi.objects.begin()->second;
+      backfill_state().progress_tracker->enqueue_drop(pbi.begin);
+      backfill_listener().enqueue_drop(bt, pbi.begin, version);
+    }
+  }
+  logger().debug("{}: BACKFILL removing {} from peers {}",
+                 __func__, check, result.pbi_targets);
+  ceph_assert(!result.pbi_targets.empty());
+  return result;
+}
+
+BackfillState::Enqueuing::result_t
+BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
+{
+  logger().debug("{}: check={}", __func__, check);
+  const auto& primary_bi = backfill_state().backfill_info;
+  result_t result { {}, primary_bi.begin };
+
+  for (const auto& bt : peering_state().get_backfill_targets()) {
+    const auto& peer_bi = backfill_state().peer_backfill_info.at(bt);
+
+    // Find all check peers that have the wrong version
+    if (const eversion_t& obj_v = primary_bi.objects.begin()->second;
+        check == primary_bi.begin && check == peer_bi.begin) {
+      if(peer_bi.objects.begin()->second != obj_v &&
+          backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) {
+        backfill_listener().enqueue_push(primary_bi.begin, obj_v);
+      } else {
+        // it's fine, keep it! OR already recovering
+      }
+      result.pbi_targets.insert(bt);
+    } else {
+      // Only include peers that we've caught up to their backfill line
+      // otherwise, they only appear to be missing this object
+      // because their peer_bi.begin > backfill_info.begin.
+      if (primary_bi.begin > peering_state().get_peer_last_backfill(bt) &&
+          backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) {
+        backfill_listener().enqueue_push(primary_bi.begin, obj_v);
+      }
+    }
+  }
+  return result;
+}
+
+bool BackfillState::Enqueuing::Enqueuing::all_emptied(
+  const BackfillInterval& local_backfill_info,
+  const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const
+{
+  const auto& targets = peering_state().get_backfill_targets();
+  const auto replicas_emptied =
+    std::all_of(std::begin(targets), std::end(targets),
+      [&] (const auto& bt) {
+        return peer_backfill_info.at(bt).empty();
+      });
+  return local_backfill_info.empty() && replicas_emptied;
+}
+
+BackfillState::Enqueuing::Enqueuing(my_context ctx)
+  : my_base(ctx)
+{
+  auto& primary_bi = backfill_state().backfill_info;
+
+  // update our local interval to cope with recent changes
+  primary_bi.begin = backfill_state().last_backfill_started;
+  if (primary_bi.version < peering_state().get_log_tail()) {
+    // it might be that the OSD is so flooded with modifying operations
+    // that backfill will be spinning here over and over. For the sake
+    // of performance and complexity we don't synchronize with entire PG.
+    // similar can happen in classical OSD.
+    logger().warn("{}: bi is old, rescanning of local backfill_info",
+                  __func__);
+    post_event(RequestPrimaryScanning{});
+    return;
+  } else {
+    maybe_update_range();
+  }
+  trim_backfill_infos();
+
+  while (!all_emptied(primary_bi, backfill_state().peer_backfill_info)) {
+    if (!backfill_listener().budget_available()) {
+      post_event(RequestWaiting{});
+      return;
+    } else if (should_rescan_replicas(backfill_state().peer_backfill_info,
+                                      primary_bi)) {
+      // Count simultaneous scans as a single op and let those complete
+      post_event(RequestReplicasScanning{});
+      return;
+    }
+    // Get object within set of peers to operate on and the set of targets
+    // for which that object applies.
+    if (const hobject_t check = \
+          earliest_peer_backfill(backfill_state().peer_backfill_info);
+        check < primary_bi.begin) {
+      // Don't increment ops here because deletions
+      // are cheap and not replied to unlike real recovery_ops,
+      // and we can't increment ops without requeueing ourself
+      // for recovery.
+      auto result = remove_on_peers(check);
+      trim_backfilled_object_from_intervals(std::move(result),
+					    backfill_state().last_backfill_started,
+					    backfill_state().peer_backfill_info);
+    } else {
+      auto result = update_on_peers(check);
+      trim_backfilled_object_from_intervals(std::move(result),
+					    backfill_state().last_backfill_started,
+					    backfill_state().peer_backfill_info);
+      primary_bi.pop_front();
+    }
+    backfill_listener().maybe_flush();
+  }
+
+  if (should_rescan_primary(backfill_state().peer_backfill_info,
+                            primary_bi)) {
+    // need to grab one another chunk of the object namespace and restart
+    // the queueing.
+    logger().debug("{}: reached end for current local chunk",
+                   __func__);
+    post_event(RequestPrimaryScanning{});
+  } else if (backfill_state().progress_tracker->tracked_objects_completed()) {
+    post_event(RequestDone{});
+  } else {
+    logger().debug("{}: reached end for both local and all peers "
+                   "but still has in-flight operations", __func__);
+    post_event(RequestWaiting{});
+  }
+}
+
+// -- PrimaryScanning
+BackfillState::PrimaryScanning::PrimaryScanning(my_context ctx)
+  : my_base(ctx)
+{
+  backfill_state().backfill_info.version = peering_state().get_last_update();
+  backfill_listener().request_primary_scan(
+    backfill_state().backfill_info.begin);
+}
+
+boost::statechart::result
+BackfillState::PrimaryScanning::react(PrimaryScanned evt)
+{
+  logger().debug("{}", __func__);
+  backfill_state().backfill_info = std::move(evt.result);
+  return transit<Enqueuing>();
+}
+
+boost::statechart::result
+BackfillState::PrimaryScanning::react(ObjectPushed evt)
+{
+  logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}",
+                 evt.object);
+  backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
+  return discard_event();
+}
+
+// -- ReplicasScanning
+bool BackfillState::ReplicasScanning::replica_needs_scan(
+  const BackfillInterval& replica_backfill_info,
+  const BackfillInterval& local_backfill_info)
+{
+  return replica_backfill_info.empty() && \
+         replica_backfill_info.begin <= local_backfill_info.begin && \
+         !replica_backfill_info.extends_to_end();
+}
+
+BackfillState::ReplicasScanning::ReplicasScanning(my_context ctx)
+  : my_base(ctx)
+{
+  for (const auto& bt : peering_state().get_backfill_targets()) {
+    if (const auto& pbi = backfill_state().peer_backfill_info.at(bt);
+        replica_needs_scan(pbi, backfill_state().backfill_info)) {
+      logger().debug("{}: scanning peer osd.{} from {}",
+                     __func__, bt, pbi.end);
+      backfill_listener().request_replica_scan(bt, pbi.end, hobject_t{});
+
+      ceph_assert(waiting_on_backfill.find(bt) == \
+                  waiting_on_backfill.end());
+      waiting_on_backfill.insert(bt);
+    }
+  }
+  ceph_assert(!waiting_on_backfill.empty());
+  // TODO: start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
+}
+
+#if 0
+BackfillState::ReplicasScanning::~ReplicasScanning()
+{
+  // TODO: finish_recovery_op(hobject_t::get_max());
+}
+#endif
+
+boost::statechart::result
+BackfillState::ReplicasScanning::react(ReplicaScanned evt)
+{
+  logger().debug("{}: got scan result from osd={}, result={}",
+                 __func__, evt.from, evt.result);
+  // TODO: maybe we'll be able to move waiting_on_backfill from
+  // the machine to the state.
+  ceph_assert(peering_state().is_backfill_target(evt.from));
+  if (waiting_on_backfill.erase(evt.from)) {
+    backfill_state().peer_backfill_info[evt.from] = std::move(evt.result);
+    if (waiting_on_backfill.empty()) {
+      ceph_assert(backfill_state().peer_backfill_info.size() == \
+                  peering_state().get_backfill_targets().size());
+      return transit<Enqueuing>();
+    }
+  } else {
+    // we canceled backfill for a while due to a too full, and this
+    // is an extra response from a non-too-full peer
+    logger().debug("{}: canceled backfill (too full?)", __func__);
+  }
+  return discard_event();
+}
+
+boost::statechart::result
+BackfillState::ReplicasScanning::react(ObjectPushed evt)
+{
+  logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}",
+                 evt.object);
+  backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
+  return discard_event();
+}
+
+
+// -- Waiting
+BackfillState::Waiting::Waiting(my_context ctx)
+  : my_base(ctx)
+{
+}
+
+boost::statechart::result
+BackfillState::Waiting::react(ObjectPushed evt)
+{
+  logger().debug("Waiting::react() on ObjectPushed; evt.object={}",
+                 evt.object);
+  backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
+  if (!Enqueuing::all_enqueued(peering_state(),
+                               backfill_state().backfill_info,
+                               backfill_state().peer_backfill_info)) {
+    return transit<Enqueuing>();
+  } else if (backfill_state().progress_tracker->tracked_objects_completed()) {
+    return transit<Done>();
+  } else {
+    // we still have something to wait on
+    logger().debug("Waiting::react() on ObjectPushed; still waiting");
+    return discard_event();
+  }
+}
+
+// -- Done
+BackfillState::Done::Done(my_context ctx)
+  : my_base(ctx)
+{
+  logger().info("{}: backfill is done", __func__);
+  backfill_listener().backfilled();
+}
+
+// -- Crashed
+BackfillState::Crashed::Crashed()
+{
+  ceph_abort_msg("{}: this should not happen");
+}
+
+// ProgressTracker is an intermediary between the BackfillListener and
+// BackfillMachine + its states. All requests to push or drop an object
+// are directed through it. The same happens with notifications about
+// completing given operations which are generated by BackfillListener
+// and dispatched as i.e. ObjectPushed events.
+// This allows ProgressTacker to track the list of in-flight operations
+// which is essential to make the decision whether the entire machine
+// should switch from Waiting to Done keep in Waiting.
+// ProgressTracker also coordinates .last_backfill_started and stats
+// updates.
+bool BackfillState::ProgressTracker::tracked_objects_completed() const
+{
+  return registry.empty();
+}
+
+bool BackfillState::ProgressTracker::enqueue_push(const hobject_t& obj)
+{
+  [[maybe_unused]] const auto [it, first_seen] = registry.try_emplace(
+    obj, registry_item_t{op_stage_t::enqueued_push, std::nullopt});
+  return first_seen;
+}
+
+void BackfillState::ProgressTracker::enqueue_drop(const hobject_t& obj)
+{
+  registry.try_emplace(
+    obj, registry_item_t{op_stage_t::enqueued_drop, pg_stat_t{}});
+}
+
+void BackfillState::ProgressTracker::complete_to(
+  const hobject_t& obj,
+  const pg_stat_t& stats)
+{
+  logger().debug("{}: obj={}",
+                 __func__, obj);
+  if (auto completion_iter = registry.find(obj);
+      completion_iter != std::end(registry)) {
+    completion_iter->second = \
+      registry_item_t{ op_stage_t::completed_push, stats };
+  } else {
+    ceph_abort_msg("completing untracked object shall not happen");
+  }
+  for (auto it = std::begin(registry);
+       it != std::end(registry) &&
+         it->second.stage != op_stage_t::enqueued_push;
+       it = registry.erase(it)) {
+    auto& [soid, item] = *it;
+    assert(item.stats);
+    peering_state().update_complete_backfill_object_stats(
+      soid,
+      *item.stats);
+  }
+  if (Enqueuing::all_enqueued(peering_state(),
+                              backfill_state().backfill_info,
+                              backfill_state().peer_backfill_info) &&
+      tracked_objects_completed()) {
+    backfill_state().last_backfill_started = hobject_t::get_max();
+    backfill_listener().update_peers_last_backfill(hobject_t::get_max());
+  } else {
+    backfill_listener().update_peers_last_backfill(obj);
+  }
+}
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h
new file mode 100644
index 000000000..4bd2991fb
--- /dev/null
+++ b/src/crimson/osd/backfill_state.h
@@ -0,0 +1,382 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <optional>
+
+#include <boost/statechart/custom_reaction.hpp>
+#include <boost/statechart/event.hpp>
+#include <boost/statechart/event_base.hpp>
+#include <boost/statechart/simple_state.hpp>
+#include <boost/statechart/state.hpp>
+#include <boost/statechart/state_machine.hpp>
+#include <boost/statechart/transition.hpp>
+
+#include "osd/recovery_types.h"
+
+namespace crimson::osd {
+
+namespace sc = boost::statechart;
+
+struct BackfillState {
+  struct BackfillListener;
+  struct PeeringFacade;
+  struct PGFacade;
+
+  // events comes first
+  struct PrimaryScanned : sc::event<PrimaryScanned> {
+    BackfillInterval result;
+    PrimaryScanned(BackfillInterval&& result)
+      : result(std::move(result)) {
+    }
+  };
+
+  struct ReplicaScanned : sc::event<ReplicaScanned> {
+    pg_shard_t from;
+    BackfillInterval result;
+    ReplicaScanned(pg_shard_t from, BackfillInterval&& result)
+      : from(std::move(from)),
+        result(std::move(result)) {
+    }
+  };
+
+  struct ObjectPushed : sc::event<ObjectPushed> {
+    // TODO: implement replica management; I don't want to follow
+    // current convention where the backend layer is responsible
+    // for tracking replicas.
+    hobject_t object;
+    pg_stat_t stat;
+    ObjectPushed(hobject_t object)
+      : object(std::move(object)) {
+    }
+  };
+
+  struct Triggered : sc::event<Triggered> {
+  };
+
+private:
+  // internal events
+  struct RequestPrimaryScanning : sc::event<RequestPrimaryScanning> {
+  };
+
+  struct RequestReplicasScanning : sc::event<RequestReplicasScanning> {
+  };
+
+  struct RequestWaiting : sc::event<RequestWaiting> {
+  };
+
+  struct RequestDone : sc::event<RequestDone> {
+  };
+
+  class ProgressTracker;
+
+public:
+
+  struct Initial;
+  struct Enqueuing;
+  struct PrimaryScanning;
+  struct ReplicasScanning;
+  struct Waiting;
+  struct Done;
+
+  struct BackfillMachine : sc::state_machine<BackfillMachine, Initial> {
+    BackfillMachine(BackfillState& backfill_state,
+                    BackfillListener& backfill_listener,
+                    std::unique_ptr<PeeringFacade> peering_state,
+                    std::unique_ptr<PGFacade> pg);
+    ~BackfillMachine();
+    BackfillState& backfill_state;
+    BackfillListener& backfill_listener;
+    std::unique_ptr<PeeringFacade> peering_state;
+    std::unique_ptr<PGFacade> pg;
+  };
+
+private:
+  template <class S>
+  struct StateHelper {
+    StateHelper();
+    ~StateHelper();
+
+    BackfillState& backfill_state() {
+      return static_cast<S*>(this) \
+        ->template context<BackfillMachine>().backfill_state;
+    }
+    BackfillListener& backfill_listener() {
+      return static_cast<S*>(this) \
+        ->template context<BackfillMachine>().backfill_listener;
+    }
+    PeeringFacade& peering_state() {
+      return *static_cast<S*>(this) \
+        ->template context<BackfillMachine>().peering_state;
+    }
+    PGFacade& pg() {
+      return *static_cast<S*>(this)->template context<BackfillMachine>().pg;
+    }
+
+    const PeeringFacade& peering_state() const {
+      return *static_cast<const S*>(this) \
+        ->template context<BackfillMachine>().peering_state;
+    }
+    const BackfillState& backfill_state() const {
+      return static_cast<const S*>(this) \
+        ->template context<BackfillMachine>().backfill_state;
+    }
+  };
+
+public:
+
+  // states
+  struct Crashed : sc::simple_state<Crashed, BackfillMachine>,
+                   StateHelper<Crashed> {
+    explicit Crashed();
+  };
+
+  struct Initial : sc::state<Initial, BackfillMachine>,
+                   StateHelper<Initial> {
+    using reactions = boost::mpl::list<
+      sc::custom_reaction<Triggered>,
+      sc::transition<sc::event_base, Crashed>>;
+    explicit Initial(my_context);
+    // initialize after triggering backfill by on_activate_complete().
+    // transit to Enqueuing.
+    sc::result react(const Triggered&);
+  };
+
+  struct Enqueuing : sc::state<Enqueuing, BackfillMachine>,
+                     StateHelper<Enqueuing> {
+    using reactions = boost::mpl::list<
+      sc::transition<RequestPrimaryScanning, PrimaryScanning>,
+      sc::transition<RequestReplicasScanning, ReplicasScanning>,
+      sc::transition<RequestWaiting, Waiting>,
+      sc::transition<RequestDone, Done>,
+      sc::transition<sc::event_base, Crashed>>;
+    explicit Enqueuing(my_context);
+
+    // indicate whether there is any remaining work to do when it comes
+    // to comparing the hobject_t namespace between primary and replicas.
+    // true doesn't necessarily mean backfill is done -- there could be
+    // in-flight pushes or drops which had been enqueued but aren't
+    // completed yet.
+    static bool all_enqueued(
+      const PeeringFacade& peering_state,
+      const BackfillInterval& backfill_info,
+      const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info);
+
+  private:
+    void maybe_update_range();
+    void trim_backfill_infos();
+
+    // these methods take BackfillIntervals instead of extracting them from
+    // the state to emphasize the relationships across the main loop.
+    bool all_emptied(
+      const BackfillInterval& local_backfill_info,
+      const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const;
+    hobject_t earliest_peer_backfill(
+      const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const;
+    bool should_rescan_replicas(
+      const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
+      const BackfillInterval& backfill_info) const;
+    // indicate whether a particular acting primary needs to scanned again
+    // to process next piece of the hobject_t's namespace.
+    // the logic is per analogy to replica_needs_scan(). See comments there.
+    bool should_rescan_primary(
+      const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
+      const BackfillInterval& backfill_info) const;
+
+    // the result_t is intermediary between {remove,update}_on_peers() and
+    // updating BackfillIntervals in trim_backfilled_object_from_intervals.
+    // This step is important because it affects the main loop's condition,
+    // and thus deserves to be exposed instead of being called deeply from
+    // {remove,update}_on_peers().
+    struct [[nodiscard]] result_t {
+      std::set<pg_shard_t> pbi_targets;
+      hobject_t new_last_backfill_started;
+    };
+    void trim_backfilled_object_from_intervals(
+      result_t&&,
+      hobject_t& last_backfill_started,
+      std::map<pg_shard_t, BackfillInterval>& peer_backfill_info);
+    result_t remove_on_peers(const hobject_t& check);
+    result_t update_on_peers(const hobject_t& check);
+  };
+
+  struct PrimaryScanning : sc::state<PrimaryScanning, BackfillMachine>,
+                           StateHelper<PrimaryScanning> {
+    using reactions = boost::mpl::list<
+      sc::custom_reaction<ObjectPushed>,
+      sc::custom_reaction<PrimaryScanned>,
+      sc::transition<sc::event_base, Crashed>>;
+    explicit PrimaryScanning(my_context);
+    sc::result react(ObjectPushed);
+    // collect scanning result and transit to Enqueuing.
+    sc::result react(PrimaryScanned);
+  };
+
+  struct ReplicasScanning : sc::state<ReplicasScanning, BackfillMachine>,
+                            StateHelper<ReplicasScanning> {
+    using reactions = boost::mpl::list<
+      sc::custom_reaction<ObjectPushed>,
+      sc::custom_reaction<ReplicaScanned>,
+      sc::transition<sc::event_base, Crashed>>;
+    explicit ReplicasScanning(my_context);
+    // collect scanning result; if all results are collected, transition
+    // to Enqueuing will happen.
+    sc::result react(ObjectPushed);
+    sc::result react(ReplicaScanned);
+
+    // indicate whether a particular peer should be scanned to retrieve
+    // BackfillInterval for new range of hobject_t namespace.
+    // true when bi.objects is exhausted, replica bi's end is not MAX,
+    // and primary bi'begin is further than the replica's one.
+    static bool replica_needs_scan(
+      const BackfillInterval& replica_backfill_info,
+      const BackfillInterval& local_backfill_info);
+
+  private:
+    std::set<pg_shard_t> waiting_on_backfill;
+  };
+
+  struct Waiting : sc::state<Waiting, BackfillMachine>,
+                   StateHelper<Waiting> {
+    using reactions = boost::mpl::list<
+      sc::custom_reaction<ObjectPushed>,
+      sc::transition<sc::event_base, Crashed>>;
+    explicit Waiting(my_context);
+    sc::result react(ObjectPushed);
+  };
+
+  struct Done : sc::state<Done, BackfillMachine>,
+                StateHelper<Done> {
+    using reactions = boost::mpl::list<
+      sc::transition<sc::event_base, Crashed>>;
+    explicit Done(my_context);
+  };
+
+  BackfillState(BackfillListener& backfill_listener,
+                std::unique_ptr<PeeringFacade> peering_state,
+                std::unique_ptr<PGFacade> pg);
+  ~BackfillState();
+
+  void process_event(
+    boost::intrusive_ptr<const sc::event_base> evt) {
+    backfill_machine.process_event(*std::move(evt));
+  }
+
+  hobject_t get_last_backfill_started() const {
+    return last_backfill_started;
+  }
+private:
+  hobject_t last_backfill_started;
+  BackfillInterval backfill_info;
+  std::map<pg_shard_t, BackfillInterval> peer_backfill_info;
+  BackfillMachine backfill_machine;
+  std::unique_ptr<ProgressTracker> progress_tracker;
+};
+
+// BackfillListener -- an interface used by the backfill FSM to request
+// low-level services like issueing `MOSDPGPush` or `MOSDPGBackfillRemove`.
+// The goals behind the interface are: 1) unittestability; 2) possibility
+// to retrofit classical OSD with BackfillState. For the second reason we
+// never use `seastar::future` -- instead responses to the requests are
+// conveyed as events; see ObjectPushed as an example.
+struct BackfillState::BackfillListener {
+  virtual void request_replica_scan(
+    const pg_shard_t& target,
+    const hobject_t& begin,
+    const hobject_t& end) = 0;
+
+  virtual void request_primary_scan(
+    const hobject_t& begin) = 0;
+
+  virtual void enqueue_push(
+    const hobject_t& obj,
+    const eversion_t& v) = 0;
+
+  virtual void enqueue_drop(
+    const pg_shard_t& target,
+    const hobject_t& obj,
+    const eversion_t& v) = 0;
+
+  virtual void maybe_flush() = 0;
+
+  virtual void update_peers_last_backfill(
+    const hobject_t& new_last_backfill) = 0;
+
+  virtual bool budget_available() const = 0;
+
+  virtual void backfilled() = 0;
+
+  virtual ~BackfillListener() = default;
+};
+
+// PeeringFacade -- a facade (in the GoF-defined meaning) simplifying
+// the interface of PeeringState. The motivation is to have an inventory
+// of behaviour that must be provided by a unit test's mock.
+struct BackfillState::PeeringFacade {
+  virtual hobject_t earliest_backfill() const = 0;
+  virtual const std::set<pg_shard_t>& get_backfill_targets() const = 0;
+  virtual const hobject_t& get_peer_last_backfill(pg_shard_t peer) const = 0;
+  virtual const eversion_t& get_last_update() const = 0;
+  virtual const eversion_t& get_log_tail() const = 0;
+
+  // the performance impact of `std::function` has not been considered yet.
+  // If there is any proof (from e.g. profiling) about its significance, we
+  // can switch back to the template variant.
+  using scan_log_func_t = std::function<void(const pg_log_entry_t&)>;
+  virtual void scan_log_after(eversion_t, scan_log_func_t) const = 0;
+
+  virtual bool is_backfill_target(pg_shard_t peer) const = 0;
+  virtual void update_complete_backfill_object_stats(const hobject_t &hoid,
+                                             const pg_stat_t &stats) = 0;
+  virtual bool is_backfilling() const = 0;
+  virtual ~PeeringFacade() {}
+};
+
+// PGFacade -- a facade (in the GoF-defined meaning) simplifying the huge
+// interface of crimson's PG class. The motivation is to have an inventory
+// of behaviour that must be provided by a unit test's mock.
+struct BackfillState::PGFacade {
+  virtual const eversion_t& get_projected_last_update() const = 0;
+  virtual ~PGFacade() {}
+};
+
+class BackfillState::ProgressTracker {
+  // TODO: apply_stat,
+  enum class op_stage_t {
+    enqueued_push,
+    enqueued_drop,
+    completed_push,
+  };
+
+  struct registry_item_t {
+    op_stage_t stage;
+    std::optional<pg_stat_t> stats;
+  };
+
+  BackfillMachine& backfill_machine;
+  std::map<hobject_t, registry_item_t> registry;
+
+  BackfillState& backfill_state() {
+    return backfill_machine.backfill_state;
+  }
+  PeeringFacade& peering_state() {
+    return *backfill_machine.peering_state;
+  }
+  BackfillListener& backfill_listener() {
+    return backfill_machine.backfill_listener;
+  }
+
+public:
+  ProgressTracker(BackfillMachine& backfill_machine)
+    : backfill_machine(backfill_machine) {
+  }
+
+  bool tracked_objects_completed() const;
+
+  bool enqueue_push(const hobject_t&);
+  void enqueue_drop(const hobject_t&);
+  void complete_to(const hobject_t&, const pg_stat_t&);
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/ec_backend.cc b/src/crimson/osd/ec_backend.cc
new file mode 100644
index 000000000..c6516d50a
--- /dev/null
+++ b/src/crimson/osd/ec_backend.cc
@@ -0,0 +1,35 @@
+#include "ec_backend.h"
+
+#include "crimson/osd/shard_services.h"
+
+ECBackend::ECBackend(shard_id_t shard,
+                     ECBackend::CollectionRef coll,
+                     crimson::osd::ShardServices& shard_services,
+                     const ec_profile_t&,
+                     uint64_t)
+  : PGBackend{shard, coll, &shard_services.get_store()}
+{
+  // todo
+}
+
+ECBackend::ll_read_errorator::future<ceph::bufferlist>
+ECBackend::_read(const hobject_t& hoid,
+                 const uint64_t off,
+                 const uint64_t len,
+                 const uint32_t flags)
+{
+  // todo
+  return seastar::make_ready_future<bufferlist>();
+}
+
+seastar::future<crimson::osd::acked_peers_t>
+ECBackend::_submit_transaction(std::set<pg_shard_t>&& pg_shards,
+                               const hobject_t& hoid,
+                               ceph::os::Transaction&& txn,
+                               const osd_op_params_t& osd_op_p,
+                               epoch_t min_epoch, epoch_t max_epoch,
+			       std::vector<pg_log_entry_t>&& log_entries)
+{
+  // todo
+  return seastar::make_ready_future<crimson::osd::acked_peers_t>();
+}
diff --git a/src/crimson/osd/ec_backend.h b/src/crimson/osd/ec_backend.h
new file mode 100644
index 000000000..e15b19970
--- /dev/null
+++ b/src/crimson/osd/ec_backend.h
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <seastar/core/future.hh>
+#include "include/buffer_fwd.h"
+#include "osd/osd_types.h"
+#include "pg_backend.h"
+
+class ECBackend : public PGBackend
+{
+public:
+  ECBackend(shard_id_t shard,
+	    CollectionRef coll,
+	    crimson::osd::ShardServices& shard_services,
+	    const ec_profile_t& ec_profile,
+	    uint64_t stripe_width);
+  seastar::future<> stop() final {
+    return seastar::now();
+  }
+  void on_actingset_changed(peering_info_t pi) final {}
+private:
+  ll_read_errorator::future<ceph::bufferlist> _read(const hobject_t& hoid,
+                                                    uint64_t off,
+                                                    uint64_t len,
+                                                    uint32_t flags) override;
+  seastar::future<crimson::osd::acked_peers_t>
+  _submit_transaction(std::set<pg_shard_t>&& pg_shards,
+		      const hobject_t& hoid,
+		      ceph::os::Transaction&& txn,
+		      const osd_op_params_t& req,
+		      epoch_t min_epoch, epoch_t max_epoch,
+		      std::vector<pg_log_entry_t>&& log_entries) final;
+  CollectionRef coll;
+  crimson::os::FuturizedStore* store;
+};
diff --git a/src/crimson/osd/exceptions.h b/src/crimson/osd/exceptions.h
new file mode 100644
index 000000000..2783ed252
--- /dev/null
+++ b/src/crimson/osd/exceptions.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <exception>
+#include <system_error>
+
+#include "crimson/common/errorator.h"
+
+namespace crimson::osd {
+class error : private std::system_error {
+public:
+  error(const std::errc ec)
+    : system_error(std::make_error_code(ec)) {
+  }
+
+  using system_error::code;
+  using system_error::what;
+
+  friend error make_error(int ret);
+
+private:
+  error(const int ret) noexcept
+    : system_error(ret, std::system_category()) {
+  }
+};
+
+inline error make_error(const int ret) {
+  return error{ret};
+}
+
+struct object_not_found : public error {
+  object_not_found() : error(std::errc::no_such_file_or_directory) {}
+};
+
+struct invalid_argument : public error {
+  invalid_argument() : error(std::errc::invalid_argument) {}
+};
+
+// FIXME: error handling
+struct permission_denied : public error {
+  permission_denied() : error(std::errc::operation_not_permitted) {}
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/heartbeat.cc b/src/crimson/osd/heartbeat.cc
new file mode 100644
index 000000000..81ec06ecd
--- /dev/null
+++ b/src/crimson/osd/heartbeat.cc
@@ -0,0 +1,680 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "heartbeat.h"
+
+#include <boost/range/join.hpp>
+
+#include "messages/MOSDPing.h"
+#include "messages/MOSDFailure.h"
+
+#include "crimson/common/config_proxy.h"
+#include "crimson/common/formatter.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/osd/shard_services.h"
+#include "crimson/mon/MonClient.h"
+
+#include "osd/OSDMap.h"
+
+using crimson::common::local_conf;
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+Heartbeat::Heartbeat(osd_id_t whoami,
+                     const crimson::osd::ShardServices& service,
+                     crimson::mon::Client& monc,
+                     crimson::net::MessengerRef front_msgr,
+                     crimson::net::MessengerRef back_msgr)
+  : whoami{whoami},
+    service{service},
+    monc{monc},
+    front_msgr{front_msgr},
+    back_msgr{back_msgr},
+    // do this in background
+    timer{[this] {
+      heartbeat_check();
+      (void)send_heartbeats();
+    }},
+    failing_peers{*this}
+{}
+
+seastar::future<> Heartbeat::start(entity_addrvec_t front_addrs,
+                                   entity_addrvec_t back_addrs)
+{
+  logger().info("heartbeat: start");
+  // i only care about the address, so any unused port would work
+  for (auto& addr : boost::join(front_addrs.v, back_addrs.v)) {
+    addr.set_port(0);
+  }
+
+  using crimson::net::SocketPolicy;
+  front_msgr->set_policy(entity_name_t::TYPE_OSD,
+                         SocketPolicy::lossy_client(0));
+  back_msgr->set_policy(entity_name_t::TYPE_OSD,
+                        SocketPolicy::lossy_client(0));
+  return seastar::when_all_succeed(start_messenger(*front_msgr,
+						   front_addrs),
+                                   start_messenger(*back_msgr,
+						   back_addrs))
+    .then_unpack([this] {
+      timer.arm_periodic(
+        std::chrono::seconds(local_conf()->osd_heartbeat_interval));
+    });
+}
+
+seastar::future<>
+Heartbeat::start_messenger(crimson::net::Messenger& msgr,
+                           const entity_addrvec_t& addrs)
+{
+  return msgr.try_bind(addrs,
+                       local_conf()->ms_bind_port_min,
+                       local_conf()->ms_bind_port_max)
+  .safe_then([this, &msgr]() mutable {
+    return msgr.start({this});
+  }, crimson::net::Messenger::bind_ertr::all_same_way(
+      [] (const std::error_code& e) {
+    logger().error("heartbeat messenger try_bind(): address range is unavailable.");
+    ceph_abort();
+  }));
+}
+
+seastar::future<> Heartbeat::stop()
+{
+  logger().info("{}", __func__);
+  timer.cancel();
+  front_msgr->stop();
+  back_msgr->stop();
+  return gate.close().then([this] {
+    return seastar::when_all_succeed(front_msgr->shutdown(),
+				     back_msgr->shutdown());
+  }).then_unpack([] {
+    return seastar::now();
+  });
+}
+
+const entity_addrvec_t& Heartbeat::get_front_addrs() const
+{
+  return front_msgr->get_myaddrs();
+}
+
+const entity_addrvec_t& Heartbeat::get_back_addrs() const
+{
+  return back_msgr->get_myaddrs();
+}
+
+void Heartbeat::set_require_authorizer(bool require_authorizer)
+{
+  if (front_msgr->get_require_authorizer() != require_authorizer) {
+    front_msgr->set_require_authorizer(require_authorizer);
+    back_msgr->set_require_authorizer(require_authorizer);
+  }
+}
+
+void Heartbeat::add_peer(osd_id_t _peer, epoch_t epoch)
+{
+  assert(whoami != _peer);
+  auto [iter, added] = peers.try_emplace(_peer, *this, _peer);
+  auto& peer = iter->second;
+  peer.set_epoch(epoch);
+}
+
+Heartbeat::osds_t Heartbeat::remove_down_peers()
+{
+  osds_t old_osds; // osds not added in this epoch
+  for (auto i = peers.begin(); i != peers.end(); ) {
+    auto osdmap = service.get_osdmap_service().get_map();
+    const auto& [osd, peer] = *i;
+    if (!osdmap->is_up(osd)) {
+      i = peers.erase(i);
+    } else {
+      if (peer.get_epoch() < osdmap->get_epoch()) {
+        old_osds.push_back(osd);
+      }
+      ++i;
+    }
+  }
+  return old_osds;
+}
+
+void Heartbeat::add_reporter_peers(int whoami)
+{
+  auto osdmap = service.get_osdmap_service().get_map();
+  // include next and previous up osds to ensure we have a fully-connected set
+  set<int> want;
+  if (auto next = osdmap->get_next_up_osd_after(whoami); next >= 0) {
+    want.insert(next);
+  }
+  if (auto prev = osdmap->get_previous_up_osd_before(whoami); prev >= 0) {
+    want.insert(prev);
+  }
+  // make sure we have at least **min_down** osds coming from different
+  // subtree level (e.g., hosts) for fast failure detection.
+  auto min_down = local_conf().get_val<uint64_t>("mon_osd_min_down_reporters");
+  auto subtree = local_conf().get_val<string>("mon_osd_reporter_subtree_level");
+  osdmap->get_random_up_osds_by_subtree(
+    whoami, subtree, min_down, want, &want);
+  auto epoch = osdmap->get_epoch();
+  for (int osd : want) {
+    add_peer(osd, epoch);
+  };
+}
+
+void Heartbeat::update_peers(int whoami)
+{
+  const auto min_peers = static_cast<size_t>(
+    local_conf().get_val<int64_t>("osd_heartbeat_min_peers"));
+  add_reporter_peers(whoami);
+  auto extra = remove_down_peers();
+  // too many?
+  for (auto& osd : extra) {
+    if (peers.size() <= min_peers) {
+      break;
+    }
+    remove_peer(osd);
+  }
+  // or too few?
+  auto osdmap = service.get_osdmap_service().get_map();
+  auto epoch = osdmap->get_epoch();
+  for (auto next = osdmap->get_next_up_osd_after(whoami);
+    peers.size() < min_peers && next >= 0 && next != whoami;
+    next = osdmap->get_next_up_osd_after(next)) {
+    add_peer(next, epoch);
+  }
+}
+
+Heartbeat::osds_t Heartbeat::get_peers() const
+{
+  osds_t osds;
+  osds.reserve(peers.size());
+  for (auto& peer : peers) {
+    osds.push_back(peer.first);
+  }
+  return osds;
+}
+
+void Heartbeat::remove_peer(osd_id_t peer)
+{
+  assert(peers.count(peer) == 1);
+  peers.erase(peer);
+}
+
+std::optional<seastar::future<>>
+Heartbeat::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m)
+{
+  bool dispatched = true;
+  gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] {
+    switch (m->get_type()) {
+    case MSG_OSD_PING:
+      return handle_osd_ping(conn, boost::static_pointer_cast<MOSDPing>(m));
+    default:
+      dispatched = false;
+      return seastar::now();
+    }
+  });
+  return (dispatched ? std::make_optional(seastar::now()) : std::nullopt);
+}
+
+void Heartbeat::ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace)
+{
+  auto peer = conn->get_peer_id();
+  if (conn->get_peer_type() != entity_name_t::TYPE_OSD ||
+      peer == entity_name_t::NEW) {
+    return;
+  }
+  if (auto found = peers.find(peer);
+      found != peers.end()) {
+    found->second.handle_reset(conn, is_replace);
+  }
+}
+
+void Heartbeat::ms_handle_connect(crimson::net::ConnectionRef conn)
+{
+  auto peer = conn->get_peer_id();
+  if (conn->get_peer_type() != entity_name_t::TYPE_OSD ||
+      peer == entity_name_t::NEW) {
+    return;
+  }
+  if (auto found = peers.find(peer);
+      found != peers.end()) {
+    found->second.handle_connect(conn);
+  }
+}
+
+void Heartbeat::ms_handle_accept(crimson::net::ConnectionRef conn)
+{
+  auto peer = conn->get_peer_id();
+  if (conn->get_peer_type() != entity_name_t::TYPE_OSD ||
+      peer == entity_name_t::NEW) {
+    return;
+  }
+  if (auto found = peers.find(peer);
+      found != peers.end()) {
+    found->second.handle_accept(conn);
+  }
+}
+
+seastar::future<> Heartbeat::handle_osd_ping(crimson::net::ConnectionRef conn,
+                                             Ref<MOSDPing> m)
+{
+  switch (m->op) {
+  case MOSDPing::PING:
+    return handle_ping(conn, m);
+  case MOSDPing::PING_REPLY:
+    return handle_reply(conn, m);
+  case MOSDPing::YOU_DIED:
+    return handle_you_died();
+  default:
+    return seastar::now();
+  }
+}
+
+seastar::future<> Heartbeat::handle_ping(crimson::net::ConnectionRef conn,
+                                         Ref<MOSDPing> m)
+{
+  auto min_message = static_cast<uint32_t>(
+    local_conf()->osd_heartbeat_min_size);
+  auto reply =
+    make_message<MOSDPing>(
+      m->fsid,
+      service.get_osdmap_service().get_map()->get_epoch(),
+      MOSDPing::PING_REPLY,
+      m->ping_stamp,
+      m->mono_ping_stamp,
+      service.get_mnow(),
+      service.get_osdmap_service().get_up_epoch(),
+      min_message);
+  return conn->send(reply);
+}
+
+seastar::future<> Heartbeat::handle_reply(crimson::net::ConnectionRef conn,
+                                          Ref<MOSDPing> m)
+{
+  const osd_id_t from = m->get_source().num();
+  auto found = peers.find(from);
+  if (found == peers.end()) {
+    // stale reply
+    return seastar::now();
+  }
+  auto& peer = found->second;
+  return peer.handle_reply(conn, m);
+}
+
+seastar::future<> Heartbeat::handle_you_died()
+{
+  // TODO: ask for newer osdmap
+  return seastar::now();
+}
+
+void Heartbeat::heartbeat_check()
+{
+  failure_queue_t failure_queue;
+  const auto now = clock::now();
+  for (const auto& [osd, peer] : peers) {
+    auto failed_since = peer.failed_since(now);
+    if (!clock::is_zero(failed_since)) {
+      failure_queue.emplace(osd, failed_since);
+    }
+  }
+  if (!failure_queue.empty()) {
+    // send_failures can run in background, because
+    // 	1. After the execution of send_failures, no msg is actually
+    // 	   sent, which means the sending operation is not done,
+    // 	   which further seems to involve problems risks that when
+    // 	   osd shuts down, the left part of the sending operation
+    // 	   may reference OSD and Heartbeat instances that are already
+    // 	   deleted. However, remaining work of that sending operation
+    // 	   involves no reference back to OSD or Heartbeat instances,
+    // 	   which means it wouldn't involve the above risks.
+    // 	2. messages are sent in order, if later checks find out
+    // 	   the previous "failed" peers to be healthy, that "still
+    // 	   alive" messages would be sent after the previous "osd
+    // 	   failure" messages which is totally safe.
+    (void)send_failures(std::move(failure_queue));
+  }
+}
+
+seastar::future<> Heartbeat::send_heartbeats()
+{
+  const auto mnow = service.get_mnow();
+  const auto now = clock::now();
+
+  std::vector<seastar::future<>> futures;
+  for (auto& [osd, peer] : peers) {
+    peer.send_heartbeat(now, mnow, futures);
+  }
+  return seastar::when_all_succeed(futures.begin(), futures.end());
+}
+
+seastar::future<> Heartbeat::send_failures(failure_queue_t&& failure_queue)
+{
+  std::vector<seastar::future<>> futures;
+  const auto now = clock::now();
+  for (auto [osd, failed_since] : failure_queue) {
+    failing_peers.add_pending(osd, failed_since, now, futures);
+  }
+
+  return seastar::when_all_succeed(futures.begin(), futures.end());
+}
+
+void Heartbeat::print(std::ostream& out) const
+{
+  out << "heartbeat";
+}
+
+Heartbeat::Connection::~Connection()
+{
+  if (conn) {
+    conn->mark_down();
+  }
+}
+
+bool Heartbeat::Connection::matches(crimson::net::ConnectionRef _conn) const
+{
+  return (conn && conn == _conn);
+}
+
+void Heartbeat::Connection::accepted(crimson::net::ConnectionRef accepted_conn)
+{
+  if (!conn) {
+    if (accepted_conn->get_peer_addr() == listener.get_peer_addr(type)) {
+      logger().info("Heartbeat::Connection::accepted(): "
+                    "{} racing resolved", *this);
+      conn = accepted_conn;
+      set_connected();
+    }
+  } else if (conn == accepted_conn) {
+    set_connected();
+  }
+}
+
+void Heartbeat::Connection::replaced()
+{
+  assert(!is_connected);
+  auto replaced_conn = conn;
+  // set the racing connection, will be handled by handle_accept()
+  conn = msgr.connect(replaced_conn->get_peer_addr(),
+                      replaced_conn->get_peer_name());
+  racing_detected = true;
+  logger().warn("Heartbeat::Connection::replaced(): {} racing", *this);
+  assert(conn != replaced_conn);
+  assert(conn->is_connected());
+}
+
+void Heartbeat::Connection::reset()
+{
+  conn = nullptr;
+  if (is_connected) {
+    is_connected = false;
+    listener.decrease_connected();
+  }
+  if (!racing_detected || is_winner_side) {
+    connect();
+  } else {
+    logger().info("Heartbeat::Connection::reset(): "
+                  "{} racing detected and lose, "
+                  "waiting for peer connect me", *this);
+  }
+}
+
+seastar::future<> Heartbeat::Connection::send(MessageRef msg)
+{
+  assert(is_connected);
+  return conn->send(msg);
+}
+
+void Heartbeat::Connection::validate()
+{
+  assert(is_connected);
+  auto peer_addr = listener.get_peer_addr(type);
+  if (conn->get_peer_addr() != peer_addr) {
+    logger().info("Heartbeat::Connection::validate(): "
+                  "{} has new address {} over {}, reset",
+                  *this, peer_addr, conn->get_peer_addr());
+    conn->mark_down();
+    racing_detected = false;
+    reset();
+  }
+}
+
+void Heartbeat::Connection::retry()
+{
+  racing_detected = false;
+  if (!is_connected) {
+    if (conn) {
+      conn->mark_down();
+      reset();
+    } else {
+      connect();
+    }
+  }
+}
+
+void Heartbeat::Connection::set_connected()
+{
+  assert(!is_connected);
+  is_connected = true;
+  listener.increase_connected();
+}
+
+void Heartbeat::Connection::connect()
+{
+  assert(!conn);
+  auto addr = listener.get_peer_addr(type);
+  conn = msgr.connect(addr, entity_name_t(CEPH_ENTITY_TYPE_OSD, peer));
+  if (conn->is_connected()) {
+    set_connected();
+  }
+}
+
+Heartbeat::clock::time_point
+Heartbeat::Session::failed_since(Heartbeat::clock::time_point now) const
+{
+  if (do_health_screen(now) == health_state::UNHEALTHY) {
+    auto oldest_deadline = ping_history.begin()->second.deadline;
+    auto failed_since = std::min(last_rx_back, last_rx_front);
+    if (clock::is_zero(failed_since)) {
+      logger().error("Heartbeat::Session::failed_since(): no reply from osd.{} "
+                     "ever on either front or back, first ping sent {} "
+                     "(oldest deadline {})",
+                     peer, first_tx, oldest_deadline);
+      failed_since = first_tx;
+    } else {
+      logger().error("Heartbeat::Session::failed_since(): no reply from osd.{} "
+                     "since back {} front {} (oldest deadline {})",
+                     peer, last_rx_back, last_rx_front, oldest_deadline);
+    }
+    return failed_since;
+  } else {
+    return clock::zero();
+  }
+}
+
+void Heartbeat::Session::set_inactive_history(clock::time_point now)
+{
+  assert(!connected);
+  if (ping_history.empty()) {
+    const utime_t sent_stamp{now};
+    const auto deadline =
+      now + std::chrono::seconds(local_conf()->osd_heartbeat_grace);
+    ping_history.emplace(sent_stamp, reply_t{deadline, 0});
+  } else { // the entry is already added
+    assert(ping_history.size() == 1);
+  }
+}
+
+Heartbeat::Peer::Peer(Heartbeat& heartbeat, osd_id_t peer)
+  : ConnectionListener(2), heartbeat{heartbeat}, peer{peer}, session{peer},
+  con_front(peer, heartbeat.whoami > peer, Connection::type_t::front,
+            *heartbeat.front_msgr, *this),
+  con_back(peer, heartbeat.whoami > peer, Connection::type_t::back,
+           *heartbeat.back_msgr, *this)
+{
+  logger().info("Heartbeat::Peer: osd.{} added", peer);
+}
+
+Heartbeat::Peer::~Peer()
+{
+  logger().info("Heartbeat::Peer: osd.{} removed", peer);
+}
+
+void Heartbeat::Peer::send_heartbeat(
+    clock::time_point now, ceph::signedspan mnow,
+    std::vector<seastar::future<>>& futures)
+{
+  session.set_tx(now);
+  if (session.is_started()) {
+    do_send_heartbeat(now, mnow, &futures);
+    for_each_conn([] (auto& conn) {
+      conn.validate();
+    });
+  } else {
+    // we should send MOSDPing but still cannot at this moment
+    if (pending_send) {
+      // we have already pending for a entire heartbeat interval
+      logger().warn("Heartbeat::Peer::send_heartbeat(): "
+                    "heartbeat to osd.{} is still pending...", peer);
+      for_each_conn([] (auto& conn) {
+        conn.retry();
+      });
+    } else {
+      logger().info("Heartbeat::Peer::send_heartbeat(): "
+                    "heartbeat to osd.{} is pending send...", peer);
+      session.set_inactive_history(now);
+      pending_send = true;
+    }
+  }
+}
+
+seastar::future<> Heartbeat::Peer::handle_reply(
+    crimson::net::ConnectionRef conn, Ref<MOSDPing> m)
+{
+  if (!session.is_started()) {
+    // we haven't sent any ping yet
+    return seastar::now();
+  }
+  type_t type;
+  if (con_front.matches(conn)) {
+    type = type_t::front;
+  } else if (con_back.matches(conn)) {
+    type = type_t::back;
+  } else {
+    return seastar::now();
+  }
+  const auto now = clock::now();
+  if (session.on_pong(m->ping_stamp, type, now)) {
+    if (session.do_health_screen(now) == Session::health_state::HEALTHY) {
+      return heartbeat.failing_peers.cancel_one(peer);
+    }
+  }
+  return seastar::now();
+}
+
+entity_addr_t Heartbeat::Peer::get_peer_addr(type_t type)
+{
+  const auto osdmap = heartbeat.service.get_osdmap_service().get_map();
+  if (type == type_t::front) {
+    return osdmap->get_hb_front_addrs(peer).front();
+  } else {
+    return osdmap->get_hb_back_addrs(peer).front();
+  }
+}
+
+void Heartbeat::Peer::on_connected()
+{
+  logger().info("Heartbeat::Peer: osd.{} connected (send={})",
+                peer, pending_send);
+  session.on_connected();
+  if (pending_send) {
+    pending_send = false;
+    do_send_heartbeat(clock::now(), heartbeat.service.get_mnow(), nullptr);
+  }
+}
+
+void Heartbeat::Peer::on_disconnected()
+{
+  logger().info("Heartbeat::Peer: osd.{} disconnected", peer);
+  session.on_disconnected();
+}
+
+void Heartbeat::Peer::do_send_heartbeat(
+    Heartbeat::clock::time_point now,
+    ceph::signedspan mnow,
+    std::vector<seastar::future<>>* futures)
+{
+  const utime_t sent_stamp{now};
+  const auto deadline =
+    now + std::chrono::seconds(local_conf()->osd_heartbeat_grace);
+  session.on_ping(sent_stamp, deadline);
+  for_each_conn([&, this] (auto& conn) {
+    auto min_message = static_cast<uint32_t>(
+      local_conf()->osd_heartbeat_min_size);
+    auto ping = make_message<MOSDPing>(
+      heartbeat.monc.get_fsid(),
+      heartbeat.service.get_osdmap_service().get_map()->get_epoch(),
+      MOSDPing::PING,
+      sent_stamp,
+      mnow,
+      mnow,
+      heartbeat.service.get_osdmap_service().get_up_epoch(),
+      min_message);
+    if (futures) {
+      futures->push_back(conn.send(std::move(ping)));
+    }
+  });
+}
+
+bool Heartbeat::FailingPeers::add_pending(
+  osd_id_t peer,
+  clock::time_point failed_since,
+  clock::time_point now,
+  std::vector<seastar::future<>>& futures)
+{
+  if (failure_pending.count(peer)) {
+    return false;
+  }
+  auto failed_for = chrono::duration_cast<chrono::seconds>(
+      now - failed_since).count();
+  auto osdmap = heartbeat.service.get_osdmap_service().get_map();
+  auto failure_report =
+      make_message<MOSDFailure>(heartbeat.monc.get_fsid(),
+                                peer,
+                                osdmap->get_addrs(peer),
+                                static_cast<int>(failed_for),
+                                osdmap->get_epoch());
+  failure_pending.emplace(peer, failure_info_t{failed_since,
+                                               osdmap->get_addrs(peer)});
+  futures.push_back(heartbeat.monc.send_message(failure_report));
+  logger().info("{}: osd.{} failed for {}", __func__, peer, failed_for);
+  return true;
+}
+
+seastar::future<> Heartbeat::FailingPeers::cancel_one(osd_id_t peer)
+{
+  if (auto pending = failure_pending.find(peer);
+      pending != failure_pending.end()) {
+    auto fut = send_still_alive(peer, pending->second.addrs);
+    failure_pending.erase(peer);
+    return fut;
+  }
+  return seastar::now();
+}
+
+seastar::future<>
+Heartbeat::FailingPeers::send_still_alive(
+    osd_id_t osd, const entity_addrvec_t& addrs)
+{
+  auto still_alive = make_message<MOSDFailure>(
+    heartbeat.monc.get_fsid(),
+    osd,
+    addrs,
+    0,
+    heartbeat.service.get_osdmap_service().get_map()->get_epoch(),
+    MOSDFailure::FLAG_ALIVE);
+  logger().info("{}: osd.{}", __func__, osd);
+  return heartbeat.monc.send_message(still_alive);
+}
diff --git a/src/crimson/osd/heartbeat.h b/src/crimson/osd/heartbeat.h
new file mode 100644
index 000000000..4947e871f
--- /dev/null
+++ b/src/crimson/osd/heartbeat.h
@@ -0,0 +1,455 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cstdint>
+#include <seastar/core/future.hh>
+#include "common/ceph_time.h"
+#include "crimson/common/gated.h"
+#include "crimson/net/Dispatcher.h"
+#include "crimson/net/Fwd.h"
+
+class MOSDPing;
+
+namespace crimson::osd {
+  class ShardServices;
+}
+
+namespace crimson::mon {
+  class Client;
+}
+
+template<typename Message> using Ref = boost::intrusive_ptr<Message>;
+
+class Heartbeat : public crimson::net::Dispatcher {
+public:
+  using osd_id_t = int;
+
+  Heartbeat(osd_id_t whoami,
+            const crimson::osd::ShardServices& service,
+	    crimson::mon::Client& monc,
+	    crimson::net::MessengerRef front_msgr,
+	    crimson::net::MessengerRef back_msgr);
+
+  seastar::future<> start(entity_addrvec_t front,
+			  entity_addrvec_t back);
+  seastar::future<> stop();
+
+  using osds_t = std::vector<osd_id_t>;
+  void add_peer(osd_id_t peer, epoch_t epoch);
+  void update_peers(int whoami);
+  void remove_peer(osd_id_t peer);
+  osds_t get_peers() const;
+
+  const entity_addrvec_t& get_front_addrs() const;
+  const entity_addrvec_t& get_back_addrs() const;
+
+  void set_require_authorizer(bool);
+
+  // Dispatcher methods
+  std::optional<seastar::future<>> ms_dispatch(
+      crimson::net::ConnectionRef conn, MessageRef m) override;
+  void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) override;
+  void ms_handle_connect(crimson::net::ConnectionRef conn) override;
+  void ms_handle_accept(crimson::net::ConnectionRef conn) override;
+
+  void print(std::ostream&) const;
+private:
+  seastar::future<> handle_osd_ping(crimson::net::ConnectionRef conn,
+				    Ref<MOSDPing> m);
+  seastar::future<> handle_ping(crimson::net::ConnectionRef conn,
+				Ref<MOSDPing> m);
+  seastar::future<> handle_reply(crimson::net::ConnectionRef conn,
+				 Ref<MOSDPing> m);
+  seastar::future<> handle_you_died();
+
+  /// remove down OSDs
+  /// @return peers not added in this epoch
+  osds_t remove_down_peers();
+  /// add enough reporters for fast failure detection
+  void add_reporter_peers(int whoami);
+
+  seastar::future<> start_messenger(crimson::net::Messenger& msgr,
+				    const entity_addrvec_t& addrs);
+private:
+  const osd_id_t whoami;
+  const crimson::osd::ShardServices& service;
+  crimson::mon::Client& monc;
+  crimson::net::MessengerRef front_msgr;
+  crimson::net::MessengerRef back_msgr;
+
+  seastar::timer<seastar::lowres_clock> timer;
+  // use real_clock so it can be converted to utime_t
+  using clock = ceph::coarse_real_clock;
+
+  class ConnectionListener;
+  class Connection;
+  class Session;
+  class Peer;
+  using peers_map_t = std::map<osd_id_t, Peer>;
+  peers_map_t peers;
+
+  // osds which are considered failed
+  // osd_id => when was the last time that both front and back pings were acked
+  //           or sent.
+  //           use for calculating how long the OSD has been unresponsive
+  using failure_queue_t = std::map<osd_id_t, clock::time_point>;
+  seastar::future<> send_failures(failure_queue_t&& failure_queue);
+  seastar::future<> send_heartbeats();
+  void heartbeat_check();
+
+  // osds we've reported to monior as failed ones, but they are not marked down
+  // yet
+  crimson::common::Gated gate;
+
+  class FailingPeers {
+   public:
+    FailingPeers(Heartbeat& heartbeat) : heartbeat(heartbeat) {}
+    bool add_pending(osd_id_t peer,
+                     clock::time_point failed_since,
+                     clock::time_point now,
+                     std::vector<seastar::future<>>& futures);
+    seastar::future<> cancel_one(osd_id_t peer);
+
+   private:
+    seastar::future<> send_still_alive(osd_id_t, const entity_addrvec_t&);
+
+    Heartbeat& heartbeat;
+
+    struct failure_info_t {
+      clock::time_point failed_since;
+      entity_addrvec_t addrs;
+    };
+    std::map<osd_id_t, failure_info_t> failure_pending;
+  } failing_peers;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const Heartbeat& hb) {
+  hb.print(out);
+  return out;
+}
+
+/*
+ * Event driven interface for Heartbeat::Peer to be notified when both hb_front
+ * and hb_back are connected, or connection is lost.
+ */
+class Heartbeat::ConnectionListener {
+ public:
+  ConnectionListener(size_t connections) : connections{connections} {}
+
+  void increase_connected() {
+    assert(connected < connections);
+    ++connected;
+    if (connected == connections) {
+      on_connected();
+    }
+  }
+  void decrease_connected() {
+    assert(connected > 0);
+    if (connected == connections) {
+      on_disconnected();
+    }
+    --connected;
+  }
+  enum class type_t { front, back };
+  virtual entity_addr_t get_peer_addr(type_t) = 0;
+
+ protected:
+  virtual void on_connected() = 0;
+  virtual void on_disconnected() = 0;
+
+ private:
+  const size_t connections;
+  size_t connected = 0;
+};
+
+class Heartbeat::Connection {
+ public:
+  using type_t = ConnectionListener::type_t;
+  Connection(osd_id_t peer, bool is_winner_side, type_t type,
+             crimson::net::Messenger& msgr,
+             ConnectionListener& listener)
+    : peer{peer}, type{type},
+      msgr{msgr}, listener{listener},
+      is_winner_side{is_winner_side} {
+    connect();
+  }
+  Connection(const Connection&) = delete;
+  Connection(Connection&&) = delete;
+  Connection& operator=(const Connection&) = delete;
+  Connection& operator=(Connection&&) = delete;
+
+  ~Connection();
+
+  bool matches(crimson::net::ConnectionRef _conn) const;
+  void connected() {
+    set_connected();
+  }
+  void accepted(crimson::net::ConnectionRef);
+  void replaced();
+  void reset();
+  seastar::future<> send(MessageRef msg);
+  void validate();
+  // retry connection if still pending
+  void retry();
+
+ private:
+  void set_connected();
+  void connect();
+
+  const osd_id_t peer;
+  const type_t type;
+  crimson::net::Messenger& msgr;
+  ConnectionListener& listener;
+
+/*
+ * Resolve the following racing when both me and peer are trying to connect
+ * each other symmetrically, under SocketPolicy::lossy_client:
+ *
+ * OSD.A               OSD.B
+ * -                       -
+ * |-[1]---->       <----[2]-|
+ *           \     /
+ *             \ /
+ *    delay..   X   delay..
+ *             / \
+ * |-[1]x>   /     \   <x[2]-|
+ * |<-[2]---         ---[1]->|
+ * |(reset#1)       (reset#2)|
+ * |(reconnectB) (reconnectA)|
+ * |-[2]--->         <---[1]-|
+ *  delay..           delay..
+ *   (remote close populated)
+ * |-[2]x>             <x[1]-|
+ * |(reset#2)       (reset#1)|
+ * | ...                 ... |
+ *         (dead loop!)
+ *
+ * Our solution is to remember if such racing was happened recently, and
+ * establish connection asymmetrically only from the winner side whose osd-id
+ * is larger.
+ */
+  const bool is_winner_side;
+  bool racing_detected = false;
+
+  crimson::net::ConnectionRef conn;
+  bool is_connected = false;
+
+ friend std::ostream& operator<<(std::ostream& os, const Connection& c) {
+   if (c.type == type_t::front) {
+     return os << "con_front(osd." << c.peer << ")";
+   } else {
+     return os << "con_back(osd." << c.peer << ")";
+   }
+ }
+};
+
+/*
+ * Track the ping history and ping reply (the pong) from the same session, clean up
+ * history once hb_front or hb_back loses connection and restart the session once
+ * both connections are connected again.
+ *
+ * We cannot simply remove the entire Heartbeat::Peer once hb_front or hb_back
+ * loses connection, because we would end up with the following deadloop:
+ *
+ * OSD.A                                   OSD.B
+ * -                                           -
+ * hb_front reset <--(network)--- hb_front close
+ *       |                             ^
+ *       |                             |
+ *  remove Peer B  (dead loop!)   remove Peer A
+ *       |                             |
+ *       V                             |
+ * hb_back close ----(network)---> hb_back reset
+ */
+class Heartbeat::Session {
+ public:
+  Session(osd_id_t peer) : peer{peer} {}
+
+  void set_epoch(epoch_t epoch_) { epoch = epoch_; }
+  epoch_t get_epoch() const { return epoch; }
+  bool is_started() const { return connected; }
+  bool pinged() const {
+    if (clock::is_zero(first_tx)) {
+      // i can never receive a pong without sending any ping message first.
+      assert(clock::is_zero(last_rx_front) &&
+             clock::is_zero(last_rx_back));
+      return false;
+    } else {
+      return true;
+    }
+  }
+
+  enum class health_state {
+    UNKNOWN,
+    UNHEALTHY,
+    HEALTHY,
+  };
+  health_state do_health_screen(clock::time_point now) const {
+    if (!pinged()) {
+      // we are not healty nor unhealty because we haven't sent anything yet
+      return health_state::UNKNOWN;
+    } else if (!ping_history.empty() && ping_history.begin()->second.deadline < now) {
+      return health_state::UNHEALTHY;
+    } else if (!clock::is_zero(last_rx_front) &&
+               !clock::is_zero(last_rx_back)) {
+      // only declare to be healthy until we have received the first
+      // replies from both front/back connections
+      return health_state::HEALTHY;
+    } else {
+      return health_state::UNKNOWN;
+    }
+  }
+
+  clock::time_point failed_since(clock::time_point now) const;
+
+  void set_tx(clock::time_point now) {
+    if (!pinged()) {
+      first_tx = now;
+    }
+    last_tx = now;
+  }
+
+  void on_connected() {
+    assert(!connected);
+    connected = true;
+    ping_history.clear();
+  }
+
+  void on_ping(const utime_t& sent_stamp,
+               const clock::time_point& deadline) {
+    assert(connected);
+    [[maybe_unused]] auto [reply, added] =
+      ping_history.emplace(sent_stamp, reply_t{deadline, 2});
+  }
+
+  bool on_pong(const utime_t& ping_stamp,
+               Connection::type_t type,
+               clock::time_point now) {
+    assert(connected);
+    auto ping = ping_history.find(ping_stamp);
+    if (ping == ping_history.end()) {
+      // old replies, deprecated by newly sent pings.
+      return false;
+    }
+    auto& unacked = ping->second.unacknowledged;
+    assert(unacked);
+    if (type == Connection::type_t::front) {
+      last_rx_front = now;
+      unacked--;
+    } else {
+      last_rx_back = now;
+      unacked--;
+    }
+    if (unacked == 0) {
+      ping_history.erase(ping_history.begin(), ++ping);
+    }
+    return true;
+  }
+
+  void on_disconnected() {
+    assert(connected);
+    connected = false;
+    if (!ping_history.empty()) {
+      // we lost our ping_history of the last session, but still need to keep
+      // the oldest deadline for unhealthy check.
+      auto oldest = ping_history.begin();
+      auto sent_stamp = oldest->first;
+      auto deadline = oldest->second.deadline;
+      ping_history.clear();
+      ping_history.emplace(sent_stamp, reply_t{deadline, 0});
+    }
+  }
+
+  // maintain an entry in ping_history for unhealthy check
+  void set_inactive_history(clock::time_point);
+
+ private:
+  const osd_id_t peer;
+  bool connected = false;
+  // time we sent our first ping request
+  clock::time_point first_tx;
+  // last time we sent a ping request
+  clock::time_point last_tx;
+  // last time we got a ping reply on the front side
+  clock::time_point last_rx_front;
+  // last time we got a ping reply on the back side
+  clock::time_point last_rx_back;
+  // most recent epoch we wanted this peer
+  epoch_t epoch;
+
+  struct reply_t {
+    clock::time_point deadline;
+    // one sent over front conn, another sent over back conn
+    uint8_t unacknowledged = 0;
+  };
+  // history of inflight pings, arranging by timestamp we sent
+  std::map<utime_t, reply_t> ping_history;
+};
+
+class Heartbeat::Peer final : private Heartbeat::ConnectionListener {
+ public:
+  Peer(Heartbeat&, osd_id_t);
+  ~Peer();
+  Peer(Peer&&) = delete;
+  Peer(const Peer&) = delete;
+  Peer& operator=(Peer&&) = delete;
+  Peer& operator=(const Peer&) = delete;
+
+  void set_epoch(epoch_t epoch) { session.set_epoch(epoch); }
+  epoch_t get_epoch() const { return session.get_epoch(); }
+
+  // if failure, return time_point since last active
+  // else, return clock::zero()
+  clock::time_point failed_since(clock::time_point now) const {
+    return session.failed_since(now);
+  }
+  void send_heartbeat(
+      clock::time_point, ceph::signedspan, std::vector<seastar::future<>>&);
+  seastar::future<> handle_reply(crimson::net::ConnectionRef, Ref<MOSDPing>);
+  void handle_reset(crimson::net::ConnectionRef conn, bool is_replace) {
+    for_each_conn([&] (auto& _conn) {
+      if (_conn.matches(conn)) {
+        if (is_replace) {
+          _conn.replaced();
+        } else {
+          _conn.reset();
+        }
+      }
+    });
+  }
+  void handle_connect(crimson::net::ConnectionRef conn) {
+    for_each_conn([&] (auto& _conn) {
+      if (_conn.matches(conn)) {
+        _conn.connected();
+      }
+    });
+  }
+  void handle_accept(crimson::net::ConnectionRef conn) {
+    for_each_conn([&] (auto& _conn) {
+      _conn.accepted(conn);
+    });
+  }
+
+ private:
+  entity_addr_t get_peer_addr(type_t type) override;
+  void on_connected() override;
+  void on_disconnected() override;
+  void do_send_heartbeat(
+      clock::time_point, ceph::signedspan, std::vector<seastar::future<>>*);
+
+  template <typename Func>
+  void for_each_conn(Func&& f) {
+    f(con_front);
+    f(con_back);
+  }
+
+  Heartbeat& heartbeat;
+  const osd_id_t peer;
+  Session session;
+  // if need to send heartbeat when session connected
+  bool pending_send = false;
+  Connection con_front;
+  Connection con_back;
+};
diff --git a/src/crimson/osd/main.cc b/src/crimson/osd/main.cc
new file mode 100644
index 000000000..a90903e72
--- /dev/null
+++ b/src/crimson/osd/main.cc
@@ -0,0 +1,230 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <random>
+
+#include <seastar/core/app-template.hh>
+#include <seastar/core/print.hh>
+#include <seastar/core/thread.hh>
+#include <seastar/util/std-compat.hh>
+
+#include "auth/KeyRing.h"
+#include "common/ceph_argparse.h"
+#include "crimson/common/buffer_io.h"
+#include "crimson/common/config_proxy.h"
+#include "crimson/net/Messenger.h"
+#include "global/pidfile.h"
+
+#include "osd.h"
+
+using config_t = crimson::common::ConfigProxy;
+
+void usage(const char* prog) {
+  std::cout << "usage: " << prog << " -i <ID>\n"
+            << "  --help-seastar    show Seastar help messages\n";
+  generic_server_usage();
+}
+
+auto partition_args(seastar::app_template& app, char** argv_begin, char** argv_end)
+{
+  namespace bpo = boost::program_options;
+  // collect all options consumed by seastar::app_template
+  auto parsed = bpo::command_line_parser(std::distance(argv_begin, argv_end),
+                                         argv_begin)
+    .options(app.get_options_description()).allow_unregistered().run();
+  auto unknown_args = bpo::collect_unrecognized(parsed.options,
+                                                bpo::include_positional);
+  std::vector<const char*> ceph_args, app_args;
+  // ceph_argparse_early_args() and
+  // seastar::smp::get_options_description() use "-c" for different
+  // options. and ceph wins
+  auto consume_conf_arg = [&](char** argv) {
+    if (std::strcmp(*argv, "-c") == 0) {
+      ceph_args.push_back(*argv++);
+      if (argv != argv_end) {
+        ceph_args.push_back(*argv++);
+      }
+    }
+    return argv;
+  };
+  auto unknown = unknown_args.begin();
+  auto consume_unknown_arg = [&](char** argv) {
+    for (; unknown != unknown_args.end() &&
+           argv != argv_end &&
+           *unknown == *argv; ++argv, ++unknown) {
+      if (std::strcmp(*argv, "--help-seastar") == 0) {
+        app_args.push_back("--help");
+      } else {
+        ceph_args.push_back(*argv);
+      }
+    }
+    return argv;
+  };
+  for (auto argv = argv_begin; argv != argv_end;) {
+    if (auto next_arg = consume_conf_arg(argv); next_arg != argv) {
+      argv = next_arg;
+    } else if (auto next_arg = consume_unknown_arg(argv); next_arg != argv) {
+      argv = next_arg;
+    } else {
+      app_args.push_back(*argv++);
+    }
+  }
+  return make_pair(std::move(ceph_args), std::move(app_args));
+}
+
+using crimson::common::local_conf;
+
+seastar::future<> make_keyring()
+{
+  const auto path = local_conf().get_val<string>("keyring");
+  return seastar::file_exists(path).then([path](bool exists) {
+    KeyRing keyring;
+    EntityName name{local_conf()->name};
+    EntityAuth auth;
+    if (exists &&
+        keyring.load(nullptr, path) == 0 &&
+        keyring.get_auth(name, auth)) {
+      seastar::fprint(std::cerr, "already have key in keyring: %s\n", path);
+      return seastar::now();
+    } else {
+      auth.key.create(std::make_unique<CephContext>().get(), CEPH_CRYPTO_AES);
+      keyring.add(name, auth);
+      bufferlist bl;
+      keyring.encode_plaintext(bl);
+      const auto permissions = (seastar::file_permissions::user_read |
+                              seastar::file_permissions::user_write);
+      return crimson::write_file(std::move(bl), path, permissions);
+    }
+  }).handle_exception_type([path](const std::filesystem::filesystem_error& e) {
+    seastar::fprint(std::cerr, "FATAL: writing new keyring to %s: %s\n", path, e.what());
+    throw e;
+  });
+}
+
+uint64_t get_nonce()
+{
+  if (auto pid = getpid(); pid != 1) {
+    return pid;
+  } else {
+    // we're running in a container; use a random number instead!
+    std::random_device rd;
+    std::default_random_engine rng{rd()};
+    return std::uniform_int_distribution<uint64_t>{}(rng);
+  }
+}
+
+int main(int argc, char* argv[])
+{
+  seastar::app_template app;
+  app.add_options()
+    ("mkkey", "generate a new secret key. "
+              "This is normally used in combination with --mkfs")
+    ("mkfs", "create a [new] data directory")
+    ("debug", "enable debug output on all loggers");
+
+  auto [ceph_args, app_args] = partition_args(app, argv, argv + argc);
+  if (ceph_argparse_need_usage(ceph_args) &&
+      std::find(app_args.begin(), app_args.end(), "--help") == app_args.end()) {
+    usage(argv[0]);
+    return EXIT_SUCCESS;
+  }
+  std::string cluster_name{"ceph"};
+  std::string conf_file_list;
+  // ceph_argparse_early_args() could _exit(), while local_conf() won't ready
+  // until it's started. so do the boilerplate-settings parsing here.
+  auto init_params = ceph_argparse_early_args(ceph_args,
+                                              CEPH_ENTITY_TYPE_OSD,
+                                              &cluster_name,
+                                              &conf_file_list);
+  seastar::sharded<crimson::osd::OSD> osd;
+  using crimson::common::sharded_conf;
+  using crimson::common::sharded_perf_coll;
+  try {
+    return app.run_deprecated(app_args.size(), const_cast<char**>(app_args.data()),
+      [&, &ceph_args=ceph_args] {
+      auto& config = app.configuration();
+      return seastar::async([&] {
+	if (config.count("debug")) {
+	    seastar::global_logger_registry().set_all_loggers_level(
+	      seastar::log_level::debug
+	    );
+	}
+        sharded_conf().start(init_params.name, cluster_name).get();
+        seastar::engine().at_exit([] {
+          return sharded_conf().stop();
+        });
+        sharded_perf_coll().start().get();
+        seastar::engine().at_exit([] {
+          return sharded_perf_coll().stop();
+        });
+        local_conf().parse_config_files(conf_file_list).get();
+        local_conf().parse_argv(ceph_args).get();
+        if (const auto ret = pidfile_write(local_conf()->pid_file);
+            ret == -EACCES || ret == -EAGAIN) {
+          ceph_abort_msg(
+            "likely there is another crimson-osd instance with the same id");
+        } else if (ret < 0) {
+          ceph_abort_msg(fmt::format("pidfile_write failed with {} {}",
+                                     ret, cpp_strerror(-ret)));
+        }
+        // just ignore SIGHUP, we don't reread settings
+        seastar::engine().handle_signal(SIGHUP, [] {});
+        const int whoami = std::stoi(local_conf()->name.get_id());
+        const auto nonce = get_nonce();
+        crimson::net::MessengerRef cluster_msgr, client_msgr;
+        crimson::net::MessengerRef hb_front_msgr, hb_back_msgr;
+        for (auto [msgr, name] : {make_pair(std::ref(cluster_msgr), "cluster"s),
+                                  make_pair(std::ref(client_msgr), "client"s),
+                                  make_pair(std::ref(hb_front_msgr), "hb_front"s),
+                                  make_pair(std::ref(hb_back_msgr), "hb_back"s)}) {
+          msgr = crimson::net::Messenger::create(entity_name_t::OSD(whoami), name,
+                                                 nonce);
+          if (local_conf()->ms_crc_data) {
+            msgr->set_crc_data();
+          }
+          if (local_conf()->ms_crc_header) {
+            msgr->set_crc_header();
+          }
+        }
+        osd.start_single(whoami, nonce,
+                         cluster_msgr, client_msgr,
+                         hb_front_msgr, hb_back_msgr).get();
+        if (config.count("mkkey")) {
+          make_keyring().handle_exception([](std::exception_ptr) {
+            seastar::engine().exit(1);
+          }).get();
+        }
+        if (config.count("mkfs")) {
+          osd.invoke_on(
+	    0,
+	    &crimson::osd::OSD::mkfs,
+	    local_conf().get_val<uuid_d>("osd_uuid"),
+	    local_conf().get_val<uuid_d>("fsid")).get();
+        }
+        seastar::engine().at_exit([&] {
+          return osd.stop();
+        });
+        if (config.count("mkkey") || config.count("mkfs")) {
+          seastar::engine().exit(0);
+        } else {
+          osd.invoke_on(0, &crimson::osd::OSD::start).get();
+        }
+      });
+    });
+  } catch (...) {
+    seastar::fprint(std::cerr, "FATAL: Exception during startup, aborting: %s\n", std::current_exception());
+    return EXIT_FAILURE;
+  }
+}
+
+/*
+ * Local Variables:
+ * compile-command: "make -j4 \
+ * -C ../../../build \
+ * crimson-osd"
+ * End:
+ */
diff --git a/src/crimson/osd/objclass.cc b/src/crimson/osd/objclass.cc
new file mode 100644
index 000000000..bc3284e26
--- /dev/null
+++ b/src/crimson/osd/objclass.cc
@@ -0,0 +1,484 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <cstdarg>
+#include <cstring>
+#include "common/ceph_context.h"
+#include "common/ceph_releases.h"
+#include "common/config.h"
+#include "common/debug.h"
+
+#include "crimson/osd/exceptions.h"
+#include "crimson/osd/ops_executer.h"
+#include "crimson/osd/pg_backend.h"
+
+#include "objclass/objclass.h"
+#include "osd/ClassHandler.h"
+
+#include "auth/Crypto.h"
+#include "common/armor.h"
+
+static inline int execute_osd_op(cls_method_context_t hctx, OSDOp& op)
+{
+  // we can expect the memory under `ret` will be still fine after
+  // executing the osd op as we're running inside `seastar::thread`
+  // created for us by `seastar::async` in `::do_op_call()`.
+  int ret = 0;
+  using osd_op_errorator = crimson::osd::OpsExecuter::osd_op_errorator;
+  reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->execute_op(op).handle_error(
+    osd_op_errorator::all_same_way([&ret] (const std::error_code& err) {
+      assert(err.value() > 0);
+      ret = -err.value();
+      return seastar::now();
+    })).get(); // we're blocking here which requires `seastar::thread`.
+  return ret;
+}
+
+int cls_call(cls_method_context_t hctx, const char *cls, const char *method,
+                                 char *indata, int datalen,
+                                 char **outdata, int *outdatalen)
+{
+// FIXME, HACK: this is for testing only. Let's use dynamic linker to verify
+// our depedencies
+  return 0;
+}
+
+int cls_getxattr(cls_method_context_t hctx,
+                 const char *name,
+                 char **outdata,
+                 int *outdatalen)
+{
+  return 0;
+}
+
+int cls_setxattr(cls_method_context_t hctx,
+                 const char *name,
+                 const char *value,
+                 int val_len)
+{
+  return 0;
+}
+
+int cls_read(cls_method_context_t hctx,
+             int ofs, int len,
+             char **outdata,
+             int *outdatalen)
+{
+  return 0;
+}
+
+int cls_get_request_origin(cls_method_context_t hctx, entity_inst_t *origin)
+{
+  assert(origin);
+
+  try {
+    const auto& message = \
+      reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_message();
+    *origin = message.get_orig_source_inst();
+    return 0;
+  } catch (crimson::osd::error& e) {
+    return -e.code().value();
+  }
+}
+
+int cls_cxx_create(cls_method_context_t hctx, const bool exclusive)
+{
+  OSDOp op{CEPH_OSD_OP_CREATE};
+  op.op.flags = (exclusive ? CEPH_OSD_OP_FLAG_EXCL : 0);
+  return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_remove(cls_method_context_t hctx)
+{
+  OSDOp op{CEPH_OSD_OP_DELETE};
+  return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime)
+{
+  OSDOp op{CEPH_OSD_OP_STAT};
+  if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+    return ret;
+  }
+  utime_t ut;
+  uint64_t s;
+  try {
+    auto iter = op.outdata.cbegin();
+    decode(s, iter);
+    decode(ut, iter);
+  } catch (buffer::error& err) {
+    return -EIO;
+  }
+  if (size) {
+    *size = s;
+  }
+  if (mtime) {
+    *mtime = ut.sec();
+  }
+  return 0;
+}
+
+int cls_cxx_stat2(cls_method_context_t hctx,
+                  uint64_t *size,
+                  ceph::real_time *mtime)
+{
+  return 0;
+}
+
+int cls_cxx_read2(cls_method_context_t hctx,
+                  int ofs,
+                  int len,
+                  bufferlist *outbl,
+                  uint32_t op_flags)
+{
+  OSDOp op{CEPH_OSD_OP_SYNC_READ};
+  op.op.extent.offset = ofs;
+  op.op.extent.length = len;
+  op.op.flags = op_flags;
+  if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+    return ret;
+  }
+  *outbl = std::move(op.outdata);
+  return outbl->length();
+}
+
+int cls_cxx_write2(cls_method_context_t hctx,
+                   int ofs,
+                   int len,
+                   bufferlist *inbl,
+                   uint32_t op_flags)
+{
+  OSDOp op{CEPH_OSD_OP_WRITE};
+  op.op.extent.offset = ofs;
+  op.op.extent.length = len;
+  op.op.flags = op_flags;
+  op.indata = *inbl;
+  return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_write_full(cls_method_context_t hctx, bufferlist * const inbl)
+{
+  OSDOp op{CEPH_OSD_OP_WRITEFULL};
+  op.op.extent.offset = 0;
+  op.op.extent.length = inbl->length();
+  op.indata = *inbl;
+  return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_replace(cls_method_context_t hctx,
+                    int ofs,
+                    int len,
+                    bufferlist *inbl)
+{
+  {
+    OSDOp top{CEPH_OSD_OP_TRUNCATE};
+    top.op.extent.offset = 0;
+    top.op.extent.length = 0;
+    if (const auto ret = execute_osd_op(hctx, top); ret < 0) {
+      return ret;
+    }
+  }
+
+  {
+    OSDOp wop{CEPH_OSD_OP_WRITE};
+    wop.op.extent.offset = ofs;
+    wop.op.extent.length = len;
+    wop.indata = *inbl;
+    if (const auto ret = execute_osd_op(hctx, wop); ret < 0) {
+      return ret;
+    }
+  }
+  return 0;
+}
+
+int cls_cxx_truncate(cls_method_context_t hctx, int ofs)
+{
+  OSDOp op{CEPH_OSD_OP_TRUNCATE};
+  op.op.extent.offset = ofs;
+  op.op.extent.length = 0;
+  return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_write_zero(cls_method_context_t hctx, int offset, int len)
+{
+  OSDOp op{CEPH_OSD_OP_ZERO};
+  op.op.extent.offset = offset;
+  op.op.extent.length = len;
+  return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_getxattr(cls_method_context_t hctx,
+                     const char *name,
+                     bufferlist *outbl)
+{
+  OSDOp op{CEPH_OSD_OP_GETXATTR};
+  op.op.xattr.name_len = strlen(name);
+  op.indata.append(name, op.op.xattr.name_len);
+  if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+    return ret;
+  }
+  *outbl = std::move(op.outdata);
+  return outbl->length();
+}
+
+int cls_cxx_getxattrs(cls_method_context_t hctx,
+                      map<string, bufferlist> *attrset)
+{
+  return 0;
+}
+
+int cls_cxx_setxattr(cls_method_context_t hctx,
+                     const char *name,
+                     bufferlist *inbl)
+{
+  OSDOp op{CEPH_OSD_OP_SETXATTR};
+  op.op.xattr.name_len = std::strlen(name);
+  op.op.xattr.value_len = inbl->length();
+  op.indata.append(name, op.op.xattr.name_len);
+  op.indata.append(*inbl);
+  return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_snap_revert(cls_method_context_t hctx, snapid_t snapid)
+{
+  OSDOp op{op = CEPH_OSD_OP_ROLLBACK};
+  op.op.snap.snapid = snapid;
+  return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_map_get_all_vals(cls_method_context_t hctx,
+                             map<string, bufferlist>* vals,
+                             bool *more)
+{
+  return 0;
+}
+
+int cls_cxx_map_get_keys(cls_method_context_t hctx,
+                         const std::string& start_obj,
+                         const uint64_t max_to_get,
+                         std::set<std::string>* const keys,
+                         bool* const more)
+{
+  OSDOp op{CEPH_OSD_OP_OMAPGETKEYS};
+  encode(start_obj, op.indata);
+  encode(max_to_get, op.indata);
+  if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+    return ret;
+  }
+  try {
+    auto iter = op.outdata.cbegin();
+    decode(*keys, iter);
+    decode(*more, iter);
+  } catch (buffer::error&) {
+    return -EIO;
+  }
+  return keys->size();
+}
+
+int cls_cxx_map_get_vals(cls_method_context_t hctx,
+                         const std::string& start_obj,
+                         const std::string& filter_prefix,
+                         const uint64_t max_to_get,
+                         std::map<std::string, ceph::bufferlist> *vals,
+                         bool* const more)
+{
+  OSDOp op{CEPH_OSD_OP_OMAPGETVALS};
+  encode(start_obj, op.indata);
+  encode(max_to_get, op.indata);
+  encode(filter_prefix, op.indata);
+  if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+    return ret;
+  }
+  try {
+    auto iter = op.outdata.cbegin();
+    decode(*vals, iter);
+    decode(*more, iter);
+  } catch (buffer::error&) {
+    return -EIO;
+  }
+  return vals->size();
+}
+
+int cls_cxx_map_get_vals_by_keys(cls_method_context_t hctx,
+				 const std::set<std::string> &keys,
+				 std::map<std::string, ceph::bufferlist> *vals)
+{
+  OSDOp op{CEPH_OSD_OP_OMAPGETVALSBYKEYS};
+  encode(keys, op.indata);
+  if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+    return ret;
+  }
+  try {
+    auto iter = op.outdata.cbegin();
+    decode(*vals, iter);
+  } catch (buffer::error&) {
+    return -EIO;
+  }
+  return 0;
+}
+
+int cls_cxx_map_read_header(cls_method_context_t hctx, bufferlist *outbl)
+{
+  OSDOp op{CEPH_OSD_OP_OMAPGETHEADER};
+  if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+    return ret;
+  }
+  *outbl = std::move(op.outdata);
+  return 0;
+}
+
+int cls_cxx_map_get_val(cls_method_context_t hctx,
+                        const string &key,
+                        bufferlist *outbl)
+{
+  OSDOp op{CEPH_OSD_OP_OMAPGETVALSBYKEYS};
+  {
+    std::set<std::string> k{key};
+    encode(k, op.indata);
+  }
+  if (const auto ret = execute_osd_op(hctx, op); ret < 0) {
+    return ret;
+  }
+  std::map<std::string, ceph::bufferlist> m;
+  try {
+    auto iter = op.outdata.cbegin();
+    decode(m, iter);
+  } catch (buffer::error&) {
+    return -EIO;
+  }
+  if (auto iter = std::begin(m); iter != std::end(m)) {
+    *outbl = std::move(iter->second);
+    return 0;
+  } else {
+    return -ENOENT;
+  }
+}
+
+int cls_cxx_map_set_val(cls_method_context_t hctx,
+                        const string &key,
+                        bufferlist *inbl)
+{
+  OSDOp op{CEPH_OSD_OP_OMAPSETVALS};
+  {
+    std::map<std::string, ceph::bufferlist> m;
+    m[key] = *inbl;
+    encode(m, op.indata);
+  }
+  return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_map_set_vals(cls_method_context_t hctx,
+                         const std::map<string, ceph::bufferlist> *map)
+{
+  OSDOp op{CEPH_OSD_OP_OMAPSETVALS};
+  encode(*map, op.indata);
+  return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_map_clear(cls_method_context_t hctx)
+{
+  return 0;
+}
+
+int cls_cxx_map_write_header(cls_method_context_t hctx, bufferlist *inbl)
+{
+  OSDOp op{CEPH_OSD_OP_OMAPSETHEADER};
+  op.indata = std::move(*inbl);
+  return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_map_remove_range(cls_method_context_t hctx,
+                             const std::string& key_begin,
+                             const std::string& key_end)
+{
+  OSDOp op{CEPH_OSD_OP_OMAPRMKEYRANGE};
+  encode(key_begin, op.indata);
+  encode(key_end, op.indata);
+  return execute_osd_op(hctx, op);
+}
+
+int cls_cxx_map_remove_key(cls_method_context_t hctx, const string &key)
+{
+  return 0;
+}
+
+int cls_cxx_list_watchers(cls_method_context_t hctx,
+                          obj_list_watch_response_t *watchers)
+{
+  return 0;
+}
+
+uint64_t cls_current_version(cls_method_context_t hctx)
+{
+  return 0;
+}
+
+
+int cls_current_subop_num(cls_method_context_t hctx)
+{
+  auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx);
+  // in contrast to classical OSD, crimson doesn't count OP_CALL and
+  // OP_STAT which seems fine regarding how the plugins we take care
+  // about use this part of API.
+  return ox->get_processed_rw_ops_num();
+}
+
+uint64_t cls_get_features(cls_method_context_t hctx)
+{
+  return 0;
+}
+
+uint64_t cls_get_client_features(cls_method_context_t hctx)
+{
+  try {
+    const auto& message = \
+      reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_message();
+    return message.get_features();
+  } catch (crimson::osd::error& e) {
+    return -e.code().value();
+  }
+}
+
+uint64_t cls_get_pool_stripe_width(cls_method_context_t hctx)
+{
+  auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx);
+  return ox->get_pool_stripe_width();
+}
+
+ceph_release_t cls_get_required_osd_release(cls_method_context_t hctx)
+{
+  // FIXME
+  return ceph_release_t::nautilus;
+}
+
+ceph_release_t cls_get_min_compatible_client(cls_method_context_t hctx)
+{
+  // FIXME
+  return ceph_release_t::nautilus;
+}
+
+int cls_get_snapset_seq(cls_method_context_t hctx, uint64_t *snap_seq)
+{
+  return 0;
+}
+
+int cls_cxx_chunk_write_and_set(cls_method_context_t hctx,
+                                int ofs,
+                                int len,
+                                bufferlist *write_inbl,
+                                uint32_t op_flags,
+                                bufferlist *set_inbl,
+                                int set_len)
+{
+  return 0;
+}
+
+int cls_get_manifest_ref_count(cls_method_context_t hctx, string fp_oid)
+{
+  return 0;
+}
+
+uint64_t cls_get_osd_min_alloc_size(cls_method_context_t hctx) {
+  // FIXME
+  return 4096;
+}
diff --git a/src/crimson/osd/object_context.cc b/src/crimson/osd/object_context.cc
new file mode 100644
index 000000000..bbc71d3f9
--- /dev/null
+++ b/src/crimson/osd/object_context.cc
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/osd/object_context.h"
+
+#include "common/Formatter.h"
+#include "crimson/common/config_proxy.h"
+
+namespace crimson::osd {
+
+ObjectContextRegistry::ObjectContextRegistry(crimson::common::ConfigProxy &conf)
+{
+  obc_lru.set_target_size(conf.get_val<uint64_t>("crimson_osd_obc_lru_size"));
+  conf.add_observer(this);
+}
+
+const char** ObjectContextRegistry::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    "crimson_osd_obc_lru_size",
+    nullptr
+  };
+  return KEYS;
+}
+
+void ObjectContextRegistry::handle_conf_change(
+  const crimson::common::ConfigProxy& conf,
+  const std::set <std::string> &changed)
+{
+  obc_lru.set_target_size(conf.get_val<uint64_t>("crimson_osd_obc_lru_size"));
+}
+
+
+}
diff --git a/src/crimson/osd/object_context.h b/src/crimson/osd/object_context.h
new file mode 100644
index 000000000..be238851e
--- /dev/null
+++ b/src/crimson/osd/object_context.h
@@ -0,0 +1,189 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <optional>
+#include <utility>
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/shared_ptr.hh>
+
+#include "common/intrusive_lru.h"
+#include "osd/object_state.h"
+#include "crimson/common/exception.h"
+#include "crimson/common/tri_mutex.h"
+#include "crimson/osd/osd_operation.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+namespace crimson::common {
+  class ConfigProxy;
+}
+
+namespace crimson::osd {
+
+class Watch;
+
+template <typename OBC>
+struct obc_to_hoid {
+  using type = hobject_t;
+  const type &operator()(const OBC &obc) {
+    return obc.obs.oi.soid;
+  }
+};
+
+class ObjectContext : public ceph::common::intrusive_lru_base<
+  ceph::common::intrusive_lru_config<
+    hobject_t, ObjectContext, obc_to_hoid<ObjectContext>>>
+{
+public:
+  Ref head; // Ref defined as part of ceph::common::intrusive_lru_base
+  ObjectState obs;
+  std::optional<SnapSet> ss;
+  bool loaded : 1;
+  // the watch / notify machinery rather stays away from the hot and
+  // frequented paths. std::map is used mostly because of developer's
+  // convenience.
+  using watch_key_t = std::pair<uint64_t, entity_name_t>;
+  std::map<watch_key_t, seastar::shared_ptr<crimson::osd::Watch>> watchers;
+
+  ObjectContext(const hobject_t &hoid) : obs(hoid), loaded(false) {}
+
+  const hobject_t &get_oid() const {
+    return obs.oi.soid;
+  }
+
+  bool is_head() const {
+    return get_oid().is_head();
+  }
+
+  const SnapSet &get_ro_ss() const {
+    if (is_head()) {
+      ceph_assert(ss);
+      return *ss;
+    } else {
+      ceph_assert(head);
+      return head->get_ro_ss();
+    }
+  }
+
+  void set_head_state(ObjectState &&_obs, SnapSet &&_ss) {
+    ceph_assert(is_head());
+    obs = std::move(_obs);
+    ss = std::move(_ss);
+    loaded = true;
+  }
+
+  void set_clone_state(ObjectState &&_obs, Ref &&_head) {
+    ceph_assert(!is_head());
+    obs = std::move(_obs);
+    head = _head;
+    loaded = true;
+  }
+
+  /// pass the provided exception to any waiting consumers of this ObjectContext
+  template<typename Exception>
+  void interrupt(Exception ex) {
+    lock.abort(std::move(ex));
+    if (recovery_read_marker) {
+      drop_recovery_read();
+    }
+  }
+
+private:
+  tri_mutex lock;
+  bool recovery_read_marker = false;
+
+  template <typename Lock, typename Func>
+  auto _with_lock(Lock&& lock, Func&& func) {
+    Ref obc = this;
+    return lock.lock().then([&lock, func = std::forward<Func>(func), obc]() mutable {
+      return seastar::futurize_invoke(func).finally([&lock, obc] {
+	lock.unlock();
+      });
+    });
+  }
+
+public:
+  template<RWState::State Type, typename Func>
+  auto with_lock(Func&& func) {
+    switch (Type) {
+    case RWState::RWWRITE:
+      return _with_lock(lock.for_write(), std::forward<Func>(func));
+    case RWState::RWREAD:
+      return _with_lock(lock.for_read(), std::forward<Func>(func));
+    case RWState::RWEXCL:
+      return _with_lock(lock.for_excl(), std::forward<Func>(func));
+    case RWState::RWNONE:
+      return seastar::futurize_invoke(std::forward<Func>(func));
+    default:
+      assert(0 == "noop");
+    }
+  }
+  template<RWState::State Type, typename Func>
+  auto with_promoted_lock(Func&& func) {
+    switch (Type) {
+    case RWState::RWWRITE:
+      return _with_lock(lock.excl_from_write(), std::forward<Func>(func));
+    case RWState::RWREAD:
+      return _with_lock(lock.excl_from_read(), std::forward<Func>(func));
+    case RWState::RWEXCL:
+      return _with_lock(lock.excl_from_excl(), std::forward<Func>(func));
+    case RWState::RWNONE:
+      return _with_lock(lock.for_excl(), std::forward<Func>(func));
+     default:
+      assert(0 == "noop");
+    }
+  }
+
+  bool empty() const {
+    return !lock.is_acquired();
+  }
+  bool is_request_pending() const {
+    return lock.is_acquired();
+  }
+
+  bool get_recovery_read() {
+    if (lock.try_lock_for_read()) {
+      recovery_read_marker = true;
+      return true;
+    } else {
+      return false;
+    }
+  }
+  void wait_recovery_read() {
+    assert(lock.get_readers() > 0);
+    recovery_read_marker = true;
+  }
+  void drop_recovery_read() {
+    assert(recovery_read_marker);
+    recovery_read_marker = false;
+  }
+  bool maybe_get_excl() {
+    return lock.try_lock_for_excl();
+  }
+};
+using ObjectContextRef = ObjectContext::Ref;
+
+class ObjectContextRegistry : public md_config_obs_t  {
+  ObjectContext::lru_t obc_lru;
+
+public:
+  ObjectContextRegistry(crimson::common::ConfigProxy &conf);
+
+  std::pair<ObjectContextRef, bool> get_cached_obc(const hobject_t &hoid) {
+    return obc_lru.get_or_create(hoid);
+  }
+  ObjectContextRef maybe_get_cached_obc(const hobject_t &hoid) {
+    return obc_lru.get(hoid);
+  }
+
+  const char** get_tracked_conf_keys() const final;
+  void handle_conf_change(const crimson::common::ConfigProxy& conf,
+                          const std::set <std::string> &changed) final;
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc
new file mode 100644
index 000000000..6b6614e93
--- /dev/null
+++ b/src/crimson/osd/ops_executer.cc
@@ -0,0 +1,980 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ops_executer.h"
+
+#include <boost/range/adaptor/filtered.hpp>
+#include <boost/range/adaptor/map.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/algorithm_ext/push_back.hpp>
+#include <boost/range/algorithm/max_element.hpp>
+#include <boost/range/numeric.hpp>
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include <seastar/core/thread.hh>
+
+#include "crimson/osd/exceptions.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/watch.h"
+#include "osd/ClassHandler.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+namespace crimson::osd {
+
+OpsExecuter::call_errorator::future<> OpsExecuter::do_op_call(OSDOp& osd_op)
+{
+  std::string cname, mname;
+  ceph::bufferlist indata;
+  try {
+    auto bp = std::begin(osd_op.indata);
+    bp.copy(osd_op.op.cls.class_len, cname);
+    bp.copy(osd_op.op.cls.method_len, mname);
+    bp.copy(osd_op.op.cls.indata_len, indata);
+  } catch (buffer::error&) {
+    logger().warn("call unable to decode class + method + indata");
+    return crimson::ct_error::invarg::make();
+  }
+
+  // NOTE: opening a class can actually result in dlopen(), and thus
+  // blocking the entire reactor. Thankfully to ClassHandler's cache
+  // this is supposed to be extremely infrequent.
+  ClassHandler::ClassData* cls;
+  int r = ClassHandler::get_instance().open_class(cname, &cls);
+  if (r) {
+    logger().warn("class {} open got {}", cname, cpp_strerror(r));
+    if (r == -ENOENT) {
+      return crimson::ct_error::operation_not_supported::make();
+    } else if (r == -EPERM) {
+      // propagate permission errors
+      return crimson::ct_error::permission_denied::make();
+    }
+    return crimson::ct_error::input_output_error::make();
+  }
+
+  ClassHandler::ClassMethod* method = cls->get_method(mname);
+  if (!method) {
+    logger().warn("call method {}.{} does not exist", cname, mname);
+    return crimson::ct_error::operation_not_supported::make();
+  }
+
+  const auto flags = method->get_flags();
+  if (!obc->obs.exists && (flags & CLS_METHOD_WR) == 0) {
+    return crimson::ct_error::enoent::make();
+  }
+
+#if 0
+  if (flags & CLS_METHOD_WR) {
+    ctx->user_modify = true;
+  }
+#endif
+
+  logger().debug("calling method {}.{}, num_read={}, num_write={}",
+                 cname, mname, num_read, num_write);
+  const auto prev_rd = num_read;
+  const auto prev_wr = num_write;
+  return seastar::async(
+    [this, method, indata=std::move(indata)]() mutable {
+      ceph::bufferlist outdata;
+      auto cls_context = reinterpret_cast<cls_method_context_t>(this);
+      const auto ret = method->exec(cls_context, indata, outdata);
+      return std::make_pair(ret, std::move(outdata));
+    }
+  ).then(
+    [this, prev_rd, prev_wr, &osd_op, flags]
+    (auto outcome) -> call_errorator::future<> {
+      auto& [ret, outdata] = outcome;
+      osd_op.rval = ret;
+
+      logger().debug("do_op_call: method returned ret={}, outdata.length()={}"
+                     " while num_read={}, num_write={}",
+                     ret, outdata.length(), num_read, num_write);
+      if (num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
+        logger().error("method tried to read object but is not marked RD");
+        osd_op.rval = -EIO;
+        return crimson::ct_error::input_output_error::make();
+      }
+      if (num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
+        logger().error("method tried to update object but is not marked WR");
+        osd_op.rval = -EIO;
+        return crimson::ct_error::input_output_error::make();
+      }
+      // ceph-osd has this implemented in `PrimaryLogPG::execute_ctx`,
+      // grep for `ignore_out_data`.
+      using crimson::common::local_conf;
+      if (op_info.allows_returnvec() &&
+          op_info.may_write() &&
+          ret >= 0 &&
+          outdata.length() > local_conf()->osd_max_write_op_reply_len) {
+        // the justification of this limit it to not inflate the pg log.
+        // that's the reason why we don't worry about pure reads.
+        logger().error("outdata overflow due to .length()={}, limit={}",
+                       outdata.length(),
+                       local_conf()->osd_max_write_op_reply_len);
+        osd_op.rval = -EOVERFLOW;
+        return crimson::ct_error::value_too_large::make();
+      }
+      // for write calls we never return data expect errors or RETURNVEC.
+      // please refer cls/cls_hello.cc to details.
+      if (!op_info.may_write() || op_info.allows_returnvec() || ret < 0) {
+        osd_op.op.extent.length = outdata.length();
+        osd_op.outdata.claim_append(outdata);
+      }
+      if (ret < 0) {
+        return crimson::stateful_ec{
+          std::error_code(-ret, std::generic_category()) };
+      } else {
+        return seastar::now();
+      }
+    }
+  );
+}
+
+static watch_info_t create_watch_info(const OSDOp& osd_op,
+                                      const MOSDOp& msg)
+{
+  using crimson::common::local_conf;
+  const uint32_t timeout =
+    osd_op.op.watch.timeout == 0 ? local_conf()->osd_client_watch_timeout
+                                 : osd_op.op.watch.timeout;
+  return {
+    osd_op.op.watch.cookie,
+    timeout,
+    msg.get_connection()->get_peer_addr()
+  };
+}
+
+OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_watch_subop_watch(
+  OSDOp& osd_op,
+  ObjectState& os,
+  ceph::os::Transaction& txn)
+{
+  struct connect_ctx_t {
+    ObjectContext::watch_key_t key;
+    crimson::net::ConnectionRef conn;
+    watch_info_t info;
+
+    connect_ctx_t(const OSDOp& osd_op, const MOSDOp& msg)
+      : key(osd_op.op.watch.cookie, msg.get_reqid().name),
+        conn(msg.get_connection()),
+        info(create_watch_info(osd_op, msg)) {
+    }
+  };
+  return with_effect_on_obc(connect_ctx_t{ osd_op, get_message() },
+    [&] (auto& ctx) {
+      const auto& entity = ctx.key.second;
+      auto [it, emplaced] =
+        os.oi.watchers.try_emplace(ctx.key, std::move(ctx.info));
+      if (emplaced) {
+        logger().info("registered new watch {} by {}", it->second, entity);
+        txn.nop();
+      } else {
+        logger().info("found existing watch {} by {}", it->second, entity);
+      }
+      return seastar::now();
+    },
+    [] (auto&& ctx, ObjectContextRef obc) {
+      auto [it, emplaced] = obc->watchers.try_emplace(ctx.key, nullptr);
+      if (emplaced) {
+        const auto& [cookie, entity] = ctx.key;
+        it->second = crimson::osd::Watch::create(obc, ctx.info, entity);
+        logger().info("op_effect: added new watcher: {}", ctx.key);
+      } else {
+        logger().info("op_effect: found existing watcher: {}", ctx.key);
+      }
+      return it->second->connect(std::move(ctx.conn), true /* will_ping */);
+    });
+}
+
+OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_watch_subop_reconnect(
+  OSDOp& osd_op,
+  ObjectState& os,
+  ceph::os::Transaction& txn)
+{
+  const entity_name_t& entity = get_message().get_reqid().name;
+  const auto& cookie = osd_op.op.watch.cookie;
+  if (!os.oi.watchers.count(std::make_pair(cookie, entity))) {
+    return crimson::ct_error::not_connected::make();
+  } else {
+    logger().info("found existing watch by {}", entity);
+    return do_op_watch_subop_watch(osd_op, os, txn);
+  }
+}
+
+OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_watch_subop_unwatch(
+  OSDOp& osd_op,
+  ObjectState& os,
+  ceph::os::Transaction& txn)
+{
+  logger().info("{}", __func__);
+
+  struct disconnect_ctx_t {
+    ObjectContext::watch_key_t key;
+    bool send_disconnect{ false };
+
+    disconnect_ctx_t(const OSDOp& osd_op, const MOSDOp& msg)
+      : key(osd_op.op.watch.cookie, msg.get_reqid().name) {
+    }
+  };
+  return with_effect_on_obc(disconnect_ctx_t{ osd_op, get_message() },
+    [&] (auto& ctx) {
+      const auto& entity = ctx.key.second;
+      if (auto nh = os.oi.watchers.extract(ctx.key); !nh.empty()) {
+        logger().info("removed watch {} by {}", nh.mapped(), entity);
+        txn.nop();
+      } else {
+        logger().info("can't remove: no watch by {}", entity);
+      }
+      return seastar::now();
+    },
+    [] (auto&& ctx, ObjectContextRef obc) {
+      if (auto nh = obc->watchers.extract(ctx.key); !nh.empty()) {
+        return seastar::do_with(std::move(nh.mapped()),
+                         [ctx](auto&& watcher) {
+          logger().info("op_effect: disconnect watcher {}", ctx.key);
+          return watcher->remove(ctx.send_disconnect);
+        });
+      } else {
+        logger().info("op_effect: disconnect failed to find watcher {}", ctx.key);
+        return seastar::now();
+      }
+    });
+}
+
+OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_watch_subop_ping(
+  OSDOp& osd_op,
+  ObjectState& os,
+  ceph::os::Transaction& txn)
+{
+  const entity_name_t& entity = get_message().get_reqid().name;
+  const auto& cookie = osd_op.op.watch.cookie;
+  const auto key = std::make_pair(cookie, entity);
+
+  // Note: WATCH with PING doesn't cause may_write() to return true,
+  // so if there is nothing else in the transaction, this is going
+  // to run do_osd_op_effects, but not write out a log entry */
+  if (!os.oi.watchers.count(key)) {
+    return crimson::ct_error::not_connected::make();
+  }
+  auto it = obc->watchers.find(key);
+  if (it == std::end(obc->watchers) || !it->second->is_connected()) {
+    return crimson::ct_error::timed_out::make();
+  }
+  logger().info("found existing watch by {}", entity);
+  it->second->got_ping(ceph_clock_now());
+  return seastar::now();
+}
+
+OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_watch(
+  OSDOp& osd_op,
+  ObjectState& os,
+  ceph::os::Transaction& txn)
+{
+  logger().debug("{}", __func__);
+  if (!os.exists) {
+    return crimson::ct_error::enoent::make();
+  }
+  switch (osd_op.op.watch.op) {
+    case CEPH_OSD_WATCH_OP_WATCH:
+      return do_op_watch_subop_watch(osd_op, os, txn);
+    case CEPH_OSD_WATCH_OP_RECONNECT:
+      return do_op_watch_subop_reconnect(osd_op, os, txn);
+    case CEPH_OSD_WATCH_OP_PING:
+      return do_op_watch_subop_ping(osd_op, os, txn);
+    case CEPH_OSD_WATCH_OP_UNWATCH:
+      return do_op_watch_subop_unwatch(osd_op, os, txn);
+    case CEPH_OSD_WATCH_OP_LEGACY_WATCH:
+      logger().warn("ignoring CEPH_OSD_WATCH_OP_LEGACY_WATCH");
+      return crimson::ct_error::invarg::make();
+  }
+  logger().warn("unrecognized WATCH subop: {}", osd_op.op.watch.op);
+  return crimson::ct_error::invarg::make();
+}
+
+static uint64_t get_next_notify_id(epoch_t e)
+{
+  // FIXME
+  static std::uint64_t next_notify_id = 0;
+  return (((uint64_t)e) << 32) | ((uint64_t)(next_notify_id++));
+}
+
+OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_notify(
+  OSDOp& osd_op,
+  const ObjectState& os)
+{
+  logger().debug("{}, msg epoch: {}", __func__, get_message().get_map_epoch());
+
+  if (!os.exists) {
+    return crimson::ct_error::enoent::make();
+  }
+  struct notify_ctx_t {
+    crimson::net::ConnectionRef conn;
+    notify_info_t ninfo;
+    const uint64_t client_gid;
+    const epoch_t epoch;
+
+    notify_ctx_t(const MOSDOp& msg)
+      : conn(msg.get_connection()),
+        client_gid(msg.get_reqid().name.num()),
+        epoch(msg.get_map_epoch()) {
+    }
+  };
+  return with_effect_on_obc(notify_ctx_t{ get_message() },
+    [&] (auto& ctx) {
+      try {
+        auto bp = osd_op.indata.cbegin();
+        uint32_t ver; // obsolete
+        ceph::decode(ver, bp);
+        ceph::decode(ctx.ninfo.timeout, bp);
+        ceph::decode(ctx.ninfo.bl, bp);
+      } catch (const buffer::error&) {
+        ctx.ninfo.timeout = 0;
+      }
+      if (!ctx.ninfo.timeout) {
+        using crimson::common::local_conf;
+        ctx.ninfo.timeout = local_conf()->osd_default_notify_timeout;
+      }
+      ctx.ninfo.notify_id = get_next_notify_id(ctx.epoch);
+      ctx.ninfo.cookie = osd_op.op.notify.cookie;
+      // return our unique notify id to the client
+      ceph::encode(ctx.ninfo.notify_id, osd_op.outdata);
+      return seastar::now();
+    },
+    [] (auto&& ctx, ObjectContextRef obc) {
+      auto alive_watchers = obc->watchers | boost::adaptors::map_values
+                                          | boost::adaptors::filtered(
+        [] (const auto& w) {
+          // FIXME: filter as for the `is_ping` in `Watch::start_notify`
+          return w->is_alive();
+        });
+      return crimson::osd::Notify::create_n_propagate(
+        std::begin(alive_watchers),
+        std::end(alive_watchers),
+        std::move(ctx.conn),
+        ctx.ninfo,
+        ctx.client_gid,
+        obc->obs.oi.user_version);
+  });
+}
+
+OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_notify_ack(
+  OSDOp& osd_op,
+  const ObjectState& os)
+{
+  logger().debug("{}", __func__);
+
+  struct notifyack_ctx_t {
+    const entity_name_t entity;
+    uint64_t watch_cookie;
+    uint64_t notify_id;
+    ceph::bufferlist reply_bl;
+
+    notifyack_ctx_t(const MOSDOp& msg) : entity(msg.get_reqid().name) {
+    }
+  };
+  return with_effect_on_obc(notifyack_ctx_t{ get_message() },
+    [&] (auto& ctx) -> watch_errorator::future<> {
+      try {
+        auto bp = osd_op.indata.cbegin();
+        ceph::decode(ctx.notify_id, bp);
+        ceph::decode(ctx.watch_cookie, bp);
+        if (!bp.end()) {
+          ceph::decode(ctx.reply_bl, bp);
+        }
+      } catch (const buffer::error&) {
+        // here we behave differently than ceph-osd. For historical reasons,
+        // it falls back to using `osd_op.op.watch.cookie` as `ctx.notify_id`.
+        // crimson just returns EINVAL if the data cannot be decoded.
+        return crimson::ct_error::invarg::make();
+      }
+      return watch_errorator::now();
+    },
+    [] (auto&& ctx, ObjectContextRef obc) {
+      logger().info("notify_ack watch_cookie={}, notify_id={}",
+                    ctx.watch_cookie, ctx.notify_id);
+      return seastar::do_for_each(obc->watchers,
+        [ctx=std::move(ctx)] (auto& kv) {
+          const auto& [key, watchp] = kv;
+          static_assert(
+            std::is_same_v<std::decay_t<decltype(watchp)>,
+                           seastar::shared_ptr<crimson::osd::Watch>>);
+          auto& [cookie, entity] = key;
+          if (ctx.entity != entity) {
+            logger().debug("skipping watch {}; entity name {} != {}",
+                           key, entity, ctx.entity);
+            return seastar::now();
+          }
+          if (ctx.watch_cookie != cookie) {
+            logger().debug("skipping watch {}; cookie {} != {}",
+                           key, ctx.watch_cookie, cookie);
+            return seastar::now();
+          }
+          logger().info("acking notify on watch {}", key);
+          return watchp->notify_ack(ctx.notify_id, ctx.reply_bl);
+        });
+  });
+}
+
+OpsExecuter::osd_op_errorator::future<>
+OpsExecuter::execute_op(OSDOp& osd_op)
+{
+  // TODO: dispatch via call table?
+  // TODO: we might want to find a way to unify both input and output
+  // of each op.
+  logger().debug(
+    "handling op {} on object {}",
+    ceph_osd_op_name(osd_op.op.op),
+    get_target());
+  switch (const ceph_osd_op& op = osd_op.op; op.op) {
+  case CEPH_OSD_OP_SYNC_READ:
+    [[fallthrough]];
+  case CEPH_OSD_OP_READ:
+    return do_read_op([&osd_op] (auto& backend, const auto& os) {
+      return backend.read(os, osd_op);
+    });
+  case CEPH_OSD_OP_SPARSE_READ:
+    return do_read_op([&osd_op] (auto& backend, const auto& os) {
+      return backend.sparse_read(os, osd_op);
+    });
+  case CEPH_OSD_OP_CHECKSUM:
+    return do_read_op([&osd_op] (auto& backend, const auto& os) {
+      return backend.checksum(os, osd_op);
+    });
+  case CEPH_OSD_OP_CMPEXT:
+    return do_read_op([&osd_op] (auto& backend, const auto& os) {
+      return backend.cmp_ext(os, osd_op);
+    });
+  case CEPH_OSD_OP_GETXATTR:
+    return do_read_op([&osd_op] (auto& backend, const auto& os) {
+      return backend.getxattr(os, osd_op);
+    });
+  case CEPH_OSD_OP_GETXATTRS:
+    return do_read_op([&osd_op] (auto& backend, const auto& os) {
+      return backend.get_xattrs(os, osd_op);
+    });
+  case CEPH_OSD_OP_RMXATTR:
+    return do_write_op([&osd_op] (auto& backend, auto& os, auto& txn) {
+      return backend.rm_xattr(os, osd_op, txn);
+    }, true);
+  case CEPH_OSD_OP_CREATE:
+    return do_write_op([&osd_op] (auto& backend, auto& os, auto& txn) {
+      return backend.create(os, osd_op, txn);
+    }, true);
+  case CEPH_OSD_OP_WRITE:
+    return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) {
+      return backend.write(os, osd_op, txn, *osd_op_params);
+    }, true);
+  case CEPH_OSD_OP_WRITESAME:
+    return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) {
+      return backend.write_same(os, osd_op, txn, *osd_op_params);
+    }, true);
+  case CEPH_OSD_OP_WRITEFULL:
+    return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) {
+      return backend.writefull(os, osd_op, txn, *osd_op_params);
+    }, true);
+  case CEPH_OSD_OP_APPEND:
+    return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) {
+      return backend.append(os, osd_op, txn, *osd_op_params);
+    }, true);
+  case CEPH_OSD_OP_TRUNCATE:
+    return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) {
+      // FIXME: rework needed. Move this out to do_write_op(), introduce
+      // do_write_op_no_user_modify()...
+      return backend.truncate(os, osd_op, txn, *osd_op_params);
+    }, true);
+  case CEPH_OSD_OP_ZERO:
+    return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) {
+      return backend.zero(os, osd_op, txn, *osd_op_params);
+    }, true);
+  case CEPH_OSD_OP_SETALLOCHINT:
+    return osd_op_errorator::now();
+  case CEPH_OSD_OP_SETXATTR:
+    return do_write_op([&osd_op] (auto& backend, auto& os, auto& txn) {
+      return backend.setxattr(os, osd_op, txn);
+    }, true);
+  case CEPH_OSD_OP_DELETE:
+    return do_write_op([] (auto& backend, auto& os, auto& txn) {
+      return backend.remove(os, txn);
+    }, true);
+  case CEPH_OSD_OP_CALL:
+    return this->do_op_call(osd_op);
+  case CEPH_OSD_OP_STAT:
+    // note: stat does not require RD
+    return do_const_op([&osd_op] (/* const */auto& backend, const auto& os) {
+      return backend.stat(os, osd_op);
+    });
+  case CEPH_OSD_OP_TMAPUP:
+    // TODO: there was an effort to kill TMAP in ceph-osd. According to
+    // @dzafman this isn't possible yet. Maybe it could be accomplished
+    // before crimson's readiness and we'd luckily don't need to carry.
+    return dont_do_legacy_op();
+
+  // OMAP
+  case CEPH_OSD_OP_OMAPGETKEYS:
+    return do_read_op([&osd_op] (auto& backend, const auto& os) {
+      return backend.omap_get_keys(os, osd_op);
+    });
+  case CEPH_OSD_OP_OMAPGETVALS:
+    return do_read_op([&osd_op] (auto& backend, const auto& os) {
+      return backend.omap_get_vals(os, osd_op);
+    });
+  case CEPH_OSD_OP_OMAPGETHEADER:
+    return do_read_op([&osd_op] (auto& backend, const auto& os) {
+      return backend.omap_get_header(os, osd_op);
+    });
+  case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
+    return do_read_op([&osd_op] (auto& backend, const auto& os) {
+      return backend.omap_get_vals_by_keys(os, osd_op);
+    });
+  case CEPH_OSD_OP_OMAPSETVALS:
+#if 0
+    if (!pg.get_pool().info.supports_omap()) {
+      return crimson::ct_error::operation_not_supported::make();
+    }
+#endif
+    return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) {
+      return backend.omap_set_vals(os, osd_op, txn, *osd_op_params);
+    }, true);
+  case CEPH_OSD_OP_OMAPSETHEADER:
+#if 0
+    if (!pg.get_pool().info.supports_omap()) {
+      return crimson::ct_error::operation_not_supported::make();
+    }
+#endif
+    return do_write_op([&osd_op] (auto& backend, auto& os, auto& txn) {
+      return backend.omap_set_header(os, osd_op, txn);
+    }, true);
+  case CEPH_OSD_OP_OMAPRMKEYRANGE:
+#if 0
+    if (!pg.get_pool().info.supports_omap()) {
+      return crimson::ct_error::operation_not_supported::make();
+    }
+#endif
+    return do_write_op([&osd_op] (auto& backend, auto& os, auto& txn) {
+      return backend.omap_remove_range(os, osd_op, txn);
+    }, true);
+  case CEPH_OSD_OP_OMAPCLEAR:
+    return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) {
+      return backend.omap_clear(os, osd_op, txn, *osd_op_params);
+    }, true);
+
+  // watch/notify
+  case CEPH_OSD_OP_WATCH:
+    return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) {
+      return do_op_watch(osd_op, os, txn);
+    }, false);
+  case CEPH_OSD_OP_NOTIFY:
+    return do_read_op([this, &osd_op] (auto&, const auto& os) {
+      return do_op_notify(osd_op, os);
+    });
+  case CEPH_OSD_OP_NOTIFY_ACK:
+    return do_read_op([this, &osd_op] (auto&, const auto& os) {
+      return do_op_notify_ack(osd_op, os);
+    });
+
+  default:
+    logger().warn("unknown op {}", ceph_osd_op_name(op.op));
+    throw std::runtime_error(
+      fmt::format("op '{}' not supported", ceph_osd_op_name(op.op)));
+  }
+}
+
+static inline std::unique_ptr<const PGLSFilter> get_pgls_filter(
+  const std::string& type,
+  bufferlist::const_iterator& iter)
+{
+  // storing non-const PGLSFilter for the sake of ::init()
+  std::unique_ptr<PGLSFilter> filter;
+  if (type.compare("plain") == 0) {
+    filter = std::make_unique<PGLSPlainFilter>();
+  } else {
+    std::size_t dot = type.find(".");
+    if (dot == type.npos || dot == 0 || dot == type.size() - 1) {
+      throw crimson::osd::invalid_argument{};
+    }
+
+    const std::string class_name = type.substr(0, dot);
+    const std::string filter_name = type.substr(dot + 1);
+    ClassHandler::ClassData *cls = nullptr;
+    int r = ClassHandler::get_instance().open_class(class_name, &cls);
+    if (r != 0) {
+      logger().warn("can't open class {}: {}", class_name, cpp_strerror(r));
+      if (r == -EPERM) {
+        // propogate permission error
+        throw crimson::osd::permission_denied{};
+      } else {
+        throw crimson::osd::invalid_argument{};
+      }
+    } else {
+      ceph_assert(cls);
+    }
+
+    ClassHandler::ClassFilter * const class_filter = cls->get_filter(filter_name);
+    if (class_filter == nullptr) {
+      logger().warn("can't find filter {} in class {}", filter_name, class_name);
+      throw crimson::osd::invalid_argument{};
+    }
+
+    filter.reset(class_filter->fn());
+    if (!filter) {
+      // Object classes are obliged to return us something, but let's
+      // give an error rather than asserting out.
+      logger().warn("buggy class {} failed to construct filter {}",
+                    class_name, filter_name);
+      throw crimson::osd::invalid_argument{};
+    }
+  }
+
+  ceph_assert(filter);
+  int r = filter->init(iter);
+  if (r < 0) {
+    logger().warn("error initializing filter {}: {}", type, cpp_strerror(r));
+    throw crimson::osd::invalid_argument{};
+  }
+
+  // successfully constructed and initialized, return it.
+  return filter;
+}
+
+static seastar::future<hobject_t> pgls_filter(
+  const PGLSFilter& filter,
+  const PGBackend& backend,
+  const hobject_t& sobj)
+{
+  if (const auto xattr = filter.get_xattr(); !xattr.empty()) {
+    logger().debug("pgls_filter: filter is interested in xattr={} for obj={}",
+                   xattr, sobj);
+    return backend.getxattr(sobj, xattr).safe_then(
+      [&filter, sobj] (ceph::bufferptr bp) {
+        logger().debug("pgls_filter: got xvalue for obj={}", sobj);
+
+        ceph::bufferlist val;
+        val.push_back(std::move(bp));
+        const bool filtered = filter.filter(sobj, val);
+        return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{});
+      }, PGBackend::get_attr_errorator::all_same_way([&filter, sobj] {
+        logger().debug("pgls_filter: got error for obj={}", sobj);
+
+        if (filter.reject_empty_xattr()) {
+          return seastar::make_ready_future<hobject_t>(hobject_t{});
+        }
+        ceph::bufferlist val;
+        const bool filtered = filter.filter(sobj, val);
+        return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{});
+      }));
+  } else {
+    ceph::bufferlist empty_lvalue_bl;
+    const bool filtered = filter.filter(sobj, empty_lvalue_bl);
+    return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{});
+  }
+}
+
+static seastar::future<ceph::bufferlist> do_pgnls_common(
+  const hobject_t& pg_start,
+  const hobject_t& pg_end,
+  const PGBackend& backend,
+  const hobject_t& lower_bound,
+  const std::string& nspace,
+  const uint64_t limit,
+  const PGLSFilter* const filter)
+{
+  if (!(lower_bound.is_min() ||
+        lower_bound.is_max() ||
+        (lower_bound >= pg_start && lower_bound < pg_end))) {
+    // this should only happen with a buggy client.
+    throw std::invalid_argument("outside of PG bounds");
+  }
+
+  return backend.list_objects(lower_bound, limit).then(
+    [&backend, filter, nspace](auto&& ret) {
+      auto& [objects, next] = ret;
+      auto in_my_namespace = [&nspace](const hobject_t& obj) {
+        using crimson::common::local_conf;
+        if (obj.get_namespace() == local_conf()->osd_hit_set_namespace) {
+          return false;
+        } else if (nspace == librados::all_nspaces) {
+          return true;
+        } else {
+          return obj.get_namespace() == nspace;
+        }
+      };
+      auto to_pglsed = [&backend, filter] (const hobject_t& obj) {
+        // this transformation looks costly. However, I don't have any
+        // reason to think PGLS* operations are critical for, let's say,
+        // general performance.
+        //
+        // from tchaikov: "another way is to use seastar::map_reduce(),
+        // to 1) save the effort to filter the already filtered objects
+        // 2) avoid the space to keep the tuple<bool, object> even if
+        // the object is filtered out".
+        if (filter) {
+          return pgls_filter(*filter, backend, obj);
+        } else {
+          return seastar::make_ready_future<hobject_t>(obj);
+        }
+      };
+
+      auto range = objects | boost::adaptors::filtered(in_my_namespace)
+                           | boost::adaptors::transformed(to_pglsed);
+      logger().debug("do_pgnls_common: finishing the 1st stage of pgls");
+      return seastar::when_all_succeed(std::begin(range),
+                                       std::end(range)).then(
+        [next=std::move(next)] (auto items) mutable {
+          // the sole purpose of this chaining is to pass `next` to 2nd
+          // stage altogether with items
+          logger().debug("do_pgnls_common: 1st done");
+          return seastar::make_ready_future<
+            std::tuple<std::vector<hobject_t>, hobject_t>>(
+              std::make_tuple(std::move(items), std::move(next)));
+      });
+  }).then(
+    [pg_end] (auto&& ret) {
+      auto& [items, next] = ret;
+      auto is_matched = [] (const auto& obj) {
+        return !obj.is_min();
+      };
+      auto to_entry = [] (const auto& obj) {
+        return librados::ListObjectImpl{
+          obj.get_namespace(), obj.oid.name, obj.get_key()
+        };
+      };
+
+      pg_nls_response_t response;
+      boost::push_back(response.entries, items | boost::adaptors::filtered(is_matched)
+                                               | boost::adaptors::transformed(to_entry));
+      response.handle = next.is_max() ? pg_end : next;
+      ceph::bufferlist out;
+      encode(response, out);
+      logger().debug("{}: response.entries.size()=",
+                     __func__, response.entries.size());
+      return seastar::make_ready_future<ceph::bufferlist>(std::move(out));
+  });
+}
+
+static seastar::future<> do_pgnls(
+  const PG& pg,
+  const std::string& nspace,
+  OSDOp& osd_op)
+{
+  hobject_t lower_bound;
+  try {
+    ceph::decode(lower_bound, osd_op.indata);
+  } catch (const buffer::error&) {
+    throw std::invalid_argument("unable to decode PGNLS handle");
+  }
+  const auto pg_start = pg.get_pgid().pgid.get_hobj_start();
+  const auto pg_end = \
+    pg.get_pgid().pgid.get_hobj_end(pg.get_pool().info.get_pg_num());
+  return do_pgnls_common(pg_start,
+                         pg_end,
+                         pg.get_backend(),
+                         lower_bound,
+                         nspace,
+                         osd_op.op.pgls.count,
+                         nullptr /* no filter */)
+    .then([&osd_op](bufferlist bl) {
+      osd_op.outdata = std::move(bl);
+      return seastar::now();
+  });
+}
+
+static seastar::future<> do_pgnls_filtered(
+  const PG& pg,
+  const std::string& nspace,
+  OSDOp& osd_op)
+{
+  std::string cname, mname, type;
+  auto bp = osd_op.indata.cbegin();
+  try {
+    ceph::decode(cname, bp);
+    ceph::decode(mname, bp);
+    ceph::decode(type, bp);
+  } catch (const buffer::error&) {
+    throw crimson::osd::invalid_argument{};
+  }
+
+  auto filter = get_pgls_filter(type, bp);
+
+  hobject_t lower_bound;
+  try {
+    lower_bound.decode(bp);
+  } catch (const buffer::error&) {
+    throw std::invalid_argument("unable to decode PGNLS_FILTER description");
+  }
+
+  logger().debug("{}: cname={}, mname={}, type={}, lower_bound={}, filter={}",
+                 __func__, cname, mname, type, lower_bound,
+                 static_cast<const void*>(filter.get()));
+  return seastar::do_with(std::move(filter),
+    [&, lower_bound=std::move(lower_bound)](auto&& filter) {
+      const auto pg_start = pg.get_pgid().pgid.get_hobj_start();
+      const auto pg_end = pg.get_pgid().pgid.get_hobj_end(pg.get_pool().info.get_pg_num());
+      return do_pgnls_common(pg_start,
+                             pg_end,
+                             pg.get_backend(),
+                             lower_bound,
+                             nspace,
+                             osd_op.op.pgls.count,
+                             filter.get())
+        .then([&osd_op](bufferlist bl) {
+          osd_op.outdata = std::move(bl);
+          return seastar::now();
+      });
+  });
+}
+
+static seastar::future<ceph::bufferlist> do_pgls_common(
+  const hobject_t& pg_start,
+  const hobject_t& pg_end,
+  const PGBackend& backend,
+  const hobject_t& lower_bound,
+  const std::string& nspace,
+  const uint64_t limit,
+  const PGLSFilter* const filter)
+{
+  if (!(lower_bound.is_min() ||
+        lower_bound.is_max() ||
+        (lower_bound >= pg_start && lower_bound < pg_end))) {
+    // this should only happen with a buggy client.
+    throw std::invalid_argument("outside of PG bounds");
+  }
+
+  using entries_t = decltype(pg_ls_response_t::entries);
+  return backend.list_objects(lower_bound, limit).then(
+    [&backend, filter, nspace](auto&& ret) {
+      auto& [objects, next] = ret;
+      return seastar::when_all(
+        seastar::map_reduce(std::move(objects),
+          [&backend, filter, nspace](const hobject_t& obj) {
+            if (obj.get_namespace() == nspace) {
+              if (filter) {
+                return pgls_filter(*filter, backend, obj);
+              } else {
+                return seastar::make_ready_future<hobject_t>(obj);
+              }
+            } else {
+              return seastar::make_ready_future<hobject_t>(hobject_t{});
+            }
+          },
+          entries_t{},
+          [](entries_t entries, hobject_t obj) {
+            if (!obj.is_min()) {
+              entries.emplace_back(obj.oid, obj.get_key());
+            }
+            return entries;
+          }),
+        seastar::make_ready_future<hobject_t>(next));
+    }).then([pg_end](auto&& ret) {
+      auto entries = std::move(std::get<0>(ret).get0());
+      auto next = std::move(std::get<1>(ret).get0());
+      pg_ls_response_t response;
+      response.handle = next.is_max() ? pg_end : next;
+      response.entries = std::move(entries);
+      ceph::bufferlist out;
+      encode(response, out);
+      logger().debug("{}: response.entries.size()=",
+                     __func__, response.entries.size());
+      return seastar::make_ready_future<ceph::bufferlist>(std::move(out));
+  });
+}
+
+static seastar::future<> do_pgls(
+   const PG& pg,
+   const std::string& nspace,
+   OSDOp& osd_op)
+{
+  hobject_t lower_bound;
+  auto bp = osd_op.indata.cbegin();
+  try {
+    lower_bound.decode(bp);
+  } catch (const buffer::error&) {
+    throw std::invalid_argument{"unable to decode PGLS handle"};
+  }
+  const auto pg_start = pg.get_pgid().pgid.get_hobj_start();
+  const auto pg_end =
+    pg.get_pgid().pgid.get_hobj_end(pg.get_pool().info.get_pg_num());
+  return do_pgls_common(pg_start,
+			pg_end,
+			pg.get_backend(),
+			lower_bound,
+			nspace,
+			osd_op.op.pgls.count,
+			nullptr /* no filter */)
+    .then([&osd_op](bufferlist bl) {
+      osd_op.outdata = std::move(bl);
+      return seastar::now();
+    });
+}
+
+static seastar::future<> do_pgls_filtered(
+  const PG& pg,
+  const std::string& nspace,
+  OSDOp& osd_op)
+{
+  std::string cname, mname, type;
+  auto bp = osd_op.indata.cbegin();
+  try {
+    ceph::decode(cname, bp);
+    ceph::decode(mname, bp);
+    ceph::decode(type, bp);
+  } catch (const buffer::error&) {
+    throw crimson::osd::invalid_argument{};
+  }
+
+  auto filter = get_pgls_filter(type, bp);
+
+  hobject_t lower_bound;
+  try {
+    lower_bound.decode(bp);
+  } catch (const buffer::error&) {
+    throw std::invalid_argument("unable to decode PGLS_FILTER description");
+  }
+
+  logger().debug("{}: cname={}, mname={}, type={}, lower_bound={}, filter={}",
+                 __func__, cname, mname, type, lower_bound,
+                 static_cast<const void*>(filter.get()));
+  return seastar::do_with(std::move(filter),
+    [&, lower_bound=std::move(lower_bound)](auto&& filter) {
+      const auto pg_start = pg.get_pgid().pgid.get_hobj_start();
+      const auto pg_end = pg.get_pgid().pgid.get_hobj_end(pg.get_pool().info.get_pg_num());
+      return do_pgls_common(pg_start,
+                            pg_end,
+                            pg.get_backend(),
+                            lower_bound,
+                            nspace,
+                            osd_op.op.pgls.count,
+                            filter.get())
+        .then([&osd_op](bufferlist bl) {
+          osd_op.outdata = std::move(bl);
+          return seastar::now();
+      });
+  });
+}
+
+seastar::future<>
+PgOpsExecuter::execute_op(OSDOp& osd_op)
+{
+  logger().warn("handling op {}", ceph_osd_op_name(osd_op.op.op));
+  switch (const ceph_osd_op& op = osd_op.op; op.op) {
+  case CEPH_OSD_OP_PGLS:
+    return do_pgls(pg, nspace, osd_op);
+  case CEPH_OSD_OP_PGLS_FILTER:
+    return do_pgls_filtered(pg, nspace, osd_op);
+  case CEPH_OSD_OP_PGNLS:
+    return do_pgnls(pg, nspace, osd_op);
+  case CEPH_OSD_OP_PGNLS_FILTER:
+    return do_pgnls_filtered(pg, nspace, osd_op);
+  default:
+    logger().warn("unknown op {}", ceph_osd_op_name(op.op));
+    throw std::runtime_error(
+      fmt::format("op '{}' not supported", ceph_osd_op_name(op.op)));
+  }
+}
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h
new file mode 100644
index 000000000..42fcf61b8
--- /dev/null
+++ b/src/crimson/osd/ops_executer.h
@@ -0,0 +1,283 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+#include <type_traits>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <boost/smart_ptr/local_shared_ptr.hpp>
+#include <seastar/core/chunked_fifo.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+
+#include "common/dout.h"
+#include "crimson/net/Fwd.h"
+#include "os/Transaction.h"
+#include "osd/osd_types.h"
+#include "crimson/osd/object_context.h"
+
+#include "crimson/common/errorator.h"
+#include "crimson/common/type_helpers.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/shard_services.h"
+#include "crimson/osd/osdmap_gate.h"
+
+#include "crimson/osd/pg_backend.h"
+#include "crimson/osd/exceptions.h"
+
+#include "messages/MOSDOp.h"
+
+class PG;
+class PGLSFilter;
+class OSDOp;
+
+namespace crimson::osd {
+
+// PgOpsExecuter -- a class for executing ops targeting a certain object.
+class OpsExecuter {
+  using call_errorator = crimson::errorator<
+    crimson::stateful_ec,
+    crimson::ct_error::enoent,
+    crimson::ct_error::invarg,
+    crimson::ct_error::permission_denied,
+    crimson::ct_error::operation_not_supported,
+    crimson::ct_error::input_output_error,
+    crimson::ct_error::value_too_large>;
+  using read_errorator = PGBackend::read_errorator;
+  using write_ertr = PGBackend::write_ertr;
+  using get_attr_errorator = PGBackend::get_attr_errorator;
+  using watch_errorator = crimson::errorator<
+    crimson::ct_error::enoent,
+    crimson::ct_error::invarg,
+    crimson::ct_error::not_connected,
+    crimson::ct_error::timed_out>;
+
+public:
+  // because OpsExecuter is pretty heavy-weight object we want to ensure
+  // it's not copied nor even moved by accident. Performance is the sole
+  // reason for prohibiting that.
+  OpsExecuter(OpsExecuter&&) = delete;
+  OpsExecuter(const OpsExecuter&) = delete;
+
+  using osd_op_errorator = crimson::compound_errorator_t<
+    call_errorator,
+    read_errorator,
+    write_ertr,
+    get_attr_errorator,
+    watch_errorator,
+    PGBackend::stat_errorator>;
+
+private:
+  // an operation can be divided into two stages: main and effect-exposing
+  // one. The former is performed immediately on call to `do_osd_op()` while
+  // the later on `submit_changes()` – after successfully processing main
+  // stages of all involved operations. When any stage fails, none of all
+  // scheduled effect-exposing stages will be executed.
+  // when operation requires this division, some variant of `with_effect()`
+  // should be used.
+  struct effect_t {
+    virtual osd_op_errorator::future<> execute() = 0;
+    virtual ~effect_t() = default;
+  };
+
+  ObjectContextRef obc;
+  const OpInfo& op_info;
+  const pg_pool_t& pool_info;  // for the sake of the ObjClass API
+  PGBackend& backend;
+  const MOSDOp& msg;
+  std::optional<osd_op_params_t> osd_op_params;
+  bool user_modify = false;
+  ceph::os::Transaction txn;
+
+  size_t num_read = 0;    ///< count read ops
+  size_t num_write = 0;   ///< count update ops
+
+  // this gizmo could be wrapped in std::optional for the sake of lazy
+  // initialization. we don't need it for ops that doesn't have effect
+  // TODO: verify the init overhead of chunked_fifo
+  seastar::chunked_fifo<std::unique_ptr<effect_t>> op_effects;
+
+  template <class Context, class MainFunc, class EffectFunc>
+  auto with_effect_on_obc(
+    Context&& ctx,
+    MainFunc&& main_func,
+    EffectFunc&& effect_func);
+
+  call_errorator::future<> do_op_call(class OSDOp& osd_op);
+  watch_errorator::future<> do_op_watch(
+    class OSDOp& osd_op,
+    class ObjectState& os,
+    ceph::os::Transaction& txn);
+  watch_errorator::future<> do_op_watch_subop_watch(
+    class OSDOp& osd_op,
+    class ObjectState& os,
+    ceph::os::Transaction& txn);
+  watch_errorator::future<> do_op_watch_subop_reconnect(
+    class OSDOp& osd_op,
+    class ObjectState& os,
+    ceph::os::Transaction& txn);
+  watch_errorator::future<> do_op_watch_subop_unwatch(
+    class OSDOp& osd_op,
+    class ObjectState& os,
+    ceph::os::Transaction& txn);
+  watch_errorator::future<> do_op_watch_subop_ping(
+    class OSDOp& osd_op,
+    class ObjectState& os,
+    ceph::os::Transaction& txn);
+  watch_errorator::future<> do_op_notify(
+    class OSDOp& osd_op,
+    const class ObjectState& os);
+  watch_errorator::future<> do_op_notify_ack(
+    class OSDOp& osd_op,
+    const class ObjectState& os);
+
+  hobject_t &get_target() const {
+    return obc->obs.oi.soid;
+  }
+
+  template <class Func>
+  auto do_const_op(Func&& f) {
+    // TODO: pass backend as read-only
+    return std::forward<Func>(f)(backend, std::as_const(obc->obs));
+  }
+
+  template <class Func>
+  auto do_read_op(Func&& f) {
+    ++num_read;
+    // TODO: pass backend as read-only
+    return do_const_op(std::forward<Func>(f));
+  }
+
+  template <class Func>
+  auto do_write_op(Func&& f, bool um) {
+    ++num_write;
+    if (!osd_op_params) {
+      osd_op_params.emplace();
+    }
+    user_modify = um;
+    return std::forward<Func>(f)(backend, obc->obs, txn);
+  }
+
+  decltype(auto) dont_do_legacy_op() {
+    return crimson::ct_error::operation_not_supported::make();
+  }
+
+public:
+  OpsExecuter(ObjectContextRef obc,
+              const OpInfo& op_info,
+              const pg_pool_t& pool_info,
+              PGBackend& backend,
+              const MOSDOp& msg)
+    : obc(std::move(obc)),
+      op_info(op_info),
+      pool_info(pool_info),
+      backend(backend),
+      msg(msg) {
+  }
+
+  osd_op_errorator::future<> execute_op(class OSDOp& osd_op);
+
+  template <typename Func, typename MutFunc>
+  osd_op_errorator::future<> flush_changes(Func&& func, MutFunc&& mut_func) &&;
+
+  const auto& get_message() const {
+    return msg;
+  }
+
+  size_t get_processed_rw_ops_num() const {
+    return num_read + num_write;
+  }
+
+  uint32_t get_pool_stripe_width() const {
+    return pool_info.get_stripe_width();
+  }
+
+  bool has_seen_write() const {
+    return num_write > 0;
+  }
+};
+
+template <class Context, class MainFunc, class EffectFunc>
+auto OpsExecuter::with_effect_on_obc(
+  Context&& ctx,
+  MainFunc&& main_func,
+  EffectFunc&& effect_func)
+{
+  using context_t = std::decay_t<Context>;
+  // the language offers implicit conversion to pointer-to-function for
+  // lambda only when it's closureless. We enforce this restriction due
+  // the fact that `flush_changes()` std::moves many executer's parts.
+  using allowed_effect_func_t =
+    seastar::future<> (*)(context_t&&, ObjectContextRef);
+  static_assert(std::is_convertible_v<EffectFunc, allowed_effect_func_t>,
+                "with_effect function is not allowed to capture");
+  struct task_t final : effect_t {
+    context_t ctx;
+    EffectFunc effect_func;
+    ObjectContextRef obc;
+
+    task_t(Context&& ctx, EffectFunc&& effect_func, ObjectContextRef obc)
+       : ctx(std::move(ctx)),
+         effect_func(std::move(effect_func)),
+         obc(std::move(obc)) {
+    }
+    osd_op_errorator::future<> execute() final {
+      return std::move(effect_func)(std::move(ctx), std::move(obc));
+    }
+  };
+  auto task =
+    std::make_unique<task_t>(std::move(ctx), std::move(effect_func), obc);
+  auto& ctx_ref = task->ctx;
+  op_effects.emplace_back(std::move(task));
+  return std::forward<MainFunc>(main_func)(ctx_ref);
+}
+
+template <typename Func,
+          typename MutFunc>
+OpsExecuter::osd_op_errorator::future<> OpsExecuter::flush_changes(
+  Func&& func,
+  MutFunc&& mut_func) &&
+{
+  const bool want_mutate = !txn.empty();
+  // osd_op_params are instantiated by every wr-like operation.
+  assert(osd_op_params || !want_mutate);
+  assert(obc);
+  if (__builtin_expect(op_effects.empty(), true)) {
+    return want_mutate ? std::forward<MutFunc>(mut_func)(std::move(txn),
+                                                         std::move(obc),
+                                                         std::move(*osd_op_params),
+                                                         user_modify)
+                       : std::forward<Func>(func)(std::move(obc));
+  } else {
+    return (want_mutate ? std::forward<MutFunc>(mut_func)(std::move(txn),
+                                                          std::move(obc),
+                                                          std::move(*osd_op_params),
+                                                          user_modify)
+                        : std::forward<Func>(func)(std::move(obc))
+    ).safe_then([this] {
+      // let's do the cleaning of `op_effects` in destructor
+      return crimson::do_for_each(op_effects, [] (auto& op_effect) {
+        return op_effect->execute();
+      });
+    });
+  }
+}
+
+// PgOpsExecuter -- a class for executing ops targeting a certain PG.
+class PgOpsExecuter {
+public:
+  PgOpsExecuter(const PG& pg, const MOSDOp& msg)
+    : pg(pg), nspace(msg.get_hobj().nspace) {
+  }
+
+  seastar::future<> execute_op(class OSDOp& osd_op);
+
+private:
+  const PG& pg;
+  const std::string& nspace;
+};
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc
new file mode 100644
index 000000000..521cb9ba3
--- /dev/null
+++ b/src/crimson/osd/osd.cc
@@ -0,0 +1,1364 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "osd.h"
+
+#include <sys/utsname.h>
+
+#include <boost/iterator/counting_iterator.hpp>
+#include <boost/range/join.hpp>
+#include <boost/smart_ptr/make_local_shared.hpp>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <seastar/core/timer.hh>
+
+#include "common/pick_address.h"
+#include "include/util.h"
+
+#include "messages/MCommand.h"
+#include "messages/MOSDBeacon.h"
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDMap.h"
+#include "messages/MOSDMarkMeDown.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDPGLog.h"
+#include "messages/MOSDPGPull.h"
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPushReply.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
+#include "messages/MOSDRepOpReply.h"
+#include "messages/MOSDScrub2.h"
+#include "messages/MPGStats.h"
+
+#include "os/Transaction.h"
+#include "osd/ClassHandler.h"
+#include "osd/OSDCap.h"
+#include "osd/PGPeeringEvent.h"
+#include "osd/PeeringState.h"
+
+#include "crimson/admin/osd_admin.h"
+#include "crimson/admin/pg_commands.h"
+#include "crimson/common/exception.h"
+#include "crimson/mon/MonClient.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+#include "crimson/osd/heartbeat.h"
+#include "crimson/osd/osd_meta.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_backend.h"
+#include "crimson/osd/pg_meta.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/osd_operations/compound_peering_request.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/osd_operations/pg_advance_map.h"
+#include "crimson/osd/osd_operations/recovery_subrequest.h"
+#include "crimson/osd/osd_operations/replicated_request.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+  static constexpr int TICK_INTERVAL = 1;
+}
+
+using crimson::common::local_conf;
+using crimson::os::FuturizedStore;
+
+namespace crimson::osd {
+
+OSD::OSD(int id, uint32_t nonce,
+         crimson::net::MessengerRef cluster_msgr,
+         crimson::net::MessengerRef public_msgr,
+         crimson::net::MessengerRef hb_front_msgr,
+         crimson::net::MessengerRef hb_back_msgr)
+  : whoami{id},
+    nonce{nonce},
+    // do this in background
+    beacon_timer{[this] { (void)send_beacon(); }},
+    cluster_msgr{cluster_msgr},
+    public_msgr{public_msgr},
+    monc{new crimson::mon::Client{*public_msgr, *this}},
+    mgrc{new crimson::mgr::Client{*public_msgr, *this}},
+    store{crimson::os::FuturizedStore::create(
+      local_conf().get_val<std::string>("osd_objectstore"),
+      local_conf().get_val<std::string>("osd_data"),
+      local_conf().get_config_values())},
+    shard_services{*this, whoami, *cluster_msgr, *public_msgr, *monc, *mgrc, *store},
+    heartbeat{new Heartbeat{whoami, shard_services, *monc, hb_front_msgr, hb_back_msgr}},
+    // do this in background
+    tick_timer{[this] {
+      update_heartbeat_peers();
+      update_stats();
+    }},
+    asok{seastar::make_lw_shared<crimson::admin::AdminSocket>()},
+    osdmap_gate("OSD::osdmap_gate", std::make_optional(std::ref(shard_services)))
+{
+  osdmaps[0] = boost::make_local_shared<OSDMap>();
+  for (auto msgr : {std::ref(cluster_msgr), std::ref(public_msgr),
+                    std::ref(hb_front_msgr), std::ref(hb_back_msgr)}) {
+    msgr.get()->set_auth_server(monc.get());
+    msgr.get()->set_auth_client(monc.get());
+  }
+
+  if (local_conf()->osd_open_classes_on_start) {
+    const int r = ClassHandler::get_instance().open_all_classes();
+    if (r) {
+      logger().warn("{} warning: got an error loading one or more classes: {}",
+                    __func__, cpp_strerror(r));
+    }
+  }
+}
+
+OSD::~OSD() = default;
+
+namespace {
+// Initial features in new superblock.
+// Features here are also automatically upgraded
+CompatSet get_osd_initial_compat_set()
+{
+  CompatSet::FeatureSet ceph_osd_feature_compat;
+  CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+  CompatSet::FeatureSet ceph_osd_feature_incompat;
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
+  return CompatSet(ceph_osd_feature_compat,
+                   ceph_osd_feature_ro_compat,
+                   ceph_osd_feature_incompat);
+}
+}
+
+seastar::future<> OSD::mkfs(uuid_d osd_uuid, uuid_d cluster_fsid)
+{
+  return store->start().then([this, osd_uuid] {
+    return store->mkfs(osd_uuid);
+  }).then([this] {
+    return store->mount();
+  }).then([cluster_fsid, this] {
+    superblock.cluster_fsid = cluster_fsid;
+    superblock.osd_fsid = store->get_fsid();
+    superblock.whoami = whoami;
+    superblock.compat_features = get_osd_initial_compat_set();
+
+    logger().info(
+      "{} writing superblock cluster_fsid {} osd_fsid {}",
+      __func__,
+      cluster_fsid,
+      superblock.osd_fsid);
+    return store->create_new_collection(coll_t::meta());
+  }).then([this] (auto ch) {
+    meta_coll = make_unique<OSDMeta>(ch , store.get());
+    ceph::os::Transaction t;
+    meta_coll->create(t);
+    meta_coll->store_superblock(t, superblock);
+    return store->do_transaction(meta_coll->collection(), std::move(t));
+  }).then([cluster_fsid, this] {
+    return when_all_succeed(
+      store->write_meta("ceph_fsid", cluster_fsid.to_string()),
+      store->write_meta("whoami", std::to_string(whoami)));
+  }).then_unpack([cluster_fsid, this] {
+    fmt::print("created object store {} for osd.{} fsid {}\n",
+               local_conf().get_val<std::string>("osd_data"),
+               whoami, cluster_fsid);
+    return seastar::now();
+  });
+}
+
+namespace {
+  entity_addrvec_t pick_addresses(int what) {
+    entity_addrvec_t addrs;
+    crimson::common::CephContext cct;
+    if (int r = ::pick_addresses(&cct, what, &addrs, -1); r < 0) {
+      throw std::runtime_error("failed to pick address");
+    }
+    for (auto addr : addrs.v) {
+      logger().info("picked address {}", addr);
+    }
+    return addrs;
+  }
+  std::pair<entity_addrvec_t, bool>
+  replace_unknown_addrs(entity_addrvec_t maybe_unknowns,
+                        const entity_addrvec_t& knowns) {
+    bool changed = false;
+    auto maybe_replace = [&](entity_addr_t addr) {
+      if (!addr.is_blank_ip()) {
+        return addr;
+      }
+      for (auto& b : knowns.v) {
+        if (addr.get_family() == b.get_family()) {
+          auto a = b;
+          a.set_nonce(addr.get_nonce());
+          a.set_type(addr.get_type());
+          a.set_port(addr.get_port());
+          changed = true;
+          return a;
+        }
+      }
+      throw std::runtime_error("failed to replace unknown address");
+    };
+    entity_addrvec_t replaced;
+    std::transform(maybe_unknowns.v.begin(),
+                   maybe_unknowns.v.end(),
+                   std::back_inserter(replaced.v),
+                   maybe_replace);
+    return {replaced, changed};
+  }
+}
+
+seastar::future<> OSD::start()
+{
+  logger().info("start");
+
+  startup_time = ceph::mono_clock::now();
+
+  return store->start().then([this] {
+    return store->mount();
+  }).then([this] {
+    return store->open_collection(coll_t::meta());
+  }).then([this](auto ch) {
+    meta_coll = make_unique<OSDMeta>(ch, store.get());
+    return meta_coll->load_superblock();
+  }).then([this](OSDSuperblock&& sb) {
+    superblock = std::move(sb);
+    return get_map(superblock.current_epoch);
+  }).then([this](cached_map_t&& map) {
+    shard_services.update_map(map);
+    osdmap_gate.got_map(map->get_epoch());
+    osdmap = std::move(map);
+    return load_pgs();
+  }).then([this] {
+
+    uint64_t osd_required =
+      CEPH_FEATURE_UID |
+      CEPH_FEATURE_PGID64 |
+      CEPH_FEATURE_OSDENC;
+    using crimson::net::SocketPolicy;
+
+    public_msgr->set_default_policy(SocketPolicy::stateless_server(0));
+    public_msgr->set_policy(entity_name_t::TYPE_MON,
+                            SocketPolicy::lossy_client(osd_required));
+    public_msgr->set_policy(entity_name_t::TYPE_MGR,
+                            SocketPolicy::lossy_client(osd_required));
+    public_msgr->set_policy(entity_name_t::TYPE_OSD,
+                            SocketPolicy::stateless_server(0));
+
+    cluster_msgr->set_default_policy(SocketPolicy::stateless_server(0));
+    cluster_msgr->set_policy(entity_name_t::TYPE_MON,
+                             SocketPolicy::lossy_client(0));
+    cluster_msgr->set_policy(entity_name_t::TYPE_OSD,
+                             SocketPolicy::lossless_peer(osd_required));
+    cluster_msgr->set_policy(entity_name_t::TYPE_CLIENT,
+                             SocketPolicy::stateless_server(0));
+
+    crimson::net::dispatchers_t dispatchers{this, monc.get(), mgrc.get()};
+    return seastar::when_all_succeed(
+      cluster_msgr->try_bind(pick_addresses(CEPH_PICK_ADDRESS_CLUSTER),
+                             local_conf()->ms_bind_port_min,
+                             local_conf()->ms_bind_port_max)
+        .safe_then([this, dispatchers]() mutable {
+	  return cluster_msgr->start(dispatchers);
+        }, crimson::net::Messenger::bind_ertr::all_same_way(
+            [] (const std::error_code& e) {
+          logger().error("cluster messenger try_bind(): address range is unavailable.");
+          ceph_abort();
+        })),
+      public_msgr->try_bind(pick_addresses(CEPH_PICK_ADDRESS_PUBLIC),
+                            local_conf()->ms_bind_port_min,
+                            local_conf()->ms_bind_port_max)
+        .safe_then([this, dispatchers]() mutable {
+	  return public_msgr->start(dispatchers);
+        }, crimson::net::Messenger::bind_ertr::all_same_way(
+            [] (const std::error_code& e) {
+          logger().error("public messenger try_bind(): address range is unavailable.");
+          ceph_abort();
+        })));
+  }).then_unpack([this] {
+    return seastar::when_all_succeed(monc->start(),
+                                     mgrc->start());
+  }).then_unpack([this] {
+    return _add_me_to_crush();
+  }).then([this] {
+    monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
+    monc->sub_want("mgrmap", 0, 0);
+    monc->sub_want("osdmap", 0, 0);
+    return monc->renew_subs();
+  }).then([this] {
+    if (auto [addrs, changed] =
+        replace_unknown_addrs(cluster_msgr->get_myaddrs(),
+                              public_msgr->get_myaddrs()); changed) {
+      return cluster_msgr->set_myaddrs(addrs);
+    } else {
+      return seastar::now();
+    }
+  }).then([this] {
+    return heartbeat->start(public_msgr->get_myaddrs(),
+                            cluster_msgr->get_myaddrs());
+  }).then([this] {
+    // create the admin-socket server, and the objects that register
+    // to handle incoming commands
+    return start_asok_admin();
+  }).then([this] {
+    return start_boot();
+  });
+}
+
+seastar::future<> OSD::start_boot()
+{
+  state.set_preboot();
+  return monc->get_version("osdmap").then([this](auto&& ret) {
+    auto [newest, oldest] = ret;
+    return _preboot(oldest, newest);
+  });
+}
+
+seastar::future<> OSD::_preboot(version_t oldest, version_t newest)
+{
+  logger().info("osd.{}: _preboot", whoami);
+  if (osdmap->get_epoch() == 0) {
+    logger().info("waiting for initial osdmap");
+  } else if (osdmap->is_destroyed(whoami)) {
+    logger().warn("osdmap says I am destroyed");
+    // provide a small margin so we don't livelock seeing if we
+    // un-destroyed ourselves.
+    if (osdmap->get_epoch() > newest - 1) {
+      throw std::runtime_error("i am destroyed");
+    }
+  } else if (osdmap->is_noup(whoami)) {
+    logger().warn("osdmap NOUP flag is set, waiting for it to clear");
+  } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
+    logger().error("osdmap SORTBITWISE OSDMap flag is NOT set; please set it");
+  } else if (osdmap->require_osd_release < ceph_release_t::octopus) {
+    logger().error("osdmap require_osd_release < octopus; please upgrade to octopus");
+  } else if (false) {
+    // TODO: update mon if current fullness state is different from osdmap
+  } else if (version_t n = local_conf()->osd_map_message_max;
+             osdmap->get_epoch() >= oldest - 1 &&
+             osdmap->get_epoch() + n > newest) {
+    return _send_boot();
+  }
+  // get all the latest maps
+  if (osdmap->get_epoch() + 1 >= oldest) {
+    return shard_services.osdmap_subscribe(osdmap->get_epoch() + 1, false);
+  } else {
+    return shard_services.osdmap_subscribe(oldest - 1, true);
+  }
+}
+
+seastar::future<> OSD::_send_boot()
+{
+  state.set_booting();
+
+  logger().info("hb_back_msgr: {}", heartbeat->get_back_addrs());
+  logger().info("hb_front_msgr: {}", heartbeat->get_front_addrs());
+  logger().info("cluster_msgr: {}", cluster_msgr->get_myaddr());
+  auto m = make_message<MOSDBoot>(superblock,
+                                  osdmap->get_epoch(),
+                                  osdmap->get_epoch(),
+                                  heartbeat->get_back_addrs(),
+                                  heartbeat->get_front_addrs(),
+                                  cluster_msgr->get_myaddrs(),
+                                  CEPH_FEATURES_ALL);
+  collect_sys_info(&m->metadata, NULL);
+  return monc->send_message(m);
+}
+
+seastar::future<> OSD::_add_me_to_crush()
+{
+  if (!local_conf().get_val<bool>("osd_crush_update_on_start")) {
+    return seastar::now();
+  }
+  auto get_weight = [this] {
+    if (auto w = local_conf().get_val<double>("osd_crush_initial_weight");
+	w >= 0) {
+      return seastar::make_ready_future<double>(w);
+    } else {
+       return store->stat().then([](auto st) {
+         auto total = st.total;
+	 return seastar::make_ready_future<double>(
+           std::max(.00001,
+		    double(total) / double(1ull << 40))); // TB
+       });
+    }
+  };
+  return get_weight().then([this](auto weight) {
+    const crimson::crush::CrushLocation loc{make_unique<CephContext>().get()};
+    logger().info("{} crush location is {}", __func__, loc);
+    string cmd = fmt::format(R"({{
+      "prefix": "osd crush create-or-move",
+      "id": {},
+      "weight": {:.4f},
+      "args": [{}]
+    }})", whoami, weight, loc);
+    return monc->run_command({cmd}, {});
+  }).then([](auto&& command_result) {
+    [[maybe_unused]] auto [code, message, out] = std::move(command_result);
+    if (code) {
+      logger().warn("fail to add to crush: {} ({})", message, code);
+      throw std::runtime_error("fail to add to crush");
+    } else {
+      logger().info("added to crush: {}", message);
+    }
+    return seastar::now();
+  });
+}
+
+seastar::future<> OSD::handle_command(crimson::net::ConnectionRef conn,
+				      Ref<MCommand> m)
+{
+  return asok->handle_command(conn, std::move(m));
+}
+
+/*
+  The OSD's Admin Socket object created here has two servers (i.e. - blocks of commands
+  to handle) registered to it:
+  - OSD's specific commands are handled by the OSD object;
+  - there are some common commands registered to be directly handled by the AdminSocket object
+    itself.
+*/
+seastar::future<> OSD::start_asok_admin()
+{
+  auto asok_path = local_conf().get_val<std::string>("admin_socket");
+  using namespace crimson::admin;
+  return asok->start(asok_path).then([this] {
+    return seastar::when_all_succeed(
+      asok->register_admin_commands(),
+      asok->register_command(make_asok_hook<OsdStatusHook>(std::as_const(*this))),
+      asok->register_command(make_asok_hook<SendBeaconHook>(*this)),
+      asok->register_command(make_asok_hook<FlushPgStatsHook>(*this)),
+      asok->register_command(make_asok_hook<DumpPGStateHistory>(std::as_const(*this))),
+      asok->register_command(make_asok_hook<SeastarMetricsHook>()),
+      // PG commands
+      asok->register_command(make_asok_hook<pg::QueryCommand>(*this)),
+      asok->register_command(make_asok_hook<pg::MarkUnfoundLostCommand>(*this)));
+  }).then_unpack([] {
+    return seastar::now();
+  });
+}
+
+seastar::future<> OSD::stop()
+{
+  logger().info("stop");
+  // see also OSD::shutdown()
+  return prepare_to_stop().then([this] {
+    state.set_stopping();
+    logger().debug("prepared to stop");
+    public_msgr->stop();
+    cluster_msgr->stop();
+    auto gate_close_fut = gate.close();
+    return asok->stop().then([this] {
+      return heartbeat->stop();
+    }).then([this] {
+      return store->umount();
+    }).then([this] {
+      return store->stop();
+    }).then([this] {
+      return seastar::parallel_for_each(pg_map.get_pgs(),
+	[](auto& p) {
+	return p.second->stop();
+      });
+    }).then([this] {
+      return monc->stop();
+    }).then([this] {
+      return mgrc->stop();
+    }).then([fut=std::move(gate_close_fut)]() mutable {
+      return std::move(fut);
+    }).then([this] {
+      return when_all_succeed(
+	  public_msgr->shutdown(),
+	  cluster_msgr->shutdown());
+    }).then_unpack([] {
+      return seastar::now();
+    }).handle_exception([](auto ep) {
+      logger().error("error while stopping osd: {}", ep);
+    });
+  });
+}
+
+void OSD::dump_status(Formatter* f) const
+{
+  f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
+  f->dump_stream("osd_fsid") << superblock.osd_fsid;
+  f->dump_unsigned("whoami", superblock.whoami);
+  f->dump_string("state", state.to_string());
+  f->dump_unsigned("oldest_map", superblock.oldest_map);
+  f->dump_unsigned("newest_map", superblock.newest_map);
+  f->dump_unsigned("num_pgs", pg_map.get_pgs().size());
+}
+
+void OSD::dump_pg_state_history(Formatter* f) const
+{
+  f->open_array_section("pgs");
+  for (auto [pgid, pg] : pg_map.get_pgs()) {
+    f->open_object_section("pg");
+    f->dump_stream("pg") << pgid;
+    const auto& peering_state = pg->get_peering_state();
+    f->dump_string("currently", peering_state.get_current_state());
+    peering_state.dump_history(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void OSD::print(std::ostream& out) const
+{
+  out << "{osd." << superblock.whoami << " "
+    << superblock.osd_fsid << " [" << superblock.oldest_map
+    << "," << superblock.newest_map << "] " << pg_map.get_pgs().size()
+    << " pgs}";
+}
+
+seastar::future<> OSD::load_pgs()
+{
+  return store->list_collections().then([this](auto colls) {
+    return seastar::parallel_for_each(colls, [this](auto coll) {
+      spg_t pgid;
+      if (coll.is_pg(&pgid)) {
+        return load_pg(pgid).then([pgid, this](auto&& pg) {
+          logger().info("load_pgs: loaded {}", pgid);
+          pg_map.pg_loaded(pgid, std::move(pg));
+          shard_services.inc_pg_num();
+          return seastar::now();
+        });
+      } else if (coll.is_temp(&pgid)) {
+        // TODO: remove the collection
+        return seastar::now();
+      } else {
+        logger().warn("ignoring unrecognized collection: {}", coll);
+        return seastar::now();
+      }
+    });
+  });
+}
+
+seastar::future<Ref<PG>> OSD::make_pg(cached_map_t create_map,
+				      spg_t pgid,
+				      bool do_create)
+{
+  using ec_profile_t = map<string,string>;
+  auto get_pool_info = [create_map, pgid, this] {
+    if (create_map->have_pg_pool(pgid.pool())) {
+      pg_pool_t pi = *create_map->get_pg_pool(pgid.pool());
+      string name = create_map->get_pool_name(pgid.pool());
+      ec_profile_t ec_profile;
+      if (pi.is_erasure()) {
+	ec_profile = create_map->get_erasure_code_profile(pi.erasure_code_profile);
+      }
+      return seastar::make_ready_future<std::tuple<pg_pool_t, string, ec_profile_t>>(
+        std::make_tuple(std::move(pi),
+			std::move(name),
+			std::move(ec_profile)));
+    } else {
+      // pool was deleted; grab final pg_pool_t off disk.
+      return meta_coll->load_final_pool_info(pgid.pool());
+    }
+  };
+  auto get_collection = [pgid, do_create, this] {
+    const coll_t cid{pgid};
+    if (do_create) {
+      return store->create_new_collection(cid);
+    } else {
+      return store->open_collection(cid);
+    }
+  };
+  return seastar::when_all(
+    std::move(get_pool_info),
+    std::move(get_collection)
+  ).then([pgid, create_map, this] (auto&& ret) {
+    auto [pool, name, ec_profile] = std::move(std::get<0>(ret).get0());
+    auto coll = std::move(std::get<1>(ret).get0());
+    return seastar::make_ready_future<Ref<PG>>(
+      new PG{pgid,
+	     pg_shard_t{whoami, pgid.shard},
+	     std::move(coll),
+	     std::move(pool),
+	     std::move(name),
+	     create_map,
+	     shard_services,
+	     ec_profile});
+  });
+}
+
+seastar::future<Ref<PG>> OSD::load_pg(spg_t pgid)
+{
+  logger().debug("{}: {}", __func__, pgid);
+
+  return seastar::do_with(PGMeta(store.get(), pgid), [] (auto& pg_meta) {
+    return pg_meta.get_epoch();
+  }).then([this](epoch_t e) {
+    return get_map(e);
+  }).then([pgid, this] (auto&& create_map) {
+    return make_pg(std::move(create_map), pgid, false);
+  }).then([this](Ref<PG> pg) {
+    return pg->read_state(store.get()).then([pg] {
+	return seastar::make_ready_future<Ref<PG>>(std::move(pg));
+    });
+  }).handle_exception([pgid](auto ep) {
+    logger().info("pg {} saw exception on load {}", pgid, ep);
+    ceph_abort("Could not load pg" == 0);
+    return seastar::make_exception_future<Ref<PG>>(ep);
+  });
+}
+
+std::optional<seastar::future<>>
+OSD::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m)
+{
+  if (state.is_stopping()) {
+    return {};
+  }
+  bool dispatched = true;
+  gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] {
+    switch (m->get_type()) {
+    case CEPH_MSG_OSD_MAP:
+      return handle_osd_map(conn, boost::static_pointer_cast<MOSDMap>(m));
+    case CEPH_MSG_OSD_OP:
+      return handle_osd_op(conn, boost::static_pointer_cast<MOSDOp>(m));
+    case MSG_OSD_PG_CREATE2:
+      shard_services.start_operation<CompoundPeeringRequest>(
+	*this,
+	conn,
+	m);
+      return seastar::now();
+    case MSG_COMMAND:
+      return handle_command(conn, boost::static_pointer_cast<MCommand>(m));
+    case MSG_OSD_MARK_ME_DOWN:
+      return handle_mark_me_down(conn, boost::static_pointer_cast<MOSDMarkMeDown>(m));
+    case MSG_OSD_PG_PULL:
+      [[fallthrough]];
+    case MSG_OSD_PG_PUSH:
+      [[fallthrough]];
+    case MSG_OSD_PG_PUSH_REPLY:
+      [[fallthrough]];
+    case MSG_OSD_PG_RECOVERY_DELETE:
+      [[fallthrough]];
+    case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+      [[fallthrough]];
+    case MSG_OSD_PG_SCAN:
+      [[fallthrough]];
+    case MSG_OSD_PG_BACKFILL:
+      [[fallthrough]];
+    case MSG_OSD_PG_BACKFILL_REMOVE:
+      return handle_recovery_subreq(conn, boost::static_pointer_cast<MOSDFastDispatchOp>(m));
+    case MSG_OSD_PG_LEASE:
+      [[fallthrough]];
+    case MSG_OSD_PG_LEASE_ACK:
+      [[fallthrough]];
+    case MSG_OSD_PG_NOTIFY2:
+      [[fallthrough]];
+    case MSG_OSD_PG_INFO2:
+      [[fallthrough]];
+    case MSG_OSD_PG_QUERY2:
+      [[fallthrough]];
+    case MSG_OSD_BACKFILL_RESERVE:
+      [[fallthrough]];
+    case MSG_OSD_RECOVERY_RESERVE:
+      [[fallthrough]];
+    case MSG_OSD_PG_LOG:
+      return handle_peering_op(conn, boost::static_pointer_cast<MOSDPeeringOp>(m));
+    case MSG_OSD_REPOP:
+      return handle_rep_op(conn, boost::static_pointer_cast<MOSDRepOp>(m));
+    case MSG_OSD_REPOPREPLY:
+      return handle_rep_op_reply(conn, boost::static_pointer_cast<MOSDRepOpReply>(m));
+    case MSG_OSD_SCRUB2:
+      return handle_scrub(conn, boost::static_pointer_cast<MOSDScrub2>(m));
+    default:
+      dispatched = false;
+      return seastar::now();
+    }
+  });
+  return (dispatched ? std::make_optional(seastar::now()) : std::nullopt);
+}
+
+void OSD::ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace)
+{
+  // TODO: cleanup the session attached to this connection
+  logger().warn("ms_handle_reset");
+}
+
+void OSD::ms_handle_remote_reset(crimson::net::ConnectionRef conn)
+{
+  logger().warn("ms_handle_remote_reset");
+}
+
+void OSD::handle_authentication(const EntityName& name,
+				const AuthCapsInfo& caps_info)
+{
+  // TODO: store the parsed cap and associate it with the connection
+  if (caps_info.allow_all) {
+    logger().debug("{} {} has all caps", __func__, name);
+    return;
+  }
+  if (caps_info.caps.length() > 0) {
+    auto p = caps_info.caps.cbegin();
+    string str;
+    try {
+      decode(str, p);
+    } catch (ceph::buffer::error& e) {
+      logger().warn("{} {} failed to decode caps string", __func__, name);
+      return;
+    }
+    OSDCap caps;
+    if (caps.parse(str)) {
+      logger().debug("{} {} has caps {}", __func__, name, str);
+    } else {
+      logger().warn("{} {} failed to parse caps {}", __func__, name, str);
+    }
+  }
+}
+
+void OSD::update_stats()
+{
+  osd_stat_seq++;
+  osd_stat.up_from = get_up_epoch();
+  osd_stat.hb_peers = heartbeat->get_peers();
+  osd_stat.seq = (static_cast<uint64_t>(get_up_epoch()) << 32) | osd_stat_seq;
+  gate.dispatch_in_background("statfs", *this, [this] {
+    (void) store->stat().then([this](store_statfs_t&& st) {
+      osd_stat.statfs = st;
+    });
+  });
+}
+
+MessageRef OSD::get_stats() const
+{
+  // todo: m-to-n: collect stats using map-reduce
+  // MPGStats::had_map_for is not used since PGMonitor was removed
+  auto m = make_message<MPGStats>(monc->get_fsid(), osdmap->get_epoch());
+  m->osd_stat = osd_stat;
+  for (auto [pgid, pg] : pg_map.get_pgs()) {
+    if (pg->is_primary()) {
+      auto stats = pg->get_stats();
+      // todo: update reported_epoch,reported_seq,last_fresh
+      stats.reported_epoch = osdmap->get_epoch();
+      m->pg_stat.emplace(pgid.pgid, std::move(stats));
+    }
+  }
+  return m;
+}
+
+uint64_t OSD::send_pg_stats()
+{
+  // mgr client sends the report message in background
+  mgrc->report();
+  return osd_stat.seq;
+}
+
+OSD::cached_map_t OSD::get_map() const
+{
+  return osdmap;
+}
+
+seastar::future<OSD::cached_map_t> OSD::get_map(epoch_t e)
+{
+  // TODO: use LRU cache for managing osdmap, fallback to disk if we have to
+  if (auto found = osdmaps.find(e); found) {
+    return seastar::make_ready_future<cached_map_t>(std::move(found));
+  } else {
+    return load_map(e).then([e, this](unique_ptr<OSDMap> osdmap) {
+      return seastar::make_ready_future<cached_map_t>(
+        osdmaps.insert(e, std::move(osdmap)));
+    });
+  }
+}
+
+void OSD::store_map_bl(ceph::os::Transaction& t,
+                       epoch_t e, bufferlist&& bl)
+{
+  meta_coll->store_map(t, e, bl);
+  map_bl_cache.insert(e, std::move(bl));
+}
+
+seastar::future<bufferlist> OSD::load_map_bl(epoch_t e)
+{
+  if (std::optional<bufferlist> found = map_bl_cache.find(e); found) {
+    return seastar::make_ready_future<bufferlist>(*found);
+  } else {
+    return meta_coll->load_map(e);
+  }
+}
+
+seastar::future<std::map<epoch_t, bufferlist>> OSD::load_map_bls(
+  epoch_t first,
+  epoch_t last)
+{
+  return seastar::map_reduce(boost::make_counting_iterator<epoch_t>(first),
+			     boost::make_counting_iterator<epoch_t>(last + 1),
+			     [this](epoch_t e) {
+    return load_map_bl(e).then([e](auto&& bl) {
+	return seastar::make_ready_future<pair<epoch_t, bufferlist>>(
+	    std::make_pair(e, std::move(bl)));
+    });
+  },
+  std::map<epoch_t, bufferlist>{},
+  [](auto&& bls, auto&& epoch_bl) {
+    bls.emplace(std::move(epoch_bl));
+    return std::move(bls);
+  });
+}
+
+seastar::future<std::unique_ptr<OSDMap>> OSD::load_map(epoch_t e)
+{
+  auto o = std::make_unique<OSDMap>();
+  if (e > 0) {
+    return load_map_bl(e).then([o=std::move(o)](bufferlist bl) mutable {
+      o->decode(bl);
+      return seastar::make_ready_future<unique_ptr<OSDMap>>(std::move(o));
+    });
+  } else {
+    return seastar::make_ready_future<unique_ptr<OSDMap>>(std::move(o));
+  }
+}
+
+seastar::future<> OSD::store_maps(ceph::os::Transaction& t,
+                                  epoch_t start, Ref<MOSDMap> m)
+{
+  return seastar::do_for_each(boost::make_counting_iterator(start),
+                              boost::make_counting_iterator(m->get_last() + 1),
+                              [&t, m, this](epoch_t e) {
+    if (auto p = m->maps.find(e); p != m->maps.end()) {
+      auto o = std::make_unique<OSDMap>();
+      o->decode(p->second);
+      logger().info("store_maps osdmap.{}", e);
+      store_map_bl(t, e, std::move(std::move(p->second)));
+      osdmaps.insert(e, std::move(o));
+      return seastar::now();
+    } else if (auto p = m->incremental_maps.find(e);
+               p != m->incremental_maps.end()) {
+      return load_map(e - 1).then([e, bl=p->second, &t, this](auto o) {
+        OSDMap::Incremental inc;
+        auto i = bl.cbegin();
+        inc.decode(i);
+        o->apply_incremental(inc);
+        bufferlist fbl;
+        o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
+        store_map_bl(t, e, std::move(fbl));
+        osdmaps.insert(e, std::move(o));
+        return seastar::now();
+      });
+    } else {
+      logger().error("MOSDMap lied about what maps it had?");
+      return seastar::now();
+    }
+  });
+}
+
+bool OSD::require_mon_peer(crimson::net::Connection *conn, Ref<Message> m)
+{
+  if (!conn->peer_is_mon()) {
+    logger().info("{} received from non-mon {}, {}",
+		  __func__,
+		  conn->get_peer_addr(),
+		  *m);
+    return false;
+  }
+  return true;
+}
+
+seastar::future<Ref<PG>> OSD::handle_pg_create_info(
+  std::unique_ptr<PGCreateInfo> info) {
+  return seastar::do_with(
+    std::move(info),
+    [this](auto &info) -> seastar::future<Ref<PG>> {
+      return get_map(info->epoch).then(
+	[&info, this](cached_map_t startmap) ->
+	seastar::future<std::tuple<Ref<PG>, cached_map_t>> {
+	  const spg_t &pgid = info->pgid;
+	  if (info->by_mon) {
+	    int64_t pool_id = pgid.pgid.pool();
+	    const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
+	    if (!pool) {
+	      logger().debug(
+		"{} ignoring pgid {}, pool dne",
+		__func__,
+		pgid);
+	      return seastar::make_ready_future<std::tuple<Ref<PG>, cached_map_t>>(
+                std::make_tuple(Ref<PG>(), startmap));
+	    }
+	    ceph_assert(osdmap->require_osd_release >= ceph_release_t::octopus);
+	    if (!pool->has_flag(pg_pool_t::FLAG_CREATING)) {
+	      // this ensures we do not process old creating messages after the
+	      // pool's initial pgs have been created (and pg are subsequently
+	      // allowed to split or merge).
+	      logger().debug(
+		"{} dropping {} create, pool does not have CREATING flag set",
+		__func__,
+		pgid);
+	      return seastar::make_ready_future<std::tuple<Ref<PG>, cached_map_t>>(
+                std::make_tuple(Ref<PG>(), startmap));
+	    }
+	  }
+	  return make_pg(startmap, pgid, true).then(
+	    [startmap=std::move(startmap)](auto pg) mutable {
+	      return seastar::make_ready_future<std::tuple<Ref<PG>, cached_map_t>>(
+                std::make_tuple(std::move(pg), std::move(startmap)));
+	    });
+      }).then([this, &info](auto&& ret) ->
+              seastar::future<Ref<PG>> {
+        auto [pg, startmap] = std::move(ret);
+        if (!pg)
+          return seastar::make_ready_future<Ref<PG>>(Ref<PG>());
+        PeeringCtx rctx{ceph_release_t::octopus};
+        const pg_pool_t* pp = startmap->get_pg_pool(info->pgid.pool());
+
+        int up_primary, acting_primary;
+        vector<int> up, acting;
+        startmap->pg_to_up_acting_osds(
+          info->pgid.pgid, &up, &up_primary, &acting, &acting_primary);
+
+        int role = startmap->calc_pg_role(pg_shard_t(whoami, info->pgid.shard),
+                                          acting);
+
+        create_pg_collection(
+          rctx.transaction,
+          info->pgid,
+          info->pgid.get_split_bits(pp->get_pg_num()));
+        init_pg_ondisk(
+          rctx.transaction,
+          info->pgid,
+          pp);
+
+        pg->init(
+          role,
+          up,
+          up_primary,
+          acting,
+          acting_primary,
+          info->history,
+          info->past_intervals,
+          false,
+          rctx.transaction);
+
+        return shard_services.start_operation<PGAdvanceMap>(
+          *this, pg, pg->get_osdmap_epoch(),
+          osdmap->get_epoch(), std::move(rctx), true).second.then([pg=pg] {
+            return seastar::make_ready_future<Ref<PG>>(pg);
+        });
+      });
+  });
+}
+
+seastar::future<> OSD::handle_osd_map(crimson::net::ConnectionRef conn,
+                                      Ref<MOSDMap> m)
+{
+  logger().info("handle_osd_map {}", *m);
+  if (m->fsid != superblock.cluster_fsid) {
+    logger().warn("fsid mismatched");
+    return seastar::now();
+  }
+  if (state.is_initializing()) {
+    logger().warn("i am still initializing");
+    return seastar::now();
+  }
+
+  const auto first = m->get_first();
+  const auto last = m->get_last();
+  logger().info("handle_osd_map epochs [{}..{}], i have {}, src has [{}..{}]",
+                first, last, superblock.newest_map, m->oldest_map, m->newest_map);
+  // make sure there is something new, here, before we bother flushing
+  // the queues and such
+  if (last <= superblock.newest_map) {
+    return seastar::now();
+  }
+  // missing some?
+  bool skip_maps = false;
+  epoch_t start = superblock.newest_map + 1;
+  if (first > start) {
+    logger().info("handle_osd_map message skips epochs {}..{}",
+                  start, first - 1);
+    if (m->oldest_map <= start) {
+      return shard_services.osdmap_subscribe(start, false);
+    }
+    // always try to get the full range of maps--as many as we can.  this
+    //  1- is good to have
+    //  2- is at present the only way to ensure that we get a *full* map as
+    //     the first map!
+    if (m->oldest_map < first) {
+      return shard_services.osdmap_subscribe(m->oldest_map - 1, true);
+    }
+    skip_maps = true;
+    start = first;
+  }
+
+  return seastar::do_with(ceph::os::Transaction{},
+                          [=](auto& t) {
+    return store_maps(t, start, m).then([=, &t] {
+      // even if this map isn't from a mon, we may have satisfied our subscription
+      monc->sub_got("osdmap", last);
+      if (!superblock.oldest_map || skip_maps) {
+        superblock.oldest_map = first;
+      }
+      superblock.newest_map = last;
+      superblock.current_epoch = last;
+
+      // note in the superblock that we were clean thru the prior epoch
+      if (boot_epoch && boot_epoch >= superblock.mounted) {
+        superblock.mounted = boot_epoch;
+        superblock.clean_thru = last;
+      }
+      meta_coll->store_superblock(t, superblock);
+      return store->do_transaction(meta_coll->collection(), std::move(t));
+    });
+  }).then([=] {
+    // TODO: write to superblock and commit the transaction
+    return committed_osd_maps(start, last, m);
+  });
+}
+
+seastar::future<> OSD::committed_osd_maps(version_t first,
+                                          version_t last,
+                                          Ref<MOSDMap> m)
+{
+  logger().info("osd.{}: committed_osd_maps({}, {})", whoami, first, last);
+  // advance through the new maps
+  return seastar::do_for_each(boost::make_counting_iterator(first),
+                              boost::make_counting_iterator(last + 1),
+                              [this](epoch_t cur) {
+    return get_map(cur).then([this](cached_map_t&& o) {
+      osdmap = std::move(o);
+      shard_services.update_map(osdmap);
+      if (up_epoch == 0 &&
+          osdmap->is_up(whoami) &&
+          osdmap->get_addrs(whoami) == public_msgr->get_myaddrs()) {
+        up_epoch = osdmap->get_epoch();
+        if (!boot_epoch) {
+          boot_epoch = osdmap->get_epoch();
+        }
+      }
+    });
+  }).then([m, this] {
+    if (osdmap->is_up(whoami) &&
+        osdmap->get_addrs(whoami) == public_msgr->get_myaddrs() &&
+        bind_epoch < osdmap->get_up_from(whoami)) {
+      if (state.is_booting()) {
+        logger().info("osd.{}: activating...", whoami);
+        state.set_active();
+        beacon_timer.arm_periodic(
+          std::chrono::seconds(local_conf()->osd_beacon_report_interval));
+        tick_timer.arm_periodic(
+          std::chrono::seconds(TICK_INTERVAL));
+      }
+    } else if (!osdmap->is_up(whoami)) {
+      if (state.is_prestop()) {
+	got_stop_ack();
+	return seastar::now();
+      }
+    }
+    check_osdmap_features();
+    // yay!
+    return consume_map(osdmap->get_epoch());
+  }).then([m, this] {
+    if (state.is_active()) {
+      logger().info("osd.{}: now active", whoami);
+      if (!osdmap->exists(whoami)) {
+        return shutdown();
+      }
+      if (should_restart()) {
+        return restart();
+      } else {
+        return seastar::now();
+      }
+    } else if (state.is_preboot()) {
+      logger().info("osd.{}: now preboot", whoami);
+
+      if (m->get_source().is_mon()) {
+        return _preboot(m->oldest_map, m->newest_map);
+      } else {
+        logger().info("osd.{}: start_boot", whoami);
+        return start_boot();
+      }
+    } else {
+      logger().info("osd.{}: now {}", whoami, state);
+      // XXX
+      return seastar::now();
+    }
+  });
+}
+
+seastar::future<> OSD::handle_osd_op(crimson::net::ConnectionRef conn,
+                                     Ref<MOSDOp> m)
+{
+  (void) shard_services.start_operation<ClientRequest>(
+    *this,
+    conn,
+    std::move(m));
+  return seastar::now();
+}
+
+seastar::future<> OSD::send_incremental_map(crimson::net::ConnectionRef conn,
+					    epoch_t first)
+{
+  if (first >= superblock.oldest_map) {
+    return load_map_bls(first, superblock.newest_map)
+    .then([this, conn, first](auto&& bls) {
+      auto m = make_message<MOSDMap>(monc->get_fsid(),
+	  osdmap->get_encoding_features());
+      m->oldest_map = first;
+      m->newest_map = superblock.newest_map;
+      m->maps = std::move(bls);
+      return conn->send(m);
+    });
+  } else {
+    return load_map_bl(osdmap->get_epoch())
+    .then([this, conn](auto&& bl) mutable {
+      auto m = make_message<MOSDMap>(monc->get_fsid(),
+	  osdmap->get_encoding_features());
+      m->oldest_map = superblock.oldest_map;
+      m->newest_map = superblock.newest_map;
+      m->maps.emplace(osdmap->get_epoch(), std::move(bl));
+      return conn->send(m);
+    });
+  }
+}
+
+seastar::future<> OSD::handle_rep_op(crimson::net::ConnectionRef conn,
+				     Ref<MOSDRepOp> m)
+{
+  m->finish_decode();
+  (void) shard_services.start_operation<RepRequest>(
+    *this,
+    std::move(conn),
+    std::move(m));
+  return seastar::now();
+}
+
+seastar::future<> OSD::handle_rep_op_reply(crimson::net::ConnectionRef conn,
+					   Ref<MOSDRepOpReply> m)
+{
+  const auto& pgs = pg_map.get_pgs();
+  if (auto pg = pgs.find(m->get_spg()); pg != pgs.end()) {
+    m->finish_decode();
+    pg->second->handle_rep_op_reply(conn, *m);
+  } else {
+    logger().warn("stale reply: {}", *m);
+  }
+  return seastar::now();
+}
+
+seastar::future<> OSD::handle_scrub(crimson::net::ConnectionRef conn,
+				    Ref<MOSDScrub2> m)
+{
+  if (m->fsid != superblock.cluster_fsid) {
+    logger().warn("fsid mismatched");
+    return seastar::now();
+  }
+  return seastar::parallel_for_each(std::move(m->scrub_pgs),
+    [m, conn, this](spg_t pgid) {
+    pg_shard_t from_shard{static_cast<int>(m->get_source().num()),
+                          pgid.shard};
+    PeeringState::RequestScrub scrub_request{m->deep, m->repair};
+    return shard_services.start_operation<RemotePeeringEvent>(
+      *this,
+      conn,
+      shard_services,
+      from_shard,
+      pgid,
+      PGPeeringEvent{m->epoch, m->epoch, scrub_request}).second;
+  });
+}
+
+seastar::future<> OSD::handle_mark_me_down(crimson::net::ConnectionRef conn,
+					   Ref<MOSDMarkMeDown> m)
+{
+  if (state.is_prestop()) {
+    got_stop_ack();
+  }
+  return seastar::now();
+}
+
+seastar::future<> OSD::handle_recovery_subreq(crimson::net::ConnectionRef conn,
+				   Ref<MOSDFastDispatchOp> m)
+{
+  (void) shard_services.start_operation<RecoverySubRequest>(
+    *this,
+    conn,
+    std::move(m));
+  return seastar::now();
+}
+
+bool OSD::should_restart() const
+{
+  if (!osdmap->is_up(whoami)) {
+    logger().info("map e {} marked osd.{} down",
+                  osdmap->get_epoch(), whoami);
+    return true;
+  } else if (osdmap->get_addrs(whoami) != public_msgr->get_myaddrs()) {
+    logger().error("map e {} had wrong client addr ({} != my {})",
+                   osdmap->get_epoch(),
+                   osdmap->get_addrs(whoami),
+                   public_msgr->get_myaddrs());
+    return true;
+  } else if (osdmap->get_cluster_addrs(whoami) != cluster_msgr->get_myaddrs()) {
+    logger().error("map e {} had wrong cluster addr ({} != my {})",
+                   osdmap->get_epoch(),
+                   osdmap->get_cluster_addrs(whoami),
+                   cluster_msgr->get_myaddrs());
+    return true;
+  } else {
+    return false;
+  }
+}
+
+seastar::future<> OSD::restart()
+{
+  beacon_timer.cancel();
+  tick_timer.cancel();
+  up_epoch = 0;
+  bind_epoch = osdmap->get_epoch();
+  // TODO: promote to shutdown if being marked down for multiple times
+  // rebind messengers
+  return start_boot();
+}
+
+seastar::future<> OSD::shutdown()
+{
+  // TODO
+  superblock.mounted = boot_epoch;
+  superblock.clean_thru = osdmap->get_epoch();
+  return seastar::now();
+}
+
+seastar::future<> OSD::send_beacon()
+{
+  if (!state.is_active()) {
+    return seastar::now();
+  }
+  // FIXME: min lec should be calculated from pg_stat
+  //        and should set m->pgs
+  epoch_t min_last_epoch_clean = osdmap->get_epoch();
+  auto m = make_message<MOSDBeacon>(osdmap->get_epoch(),
+                                    min_last_epoch_clean,
+                                    superblock.last_purged_snaps_scrub,
+                                    local_conf()->osd_beacon_report_interval);
+  return monc->send_message(m);
+}
+
+void OSD::update_heartbeat_peers()
+{
+  if (!state.is_active()) {
+    return;
+  }
+  for (auto& pg : pg_map.get_pgs()) {
+    vector<int> up, acting;
+    osdmap->pg_to_up_acting_osds(pg.first.pgid,
+                                 &up, nullptr,
+                                 &acting, nullptr);
+    for (int osd : boost::join(up, acting)) {
+      if (osd == CRUSH_ITEM_NONE || osd == whoami) {
+        continue;
+      } else {
+        heartbeat->add_peer(osd, osdmap->get_epoch());
+      }
+    }
+  }
+  heartbeat->update_peers(whoami);
+}
+
+seastar::future<> OSD::handle_peering_op(
+  crimson::net::ConnectionRef conn,
+  Ref<MOSDPeeringOp> m)
+{
+  const int from = m->get_source().num();
+  logger().debug("handle_peering_op on {} from {}", m->get_spg(), from);
+  std::unique_ptr<PGPeeringEvent> evt(m->get_event());
+  (void) shard_services.start_operation<RemotePeeringEvent>(
+    *this,
+    conn,
+    shard_services,
+    pg_shard_t{from, m->get_spg().shard},
+    m->get_spg(),
+    std::move(*evt));
+  return seastar::now();
+}
+
+void OSD::check_osdmap_features()
+{
+  heartbeat->set_require_authorizer(true);
+}
+
+seastar::future<> OSD::consume_map(epoch_t epoch)
+{
+  // todo: m-to-n: broadcast this news to all shards
+  auto &pgs = pg_map.get_pgs();
+  return seastar::parallel_for_each(pgs.begin(), pgs.end(), [=](auto& pg) {
+    return shard_services.start_operation<PGAdvanceMap>(
+      *this, pg.second, pg.second->get_osdmap_epoch(), epoch,
+      PeeringCtx{ceph_release_t::octopus}, false).second;
+  }).then([epoch, this] {
+    osdmap_gate.got_map(epoch);
+    return seastar::make_ready_future();
+  });
+}
+
+
+blocking_future<Ref<PG>>
+OSD::get_or_create_pg(
+  spg_t pgid,
+  epoch_t epoch,
+  std::unique_ptr<PGCreateInfo> info)
+{
+  if (info) {
+    auto [fut, creating] = pg_map.wait_for_pg(pgid);
+    if (!creating) {
+      pg_map.set_creating(pgid);
+      (void)handle_pg_create_info(std::move(info));
+    }
+    return std::move(fut);
+  } else {
+    return make_ready_blocking_future<Ref<PG>>(pg_map.get_pg(pgid));
+  }
+}
+
+blocking_future<Ref<PG>> OSD::wait_for_pg(
+  spg_t pgid)
+{
+  return pg_map.wait_for_pg(pgid).first;
+}
+
+Ref<PG> OSD::get_pg(spg_t pgid)
+{
+  return pg_map.get_pg(pgid);
+}
+
+seastar::future<> OSD::prepare_to_stop()
+{
+  if (osdmap && osdmap->is_up(whoami)) {
+    state.set_prestop();
+    const auto timeout =
+      std::chrono::duration_cast<std::chrono::milliseconds>(
+	std::chrono::duration<double>(
+	  local_conf().get_val<double>("osd_mon_shutdown_timeout")));
+
+    return seastar::with_timeout(
+      seastar::timer<>::clock::now() + timeout,
+      monc->send_message(
+	  make_message<MOSDMarkMeDown>(
+	    monc->get_fsid(),
+	    whoami,
+	    osdmap->get_addrs(whoami),
+	    osdmap->get_epoch(),
+	    true)).then([this] {
+	return stop_acked.get_future();
+      })
+    ).handle_exception_type(
+      [](seastar::timed_out_error&) {
+      return seastar::now();
+    });
+  }
+  return seastar::now();
+}
+
+}
diff --git a/src/crimson/osd/osd.h b/src/crimson/osd/osd.h
new file mode 100644
index 000000000..889960ced
--- /dev/null
+++ b/src/crimson/osd/osd.h
@@ -0,0 +1,250 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/timer.hh>
+
+#include "crimson/common/type_helpers.h"
+#include "crimson/common/auth_handler.h"
+#include "crimson/common/gated.h"
+#include "crimson/admin/admin_socket.h"
+#include "crimson/common/simple_lru.h"
+#include "crimson/common/shared_lru.h"
+#include "crimson/mgr/client.h"
+#include "crimson/net/Dispatcher.h"
+#include "crimson/osd/osdmap_service.h"
+#include "crimson/osd/state.h"
+#include "crimson/osd/shard_services.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/pg_map.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+
+#include "messages/MOSDOp.h"
+#include "osd/PeeringState.h"
+#include "osd/osd_types.h"
+#include "osd/osd_perf_counters.h"
+#include "osd/PGPeeringEvent.h"
+
+class MCommand;
+class MOSDMap;
+class MOSDRepOpReply;
+class MOSDRepOp;
+class MOSDScrub2;
+class OSDMap;
+class OSDMeta;
+class Heartbeat;
+
+namespace ceph::os {
+  class Transaction;
+}
+
+namespace crimson::mon {
+  class Client;
+}
+
+namespace crimson::net {
+  class Messenger;
+}
+
+namespace crimson::os {
+  class FuturizedStore;
+}
+
+namespace crimson::osd {
+class PG;
+
+class OSD final : public crimson::net::Dispatcher,
+		  private OSDMapService,
+		  private crimson::common::AuthHandler,
+		  private crimson::mgr::WithStats {
+  const int whoami;
+  const uint32_t nonce;
+  seastar::timer<seastar::lowres_clock> beacon_timer;
+  // talk with osd
+  crimson::net::MessengerRef cluster_msgr;
+  // talk with client/mon/mgr
+  crimson::net::MessengerRef public_msgr;
+  std::unique_ptr<crimson::mon::Client> monc;
+  std::unique_ptr<crimson::mgr::Client> mgrc;
+
+  SharedLRU<epoch_t, OSDMap> osdmaps;
+  SimpleLRU<epoch_t, bufferlist, false> map_bl_cache;
+  cached_map_t osdmap;
+  // TODO: use a wrapper for ObjectStore
+  std::unique_ptr<crimson::os::FuturizedStore> store;
+  std::unique_ptr<OSDMeta> meta_coll;
+
+  OSDState state;
+
+  /// _first_ epoch we were marked up (after this process started)
+  epoch_t boot_epoch = 0;
+  /// _most_recent_ epoch we were marked up
+  epoch_t up_epoch = 0;
+  //< epoch we last did a bind to new ip:ports
+  epoch_t bind_epoch = 0;
+  //< since when there is no more pending pg creates from mon
+  epoch_t last_pg_create_epoch = 0;
+
+  ceph::mono_time startup_time;
+
+  OSDSuperblock superblock;
+
+  // Dispatcher methods
+  std::optional<seastar::future<>> ms_dispatch(crimson::net::ConnectionRef, MessageRef) final;
+  void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) final;
+  void ms_handle_remote_reset(crimson::net::ConnectionRef conn) final;
+
+  // mgr::WithStats methods
+  // pg statistics including osd ones
+  osd_stat_t osd_stat;
+  uint32_t osd_stat_seq = 0;
+  void update_stats();
+  MessageRef get_stats() const final;
+
+  // AuthHandler methods
+  void handle_authentication(const EntityName& name,
+			     const AuthCapsInfo& caps) final;
+
+  crimson::osd::ShardServices shard_services;
+
+  std::unique_ptr<Heartbeat> heartbeat;
+  seastar::timer<seastar::lowres_clock> tick_timer;
+
+  // admin-socket
+  seastar::lw_shared_ptr<crimson::admin::AdminSocket> asok;
+
+public:
+  OSD(int id, uint32_t nonce,
+      crimson::net::MessengerRef cluster_msgr,
+      crimson::net::MessengerRef client_msgr,
+      crimson::net::MessengerRef hb_front_msgr,
+      crimson::net::MessengerRef hb_back_msgr);
+  ~OSD() final;
+
+  seastar::future<> mkfs(uuid_d osd_uuid, uuid_d cluster_fsid);
+
+  seastar::future<> start();
+  seastar::future<> stop();
+
+  void dump_status(Formatter*) const;
+  void dump_pg_state_history(Formatter*) const;
+  void print(std::ostream&) const;
+
+  seastar::future<> send_incremental_map(crimson::net::ConnectionRef conn,
+					 epoch_t first);
+
+  /// @return the seq id of the pg stats being sent
+  uint64_t send_pg_stats();
+
+private:
+  seastar::future<> start_boot();
+  seastar::future<> _preboot(version_t oldest_osdmap, version_t newest_osdmap);
+  seastar::future<> _send_boot();
+  seastar::future<> _add_me_to_crush();
+
+  seastar::future<Ref<PG>> make_pg(cached_map_t create_map,
+				   spg_t pgid,
+				   bool do_create);
+  seastar::future<Ref<PG>> load_pg(spg_t pgid);
+  seastar::future<> load_pgs();
+
+  // OSDMapService methods
+  epoch_t get_up_epoch() const final {
+    return up_epoch;
+  }
+  seastar::future<cached_map_t> get_map(epoch_t e) final;
+  cached_map_t get_map() const final;
+  seastar::future<std::unique_ptr<OSDMap>> load_map(epoch_t e);
+  seastar::future<bufferlist> load_map_bl(epoch_t e);
+  seastar::future<std::map<epoch_t, bufferlist>>
+  load_map_bls(epoch_t first, epoch_t last);
+  void store_map_bl(ceph::os::Transaction& t,
+                    epoch_t e, bufferlist&& bl);
+  seastar::future<> store_maps(ceph::os::Transaction& t,
+                               epoch_t start, Ref<MOSDMap> m);
+  seastar::future<> osdmap_subscribe(version_t epoch, bool force_request);
+
+  void write_superblock(ceph::os::Transaction& t);
+  seastar::future<> read_superblock();
+
+  bool require_mon_peer(crimson::net::Connection *conn, Ref<Message> m);
+
+  seastar::future<Ref<PG>> handle_pg_create_info(
+    std::unique_ptr<PGCreateInfo> info);
+
+  seastar::future<> handle_osd_map(crimson::net::ConnectionRef conn,
+                                   Ref<MOSDMap> m);
+  seastar::future<> handle_osd_op(crimson::net::ConnectionRef conn,
+				  Ref<MOSDOp> m);
+  seastar::future<> handle_rep_op(crimson::net::ConnectionRef conn,
+				  Ref<MOSDRepOp> m);
+  seastar::future<> handle_rep_op_reply(crimson::net::ConnectionRef conn,
+					Ref<MOSDRepOpReply> m);
+  seastar::future<> handle_peering_op(crimson::net::ConnectionRef conn,
+				      Ref<MOSDPeeringOp> m);
+  seastar::future<> handle_recovery_subreq(crimson::net::ConnectionRef conn,
+					   Ref<MOSDFastDispatchOp> m);
+  seastar::future<> handle_scrub(crimson::net::ConnectionRef conn,
+				 Ref<MOSDScrub2> m);
+  seastar::future<> handle_mark_me_down(crimson::net::ConnectionRef conn,
+					Ref<MOSDMarkMeDown> m);
+
+  seastar::future<> committed_osd_maps(version_t first,
+                                       version_t last,
+                                       Ref<MOSDMap> m);
+
+  void check_osdmap_features();
+
+  seastar::future<> handle_command(crimson::net::ConnectionRef conn,
+				   Ref<MCommand> m);
+  seastar::future<> start_asok_admin();
+
+public:
+  OSDMapGate osdmap_gate;
+
+  ShardServices &get_shard_services() {
+    return shard_services;
+  }
+
+  seastar::future<> consume_map(epoch_t epoch);
+
+private:
+  PGMap pg_map;
+  crimson::common::Gated gate;
+
+  seastar::promise<> stop_acked;
+  void got_stop_ack() {
+    stop_acked.set_value();
+  }
+  seastar::future<> prepare_to_stop();
+public:
+  blocking_future<Ref<PG>> get_or_create_pg(
+    spg_t pgid,
+    epoch_t epoch,
+    std::unique_ptr<PGCreateInfo> info);
+  blocking_future<Ref<PG>> wait_for_pg(
+    spg_t pgid);
+  Ref<PG> get_pg(spg_t pgid);
+
+  bool should_restart() const;
+  seastar::future<> restart();
+  seastar::future<> shutdown();
+
+  seastar::future<> send_beacon();
+  void update_heartbeat_peers();
+
+  friend class PGAdvanceMap;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const OSD& osd) {
+  osd.print(out);
+  return out;
+}
+
+}
diff --git a/src/crimson/osd/osd_connection_priv.h b/src/crimson/osd/osd_connection_priv.h
new file mode 100644
index 000000000..a265bb432
--- /dev/null
+++ b/src/crimson/osd/osd_connection_priv.h
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/osd_operations/replicated_request.h"
+
+namespace crimson::osd {
+
+struct OSDConnectionPriv : public crimson::net::Connection::user_private_t {
+  ClientRequest::ConnectionPipeline client_request_conn_pipeline;
+  RemotePeeringEvent::ConnectionPipeline peering_request_conn_pipeline;
+  RepRequest::ConnectionPipeline replicated_request_conn_pipeline;
+};
+
+static OSDConnectionPriv &get_osd_priv(crimson::net::Connection *conn) {
+  if (!conn->has_user_private()) {
+    conn->set_user_private(std::make_unique<OSDConnectionPriv>());
+  }
+  return static_cast<OSDConnectionPriv&>(conn->get_user_private());
+}
+
+}
diff --git a/src/crimson/osd/osd_meta.cc b/src/crimson/osd/osd_meta.cc
new file mode 100644
index 000000000..9b9215f5b
--- /dev/null
+++ b/src/crimson/osd/osd_meta.cc
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "osd_meta.h"
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+#include "os/Transaction.h"
+
+using read_errorator = crimson::os::FuturizedStore::read_errorator;
+
+void OSDMeta::create(ceph::os::Transaction& t)
+{
+  t.create_collection(coll->get_cid(), 0);
+}
+
+void OSDMeta::store_map(ceph::os::Transaction& t,
+                        epoch_t e, const bufferlist& m)
+{
+  t.write(coll->get_cid(), osdmap_oid(e), 0, m.length(), m);
+}
+
+seastar::future<bufferlist> OSDMeta::load_map(epoch_t e)
+{
+  return store->read(coll,
+                     osdmap_oid(e), 0, 0,
+                     CEPH_OSD_OP_FLAG_FADVISE_WILLNEED).handle_error(
+    read_errorator::all_same_way([e] {
+      throw std::runtime_error(fmt::format("read gave enoent on {}",
+                                           osdmap_oid(e)));
+    }));
+}
+
+void OSDMeta::store_superblock(ceph::os::Transaction& t,
+                               const OSDSuperblock& superblock)
+{
+  bufferlist bl;
+  encode(superblock, bl);
+  t.write(coll->get_cid(), superblock_oid(), 0, bl.length(), bl);
+}
+
+seastar::future<OSDSuperblock> OSDMeta::load_superblock()
+{
+  return store->read(coll, superblock_oid(), 0, 0).safe_then(
+    [] (bufferlist&& bl) {
+      auto p = bl.cbegin();
+      OSDSuperblock superblock;
+      decode(superblock, p);
+      return seastar::make_ready_future<OSDSuperblock>(std::move(superblock));
+    }, read_errorator::all_same_way([] {
+      throw std::runtime_error(fmt::format("read gave enoent on {}",
+                                           superblock_oid()));
+    }));
+}
+
+seastar::future<std::tuple<pg_pool_t,
+			   std::string,
+			   OSDMeta::ec_profile_t>>
+OSDMeta::load_final_pool_info(int64_t pool) {
+  return store->read(coll, final_pool_info_oid(pool),
+                     0, 0).safe_then([] (bufferlist&& bl) {
+    auto p = bl.cbegin();
+    pg_pool_t pi;
+    string name;
+    ec_profile_t ec_profile;
+    decode(pi, p);
+    decode(name, p);
+    decode(ec_profile, p);
+    return seastar::make_ready_future<std::tuple<pg_pool_t,
+						 string,
+						 ec_profile_t>>(
+      std::make_tuple(std::move(pi),
+		      std::move(name),
+		      std::move(ec_profile)));
+  },read_errorator::all_same_way([pool] {
+    throw std::runtime_error(fmt::format("read gave enoent on {}",
+                                         final_pool_info_oid(pool)));
+  }));
+}
+
+ghobject_t OSDMeta::osdmap_oid(epoch_t epoch)
+{
+  string name = fmt::format("osdmap.{}", epoch);
+  return ghobject_t(hobject_t(sobject_t(object_t(name), 0)));
+}
+
+ghobject_t OSDMeta::final_pool_info_oid(int64_t pool)
+{
+  string name = fmt::format("final_pool_{}", pool);
+  return ghobject_t(hobject_t(sobject_t(object_t(name), CEPH_NOSNAP)));
+}
+
+ghobject_t OSDMeta::superblock_oid()
+{
+  return ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)));
+}
diff --git a/src/crimson/osd/osd_meta.h b/src/crimson/osd/osd_meta.h
new file mode 100644
index 000000000..841572087
--- /dev/null
+++ b/src/crimson/osd/osd_meta.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <seastar/core/future.hh>
+#include "osd/osd_types.h"
+#include "crimson/os/futurized_collection.h"
+
+namespace ceph::os {
+  class Transaction;
+}
+
+namespace crimson::os {
+  class FuturizedCollection;
+  class FuturizedStore;
+}
+
+/// metadata shared across PGs, or put in another way,
+/// metadata not specific to certain PGs.
+class OSDMeta {
+  template<typename T> using Ref = boost::intrusive_ptr<T>;
+
+  crimson::os::FuturizedStore* store;
+  Ref<crimson::os::FuturizedCollection> coll;
+
+public:
+  OSDMeta(Ref<crimson::os::FuturizedCollection> coll,
+          crimson::os::FuturizedStore* store)
+    : store{store}, coll{coll}
+  {}
+
+  auto collection() {
+    return coll;
+  }
+  void create(ceph::os::Transaction& t);
+
+  void store_map(ceph::os::Transaction& t,
+                 epoch_t e, const bufferlist& m);
+  seastar::future<bufferlist> load_map(epoch_t e);
+
+  void store_superblock(ceph::os::Transaction& t,
+                        const OSDSuperblock& sb);
+  seastar::future<OSDSuperblock> load_superblock();
+
+  using ec_profile_t = std::map<std::string, std::string>;
+  seastar::future<std::tuple<pg_pool_t,
+			     std::string,
+			     ec_profile_t>> load_final_pool_info(int64_t pool);
+private:
+  static ghobject_t osdmap_oid(epoch_t epoch);
+  static ghobject_t final_pool_info_oid(int64_t pool);
+  static ghobject_t superblock_oid();
+};
diff --git a/src/crimson/osd/osd_operation.cc b/src/crimson/osd/osd_operation.cc
new file mode 100644
index 000000000..b5f3c3cbb
--- /dev/null
+++ b/src/crimson/osd/osd_operation.cc
@@ -0,0 +1,159 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "osd_operation.h"
+#include "common/Formatter.h"
+
+namespace crimson::osd {
+
+void Operation::dump(ceph::Formatter* f)
+{
+  f->open_object_section("operation");
+  f->dump_string("type", get_type_name());
+  f->dump_unsigned("id", id);
+  {
+    f->open_object_section("detail");
+    dump_detail(f);
+    f->close_section();
+  }
+  f->open_array_section("blockers");
+  for (auto &blocker : blockers) {
+    blocker->dump(f);
+  }
+  f->close_section();
+  f->close_section();
+}
+
+void Operation::dump_brief(ceph::Formatter* f)
+{
+  f->open_object_section("operation");
+  f->dump_string("type", get_type_name());
+  f->dump_unsigned("id", id);
+  f->close_section();
+}
+
+std::ostream &operator<<(std::ostream &lhs, const Operation &rhs) {
+  lhs << rhs.get_type_name() << "(id=" << rhs.get_id() << ", detail=";
+  rhs.print(lhs);
+  lhs << ")";
+  return lhs;
+}
+
+void Blocker::dump(ceph::Formatter* f) const
+{
+  f->open_object_section("blocker");
+  f->dump_string("op_type", get_type_name());
+  {
+    f->open_object_section("detail");
+    dump_detail(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void AggregateBlocker::dump_detail(ceph::Formatter *f) const
+{
+  f->open_array_section("parent_blockers");
+  for (auto b : parent_blockers) {
+    f->open_object_section("parent_blocker");
+    b->dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+OperationThrottler::OperationThrottler(ConfigProxy &conf)
+  : scheduler(crimson::osd::scheduler::make_scheduler(conf))
+{
+  conf.add_observer(this);
+  update_from_config(conf);
+}
+
+void OperationThrottler::wake()
+{
+  while ((!max_in_progress || in_progress < max_in_progress) &&
+	 !scheduler->empty()) {
+    auto item = scheduler->dequeue();
+    item.wake.set_value();
+    ++in_progress;
+    --pending;
+  }
+}
+
+void OperationThrottler::release_throttle()
+{
+  ceph_assert(in_progress > 0);
+  --in_progress;
+  wake();
+}
+
+blocking_future<> OperationThrottler::acquire_throttle(
+  crimson::osd::scheduler::params_t params)
+{
+  crimson::osd::scheduler::item_t item{params, seastar::promise<>()};
+  auto fut = item.wake.get_future();
+  scheduler->enqueue(std::move(item));
+  return make_blocking_future(std::move(fut));
+}
+
+void OperationThrottler::dump_detail(Formatter *f) const
+{
+  f->dump_unsigned("max_in_progress", max_in_progress);
+  f->dump_unsigned("in_progress", in_progress);
+  f->open_object_section("scheduler");
+  {
+    scheduler->dump(*f);
+  }
+  f->close_section();
+}
+
+void OperationThrottler::update_from_config(const ConfigProxy &conf)
+{
+  max_in_progress = conf.get_val<uint64_t>("crimson_osd_scheduler_concurrency");
+  wake();
+}
+
+const char** OperationThrottler::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    "crimson_osd_scheduler_concurrency",
+    NULL
+  };
+  return KEYS;
+}
+
+void OperationThrottler::handle_conf_change(
+  const ConfigProxy& conf,
+  const std::set<std::string> &changed)
+{
+  update_from_config(conf);
+}
+
+
+void OrderedPipelinePhase::Handle::exit()
+{
+  if (phase) {
+    phase->mutex.unlock();
+    phase = nullptr;
+  }
+}
+
+blocking_future<> OrderedPipelinePhase::Handle::enter(
+  OrderedPipelinePhase &new_phase)
+{
+  auto fut = new_phase.mutex.lock();
+  exit();
+  phase = &new_phase;
+  return new_phase.make_blocking_future(std::move(fut));
+}
+
+OrderedPipelinePhase::Handle::~Handle()
+{
+  exit();
+}
+
+void OrderedPipelinePhase::dump_detail(ceph::Formatter* f) const
+{
+}
+
+}
diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h
new file mode 100644
index 000000000..5178749b0
--- /dev/null
+++ b/src/crimson/osd/osd_operation.h
@@ -0,0 +1,427 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <set>
+#include <vector>
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/shared_mutex.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/timer.hh>
+#include <seastar/core/lowres_clock.hh>
+
+#include "include/ceph_assert.h"
+#include "crimson/osd/scheduler/scheduler.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+namespace crimson::osd {
+
+enum class OperationTypeCode {
+  client_request = 0,
+  peering_event,
+  compound_peering_request,
+  pg_advance_map,
+  pg_creation,
+  replicated_request,
+  background_recovery,
+  background_recovery_sub,
+  last_op
+};
+
+static constexpr const char* const OP_NAMES[] = {
+  "client_request",
+  "peering_event",
+  "compound_peering_request",
+  "pg_advance_map",
+  "pg_creation",
+  "replicated_request",
+  "background_recovery",
+  "background_recovery_sub",
+};
+
+// prevent the addition of OperationTypeCode-s with no matching OP_NAMES entry:
+static_assert(
+  (sizeof(OP_NAMES)/sizeof(OP_NAMES[0])) ==
+  static_cast<int>(OperationTypeCode::last_op));
+
+class OperationRegistry;
+
+using registry_hook_t = boost::intrusive::list_member_hook<
+  boost::intrusive::link_mode<boost::intrusive::auto_unlink>>;
+
+class Operation;
+class Blocker;
+
+/**
+ * Provides an abstraction for registering and unregistering a blocker
+ * for the duration of a future becoming available.
+ */
+template <typename Fut>
+class blocking_future_detail {
+  friend class Operation;
+  friend class Blocker;
+  Blocker *blocker;
+  Fut fut;
+  blocking_future_detail(Blocker *b, Fut &&f)
+    : blocker(b), fut(std::move(f)) {}
+
+  template <typename V, typename U>
+  friend blocking_future_detail<seastar::future<V>> make_ready_blocking_future(U&& args);
+  template <typename V, typename Exception>
+  friend blocking_future_detail<seastar::future<V>>
+  make_exception_blocking_future(Exception&& e);
+
+  template <typename U>
+  friend blocking_future_detail<seastar::future<>> join_blocking_futures(U &&u);
+
+  template <typename U>
+  friend class blocking_future_detail;
+
+public:
+  template <typename F>
+  auto then(F &&f) && {
+    using result = decltype(std::declval<Fut>().then(f));
+    return blocking_future_detail<seastar::futurize_t<result>>(
+      blocker,
+      std::move(fut).then(std::forward<F>(f)));
+  }
+};
+
+template <typename T=void>
+using blocking_future = blocking_future_detail<seastar::future<T>>;
+
+template <typename V, typename U>
+blocking_future_detail<seastar::future<V>> make_ready_blocking_future(U&& args) {
+  return blocking_future<V>(
+    nullptr,
+    seastar::make_ready_future<V>(std::forward<U>(args)));
+}
+
+template <typename V, typename Exception>
+blocking_future_detail<seastar::future<V>>
+make_exception_blocking_future(Exception&& e) {
+  return blocking_future<V>(
+    nullptr,
+    seastar::make_exception_future<V>(e));
+}
+
+/**
+ * Provides an interface for dumping diagnostic information about
+ * why a particular op is not making progress.
+ */
+class Blocker {
+public:
+  template <typename T>
+  blocking_future<T> make_blocking_future(seastar::future<T> &&f) {
+    return blocking_future<T>(this, std::move(f));
+  }
+  void dump(ceph::Formatter *f) const;
+  virtual ~Blocker() = default;
+
+private:
+  virtual void dump_detail(ceph::Formatter *f) const = 0;
+  virtual const char *get_type_name() const = 0;
+};
+
+template <typename T>
+class BlockerT : public Blocker {
+public:
+  virtual ~BlockerT() = default;
+private:
+  const char *get_type_name() const final {
+    return T::type_name;
+  }
+};
+
+class AggregateBlocker : public BlockerT<AggregateBlocker> {
+  vector<Blocker*> parent_blockers;
+public:
+  AggregateBlocker(vector<Blocker*> &&parent_blockers)
+    : parent_blockers(std::move(parent_blockers)) {}
+  static constexpr const char *type_name = "AggregateBlocker";
+private:
+  void dump_detail(ceph::Formatter *f) const final;
+};
+
+template <typename T>
+blocking_future<> join_blocking_futures(T &&t) {
+  vector<Blocker*> blockers;
+  blockers.reserve(t.size());
+  for (auto &&bf: t) {
+    blockers.push_back(bf.blocker);
+    bf.blocker = nullptr;
+  }
+  auto agg = std::make_unique<AggregateBlocker>(std::move(blockers));
+  return agg->make_blocking_future(
+    seastar::parallel_for_each(
+      std::forward<T>(t),
+      [](auto &&bf) {
+	return std::move(bf.fut);
+      }).then([agg=std::move(agg)] {
+	return seastar::make_ready_future<>();
+      }));
+}
+
+
+/**
+ * Common base for all crimson-osd operations.  Mainly provides
+ * an interface for registering ops in flight and dumping
+ * diagnostic information.
+ */
+class Operation : public boost::intrusive_ref_counter<
+  Operation, boost::thread_unsafe_counter> {
+ public:
+  uint64_t get_id() const {
+    return id;
+  }
+
+  virtual OperationTypeCode get_type() const = 0;
+  virtual const char *get_type_name() const = 0;
+  virtual void print(std::ostream &) const = 0;
+
+  template <typename T>
+  seastar::future<T> with_blocking_future(blocking_future<T> &&f) {
+    if (f.fut.available()) {
+      return std::move(f.fut);
+    }
+    assert(f.blocker);
+    add_blocker(f.blocker);
+    return std::move(f.fut).then_wrapped([this, blocker=f.blocker](auto &&arg) {
+      clear_blocker(blocker);
+      return std::move(arg);
+    });
+  }
+
+  void dump(ceph::Formatter *f);
+  void dump_brief(ceph::Formatter *f);
+  virtual ~Operation() = default;
+
+ private:
+  virtual void dump_detail(ceph::Formatter *f) const = 0;
+
+ private:
+  registry_hook_t registry_hook;
+
+  std::vector<Blocker*> blockers;
+  uint64_t id = 0;
+  void set_id(uint64_t in_id) {
+    id = in_id;
+  }
+
+  void add_blocker(Blocker *b) {
+    blockers.push_back(b);
+  }
+
+  void clear_blocker(Blocker *b) {
+    auto iter = std::find(blockers.begin(), blockers.end(), b);
+    if (iter != blockers.end()) {
+      blockers.erase(iter);
+    }
+  }
+
+  friend class OperationRegistry;
+};
+using OperationRef = boost::intrusive_ptr<Operation>;
+
+std::ostream &operator<<(std::ostream &, const Operation &op);
+
+template <typename T>
+class OperationT : public Operation {
+public:
+  static constexpr const char *type_name = OP_NAMES[static_cast<int>(T::type)];
+  using IRef = boost::intrusive_ptr<T>;
+
+  OperationTypeCode get_type() const final {
+    return T::type;
+  }
+
+  const char *get_type_name() const final {
+    return T::type_name;
+  }
+
+  virtual ~OperationT() = default;
+
+private:
+  virtual void dump_detail(ceph::Formatter *f) const = 0;
+};
+
+/**
+ * Maintains a set of lists of all active ops.
+ */
+class OperationRegistry {
+  friend class Operation;
+  using op_list_member_option = boost::intrusive::member_hook<
+    Operation,
+    registry_hook_t,
+    &Operation::registry_hook
+    >;
+  using op_list = boost::intrusive::list<
+    Operation,
+    op_list_member_option,
+    boost::intrusive::constant_time_size<false>>;
+
+  std::array<
+    op_list,
+    static_cast<int>(OperationTypeCode::last_op)
+  > registries;
+
+  std::array<
+    uint64_t,
+    static_cast<int>(OperationTypeCode::last_op)
+  > op_id_counters = {};
+
+  seastar::timer<seastar::lowres_clock> shutdown_timer;
+  seastar::promise<> shutdown;
+public:
+  template <typename T, typename... Args>
+  typename T::IRef create_operation(Args&&... args) {
+    typename T::IRef op = new T(std::forward<Args>(args)...);
+    registries[static_cast<int>(T::type)].push_back(*op);
+    op->set_id(op_id_counters[static_cast<int>(T::type)]++);
+    return op;
+  }
+
+  seastar::future<> stop() {
+    shutdown_timer.set_callback([this] {
+	if (std::all_of(registries.begin(),
+			registries.end(),
+			[](auto& opl) {
+			  return opl.empty();
+			})) {
+	  shutdown.set_value();
+	  shutdown_timer.cancel();
+	}
+      });
+    shutdown_timer.arm_periodic(std::chrono::milliseconds(100/*TODO: use option instead*/));
+    return shutdown.get_future();
+  }
+};
+
+/**
+ * Throttles set of currently running operations
+ *
+ * Very primitive currently, assumes all ops are equally
+ * expensive and simply limits the number that can be
+ * concurrently active.
+ */
+class OperationThrottler : public Blocker,
+			private md_config_obs_t {
+public:
+  OperationThrottler(ConfigProxy &conf);
+
+  const char** get_tracked_conf_keys() const final;
+  void handle_conf_change(const ConfigProxy& conf,
+			  const std::set<std::string> &changed) final;
+  void update_from_config(const ConfigProxy &conf);
+
+  template <typename F>
+  auto with_throttle(
+    OperationRef op,
+    crimson::osd::scheduler::params_t params,
+    F &&f) {
+    if (!max_in_progress) return f();
+    auto fut = acquire_throttle(params);
+    return op->with_blocking_future(std::move(fut))
+      .then(std::forward<F>(f))
+      .then([this](auto x) {
+	release_throttle();
+	return x;
+      });
+  }
+
+  template <typename F>
+  seastar::future<> with_throttle_while(
+    OperationRef op,
+    crimson::osd::scheduler::params_t params,
+    F &&f) {
+    return with_throttle(op, params, f).then([this, params, op, f](bool cont) {
+      if (cont)
+	return with_throttle_while(op, params, f);
+      else
+	return seastar::make_ready_future<>();
+    });
+  }
+
+private:
+  void dump_detail(Formatter *f) const final;
+  const char *get_type_name() const final {
+    return "OperationThrottler";
+  }
+
+private:
+  crimson::osd::scheduler::SchedulerRef scheduler;
+
+  uint64_t max_in_progress = 0;
+  uint64_t in_progress = 0;
+
+  uint64_t pending = 0;
+
+  void wake();
+
+  blocking_future<> acquire_throttle(
+    crimson::osd::scheduler::params_t params);
+
+  void release_throttle();
+};
+
+/**
+ * Ensures that at most one op may consider itself in the phase at a time.
+ * Ops will see enter() unblock in the order in which they tried to enter
+ * the phase.  entering (though not necessarily waiting for the future to
+ * resolve) a new phase prior to exiting the previous one will ensure that
+ * the op ordering is preserved.
+ */
+class OrderedPipelinePhase : public Blocker {
+private:
+  void dump_detail(ceph::Formatter *f) const final;
+  const char *get_type_name() const final {
+    return name;
+  }
+
+public:
+  /**
+   * Used to encapsulate pipeline residency state.
+   */
+  class Handle {
+    OrderedPipelinePhase *phase = nullptr;
+
+  public:
+    Handle() = default;
+
+    Handle(const Handle&) = delete;
+    Handle(Handle&&) = delete;
+    Handle &operator=(const Handle&) = delete;
+    Handle &operator=(Handle&&) = delete;
+
+    /**
+     * Returns a future which unblocks when the handle has entered the passed
+     * OrderedPipelinePhase.  If already in a phase, enter will also release
+     * that phase after placing itself in the queue for the next one to preserve
+     * ordering.
+     */
+    blocking_future<> enter(OrderedPipelinePhase &phase);
+
+    /**
+     * Releases the current phase if there is one.  Called in ~Handle().
+     */
+    void exit();
+
+    ~Handle();
+  };
+
+  OrderedPipelinePhase(const char *name) : name(name) {}
+
+private:
+  const char * name;
+  seastar::shared_mutex mutex;
+};
+
+}
diff --git a/src/crimson/osd/osd_operations/background_recovery.cc b/src/crimson/osd/osd_operations/background_recovery.cc
new file mode 100644
index 000000000..126e0e902
--- /dev/null
+++ b/src/crimson/osd/osd_operations/background_recovery.cc
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <seastar/core/future.hh>
+
+#include "messages/MOSDOp.h"
+
+#include "crimson/osd/pg.h"
+#include "crimson/osd/shard_services.h"
+#include "common/Formatter.h"
+#include "crimson/osd/osd_operations/background_recovery.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+namespace crimson::osd {
+
+BackgroundRecovery::BackgroundRecovery(
+  Ref<PG> pg,
+  ShardServices &ss,
+  epoch_t epoch_started,
+  crimson::osd::scheduler::scheduler_class_t scheduler_class)
+  : pg(pg),
+    epoch_started(epoch_started),
+    ss(ss),
+    scheduler_class(scheduler_class)
+{}
+
+void BackgroundRecovery::print(std::ostream &lhs) const
+{
+  lhs << "BackgroundRecovery(" << pg->get_pgid() << ")";
+}
+
+void BackgroundRecovery::dump_detail(Formatter *f) const
+{
+  f->dump_stream("pgid") << pg->get_pgid();
+  f->open_object_section("recovery_detail");
+  {
+    // TODO pg->dump_recovery_state(f);
+  }
+  f->close_section();
+}
+
+seastar::future<> BackgroundRecovery::start()
+{
+  logger().debug("{}: start", *this);
+
+  IRef ref = this;
+  return ss.throttler.with_throttle_while(
+    this, get_scheduler_params(), [this] {
+      return do_recovery();
+    }).handle_exception_type([ref, this](const std::system_error& err) {
+      if (err.code() == std::make_error_code(std::errc::interrupted)) {
+	logger().debug("{} recovery interruped: {}", *pg, err.what());
+	return seastar::now();
+      }
+      return seastar::make_exception_future<>(err);
+    });
+}
+
+seastar::future<bool> UrgentRecovery::do_recovery()
+{
+  if (!pg->has_reset_since(epoch_started)) {
+    return with_blocking_future(
+      pg->get_recovery_handler()->recover_missing(soid, need)
+    ).then([] {
+      return seastar::make_ready_future<bool>(false);
+    });
+  }
+  return seastar::make_ready_future<bool>(false);
+}
+
+void UrgentRecovery::print(std::ostream &lhs) const
+{
+  lhs << "UrgentRecovery(" << pg->get_pgid() << ", "
+    << soid << ", v" << need << ")";
+}
+
+void UrgentRecovery::dump_detail(Formatter *f) const
+{
+  f->dump_stream("pgid") << pg->get_pgid();
+  f->open_object_section("recovery_detail");
+  {
+    f->dump_stream("oid") << soid;
+    f->dump_stream("version") << need;
+  }
+  f->close_section();
+}
+
+PglogBasedRecovery::PglogBasedRecovery(
+  Ref<PG> pg,
+  ShardServices &ss,
+  const epoch_t epoch_started)
+  : BackgroundRecovery(
+      std::move(pg),
+      ss,
+      epoch_started,
+      crimson::osd::scheduler::scheduler_class_t::background_recovery)
+{}
+
+seastar::future<bool> PglogBasedRecovery::do_recovery()
+{
+  if (pg->has_reset_since(epoch_started))
+    return seastar::make_ready_future<bool>(false);
+  return with_blocking_future(
+    pg->get_recovery_handler()->start_recovery_ops(
+      crimson::common::local_conf()->osd_recovery_max_single_start));
+}
+
+BackfillRecovery::BackfillRecoveryPipeline &BackfillRecovery::bp(PG &pg)
+{
+  return pg.backfill_pipeline;
+}
+
+seastar::future<bool> BackfillRecovery::do_recovery()
+{
+  logger().debug("{}", __func__);
+
+  if (pg->has_reset_since(epoch_started)) {
+    logger().debug("{}: pg got reset since epoch_started={}",
+                   __func__, epoch_started);
+    return seastar::make_ready_future<bool>(false);
+  }
+  // TODO: limits
+  return with_blocking_future(
+    // process_event() of our boost::statechart machine is non-reentrant.
+    // with the backfill_pipeline we protect it from a second entry from
+    // the implementation of BackfillListener.
+    // additionally, this stage serves to synchronize with PeeringEvent.
+    handle.enter(bp(*pg).process)
+  ).then([this] {
+    pg->get_recovery_handler()->dispatch_backfill_event(std::move(evt));
+    return seastar::make_ready_future<bool>(false);
+  });
+}
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/background_recovery.h b/src/crimson/osd/osd_operations/background_recovery.h
new file mode 100644
index 000000000..37e46c588
--- /dev/null
+++ b/src/crimson/osd/osd_operations/background_recovery.h
@@ -0,0 +1,126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/statechart/event_base.hpp>
+
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/common/type_helpers.h"
+
+#include "messages/MOSDOp.h"
+
+namespace crimson::osd {
+class PG;
+class ShardServices;
+
+class BackgroundRecovery : public OperationT<BackgroundRecovery> {
+public:
+  static constexpr OperationTypeCode type = OperationTypeCode::background_recovery;
+
+  BackgroundRecovery(
+    Ref<PG> pg,
+    ShardServices &ss,
+    epoch_t epoch_started,
+    crimson::osd::scheduler::scheduler_class_t scheduler_class);
+
+  virtual void print(std::ostream &) const;
+  seastar::future<> start();
+
+protected:
+  Ref<PG> pg;
+  const epoch_t epoch_started;
+
+private:
+  virtual void dump_detail(Formatter *f) const;
+  crimson::osd::scheduler::params_t get_scheduler_params() const {
+    return {
+      1, // cost
+      0, // owner
+      scheduler_class
+    };
+  }
+  virtual seastar::future<bool> do_recovery() = 0;
+  ShardServices &ss;
+  const crimson::osd::scheduler::scheduler_class_t scheduler_class;
+};
+
+/// represent a recovery initiated for serving a client request
+///
+/// unlike @c PglogBasedRecovery and @c BackfillRecovery,
+/// @c UrgentRecovery is not throttled by the scheduler. and it
+/// utilizes @c RecoveryBackend directly to recover the unreadable
+/// object.
+class UrgentRecovery final : public BackgroundRecovery {
+public:
+  UrgentRecovery(
+    const hobject_t& soid,
+    const eversion_t& need,
+    Ref<PG> pg,
+    ShardServices& ss,
+    epoch_t epoch_started)
+  : BackgroundRecovery{pg, ss, epoch_started,
+                       crimson::osd::scheduler::scheduler_class_t::immediate},
+    soid{soid}, need(need) {}
+  void print(std::ostream&) const final;
+
+private:
+  void dump_detail(Formatter* f) const final;
+  seastar::future<bool> do_recovery() override;
+  const hobject_t soid;
+  const eversion_t need;
+};
+
+class PglogBasedRecovery final : public BackgroundRecovery {
+public:
+  PglogBasedRecovery(
+    Ref<PG> pg,
+    ShardServices &ss,
+    epoch_t epoch_started);
+
+private:
+  seastar::future<bool> do_recovery() override;
+};
+
+class BackfillRecovery final : public BackgroundRecovery {
+public:
+  class BackfillRecoveryPipeline {
+    OrderedPipelinePhase process = {
+      "BackfillRecovery::PGPipeline::process"
+    };
+    friend class BackfillRecovery;
+    friend class PeeringEvent;
+  };
+
+  template <class EventT>
+  BackfillRecovery(
+    Ref<PG> pg,
+    ShardServices &ss,
+    epoch_t epoch_started,
+    const EventT& evt);
+
+  static BackfillRecoveryPipeline &bp(PG &pg);
+
+private:
+  boost::intrusive_ptr<const boost::statechart::event_base> evt;
+  OrderedPipelinePhase::Handle handle;
+  seastar::future<bool> do_recovery() override;
+};
+
+template <class EventT>
+BackfillRecovery::BackfillRecovery(
+  Ref<PG> pg,
+  ShardServices &ss,
+  const epoch_t epoch_started,
+  const EventT& evt)
+  : BackgroundRecovery(
+      std::move(pg),
+      ss,
+      epoch_started,
+      crimson::osd::scheduler::scheduler_class_t::background_best_effort),
+    evt(evt.intrusive_from_this())
+{}
+
+
+}
diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc
new file mode 100644
index 000000000..87b8fc788
--- /dev/null
+++ b/src/crimson/osd/osd_operations/client_request.cc
@@ -0,0 +1,201 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <seastar/core/future.hh>
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+
+#include "crimson/common/exception.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/osd.h"
+#include "common/Formatter.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/osd_connection_priv.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+namespace crimson::osd {
+
+ClientRequest::ClientRequest(
+  OSD &osd, crimson::net::ConnectionRef conn, Ref<MOSDOp> &&m)
+  : osd(osd), conn(conn), m(m)
+{}
+
+void ClientRequest::print(std::ostream &lhs) const
+{
+  lhs << *m;
+}
+
+void ClientRequest::dump_detail(Formatter *f) const
+{
+}
+
+ClientRequest::ConnectionPipeline &ClientRequest::cp()
+{
+  return get_osd_priv(conn.get()).client_request_conn_pipeline;
+}
+
+ClientRequest::PGPipeline &ClientRequest::pp(PG &pg)
+{
+  return pg.client_request_pg_pipeline;
+}
+
+bool ClientRequest::is_pg_op() const
+{
+  return std::any_of(
+    begin(m->ops), end(m->ops),
+    [](auto& op) { return ceph_osd_op_type_pg(op.op.op); });
+}
+
+seastar::future<> ClientRequest::start()
+{
+  logger().debug("{}: start", *this);
+
+  IRef opref = this;
+  return crimson::common::handle_system_shutdown(
+    [this, opref=std::move(opref)]() mutable {
+    return seastar::repeat([this, opref]() mutable {
+      return with_blocking_future(handle.enter(cp().await_map))
+      .then([this]() {
+	return with_blocking_future(osd.osdmap_gate.wait_for_map(m->get_min_epoch()));
+      }).then([this](epoch_t epoch) {
+	return with_blocking_future(handle.enter(cp().get_pg));
+      }).then([this] {
+	return with_blocking_future(osd.wait_for_pg(m->get_spg()));
+      }).then([this, opref](Ref<PG> pgref) {
+	PG &pg = *pgref;
+	if (pg.can_discard_op(*m)) {
+	  return osd.send_incremental_map(conn, m->get_map_epoch());
+	}
+	return with_blocking_future(
+	  handle.enter(pp(pg).await_map)
+	).then([this, &pg]() mutable {
+	  return with_blocking_future(
+	    pg.osdmap_gate.wait_for_map(m->get_min_epoch()));
+	}).then([this, &pg](auto map) mutable {
+	  return with_blocking_future(
+	    handle.enter(pp(pg).wait_for_active));
+	}).then([this, &pg]() mutable {
+	  return with_blocking_future(pg.wait_for_active_blocker.wait());
+	}).then([this, pgref=std::move(pgref)]() mutable {
+	  if (m->finish_decode()) {
+	    m->clear_payload();
+	  }
+	  if (is_pg_op()) {
+	    return process_pg_op(pgref);
+	  } else {
+	    return process_op(pgref);
+	  }
+	});
+      }).then([] {
+	return seastar::stop_iteration::yes;
+      }).handle_exception_type([](crimson::common::actingset_changed& e) {
+	if (e.is_primary()) {
+	  logger().debug("operation restart, acting set changed");
+	  return seastar::stop_iteration::no;
+	} else {
+	  logger().debug("operation abort, up primary changed");
+	  return seastar::stop_iteration::yes;
+	}
+      });
+    });
+  });
+}
+
+seastar::future<> ClientRequest::process_pg_op(
+  Ref<PG> &pg)
+{
+  return pg->do_pg_ops(m)
+    .then([this, pg=std::move(pg)](Ref<MOSDOpReply> reply) {
+      return conn->send(reply);
+    });
+}
+
+seastar::future<> ClientRequest::process_op(
+  Ref<PG> &pgref)
+{
+  PG& pg = *pgref;
+  return with_blocking_future(
+    handle.enter(pp(pg).recover_missing)
+  ).then([this, &pg, pgref] {
+    eversion_t ver;
+    const hobject_t& soid = m->get_hobj();
+    logger().debug("{} check for recovery, {}", *this, soid);
+    if (pg.is_unreadable_object(soid, &ver) ||
+	pg.is_degraded_or_backfilling_object(soid)) {
+      logger().debug("{} need to wait for recovery, {}", *this, soid);
+      if (pg.get_recovery_backend()->is_recovering(soid)) {
+	return pg.get_recovery_backend()->get_recovering(soid).wait_for_recovered();
+      } else {
+	auto [op, fut] = osd.get_shard_services().start_operation<UrgentRecovery>(
+			    soid, ver, pgref, osd.get_shard_services(), pg.get_osdmap_epoch());
+	return std::move(fut);
+      }
+    }
+    return seastar::now();
+  }).then([this, &pg] {
+    return with_blocking_future(handle.enter(pp(pg).get_obc));
+  }).then([this, &pg]() -> PG::load_obc_ertr::future<> {
+    op_info.set_from_op(&*m, *pg.get_osdmap());
+    return pg.with_locked_obc(m, op_info, this, [this, &pg](auto obc) {
+      return with_blocking_future(
+        handle.enter(pp(pg).process)
+      ).then([this, &pg, obc] {
+        if (!pg.is_primary()) {
+           // primary can handle both normal ops and balanced reads
+          if (is_misdirected(pg)) {
+            logger().trace("process_op: dropping misdirected op");
+            return seastar::make_ready_future<Ref<MOSDOpReply>>();
+          } else if (const hobject_t& hoid = m->get_hobj();
+                     !pg.get_peering_state().can_serve_replica_read(hoid)) {
+            auto reply = make_message<MOSDOpReply>(
+              m.get(), -EAGAIN, pg.get_osdmap_epoch(),
+              m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK),
+              !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
+            return seastar::make_ready_future<Ref<MOSDOpReply>>(std::move(reply));
+          }
+        }
+        return pg.do_osd_ops(m, obc, op_info);
+      }).then([this](Ref<MOSDOpReply> reply) {
+        if (reply) {
+          return conn->send(std::move(reply));
+        } else {
+          return seastar::now();
+        }
+      });
+    });
+  }).safe_then([pgref=std::move(pgref)] {
+    return seastar::now();
+  }, PG::load_obc_ertr::all_same_way([](auto &code) {
+    logger().error("ClientRequest saw error code {}", code);
+    return seastar::now();
+  }));
+}
+
+bool ClientRequest::is_misdirected(const PG& pg) const
+{
+  // otherwise take a closer look
+  if (const int flags = m->get_flags();
+      flags & CEPH_OSD_FLAG_BALANCE_READS ||
+      flags & CEPH_OSD_FLAG_LOCALIZE_READS) {
+    if (!op_info.may_read()) {
+      // no read found, so it can't be balanced read
+      return true;
+    }
+    if (op_info.may_write() || op_info.may_cache()) {
+      // write op, but i am not primary
+      return true;
+    }
+    // balanced reads; any replica will do
+    return pg.is_nonprimary();
+  }
+  // neither balanced nor localize reads
+  return true;
+}
+
+}
diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h
new file mode 100644
index 000000000..ea3124a93
--- /dev/null
+++ b/src/crimson/osd/osd_operations/client_request.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "osd/osd_op_util.h"
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/common/type_helpers.h"
+#include "messages/MOSDOp.h"
+
+namespace crimson::osd {
+class PG;
+class OSD;
+
+class ClientRequest final : public OperationT<ClientRequest> {
+  OSD &osd;
+  crimson::net::ConnectionRef conn;
+  Ref<MOSDOp> m;
+  OpInfo op_info;
+  OrderedPipelinePhase::Handle handle;
+
+public:
+  class ConnectionPipeline {
+    OrderedPipelinePhase await_map = {
+      "ClientRequest::ConnectionPipeline::await_map"
+    };
+    OrderedPipelinePhase get_pg = {
+      "ClientRequest::ConnectionPipeline::get_pg"
+    };
+    friend class ClientRequest;
+  };
+  class PGPipeline {
+    OrderedPipelinePhase await_map = {
+      "ClientRequest::PGPipeline::await_map"
+    };
+    OrderedPipelinePhase wait_for_active = {
+      "ClientRequest::PGPipeline::wait_for_active"
+    };
+    OrderedPipelinePhase recover_missing = {
+      "ClientRequest::PGPipeline::recover_missing"
+    };
+    OrderedPipelinePhase get_obc = {
+      "ClientRequest::PGPipeline::get_obc"
+    };
+    OrderedPipelinePhase process = {
+      "ClientRequest::PGPipeline::process"
+    };
+    friend class ClientRequest;
+  };
+
+  static constexpr OperationTypeCode type = OperationTypeCode::client_request;
+
+  ClientRequest(OSD &osd, crimson::net::ConnectionRef, Ref<MOSDOp> &&m);
+
+  void print(std::ostream &) const final;
+  void dump_detail(Formatter *f) const final;
+
+public:
+  seastar::future<> start();
+
+private:
+  seastar::future<> process_pg_op(
+    Ref<PG> &pg);
+  seastar::future<> process_op(
+    Ref<PG> &pg);
+  bool is_pg_op() const;
+
+  ConnectionPipeline &cp();
+  PGPipeline &pp(PG &pg);
+
+private:
+  bool is_misdirected(const PG& pg) const;
+};
+
+}
diff --git a/src/crimson/osd/osd_operations/compound_peering_request.cc b/src/crimson/osd/osd_operations/compound_peering_request.cc
new file mode 100644
index 000000000..e55760096
--- /dev/null
+++ b/src/crimson/osd/osd_operations/compound_peering_request.cc
@@ -0,0 +1,170 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <seastar/core/future.hh>
+
+#include "osd/PeeringState.h"
+
+#include "messages/MOSDPGQuery.h"
+#include "messages/MOSDPGCreate2.h"
+
+#include "common/Formatter.h"
+
+#include "crimson/common/exception.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/osd.h"
+#include "crimson/osd/osd_operations/compound_peering_request.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+namespace {
+using namespace crimson::osd;
+
+struct compound_state {
+  seastar::promise<BufferedRecoveryMessages> promise;
+  // assuming crimson-osd won't need to be compatible with pre-octopus
+  // releases
+  BufferedRecoveryMessages ctx{ceph_release_t::octopus};
+  compound_state() = default;
+  ~compound_state() {
+    promise.set_value(std::move(ctx));
+  }
+};
+using compound_state_ref = seastar::lw_shared_ptr<compound_state>;
+
+class PeeringSubEvent : public RemotePeeringEvent {
+  compound_state_ref state;
+public:
+  template <typename... Args>
+  PeeringSubEvent(compound_state_ref state, Args &&... args) :
+    RemotePeeringEvent(std::forward<Args>(args)...), state(state) {}
+
+  seastar::future<> complete_rctx(Ref<crimson::osd::PG> pg) final {
+    logger().debug("{}: submitting ctx transaction", *this);
+    state->ctx.accept_buffered_messages(ctx);
+    state = {};
+    if (!pg) {
+      ceph_assert(ctx.transaction.empty());
+      return seastar::now();
+    } else {
+      return osd.get_shard_services().dispatch_context_transaction(
+	pg->get_collection_ref(), ctx);
+    }
+  }
+};
+
+std::vector<OperationRef> handle_pg_create(
+  OSD &osd,
+  crimson::net::ConnectionRef conn,
+  compound_state_ref state,
+  Ref<MOSDPGCreate2> m)
+{
+  std::vector<OperationRef> ret;
+  for (auto& [pgid, when] : m->pgs) {
+    const auto &[created, created_stamp] = when;
+    auto q = m->pg_extra.find(pgid);
+    ceph_assert(q != m->pg_extra.end());
+    auto& [history, pi] = q->second;
+    logger().debug(
+      "{}: {} e{} @{} "
+      "history {} pi {}",
+      __func__, pgid, created, created_stamp,
+      history, pi);
+    if (!pi.empty() &&
+	m->epoch < pi.get_bounds().second) {
+      logger().error(
+        "got pg_create on {} epoch {}  "
+        "unmatched past_intervals {} (history {})",
+        pgid, m->epoch,
+        pi, history);
+    } else {
+      auto op = osd.get_shard_services().start_operation<PeeringSubEvent>(
+	  state,
+	  osd,
+	  conn,
+	  osd.get_shard_services(),
+	  pg_shard_t(),
+	  pgid,
+	  m->epoch,
+	  m->epoch,
+	  NullEvt(),
+	  true,
+	  new PGCreateInfo(pgid, m->epoch, history, pi, true)).first;
+      ret.push_back(op);
+    }
+  }
+  return ret;
+}
+
+struct SubOpBlocker : BlockerT<SubOpBlocker> {
+  static constexpr const char * type_name = "CompoundOpBlocker";
+
+  std::vector<OperationRef> subops;
+  SubOpBlocker(std::vector<OperationRef> &&subops) : subops(subops) {}
+
+  virtual void dump_detail(Formatter *f) const {
+    f->open_array_section("dependent_operations");
+    {
+      for (auto &i : subops) {
+	i->dump_brief(f);
+      }
+    }
+    f->close_section();
+  }
+};
+
+} // namespace
+
+namespace crimson::osd {
+
+CompoundPeeringRequest::CompoundPeeringRequest(
+  OSD &osd, crimson::net::ConnectionRef conn, Ref<Message> m)
+  : osd(osd),
+    conn(conn),
+    m(m)
+{}
+
+void CompoundPeeringRequest::print(std::ostream &lhs) const
+{
+  lhs << *m;
+}
+
+void CompoundPeeringRequest::dump_detail(Formatter *f) const
+{
+  f->dump_stream("message") << *m;
+}
+
+seastar::future<> CompoundPeeringRequest::start()
+{
+  logger().info("{}: starting", *this);
+  auto state = seastar::make_lw_shared<compound_state>();
+  auto blocker = std::make_unique<SubOpBlocker>(
+    [&] {
+      assert((m->get_type() == MSG_OSD_PG_CREATE2));
+      return handle_pg_create(
+        osd,
+	conn,
+	state,
+	boost::static_pointer_cast<MOSDPGCreate2>(m));
+    }());
+
+  IRef ref = this;
+  logger().info("{}: about to fork future", *this);
+  return crimson::common::handle_system_shutdown(
+    [this, ref, blocker=std::move(blocker), state]() mutable {
+    return with_blocking_future(
+      blocker->make_blocking_future(state->promise.get_future())
+    ).then([this, blocker=std::move(blocker)](auto &&ctx) {
+      logger().info("{}: sub events complete", *this);
+      return osd.get_shard_services().dispatch_context_messages(std::move(ctx));
+    }).then([this, ref=std::move(ref)] {
+      logger().info("{}: complete", *this);
+    });
+  });
+}
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/compound_peering_request.h b/src/crimson/osd/osd_operations/compound_peering_request.h
new file mode 100644
index 000000000..495306d75
--- /dev/null
+++ b/src/crimson/osd/osd_operations/compound_peering_request.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <seastar/core/future.hh>
+
+#include "msg/MessageRef.h"
+
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osd_operation.h"
+
+namespace crimson::osd {
+
+class OSD;
+class PG;
+
+using osd_id_t = int;
+
+class CompoundPeeringRequest : public OperationT<CompoundPeeringRequest> {
+public:
+  static constexpr OperationTypeCode type =
+    OperationTypeCode::compound_peering_request;
+
+private:
+  OSD &osd;
+  crimson::net::ConnectionRef conn;
+  Ref<Message> m;
+
+public:
+  CompoundPeeringRequest(
+    OSD &osd, crimson::net::ConnectionRef conn, Ref<Message> m);
+
+  void print(std::ostream &) const final;
+  void dump_detail(Formatter *f) const final;
+  seastar::future<> start();
+};
+
+}
diff --git a/src/crimson/osd/osd_operations/osdop_params.h b/src/crimson/osd/osd_operations/osdop_params.h
new file mode 100644
index 000000000..a0bd9dcbb
--- /dev/null
+++ b/src/crimson/osd/osd_operations/osdop_params.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "messages/MOSDOp.h"
+#include "osd/osd_types.h"
+#include "crimson/common/type_helpers.h"
+
+// The fields in this struct are parameters that may be needed in multiple
+// level of processing. I inclosed all those parameters in this struct to
+// avoid passing each of them as a method parameter.
+struct osd_op_params_t {
+  Ref<MOSDOp> req;
+  eversion_t at_version;
+  eversion_t pg_trim_to;
+  eversion_t min_last_complete_ondisk;
+  eversion_t last_complete;
+  version_t user_at_version = 0;
+  bool user_modify = false;
+  ObjectCleanRegions clean_regions;
+
+  osd_op_params_t() = default;
+  osd_op_params_t(Ref<MOSDOp>&& req) : req(req) {}
+  osd_op_params_t(Ref<MOSDOp>&& req, eversion_t at_version, eversion_t pg_trim_to,
+      eversion_t mlcod, eversion_t lc, version_t user_at_version) :
+    req(req), at_version(at_version), pg_trim_to(pg_trim_to),
+    min_last_complete_ondisk(mlcod), last_complete(lc),
+    user_at_version(user_at_version) {}
+};
diff --git a/src/crimson/osd/osd_operations/peering_event.cc b/src/crimson/osd/osd_operations/peering_event.cc
new file mode 100644
index 000000000..d3c6ccf81
--- /dev/null
+++ b/src/crimson/osd/osd_operations/peering_event.cc
@@ -0,0 +1,173 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <seastar/core/future.hh>
+
+#include "messages/MOSDPGLog.h"
+
+#include "common/Formatter.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/osd.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/osd_connection_priv.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+namespace crimson::osd {
+
+void PeeringEvent::print(std::ostream &lhs) const
+{
+  lhs << "PeeringEvent("
+      << "from=" << from
+      << " pgid=" << pgid
+      << " sent=" << evt.get_epoch_sent()
+      << " requested=" << evt.get_epoch_requested()
+      << " evt=" << evt.get_desc()
+      << ")";
+}
+
+void PeeringEvent::dump_detail(Formatter *f) const
+{
+  f->open_object_section("PeeringEvent");
+  f->dump_stream("from") << from;
+  f->dump_stream("pgid") << pgid;
+  f->dump_int("sent", evt.get_epoch_sent());
+  f->dump_int("requested", evt.get_epoch_requested());
+  f->dump_string("evt", evt.get_desc());
+  f->close_section();
+}
+
+
+PeeringEvent::PGPipeline &PeeringEvent::pp(PG &pg)
+{
+  return pg.peering_request_pg_pipeline;
+}
+
+seastar::future<> PeeringEvent::start()
+{
+
+  logger().debug("{}: start", *this);
+
+  IRef ref = this;
+  return [this] {
+    if (delay) {
+      return seastar::sleep(std::chrono::milliseconds(
+		std::lround(delay*1000)));
+    } else {
+      return seastar::now();
+    }
+  }().then([this] {
+    return get_pg();
+  }).then([this](Ref<PG> pg) {
+    if (!pg) {
+      logger().warn("{}: pg absent, did not create", *this);
+      on_pg_absent();
+      handle.exit();
+      return complete_rctx(pg);
+    } else {
+      logger().debug("{}: pg present", *this);
+      return with_blocking_future(handle.enter(pp(*pg).await_map)
+      ).then([this, pg] {
+	return with_blocking_future(
+	  pg->osdmap_gate.wait_for_map(evt.get_epoch_sent()));
+      }).then([this, pg](auto) {
+	return with_blocking_future(handle.enter(pp(*pg).process));
+      }).then([this, pg] {
+        // TODO: likely we should synchronize also with the pg log-based
+        // recovery.
+	return with_blocking_future(
+          handle.enter(BackfillRecovery::bp(*pg).process));
+      }).then([this, pg] {
+	pg->do_peering_event(evt, ctx);
+	handle.exit();
+	return complete_rctx(pg);
+      }).then([this, pg] {
+	return pg->get_need_up_thru() ? shard_services.send_alive(pg->get_same_interval_since())
+                               : seastar::now();
+      });
+    }
+  }).then([this] {
+    return shard_services.send_pg_temp();
+  }).then([this, ref=std::move(ref)] {
+    logger().debug("{}: complete", *this);
+  });
+}
+
+void PeeringEvent::on_pg_absent()
+{
+  logger().debug("{}: pg absent, dropping", *this);
+}
+
+seastar::future<> PeeringEvent::complete_rctx(Ref<PG> pg)
+{
+  logger().debug("{}: submitting ctx", *this);
+  return shard_services.dispatch_context(
+    pg->get_collection_ref(),
+    std::move(ctx));
+}
+
+RemotePeeringEvent::ConnectionPipeline &RemotePeeringEvent::cp()
+{
+  return get_osd_priv(conn.get()).peering_request_conn_pipeline;
+}
+
+void RemotePeeringEvent::on_pg_absent()
+{
+  if (auto& e = get_event().get_event();
+      e.dynamic_type() == MQuery::static_type()) {
+    const auto map_epoch =
+      shard_services.get_osdmap_service().get_map()->get_epoch();
+    const auto& q = static_cast<const MQuery&>(e);
+    const pg_info_t empty{spg_t{pgid.pgid, q.query.to}};
+    if (q.query.type == q.query.LOG ||
+	q.query.type == q.query.FULLLOG)  {
+      auto m = ceph::make_message<MOSDPGLog>(q.query.from, q.query.to,
+					     map_epoch, empty,
+					     q.query.epoch_sent);
+      ctx.send_osd_message(q.from.osd, std::move(m));
+    } else {
+      ctx.send_notify(q.from.osd, {q.query.from, q.query.to,
+				   q.query.epoch_sent,
+				   map_epoch, empty,
+				   PastIntervals{}});
+    }
+  }
+}
+
+seastar::future<> RemotePeeringEvent::complete_rctx(Ref<PG> pg)
+{
+  if (pg) {
+    return PeeringEvent::complete_rctx(pg);
+  } else {
+    return shard_services.dispatch_context_messages(std::move(ctx));
+  }
+}
+
+seastar::future<Ref<PG>> RemotePeeringEvent::get_pg()
+{
+  return with_blocking_future(
+    handle.enter(cp().await_map)
+  ).then([this] {
+    return with_blocking_future(
+      osd.osdmap_gate.wait_for_map(evt.get_epoch_sent()));
+  }).then([this](auto epoch) {
+    logger().debug("{}: got map {}", *this, epoch);
+    return with_blocking_future(handle.enter(cp().get_pg));
+  }).then([this] {
+    return with_blocking_future(
+      osd.get_or_create_pg(
+	pgid, evt.get_epoch_sent(), std::move(evt.create_info)));
+  });
+}
+
+seastar::future<Ref<PG>> LocalPeeringEvent::get_pg() {
+  return seastar::make_ready_future<Ref<PG>>(pg);
+}
+
+LocalPeeringEvent::~LocalPeeringEvent() {}
+
+}
diff --git a/src/crimson/osd/osd_operations/peering_event.h b/src/crimson/osd/osd_operations/peering_event.h
new file mode 100644
index 000000000..3a6c0678c
--- /dev/null
+++ b/src/crimson/osd/osd_operations/peering_event.h
@@ -0,0 +1,142 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/osd_operation.h"
+#include "osd/osd_types.h"
+#include "osd/PGPeeringEvent.h"
+#include "osd/PeeringState.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+namespace crimson::osd {
+
+class OSD;
+class ShardServices;
+class PG;
+
+class PeeringEvent : public OperationT<PeeringEvent> {
+public:
+  static constexpr OperationTypeCode type = OperationTypeCode::peering_event;
+
+  class PGPipeline {
+    OrderedPipelinePhase await_map = {
+      "PeeringEvent::PGPipeline::await_map"
+    };
+    OrderedPipelinePhase process = {
+      "PeeringEvent::PGPipeline::process"
+    };
+    friend class PeeringEvent;
+    friend class PGAdvanceMap;
+  };
+
+protected:
+  OrderedPipelinePhase::Handle handle;
+  PGPipeline &pp(PG &pg);
+
+  ShardServices &shard_services;
+  PeeringCtx ctx;
+  pg_shard_t from;
+  spg_t pgid;
+  float delay = 0;
+  PGPeeringEvent evt;
+
+  const pg_shard_t get_from() const {
+    return from;
+  }
+
+  const spg_t get_pgid() const {
+    return pgid;
+  }
+
+  const PGPeeringEvent &get_event() const {
+    return evt;
+  }
+
+  virtual void on_pg_absent();
+  virtual seastar::future<> complete_rctx(Ref<PG>);
+  virtual seastar::future<Ref<PG>> get_pg() = 0;
+
+public:
+  template <typename... Args>
+  PeeringEvent(
+    ShardServices &shard_services, const pg_shard_t &from, const spg_t &pgid,
+    Args&&... args) :
+    shard_services(shard_services),
+    ctx{ceph_release_t::octopus},
+    from(from),
+    pgid(pgid),
+    evt(std::forward<Args>(args)...)
+  {}
+  template <typename... Args>
+  PeeringEvent(
+    ShardServices &shard_services, const pg_shard_t &from, const spg_t &pgid,
+    float delay, Args&&... args) :
+    shard_services(shard_services),
+    ctx{ceph_release_t::octopus},
+    from(from),
+    pgid(pgid),
+    delay(delay),
+    evt(std::forward<Args>(args)...)
+  {}
+
+  void print(std::ostream &) const final;
+  void dump_detail(ceph::Formatter* f) const final;
+  seastar::future<> start();
+};
+
+class RemotePeeringEvent : public PeeringEvent {
+protected:
+  OSD &osd;
+  crimson::net::ConnectionRef conn;
+
+  void on_pg_absent() final;
+  seastar::future<> complete_rctx(Ref<PG> pg) override;
+  seastar::future<Ref<PG>> get_pg() final;
+
+public:
+  class ConnectionPipeline {
+    OrderedPipelinePhase await_map = {
+      "PeeringRequest::ConnectionPipeline::await_map"
+    };
+    OrderedPipelinePhase get_pg = {
+      "PeeringRequest::ConnectionPipeline::get_pg"
+    };
+    friend class RemotePeeringEvent;
+  };
+
+  template <typename... Args>
+  RemotePeeringEvent(OSD &osd, crimson::net::ConnectionRef conn, Args&&... args) :
+    PeeringEvent(std::forward<Args>(args)...),
+    osd(osd),
+    conn(conn)
+  {}
+
+private:
+  ConnectionPipeline &cp();
+};
+
+class LocalPeeringEvent final : public PeeringEvent {
+protected:
+  seastar::future<Ref<PG>> get_pg() final;
+
+  Ref<PG> pg;
+
+public:
+  template <typename... Args>
+  LocalPeeringEvent(Ref<PG> pg, Args&&... args) :
+    PeeringEvent(std::forward<Args>(args)...),
+    pg(pg)
+  {}
+
+  virtual ~LocalPeeringEvent();
+};
+
+
+}
diff --git a/src/crimson/osd/osd_operations/pg_advance_map.cc b/src/crimson/osd/osd_operations/pg_advance_map.cc
new file mode 100644
index 000000000..a96479d40
--- /dev/null
+++ b/src/crimson/osd/osd_operations/pg_advance_map.cc
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/osd/osd_operations/pg_advance_map.h"
+
+#include <boost/smart_ptr/local_shared_ptr.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/types.h"
+#include "common/Formatter.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/osd.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+namespace crimson::osd {
+
+PGAdvanceMap::PGAdvanceMap(
+  OSD &osd, Ref<PG> pg, epoch_t from, epoch_t to,
+  PeeringCtx &&rctx, bool do_init)
+  : osd(osd), pg(pg), from(from), to(to),
+    rctx(std::move(rctx)), do_init(do_init) {}
+
+PGAdvanceMap::~PGAdvanceMap() {}
+
+void PGAdvanceMap::print(std::ostream &lhs) const
+{
+  lhs << "PGAdvanceMap("
+      << "pg=" << pg->get_pgid()
+      << " from=" << from
+      << " to=" << to;
+  if (do_init) {
+    lhs << " do_init";
+  }
+  lhs << ")";
+}
+
+void PGAdvanceMap::dump_detail(Formatter *f) const
+{
+  f->open_object_section("PGAdvanceMap");
+  f->dump_stream("pgid") << pg->get_pgid();
+  f->dump_int("from", from);
+  f->dump_int("to", to);
+  f->dump_bool("do_init", do_init);
+  f->close_section();
+}
+
+seastar::future<> PGAdvanceMap::start()
+{
+  using cached_map_t = boost::local_shared_ptr<const OSDMap>;
+
+  logger().debug("{}: start", *this);
+
+  IRef ref = this;
+  return with_blocking_future(
+    handle.enter(pg->peering_request_pg_pipeline.process))
+    .then([this] {
+      if (do_init) {
+	pg->handle_initialize(rctx);
+	pg->handle_activate_map(rctx);
+      }
+      return seastar::do_for_each(
+	boost::make_counting_iterator(from + 1),
+	boost::make_counting_iterator(to + 1),
+	[this](epoch_t next_epoch) {
+	  return osd.get_map(next_epoch).then(
+	    [this] (cached_map_t&& next_map) {
+	      pg->handle_advance_map(next_map, rctx);
+	    });
+	}).then([this] {
+	  pg->handle_activate_map(rctx);
+	  handle.exit();
+	  if (do_init) {
+	    osd.pg_map.pg_created(pg->get_pgid(), pg);
+	    osd.shard_services.inc_pg_num();
+	    logger().info("PGAdvanceMap::start new pg {}", *pg);
+	  }
+	  return seastar::when_all_succeed(
+	    pg->get_need_up_thru() \
+              ? osd.shard_services.send_alive(pg->get_same_interval_since())
+              : seastar::now(),
+	    osd.shard_services.dispatch_context(
+	      pg->get_collection_ref(),
+	      std::move(rctx)));
+	}).then_unpack([this] {
+          return osd.shard_services.send_pg_temp();
+        });
+    }).then([this, ref=std::move(ref)] {
+      logger().debug("{}: complete", *this);
+    });
+}
+
+}
diff --git a/src/crimson/osd/osd_operations/pg_advance_map.h b/src/crimson/osd/osd_operations/pg_advance_map.h
new file mode 100644
index 000000000..1b27037eb
--- /dev/null
+++ b/src/crimson/osd/osd_operations/pg_advance_map.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/osd_operation.h"
+#include "osd/osd_types.h"
+#include "crimson/common/type_helpers.h"
+#include "osd/PeeringState.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+namespace crimson::osd {
+
+class OSD;
+class PG;
+
+class PGAdvanceMap : public OperationT<PGAdvanceMap> {
+public:
+  static constexpr OperationTypeCode type = OperationTypeCode::pg_advance_map;
+
+protected:
+  OrderedPipelinePhase::Handle handle;
+
+  OSD &osd;
+  Ref<PG> pg;
+
+  epoch_t from;
+  epoch_t to;
+
+  PeeringCtx rctx;
+  const bool do_init;
+
+public:
+  PGAdvanceMap(
+    OSD &osd, Ref<PG> pg, epoch_t from, epoch_t to,
+    PeeringCtx &&rctx, bool do_init);
+  ~PGAdvanceMap();
+
+  void print(std::ostream &) const final;
+  void dump_detail(ceph::Formatter *f) const final;
+  seastar::future<> start();
+};
+
+}
diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.cc b/src/crimson/osd/osd_operations/recovery_subrequest.cc
new file mode 100644
index 000000000..820c7beab
--- /dev/null
+++ b/src/crimson/osd/osd_operations/recovery_subrequest.cc
@@ -0,0 +1,29 @@
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "crimson/osd/osd_operations/recovery_subrequest.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+namespace crimson::osd {
+
+seastar::future<> RecoverySubRequest::start() {
+  logger().debug("{}: start", *this);
+
+  IRef opref = this;
+  return with_blocking_future(osd.osdmap_gate.wait_for_map(m->get_min_epoch()))
+  .then([this] (epoch_t epoch) {
+    return with_blocking_future(osd.wait_for_pg(m->get_spg()));
+  }).then([this, opref=std::move(opref)] (Ref<PG> pgref) {
+    return seastar::do_with(std::move(pgref), std::move(opref),
+      [this](auto& pgref, auto& opref) {
+      return pgref->get_recovery_backend()->handle_recovery_op(m);
+    });
+  });
+}
+
+}
diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.h b/src/crimson/osd/osd_operations/recovery_subrequest.h
new file mode 100644
index 000000000..b151e5c1d
--- /dev/null
+++ b/src/crimson/osd/osd_operations/recovery_subrequest.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "osd/osd_op_util.h"
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd.h"
+#include "crimson/common/type_helpers.h"
+#include "messages/MOSDPGPull.h"
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPushReply.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
+
+namespace crimson::osd {
+
+class OSD;
+class PG;
+
+class RecoverySubRequest final : public OperationT<RecoverySubRequest> {
+public:
+  static constexpr OperationTypeCode type = OperationTypeCode::background_recovery_sub;
+
+  RecoverySubRequest(OSD &osd, crimson::net::ConnectionRef conn, Ref<MOSDFastDispatchOp>&& m)
+    : osd(osd), conn(conn), m(m) {}
+
+  void print(std::ostream& out) const final
+  {
+    out << *m;
+  }
+
+  void dump_detail(Formatter *f) const final
+  {
+  }
+
+  seastar::future<> start();
+private:
+  OSD& osd;
+  crimson::net::ConnectionRef conn;
+  Ref<MOSDFastDispatchOp> m;
+};
+
+}
diff --git a/src/crimson/osd/osd_operations/replicated_request.cc b/src/crimson/osd/osd_operations/replicated_request.cc
new file mode 100644
index 000000000..34487f9e4
--- /dev/null
+++ b/src/crimson/osd/osd_operations/replicated_request.cc
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "replicated_request.h"
+
+#include "common/Formatter.h"
+#include "messages/MOSDRepOp.h"
+
+#include "crimson/osd/osd.h"
+#include "crimson/osd/osd_connection_priv.h"
+#include "crimson/osd/pg.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+namespace crimson::osd {
+
+RepRequest::RepRequest(OSD &osd,
+		       crimson::net::ConnectionRef&& conn,
+		       Ref<MOSDRepOp> &&req)
+  : osd{osd},
+    conn{std::move(conn)},
+    req{req}
+{}
+
+void RepRequest::print(std::ostream& os) const
+{
+  os << "RepRequest("
+     << "from=" << req->from
+     << " req=" << *req
+     << ")";
+}
+
+void RepRequest::dump_detail(Formatter *f) const
+{
+  f->open_object_section("RepRequest");
+  f->dump_stream("reqid") << req->reqid;
+  f->dump_stream("pgid") << req->get_spg();
+  f->dump_unsigned("map_epoch", req->get_map_epoch());
+  f->dump_unsigned("min_epoch", req->get_min_epoch());
+  f->dump_stream("oid") << req->poid;
+  f->dump_stream("from") << req->from;
+  f->close_section();
+}
+
+RepRequest::ConnectionPipeline &RepRequest::cp()
+{
+  return get_osd_priv(conn.get()).replicated_request_conn_pipeline;
+}
+
+RepRequest::PGPipeline &RepRequest::pp(PG &pg)
+{
+  return pg.replicated_request_pg_pipeline;
+}
+
+seastar::future<> RepRequest::start()
+{
+  logger().debug("{} start", *this);
+  IRef ref = this;
+  return with_blocking_future(handle.enter(cp().await_map))
+    .then([this]() {
+      return with_blocking_future(osd.osdmap_gate.wait_for_map(req->get_min_epoch()));
+    }).then([this](epoch_t epoch) {
+      return with_blocking_future(handle.enter(cp().get_pg));
+    }).then([this] {
+      return with_blocking_future(osd.wait_for_pg(req->get_spg()));
+    }).then([this, ref=std::move(ref)](Ref<PG> pg) {
+      return pg->handle_rep_op(std::move(req));
+    });
+}
+}
diff --git a/src/crimson/osd/osd_operations/replicated_request.h b/src/crimson/osd/osd_operations/replicated_request.h
new file mode 100644
index 000000000..8e9cfc9fe
--- /dev/null
+++ b/src/crimson/osd/osd_operations/replicated_request.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/net/Connection.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/common/type_helpers.h"
+
+class MOSDRepOp;
+
+namespace ceph {
+  class Formatter;
+}
+
+namespace crimson::osd {
+
+class OSD;
+class PG;
+
+class RepRequest final : public OperationT<RepRequest> {
+public:
+  class ConnectionPipeline {
+    OrderedPipelinePhase await_map = {
+      "RepRequest::ConnectionPipeline::await_map"
+    };
+    OrderedPipelinePhase get_pg = {
+      "RepRequest::ConnectionPipeline::get_pg"
+    };
+    friend RepRequest;
+  };
+  class PGPipeline {
+    OrderedPipelinePhase await_map = {
+      "RepRequest::PGPipeline::await_map"
+    };
+    OrderedPipelinePhase process = {
+      "RepRequest::PGPipeline::process"
+    };
+    friend RepRequest;
+  };
+  static constexpr OperationTypeCode type = OperationTypeCode::replicated_request;
+  RepRequest(OSD&, crimson::net::ConnectionRef&&, Ref<MOSDRepOp>&&);
+
+  void print(std::ostream &) const final;
+  void dump_detail(ceph::Formatter* f) const final;
+  seastar::future<> start();
+
+private:
+  ConnectionPipeline &cp();
+  PGPipeline &pp(PG &pg);
+
+  OSD &osd;
+  crimson::net::ConnectionRef conn;
+  Ref<MOSDRepOp> req;
+  OrderedPipelinePhase::Handle handle;
+};
+
+}
diff --git a/src/crimson/osd/osdmap_gate.cc b/src/crimson/osd/osdmap_gate.cc
new file mode 100644
index 000000000..90afc32b4
--- /dev/null
+++ b/src/crimson/osd/osdmap_gate.cc
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/common/exception.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/shard_services.h"
+#include "common/Formatter.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+namespace crimson::osd {
+
+void OSDMapGate::OSDMapBlocker::dump_detail(Formatter *f) const
+{
+  f->open_object_section("OSDMapGate");
+  f->dump_int("epoch", epoch);
+  f->close_section();
+}
+
+blocking_future<epoch_t> OSDMapGate::wait_for_map(epoch_t epoch)
+{
+  if (__builtin_expect(stopping, false)) {
+    return make_exception_blocking_future<epoch_t>(
+	crimson::common::system_shutdown_exception());
+  }
+  if (current >= epoch) {
+    return make_ready_blocking_future<epoch_t>(current);
+  } else {
+    logger().info("evt epoch is {}, i have {}, will wait", epoch, current);
+    auto &blocker = waiting_peering.emplace(
+      epoch, make_pair(blocker_type, epoch)).first->second;
+    auto fut = blocker.promise.get_shared_future();
+    if (shard_services) {
+      return blocker.make_blocking_future(
+	(*shard_services).get().osdmap_subscribe(current, true).then(
+	  [fut=std::move(fut)]() mutable {
+	    return std::move(fut);
+	  }));
+    } else {
+      return blocker.make_blocking_future(std::move(fut));
+    }
+  }
+}
+
+void OSDMapGate::got_map(epoch_t epoch) {
+  current = epoch;
+  auto first = waiting_peering.begin();
+  auto last = waiting_peering.upper_bound(epoch);
+  std::for_each(first, last, [epoch](auto& blocked_requests) {
+    blocked_requests.second.promise.set_value(epoch);
+  });
+  waiting_peering.erase(first, last);
+}
+
+seastar::future<> OSDMapGate::stop() {
+  logger().info("osdmap::stop");
+  stopping = true;
+  auto first = waiting_peering.begin();
+  auto last = waiting_peering.end();
+  std::for_each(first, last, [](auto& blocked_requests) {
+    blocked_requests.second.promise.set_exception(
+	crimson::common::system_shutdown_exception());
+  });
+  return seastar::now();
+}
+
+}
diff --git a/src/crimson/osd/osdmap_gate.h b/src/crimson/osd/osdmap_gate.h
new file mode 100644
index 000000000..2b73d8959
--- /dev/null
+++ b/src/crimson/osd/osdmap_gate.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <functional>
+#include <map>
+#include <optional>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+
+#include "include/types.h"
+#include "crimson/osd/osd_operation.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+namespace crimson::osd {
+
+class ShardServices;
+
+class OSDMapGate {
+  struct OSDMapBlocker : public Blocker {
+    const char * type_name;
+    epoch_t epoch;
+
+    OSDMapBlocker(std::pair<const char *, epoch_t> args)
+      : type_name(args.first), epoch(args.second) {}
+
+    OSDMapBlocker(const OSDMapBlocker &) = delete;
+    OSDMapBlocker(OSDMapBlocker &&) = delete;
+    OSDMapBlocker &operator=(const OSDMapBlocker &) = delete;
+    OSDMapBlocker &operator=(OSDMapBlocker &&) = delete;
+
+    seastar::shared_promise<epoch_t> promise;
+
+    void dump_detail(Formatter *f) const final;
+  private:
+    const char *get_type_name() const final {
+      return type_name;
+    }
+  };
+
+  // order the promises in ascending order of the waited osdmap epoch,
+  // so we can access all the waiters expecting a map whose epoch is less
+  // than or equal to a given epoch
+  using waiting_peering_t = std::map<epoch_t,
+				     OSDMapBlocker>;
+  const char *blocker_type;
+  waiting_peering_t waiting_peering;
+  epoch_t current = 0;
+  std::optional<std::reference_wrapper<ShardServices>> shard_services;
+  bool stopping = false;
+public:
+  OSDMapGate(
+    const char *blocker_type,
+    std::optional<std::reference_wrapper<ShardServices>> shard_services)
+    : blocker_type(blocker_type), shard_services(shard_services) {}
+
+  // wait for an osdmap whose epoch is greater or equal to given epoch
+  blocking_future<epoch_t> wait_for_map(epoch_t epoch);
+  void got_map(epoch_t epoch);
+  seastar::future<> stop();
+};
+
+}
diff --git a/src/crimson/osd/osdmap_service.h b/src/crimson/osd/osdmap_service.h
new file mode 100644
index 000000000..effd45b79
--- /dev/null
+++ b/src/crimson/osd/osdmap_service.h
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/smart_ptr/local_shared_ptr.hpp>
+
+#include "include/types.h"
+
+class OSDMap;
+
+class OSDMapService {
+public:
+  using cached_map_t = boost::local_shared_ptr<const OSDMap>;
+  virtual ~OSDMapService() = default;
+  virtual seastar::future<cached_map_t> get_map(epoch_t e) = 0;
+  /// get the latest map
+  virtual cached_map_t get_map() const = 0;
+  virtual epoch_t get_up_epoch() const = 0;
+};
diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
new file mode 100644
index 000000000..0f01c1607
--- /dev/null
+++ b/src/crimson/osd/pg.cc
@@ -0,0 +1,1102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "pg.h"
+
+#include <functional>
+
+#include <boost/range/adaptor/filtered.hpp>
+#include <boost/range/adaptor/map.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/algorithm/copy.hpp>
+#include <boost/range/algorithm/max_element.hpp>
+#include <boost/range/numeric.hpp>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDRepOp.h"
+#include "messages/MOSDRepOpReply.h"
+
+#include "osd/OSDMap.h"
+
+#include "os/Transaction.h"
+
+#include "crimson/common/exception.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/os/cyanstore/cyan_store.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/osd/exceptions.h"
+#include "crimson/osd/pg_meta.h"
+#include "crimson/osd/pg_backend.h"
+#include "crimson/osd/ops_executer.h"
+#include "crimson/osd/osd_operations/osdop_params.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/pg_recovery.h"
+#include "crimson/osd/replicated_recovery_backend.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+namespace std::chrono {
+std::ostream& operator<<(std::ostream& out, const signedspan& d)
+{
+  auto s = std::chrono::duration_cast<std::chrono::seconds>(d).count();
+  auto ns = std::abs((d % 1s).count());
+  fmt::print(out, "{}{}s", s, ns ? fmt::format(".{:0>9}", ns) : "");
+  return out;
+}
+}
+
+namespace crimson::osd {
+
+using crimson::common::local_conf;
+
+class RecoverablePredicate : public IsPGRecoverablePredicate {
+public:
+  bool operator()(const set<pg_shard_t> &have) const override {
+    return !have.empty();
+  }
+};
+
+class ReadablePredicate: public IsPGReadablePredicate {
+  pg_shard_t whoami;
+public:
+  explicit ReadablePredicate(pg_shard_t whoami) : whoami(whoami) {}
+  bool operator()(const set<pg_shard_t> &have) const override {
+    return have.count(whoami);
+  }
+};
+
+PG::PG(
+  spg_t pgid,
+  pg_shard_t pg_shard,
+  crimson::os::CollectionRef coll_ref,
+  pg_pool_t&& pool,
+  std::string&& name,
+  cached_map_t osdmap,
+  ShardServices &shard_services,
+  ec_profile_t profile)
+  : pgid{pgid},
+    pg_whoami{pg_shard},
+    coll_ref{coll_ref},
+    pgmeta_oid{pgid.make_pgmeta_oid()},
+    osdmap_gate("PG::osdmap_gate", std::nullopt),
+    shard_services{shard_services},
+    osdmap{osdmap},
+    backend(
+      PGBackend::create(
+	pgid.pgid,
+	pg_shard,
+	pool,
+	coll_ref,
+	shard_services,
+	profile)),
+    recovery_backend(
+      std::make_unique<ReplicatedRecoveryBackend>(
+	*this, shard_services, coll_ref, backend.get())),
+    recovery_handler(
+      std::make_unique<PGRecovery>(this)),
+    peering_state(
+      shard_services.get_cct(),
+      pg_shard,
+      pgid,
+      PGPool(
+	osdmap,
+	pgid.pool(),
+	pool,
+	name),
+      osdmap,
+      this,
+      this),
+    wait_for_active_blocker(this)
+{
+  peering_state.set_backend_predicates(
+    new ReadablePredicate(pg_whoami),
+    new RecoverablePredicate());
+  osdmap_gate.got_map(osdmap->get_epoch());
+}
+
+PG::~PG() {}
+
+bool PG::try_flush_or_schedule_async() {
+  (void)shard_services.get_store().do_transaction(
+    coll_ref,
+    ObjectStore::Transaction()).then(
+      [this, epoch=get_osdmap_epoch()]() {
+	return shard_services.start_operation<LocalPeeringEvent>(
+	  this,
+	  shard_services,
+	  pg_whoami,
+	  pgid,
+	  epoch,
+	  epoch,
+	  PeeringState::IntervalFlush());
+      });
+  return false;
+}
+
+void PG::queue_check_readable(epoch_t last_peering_reset, ceph::timespan delay)
+{
+  // handle the peering event in the background
+  check_readable_timer.cancel();
+  check_readable_timer.set_callback([last_peering_reset, this] {
+    (void) shard_services.start_operation<LocalPeeringEvent>(
+      this,
+      shard_services,
+      pg_whoami,
+      pgid,
+      last_peering_reset,
+      last_peering_reset,
+      PeeringState::CheckReadable{});
+    });
+  check_readable_timer.arm(
+    std::chrono::duration_cast<seastar::lowres_clock::duration>(delay));
+}
+
+void PG::recheck_readable()
+{
+  bool changed = false;
+  const auto mnow = shard_services.get_mnow();
+  if (peering_state.state_test(PG_STATE_WAIT)) {
+    auto prior_readable_until_ub = peering_state.get_prior_readable_until_ub();
+    if (mnow < prior_readable_until_ub) {
+      logger().info("{} will wait (mnow {} < prior_readable_until_ub {})",
+		    __func__, mnow, prior_readable_until_ub);
+    } else {
+      logger().info("{} no longer wait (mnow {} >= prior_readable_until_ub {})",
+		    __func__, mnow, prior_readable_until_ub);
+      peering_state.state_clear(PG_STATE_WAIT);
+      peering_state.clear_prior_readable_until_ub();
+      changed = true;
+    }
+  }
+  if (peering_state.state_test(PG_STATE_LAGGY)) {
+    auto readable_until = peering_state.get_readable_until();
+    if (readable_until == readable_until.zero()) {
+      logger().info("{} still laggy (mnow {}, readable_until zero)",
+		    __func__, mnow);
+    } else if (mnow >= readable_until) {
+      logger().info("{} still laggy (mnow {} >= readable_until {})",
+		    __func__, mnow, readable_until);
+    } else {
+      logger().info("{} no longer laggy (mnow {} < readable_until {})",
+		    __func__, mnow, readable_until);
+      peering_state.state_clear(PG_STATE_LAGGY);
+      changed = true;
+    }
+  }
+  if (changed) {
+    publish_stats_to_osd();
+    if (!peering_state.state_test(PG_STATE_WAIT) &&
+	!peering_state.state_test(PG_STATE_LAGGY)) {
+      // TODO: requeue ops waiting for readable
+    }
+  }
+}
+
+unsigned PG::get_target_pg_log_entries() const
+{
+  const unsigned num_pgs = shard_services.get_pg_num();
+  const unsigned target =
+    local_conf().get_val<uint64_t>("osd_target_pg_log_entries_per_osd");
+  const unsigned min_pg_log_entries =
+    local_conf().get_val<uint64_t>("osd_min_pg_log_entries");
+  if (num_pgs > 0 && target > 0) {
+    // target an even spread of our budgeted log entries across all
+    // PGs.  note that while we only get to control the entry count
+    // for primary PGs, we'll normally be responsible for a mix of
+    // primary and replica PGs (for the same pool(s) even), so this
+    // will work out.
+    const unsigned max_pg_log_entries =
+      local_conf().get_val<uint64_t>("osd_max_pg_log_entries");
+    return std::clamp(target / num_pgs,
+		      min_pg_log_entries,
+		      max_pg_log_entries);
+  } else {
+    // fall back to a per-pg value.
+    return min_pg_log_entries;
+  }
+}
+
+void PG::on_activate(interval_set<snapid_t>)
+{
+  projected_last_update = peering_state.get_info().last_update;
+}
+
+void PG::on_activate_complete()
+{
+  wait_for_active_blocker.on_active();
+
+  if (peering_state.needs_recovery()) {
+    logger().info("{}: requesting recovery",
+                  __func__);
+    (void) shard_services.start_operation<LocalPeeringEvent>(
+      this,
+      shard_services,
+      pg_whoami,
+      pgid,
+      get_osdmap_epoch(),
+      get_osdmap_epoch(),
+      PeeringState::DoRecovery{});
+  } else if (peering_state.needs_backfill()) {
+    logger().info("{}: requesting backfill",
+                  __func__);
+    (void) shard_services.start_operation<LocalPeeringEvent>(
+      this,
+      shard_services,
+      pg_whoami,
+      pgid,
+      get_osdmap_epoch(),
+      get_osdmap_epoch(),
+      PeeringState::RequestBackfill{});
+  } else {
+    logger().debug("{}: no need to recover or backfill, AllReplicasRecovered",
+		   " for pg: {}", __func__, pgid);
+    (void) shard_services.start_operation<LocalPeeringEvent>(
+      this,
+      shard_services,
+      pg_whoami,
+      pgid,
+      get_osdmap_epoch(),
+      get_osdmap_epoch(),
+      PeeringState::AllReplicasRecovered{});
+  }
+  backend->on_activate_complete();
+}
+
+void PG::prepare_write(pg_info_t &info,
+		       pg_info_t &last_written_info,
+		       PastIntervals &past_intervals,
+		       PGLog &pglog,
+		       bool dirty_info,
+		       bool dirty_big_info,
+		       bool need_write_epoch,
+		       ceph::os::Transaction &t)
+{
+  std::map<string,bufferlist> km;
+  std::string key_to_remove;
+  if (dirty_big_info || dirty_info) {
+    int ret = prepare_info_keymap(
+      shard_services.get_cct(),
+      &km,
+      &key_to_remove,
+      get_osdmap_epoch(),
+      info,
+      last_written_info,
+      past_intervals,
+      dirty_big_info,
+      need_write_epoch,
+      true,
+      nullptr,
+      this);
+    ceph_assert(ret == 0);
+  }
+  pglog.write_log_and_missing(
+    t, &km, coll_ref->get_cid(), pgmeta_oid,
+    peering_state.get_pool().info.require_rollback());
+  if (!km.empty()) {
+    t.omap_setkeys(coll_ref->get_cid(), pgmeta_oid, km);
+  }
+  if (!key_to_remove.empty()) {
+    t.omap_rmkey(coll_ref->get_cid(), pgmeta_oid, key_to_remove);
+  }
+}
+
+std::pair<ghobject_t, bool>
+PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next)
+{
+  // TODO
+  shard_services.dec_pg_num();
+  return {_next, false};
+}
+
+void PG::scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type)
+{
+  // TODO: should update the stats upon finishing the scrub
+  peering_state.update_stats([scrub_level, this](auto& history, auto& stats) {
+    const utime_t now = ceph_clock_now();
+    history.last_scrub = peering_state.get_info().last_update;
+    history.last_scrub_stamp = now;
+    history.last_clean_scrub_stamp = now;
+    if (scrub_level == scrub_level_t::deep) {
+      history.last_deep_scrub = history.last_scrub;
+      history.last_deep_scrub_stamp = now;
+    }
+    // yes, please publish the stats
+    return true;
+  });
+}
+
+void PG::log_state_enter(const char *state) {
+  logger().info("Entering state: {}", state);
+}
+
+void PG::log_state_exit(
+  const char *state_name, utime_t enter_time,
+  uint64_t events, utime_t event_dur) {
+  logger().info(
+    "Exiting state: {}, entered at {}, {} spent on {} events",
+    state_name,
+    enter_time,
+    event_dur,
+    events);
+}
+
+ceph::signedspan PG::get_mnow()
+{
+  return shard_services.get_mnow();
+}
+
+HeartbeatStampsRef PG::get_hb_stamps(int peer)
+{
+  return shard_services.get_hb_stamps(peer);
+}
+
+void PG::schedule_renew_lease(epoch_t last_peering_reset, ceph::timespan delay)
+{
+  // handle the peering event in the background
+  renew_lease_timer.cancel();
+  renew_lease_timer.set_callback([last_peering_reset, this] {
+    (void) shard_services.start_operation<LocalPeeringEvent>(
+      this,
+      shard_services,
+      pg_whoami,
+      pgid,
+      last_peering_reset,
+      last_peering_reset,
+      RenewLease{});
+    });
+  renew_lease_timer.arm(
+    std::chrono::duration_cast<seastar::lowres_clock::duration>(delay));
+}
+
+
+void PG::init(
+  int role,
+  const vector<int>& newup, int new_up_primary,
+  const vector<int>& newacting, int new_acting_primary,
+  const pg_history_t& history,
+  const PastIntervals& pi,
+  bool backfill,
+  ObjectStore::Transaction &t)
+{
+  peering_state.init(
+    role, newup, new_up_primary, newacting,
+    new_acting_primary, history, pi, backfill, t);
+}
+
+seastar::future<> PG::read_state(crimson::os::FuturizedStore* store)
+{
+  if (__builtin_expect(stopping, false)) {
+    return seastar::make_exception_future<>(
+	crimson::common::system_shutdown_exception());
+  }
+
+  return seastar::do_with(PGMeta(store, pgid), [] (auto& pg_meta) {
+    return pg_meta.load();
+  }).then([this, store](auto&& ret) {
+    auto [pg_info, past_intervals] = std::move(ret);
+    return peering_state.init_from_disk_state(
+	std::move(pg_info),
+	std::move(past_intervals),
+	[this, store] (PGLog &pglog) {
+	  return pglog.read_log_and_missing_crimson(
+	    *store,
+	    coll_ref,
+	    peering_state.get_info(),
+	    pgmeta_oid);
+      });
+  }).then([this]() {
+    int primary, up_primary;
+    vector<int> acting, up;
+    peering_state.get_osdmap()->pg_to_up_acting_osds(
+	pgid.pgid, &up, &up_primary, &acting, &primary);
+    peering_state.init_primary_up_acting(
+	up,
+	acting,
+	up_primary,
+	primary);
+    int rr = OSDMap::calc_pg_role(pg_whoami, acting);
+    peering_state.set_role(rr);
+
+    epoch_t epoch = get_osdmap_epoch();
+    (void) shard_services.start_operation<LocalPeeringEvent>(
+	this,
+	shard_services,
+	pg_whoami,
+	pgid,
+	epoch,
+	epoch,
+	PeeringState::Initialize());
+
+    return seastar::now();
+  });
+}
+
+void PG::do_peering_event(
+  const boost::statechart::event_base &evt,
+  PeeringCtx &rctx)
+{
+  peering_state.handle_event(
+    evt,
+    &rctx);
+  peering_state.write_if_dirty(rctx.transaction);
+}
+
+void PG::do_peering_event(
+  PGPeeringEvent& evt, PeeringCtx &rctx)
+{
+  if (!peering_state.pg_has_reset_since(evt.get_epoch_requested())) {
+    logger().debug("{} handling {} for pg: {}", __func__, evt.get_desc(), pgid);
+    do_peering_event(evt.get_event(), rctx);
+  } else {
+    logger().debug("{} ignoring {} -- pg has reset", __func__, evt.get_desc());
+  }
+}
+
+void PG::handle_advance_map(
+  cached_map_t next_map, PeeringCtx &rctx)
+{
+  vector<int> newup, newacting;
+  int up_primary, acting_primary;
+  next_map->pg_to_up_acting_osds(
+    pgid.pgid,
+    &newup, &up_primary,
+    &newacting, &acting_primary);
+  peering_state.advance_map(
+    next_map,
+    peering_state.get_osdmap(),
+    newup,
+    up_primary,
+    newacting,
+    acting_primary,
+    rctx);
+  osdmap_gate.got_map(next_map->get_epoch());
+}
+
+void PG::handle_activate_map(PeeringCtx &rctx)
+{
+  peering_state.activate_map(rctx);
+}
+
+void PG::handle_initialize(PeeringCtx &rctx)
+{
+  PeeringState::Initialize evt;
+  peering_state.handle_event(evt, &rctx);
+}
+
+
+void PG::print(ostream& out) const
+{
+  out << peering_state << " ";
+}
+
+void PG::dump_primary(Formatter* f)
+{
+  peering_state.dump_peering_state(f);
+
+  f->open_array_section("recovery_state");
+  PeeringState::QueryState q(f);
+  peering_state.handle_event(q, 0);
+  f->close_section();
+
+  // TODO: snap_trimq
+  // TODO: scrubber state
+  // TODO: agent state
+}
+
+std::ostream& operator<<(std::ostream& os, const PG& pg)
+{
+  os << " pg_epoch " << pg.get_osdmap_epoch() << " ";
+  pg.print(os);
+  return os;
+}
+
+void PG::WaitForActiveBlocker::dump_detail(Formatter *f) const
+{
+  f->dump_stream("pgid") << pg->pgid;
+}
+
+void PG::WaitForActiveBlocker::on_active()
+{
+  p.set_value();
+  p = {};
+}
+
+blocking_future<> PG::WaitForActiveBlocker::wait()
+{
+  if (pg->peering_state.is_active()) {
+    return make_blocking_future(seastar::now());
+  } else {
+    return make_blocking_future(p.get_shared_future());
+  }
+}
+
+seastar::future<> PG::WaitForActiveBlocker::stop()
+{
+  p.set_exception(crimson::common::system_shutdown_exception());
+  return seastar::now();
+}
+
+seastar::future<> PG::submit_transaction(const OpInfo& op_info,
+					 const std::vector<OSDOp>& ops,
+					 ObjectContextRef&& obc,
+					 ceph::os::Transaction&& txn,
+					 const osd_op_params_t& osd_op_p)
+{
+  if (__builtin_expect(stopping, false)) {
+    return seastar::make_exception_future<>(
+	crimson::common::system_shutdown_exception());
+  }
+
+  epoch_t map_epoch = get_osdmap_epoch();
+
+  if (__builtin_expect(osd_op_p.at_version.epoch != map_epoch, false)) {
+    throw crimson::common::actingset_changed(is_primary());
+  }
+
+  std::vector<pg_log_entry_t> log_entries;
+  log_entries.emplace_back(obc->obs.exists ?
+		      pg_log_entry_t::MODIFY : pg_log_entry_t::DELETE,
+		    obc->obs.oi.soid, osd_op_p.at_version, obc->obs.oi.version,
+		    osd_op_p.user_modify ? osd_op_p.at_version.version : 0,
+		    osd_op_p.req->get_reqid(), osd_op_p.req->get_mtime(),
+                    op_info.allows_returnvec() && !ops.empty() ? ops.back().rval.code : 0);
+  // TODO: refactor the submit_transaction
+  if (op_info.allows_returnvec()) {
+    // also the per-op values are recorded in the pg log
+    log_entries.back().set_op_returns(ops);
+    logger().debug("{} op_returns: {}",
+                   __func__, log_entries.back().op_returns);
+  }
+  log_entries.back().clean_regions = std::move(osd_op_p.clean_regions);
+  peering_state.pre_submit_op(obc->obs.oi.soid, log_entries, osd_op_p.at_version);
+  peering_state.append_log_with_trim_to_updated(std::move(log_entries), osd_op_p.at_version,
+						txn, true, false);
+
+  return backend->mutate_object(peering_state.get_acting_recovery_backfill(),
+				std::move(obc),
+				std::move(txn),
+				std::move(osd_op_p),
+				peering_state.get_last_peering_reset(),
+				map_epoch,
+				std::move(log_entries)).then(
+    [this, last_complete=peering_state.get_info().last_complete,
+      at_version=osd_op_p.at_version](auto acked) {
+    for (const auto& peer : acked) {
+      peering_state.update_peer_last_complete_ondisk(
+        peer.shard, peer.last_complete_ondisk);
+    }
+    peering_state.complete_write(at_version, last_complete);
+    return seastar::now();
+  });
+}
+
+osd_op_params_t&& PG::fill_op_params_bump_pg_version(
+  osd_op_params_t&& osd_op_p,
+  Ref<MOSDOp> m,
+  const bool user_modify)
+{
+  osd_op_p.req = std::move(m);
+  osd_op_p.at_version = next_version();
+  osd_op_p.pg_trim_to = get_pg_trim_to();
+  osd_op_p.min_last_complete_ondisk = get_min_last_complete_ondisk();
+  osd_op_p.last_complete = get_info().last_complete;
+  if (user_modify) {
+    osd_op_p.user_at_version = osd_op_p.at_version.version;
+  }
+  return std::move(osd_op_p);
+}
+
+seastar::future<Ref<MOSDOpReply>> PG::handle_failed_op(
+  const std::error_code& e,
+  ObjectContextRef obc,
+  const OpsExecuter& ox,
+  const MOSDOp& m) const
+{
+  // Oops, an operation had failed. do_osd_ops() altogether with
+  // OpsExecuter already dropped the ObjectStore::Transaction if
+  // there was any. However, this is not enough to completely
+  // rollback as we gave OpsExecuter the very single copy of `obc`
+  // we maintain and we did it for both reading and writing.
+  // Now all modifications must be reverted.
+  //
+  // Let's just reload from the store. Evicting from the shared
+  // LRU would be tricky as next MOSDOp (the one at `get_obc`
+  // phase) could actually already finished the lookup. Fortunately,
+  // this is supposed to live on cold  paths, so performance is not
+  // a concern -- simplicity wins.
+  //
+  // The conditional's purpose is to efficiently handle hot errors
+  // which may appear as a result of e.g. CEPH_OSD_OP_CMPXATTR or
+  // CEPH_OSD_OP_OMAP_CMP. These are read-like ops and clients
+  // typically append them before any write. If OpsExecuter hasn't
+  // seen any modifying operation, `obc` is supposed to be kept
+  // unchanged.
+  assert(e.value() > 0);
+  const bool need_reload_obc = ox.has_seen_write();
+  logger().debug(
+    "{}: {} - object {} got error code {}, {}; need_reload_obc {}",
+    __func__,
+    m,
+    obc->obs.oi.soid,
+    e.value(),
+    e.message(),
+    need_reload_obc);
+  return (need_reload_obc ? reload_obc(*obc)
+                          : load_obc_ertr::now()
+  ).safe_then([&e, &m, obc = std::move(obc), this] {
+    auto reply = make_message<MOSDOpReply>(
+      &m, -e.value(), get_osdmap_epoch(), 0, false);
+    reply->set_enoent_reply_versions(
+      peering_state.get_info().last_update,
+      peering_state.get_info().last_user_version);
+    return seastar::make_ready_future<Ref<MOSDOpReply>>(std::move(reply));
+  }, load_obc_ertr::assert_all{ "can't live with object state messed up" });
+}
+
+seastar::future<Ref<MOSDOpReply>> PG::do_osd_ops(
+  Ref<MOSDOp> m,
+  ObjectContextRef obc,
+  const OpInfo &op_info)
+{
+  if (__builtin_expect(stopping, false)) {
+    throw crimson::common::system_shutdown_exception();
+  }
+
+  using osd_op_errorator = OpsExecuter::osd_op_errorator;
+  const auto oid = m->get_snapid() == CEPH_SNAPDIR ? m->get_hobj().get_head()
+                                                   : m->get_hobj();
+  auto ox = std::make_unique<OpsExecuter>(
+    obc, op_info, get_pool().info, get_backend(), *m);
+  return crimson::do_for_each(
+    m->ops, [obc, m, ox = ox.get()](OSDOp& osd_op) {
+    logger().debug(
+      "do_osd_ops: {} - object {} - handling op {}",
+      *m,
+      obc->obs.oi.soid,
+      ceph_osd_op_name(osd_op.op.op));
+    return ox->execute_op(osd_op);
+  }).safe_then([this, obc, m, ox = ox.get(), &op_info] {
+    logger().debug(
+      "do_osd_ops: {} - object {} all operations successful",
+      *m,
+      obc->obs.oi.soid);
+    return std::move(*ox).flush_changes(
+      [m] (auto&& obc) -> osd_op_errorator::future<> {
+	logger().debug(
+	  "do_osd_ops: {} - object {} txn is empty, bypassing mutate",
+	  *m,
+	  obc->obs.oi.soid);
+        return osd_op_errorator::now();
+      },
+      [this, m, &op_info] (auto&& txn,
+			   auto&& obc,
+			   auto&& osd_op_p,
+                           bool user_modify) -> osd_op_errorator::future<> {
+	logger().debug(
+	  "do_osd_ops: {} - object {} submitting txn",
+	  *m,
+	  obc->obs.oi.soid);
+        auto filled_osd_op_p = fill_op_params_bump_pg_version(
+          std::move(osd_op_p),
+          std::move(m),
+          user_modify);
+	return submit_transaction(
+          op_info,
+          filled_osd_op_p.req->ops,
+          std::move(obc),
+          std::move(txn),
+          std::move(filled_osd_op_p));
+      });
+  }).safe_then([this,
+                m,
+                obc,
+                rvec = op_info.allows_returnvec()] {
+    // TODO: should stop at the first op which returns a negative retval,
+    //       cmpext uses it for returning the index of first unmatched byte
+    int result = m->ops.empty() ? 0 : m->ops.back().rval.code;
+    if (result > 0 && !rvec) {
+      result = 0;
+    }
+    auto reply = make_message<MOSDOpReply>(m.get(),
+                                           result,
+                                           get_osdmap_epoch(),
+                                           0,
+                                           false);
+    reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+    logger().debug(
+      "do_osd_ops: {} - object {} sending reply",
+      *m,
+      obc->obs.oi.soid);
+    return seastar::make_ready_future<Ref<MOSDOpReply>>(std::move(reply));
+  }, osd_op_errorator::all_same_way([ox = ox.get(),
+                                     m,
+                                     obc,
+                                     this] (const std::error_code& e) {
+    return handle_failed_op(e, std::move(obc), *ox, *m);
+  })).handle_exception_type([ox_deleter = std::move(ox),
+                             m,
+                             obc,
+                             this] (const crimson::osd::error& e) {
+    // we need this handler because throwing path which aren't errorated yet.
+    logger().debug("encountered the legacy error handling path!");
+    return handle_failed_op(e.code(), std::move(obc), *ox_deleter, *m);
+  });
+}
+
+seastar::future<Ref<MOSDOpReply>> PG::do_pg_ops(Ref<MOSDOp> m)
+{
+  if (__builtin_expect(stopping, false)) {
+    throw crimson::common::system_shutdown_exception();
+  }
+
+  auto ox = std::make_unique<PgOpsExecuter>(std::as_const(*this),
+                                            std::as_const(*m));
+  return seastar::do_for_each(m->ops, [ox = ox.get()](OSDOp& osd_op) {
+    logger().debug("will be handling pg op {}", ceph_osd_op_name(osd_op.op.op));
+    return ox->execute_op(osd_op);
+  }).then([m, this, ox = std::move(ox)] {
+    auto reply = make_message<MOSDOpReply>(m.get(), 0, get_osdmap_epoch(),
+                                           CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
+                                           false);
+    return seastar::make_ready_future<Ref<MOSDOpReply>>(std::move(reply));
+  }).handle_exception_type([=](const crimson::osd::error& e) {
+    auto reply = make_message<MOSDOpReply>(
+      m.get(), -e.code().value(), get_osdmap_epoch(), 0, false);
+    reply->set_enoent_reply_versions(peering_state.get_info().last_update,
+				     peering_state.get_info().last_user_version);
+    return seastar::make_ready_future<Ref<MOSDOpReply>>(std::move(reply));
+  });
+}
+
+hobject_t PG::get_oid(const MOSDOp &m)
+{
+  return (m.get_snapid() == CEPH_SNAPDIR ?
+          m.get_hobj().get_head() :
+          m.get_hobj());
+}
+
+RWState::State PG::get_lock_type(const OpInfo &op_info)
+{
+
+  if (op_info.rwordered() && op_info.may_read()) {
+    return RWState::RWEXCL;
+  } else if (op_info.rwordered()) {
+    return RWState::RWWRITE;
+  } else {
+    ceph_assert(op_info.may_read());
+    return RWState::RWREAD;
+  }
+}
+
+std::optional<hobject_t> PG::resolve_oid(
+  const SnapSet &ss,
+  const hobject_t &oid)
+{
+  if (oid.snap > ss.seq) {
+    return oid.get_head();
+  } else {
+    // which clone would it be?
+    auto clone = std::upper_bound(
+      begin(ss.clones), end(ss.clones),
+      oid.snap);
+    if (clone == end(ss.clones)) {
+      // Doesn't exist, > last clone, < ss.seq
+      return std::nullopt;
+    }
+    auto citer = ss.clone_snaps.find(*clone);
+    // TODO: how do we want to handle this kind of logic error?
+    ceph_assert(citer != ss.clone_snaps.end());
+
+    if (std::find(
+	  citer->second.begin(),
+	  citer->second.end(),
+	  *clone) == citer->second.end()) {
+      return std::nullopt;
+    } else {
+      auto soid = oid;
+      soid.snap = *clone;
+      return std::optional<hobject_t>(soid);
+    }
+  }
+}
+
+template<RWState::State State>
+PG::load_obc_ertr::future<>
+PG::with_head_obc(hobject_t oid, with_obc_func_t&& func)
+{
+  assert(oid.is_head());
+  auto [obc, existed] = shard_services.obc_registry.get_cached_obc(oid);
+  return obc->with_lock<State>(
+    [oid=std::move(oid), existed=existed, obc=std::move(obc),
+     func=std::move(func), this] {
+    auto loaded = load_obc_ertr::make_ready_future<ObjectContextRef>(obc);
+    if (existed) {
+      logger().debug("with_head_obc: found {} in cache", oid);
+    } else {
+      logger().debug("with_head_obc: cache miss on {}", oid);
+      loaded = obc->with_promoted_lock<State>([this, obc] {
+        return load_head_obc(obc);
+      });
+    }
+    return loaded.safe_then([func=std::move(func)](auto obc) {
+      return func(std::move(obc));
+    });
+  });
+}
+
+template<RWState::State State>
+PG::load_obc_ertr::future<>
+PG::with_clone_obc(hobject_t oid, with_obc_func_t&& func)
+{
+  assert(!oid.is_head());
+  return with_head_obc<RWState::RWREAD>(oid.get_head(),
+    [oid, func=std::move(func), this](auto head) -> load_obc_ertr::future<> {
+    auto coid = resolve_oid(head->get_ro_ss(), oid);
+    if (!coid) {
+      // TODO: return crimson::ct_error::enoent::make();
+      logger().error("with_clone_obc: {} clone not found", coid);
+      return load_obc_ertr::make_ready_future<>();
+    }
+    auto [clone, existed] = shard_services.obc_registry.get_cached_obc(*coid);
+    return clone->template with_lock<State>(
+      [coid=*coid, existed=existed,
+       head=std::move(head), clone=std::move(clone),
+       func=std::move(func), this]() -> load_obc_ertr::future<> {
+      auto loaded = load_obc_ertr::make_ready_future<ObjectContextRef>(clone);
+      if (existed) {
+        logger().debug("with_clone_obc: found {} in cache", coid);
+      } else {
+        logger().debug("with_clone_obc: cache miss on {}", coid);
+        loaded = clone->template with_promoted_lock<State>(
+          [coid, clone, head, this] {
+          return backend->load_metadata(coid).safe_then(
+            [coid, clone=std::move(clone), head=std::move(head)](auto md) mutable {
+            clone->set_clone_state(std::move(md->os), std::move(head));
+            return clone;
+          });
+        });
+      }
+      return loaded.safe_then([func=std::move(func)](auto clone) {
+        return func(std::move(clone));
+      });
+    });
+  });
+}
+
+// explicitly instantiate the used instantiations
+template PG::load_obc_ertr::future<>
+PG::with_head_obc<RWState::RWNONE>(hobject_t, with_obc_func_t&&);
+
+PG::load_obc_ertr::future<crimson::osd::ObjectContextRef>
+PG::load_head_obc(ObjectContextRef obc)
+{
+  hobject_t oid = obc->get_oid();
+  return backend->load_metadata(oid).safe_then([obc=std::move(obc)](auto md)
+    -> load_obc_ertr::future<crimson::osd::ObjectContextRef> {
+    const hobject_t& oid = md->os.oi.soid;
+    logger().debug(
+      "load_head_obc: loaded obs {} for {}", md->os.oi, oid);
+    if (!md->ss) {
+      logger().error(
+        "load_head_obc: oid {} missing snapset", oid);
+      return crimson::ct_error::object_corrupted::make();
+    }
+    obc->set_head_state(std::move(md->os), std::move(*(md->ss)));
+    logger().debug(
+      "load_head_obc: returning obc {} for {}",
+      obc->obs.oi, obc->obs.oi.soid);
+    return load_obc_ertr::make_ready_future<
+      crimson::osd::ObjectContextRef>(obc);
+  });
+}
+
+PG::load_obc_ertr::future<>
+PG::reload_obc(crimson::osd::ObjectContext& obc) const
+{
+  assert(obc.is_head());
+  return backend->load_metadata(obc.get_oid()).safe_then([&obc](auto md)
+    -> load_obc_ertr::future<> {
+    logger().debug(
+      "{}: reloaded obs {} for {}",
+      __func__,
+      md->os.oi,
+      obc.get_oid());
+    if (!md->ss) {
+      logger().error(
+        "{}: oid {} missing snapset",
+        __func__,
+        obc.get_oid());
+      return crimson::ct_error::object_corrupted::make();
+    }
+    obc.set_head_state(std::move(md->os), std::move(*(md->ss)));
+    return load_obc_ertr::now();
+  });
+}
+
+PG::load_obc_ertr::future<>
+PG::with_locked_obc(Ref<MOSDOp> &m, const OpInfo &op_info,
+		    Operation *op, PG::with_obc_func_t &&f)
+{
+  if (__builtin_expect(stopping, false)) {
+    throw crimson::common::system_shutdown_exception();
+  }
+  const hobject_t oid = get_oid(*m);
+  switch (get_lock_type(op_info)) {
+  case RWState::RWREAD:
+    if (oid.is_head()) {
+      return with_head_obc<RWState::RWREAD>(oid, std::move(f));
+    } else {
+      return with_clone_obc<RWState::RWREAD>(oid, std::move(f));
+    }
+  case RWState::RWWRITE:
+    if (oid.is_head()) {
+      return with_head_obc<RWState::RWWRITE>(oid, std::move(f));
+    } else {
+      return with_clone_obc<RWState::RWWRITE>(oid, std::move(f));
+    }
+  case RWState::RWEXCL:
+    if (oid.is_head()) {
+      return with_head_obc<RWState::RWWRITE>(oid, std::move(f));
+    } else {
+      return with_clone_obc<RWState::RWWRITE>(oid, std::move(f));
+    }
+  default:
+    ceph_abort();
+  };
+}
+
+seastar::future<> PG::handle_rep_op(Ref<MOSDRepOp> req)
+{
+  if (__builtin_expect(stopping, false)) {
+    return seastar::make_exception_future<>(
+	crimson::common::system_shutdown_exception());
+  }
+
+  if (can_discard_replica_op(*req)) {
+    return seastar::now();
+  }
+
+  ceph::os::Transaction txn;
+  auto encoded_txn = req->get_data().cbegin();
+  decode(txn, encoded_txn);
+  auto p = req->logbl.cbegin();
+  std::vector<pg_log_entry_t> log_entries;
+  decode(log_entries, p);
+  peering_state.append_log(std::move(log_entries), req->pg_trim_to,
+      req->version, req->min_last_complete_ondisk, txn, !txn.empty(), false);
+  return shard_services.get_store().do_transaction(coll_ref, std::move(txn))
+    .then([req, lcod=peering_state.get_info().last_complete, this] {
+      peering_state.update_last_complete_ondisk(lcod);
+      const auto map_epoch = get_osdmap_epoch();
+      auto reply = make_message<MOSDRepOpReply>(
+        req.get(), pg_whoami, 0,
+	map_epoch, req->get_min_epoch(), CEPH_OSD_FLAG_ONDISK);
+      reply->set_last_complete_ondisk(lcod);
+      return shard_services.send_to_osd(req->from.osd, reply, map_epoch);
+    });
+}
+
+void PG::handle_rep_op_reply(crimson::net::ConnectionRef conn,
+			     const MOSDRepOpReply& m)
+{
+  if (!can_discard_replica_op(m)) {
+    backend->got_rep_op_reply(m);
+  }
+}
+
+template <typename MsgType>
+bool PG::can_discard_replica_op(const MsgType& m) const
+{
+  // if a repop is replied after a replica goes down in a new osdmap, and
+  // before the pg advances to this new osdmap, the repop replies before this
+  // repop can be discarded by that replica OSD, because the primary resets the
+  // connection to it when handling the new osdmap marking it down, and also
+  // resets the messenger sesssion when the replica reconnects. to avoid the
+  // out-of-order replies, the messages from that replica should be discarded.
+  const auto osdmap = peering_state.get_osdmap();
+  const int from_osd = m.get_source().num();
+  if (osdmap->is_down(from_osd)) {
+    return true;
+  }
+  // Mostly, this overlaps with the old_peering_msg
+  // condition.  An important exception is pushes
+  // sent by replicas not in the acting set, since
+  // if such a replica goes down it does not cause
+  // a new interval.
+  if (osdmap->get_down_at(from_osd) >= m.map_epoch) {
+    return true;
+  }
+  // same pg?
+  //  if pg changes *at all*, we reset and repeer!
+  if (epoch_t lpr = peering_state.get_last_peering_reset();
+      lpr > m.map_epoch) {
+    logger().debug("{}: pg changed {} after {}, dropping",
+                   __func__, get_info().history, m.map_epoch);
+    return true;
+  }
+  return false;
+}
+
+seastar::future<> PG::stop()
+{
+  logger().info("PG {} {}", pgid, __func__);
+  stopping = true;
+  return osdmap_gate.stop().then([this] {
+    return wait_for_active_blocker.stop();
+  }).then([this] {
+    return recovery_handler->stop();
+  }).then([this] {
+    return recovery_backend->stop();
+  }).then([this] {
+    return backend->stop();
+  });
+}
+
+void PG::on_change(ceph::os::Transaction &t) {
+  recovery_backend->on_peering_interval_change(t);
+  backend->on_actingset_changed({ is_primary() });
+}
+
+bool PG::can_discard_op(const MOSDOp& m) const {
+  return __builtin_expect(m.get_map_epoch()
+      < peering_state.get_info().history.same_primary_since, false);
+}
+
+bool PG::is_degraded_or_backfilling_object(const hobject_t& soid) const {
+  /* The conditions below may clear (on_local_recover, before we queue
+   * the transaction) before we actually requeue the degraded waiters
+   * in on_global_recover after the transaction completes.
+   */
+  if (peering_state.get_pg_log().get_missing().get_items().count(soid))
+    return true;
+  ceph_assert(!get_acting_recovery_backfill().empty());
+  for (auto& peer : get_acting_recovery_backfill()) {
+    if (peer == get_primary()) continue;
+    auto peer_missing_entry = peering_state.get_peer_missing().find(peer);
+    // If an object is missing on an async_recovery_target, return false.
+    // This will not block the op and the object is async recovered later.
+    if (peer_missing_entry != peering_state.get_peer_missing().end() &&
+	peer_missing_entry->second.get_items().count(soid)) {
+	return true;
+    }
+    // Object is degraded if after last_backfill AND
+    // we are backfilling it
+    if (is_backfill_target(peer) &&
+        peering_state.get_peer_info(peer).last_backfill <= soid &&
+	recovery_handler->backfill_state->get_last_backfill_started() >= soid &&
+	recovery_backend->is_recovering(soid)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+}
diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h
new file mode 100644
index 000000000..34676ee7a
--- /dev/null
+++ b/src/crimson/osd/pg.h
@@ -0,0 +1,704 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <boost/smart_ptr/local_shared_ptr.hpp>
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/sleep.hh>
+
+#include "common/dout.h"
+#include "crimson/net/Fwd.h"
+#include "messages/MOSDRepOpReply.h"
+#include "messages/MOSDOpReply.h"
+#include "os/Transaction.h"
+#include "osd/osd_types.h"
+#include "crimson/osd/object_context.h"
+#include "osd/PeeringState.h"
+
+#include "crimson/common/type_helpers.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/osd/backfill_state.h"
+#include "crimson/osd/osd_operations/client_request.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/osd_operations/replicated_request.h"
+#include "crimson/osd/osd_operations/background_recovery.h"
+#include "crimson/osd/shard_services.h"
+#include "crimson/osd/osdmap_gate.h"
+#include "crimson/osd/pg_recovery.h"
+#include "crimson/osd/pg_recovery_listener.h"
+#include "crimson/osd/recovery_backend.h"
+
+class MQuery;
+class OSDMap;
+class PGBackend;
+class PGPeeringEvent;
+class osd_op_params_t;
+
+namespace recovery {
+  class Context;
+}
+
+namespace crimson::net {
+  class Messenger;
+}
+
+namespace crimson::os {
+  class FuturizedStore;
+}
+
+namespace crimson::osd {
+class ClientRequest;
+class OpsExecuter;
+
+class PG : public boost::intrusive_ref_counter<
+  PG,
+  boost::thread_unsafe_counter>,
+  public PGRecoveryListener,
+  PeeringState::PeeringListener,
+  DoutPrefixProvider
+{
+  using ec_profile_t = std::map<std::string,std::string>;
+  using cached_map_t = boost::local_shared_ptr<const OSDMap>;
+
+  ClientRequest::PGPipeline client_request_pg_pipeline;
+  PeeringEvent::PGPipeline peering_request_pg_pipeline;
+  RepRequest::PGPipeline replicated_request_pg_pipeline;
+
+  spg_t pgid;
+  pg_shard_t pg_whoami;
+  crimson::os::CollectionRef coll_ref;
+  ghobject_t pgmeta_oid;
+
+  seastar::timer<seastar::lowres_clock> check_readable_timer;
+  seastar::timer<seastar::lowres_clock> renew_lease_timer;
+
+public:
+  PG(spg_t pgid,
+     pg_shard_t pg_shard,
+     crimson::os::CollectionRef coll_ref,
+     pg_pool_t&& pool,
+     std::string&& name,
+     cached_map_t osdmap,
+     ShardServices &shard_services,
+     ec_profile_t profile);
+
+  ~PG();
+
+  const pg_shard_t& get_pg_whoami() const final {
+    return pg_whoami;
+  }
+
+  const spg_t& get_pgid() const final {
+    return pgid;
+  }
+
+  PGBackend& get_backend() {
+    return *backend;
+  }
+  const PGBackend& get_backend() const {
+    return *backend;
+  }
+  // EpochSource
+  epoch_t get_osdmap_epoch() const final {
+    return peering_state.get_osdmap_epoch();
+  }
+
+  eversion_t get_pg_trim_to() const {
+    return peering_state.get_pg_trim_to();
+  }
+
+  eversion_t get_min_last_complete_ondisk() const {
+    return peering_state.get_min_last_complete_ondisk();
+  }
+
+  const pg_info_t& get_info() const final {
+    return peering_state.get_info();
+  }
+
+  // DoutPrefixProvider
+  std::ostream& gen_prefix(std::ostream& out) const final {
+    return out << *this;
+  }
+  crimson::common::CephContext *get_cct() const final {
+    return shard_services.get_cct();
+  }
+  unsigned get_subsys() const final {
+    return ceph_subsys_osd;
+  }
+
+  crimson::os::CollectionRef get_collection_ref() {
+    return coll_ref;
+  }
+
+  // PeeringListener
+  void prepare_write(
+    pg_info_t &info,
+    pg_info_t &last_written_info,
+    PastIntervals &past_intervals,
+    PGLog &pglog,
+    bool dirty_info,
+    bool dirty_big_info,
+    bool need_write_epoch,
+    ceph::os::Transaction &t) final;
+
+  void on_info_history_change() final {
+    // Not needed yet -- mainly for scrub scheduling
+  }
+
+  void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) final;
+
+  uint64_t get_snap_trimq_size() const final {
+    return 0;
+  }
+
+  void send_cluster_message(
+    int osd, MessageRef m,
+    epoch_t epoch, bool share_map_update=false) final {
+    (void)shard_services.send_to_osd(osd, m, epoch);
+  }
+
+  void send_pg_created(pg_t pgid) final {
+    (void)shard_services.send_pg_created(pgid);
+  }
+
+  bool try_flush_or_schedule_async() final;
+
+  void start_flush_on_transaction(
+    ceph::os::Transaction &t) final {
+    t.register_on_commit(
+      new LambdaContext([this](int r){
+	peering_state.complete_flush();
+    }));
+  }
+
+  void on_flushed() final {
+    // will be needed for unblocking IO operations/peering
+  }
+
+  template <typename T>
+  void start_peering_event_operation(T &&evt, float delay = 0) {
+    (void) shard_services.start_operation<LocalPeeringEvent>(
+      this,
+      shard_services,
+      pg_whoami,
+      pgid,
+      delay,
+      std::forward<T>(evt));
+  }
+
+  void schedule_event_after(
+    PGPeeringEventRef event,
+    float delay) final {
+    start_peering_event_operation(std::move(*event), delay);
+  }
+  std::vector<pg_shard_t> get_replica_recovery_order() const final {
+    return peering_state.get_replica_recovery_order();
+  }
+  void request_local_background_io_reservation(
+    unsigned priority,
+    PGPeeringEventURef on_grant,
+    PGPeeringEventURef on_preempt) final {
+    shard_services.local_reserver.request_reservation(
+      pgid,
+      on_grant ? make_lambda_context([this, on_grant=std::move(on_grant)] (int) {
+	start_peering_event_operation(std::move(*on_grant));
+      }) : nullptr,
+      priority,
+      on_preempt ? make_lambda_context(
+	[this, on_preempt=std::move(on_preempt)] (int) {
+	start_peering_event_operation(std::move(*on_preempt));
+      }) : nullptr);
+  }
+
+  void update_local_background_io_priority(
+    unsigned priority) final {
+    shard_services.local_reserver.update_priority(
+      pgid,
+      priority);
+  }
+
+  void cancel_local_background_io_reservation() final {
+    shard_services.local_reserver.cancel_reservation(
+      pgid);
+  }
+
+  void request_remote_recovery_reservation(
+    unsigned priority,
+    PGPeeringEventURef on_grant,
+    PGPeeringEventURef on_preempt) final {
+    shard_services.remote_reserver.request_reservation(
+      pgid,
+      on_grant ? make_lambda_context([this, on_grant=std::move(on_grant)] (int) {
+	start_peering_event_operation(std::move(*on_grant));
+      }) : nullptr,
+      priority,
+      on_preempt ? make_lambda_context(
+	[this, on_preempt=std::move(on_preempt)] (int) {
+	start_peering_event_operation(std::move(*on_preempt));
+      }) : nullptr);
+  }
+
+  void cancel_remote_recovery_reservation() final {
+    shard_services.remote_reserver.cancel_reservation(
+      pgid);
+  }
+
+  void schedule_event_on_commit(
+    ceph::os::Transaction &t,
+    PGPeeringEventRef on_commit) final {
+    t.register_on_commit(
+      make_lambda_context(
+	[this, on_commit=std::move(on_commit)](int) {
+	  start_peering_event_operation(std::move(*on_commit));
+	}));
+  }
+
+  void update_heartbeat_peers(set<int> peers) final {
+    // Not needed yet
+  }
+  void set_probe_targets(const set<pg_shard_t> &probe_set) final {
+    // Not needed yet
+  }
+  void clear_probe_targets() final {
+    // Not needed yet
+  }
+  void queue_want_pg_temp(const std::vector<int> &wanted) final {
+    shard_services.queue_want_pg_temp(pgid.pgid, wanted);
+  }
+  void clear_want_pg_temp() final {
+    shard_services.remove_want_pg_temp(pgid.pgid);
+  }
+  void publish_stats_to_osd() final {
+    if (!is_primary())
+      return;
+
+    (void) peering_state.prepare_stats_for_publish(
+      false,
+      pg_stat_t(),
+      object_stat_collection_t());
+  }
+  void clear_publish_stats() final {
+    // Not needed yet
+  }
+  void check_recovery_sources(const OSDMapRef& newmap) final {
+    // Not needed yet
+  }
+  void check_blocklisted_watchers() final {
+    // Not needed yet
+  }
+  void clear_primary_state() final {
+    // Not needed yet
+  }
+
+  void queue_check_readable(epoch_t last_peering_reset,
+			    ceph::timespan delay) final;
+  void recheck_readable() final;
+
+  unsigned get_target_pg_log_entries() const final;
+
+  void on_pool_change() final {
+    // Not needed yet
+  }
+  void on_role_change() final {
+    // Not needed yet
+  }
+  void on_change(ceph::os::Transaction &t) final;
+  void on_activate(interval_set<snapid_t> to_trim) final;
+  void on_activate_complete() final;
+  void on_new_interval() final {
+    // Not needed yet
+  }
+  Context *on_clean() final {
+    // Not needed yet (will be needed for IO unblocking)
+    return nullptr;
+  }
+  void on_activate_committed() final {
+    // Not needed yet (will be needed for IO unblocking)
+  }
+  void on_active_exit() final {
+    // Not needed yet
+  }
+
+  void on_removal(ceph::os::Transaction &t) final {
+    // TODO
+  }
+  std::pair<ghobject_t, bool>
+  do_delete_work(ceph::os::Transaction &t, ghobject_t _next) final;
+
+  // merge/split not ready
+  void clear_ready_to_merge() final {}
+  void set_not_ready_to_merge_target(pg_t pgid, pg_t src) final {}
+  void set_not_ready_to_merge_source(pg_t pgid) final {}
+  void set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) final {}
+  void set_ready_to_merge_source(eversion_t lu) final {}
+
+  void on_active_actmap() final {
+    // Not needed yet
+  }
+  void on_active_advmap(const OSDMapRef &osdmap) final {
+    // Not needed yet
+  }
+  epoch_t oldest_stored_osdmap() final {
+    // TODO
+    return 0;
+  }
+
+  void on_backfill_reserved() final {
+    recovery_handler->on_backfill_reserved();
+  }
+  void on_backfill_canceled() final {
+    ceph_assert(0 == "Not implemented");
+  }
+
+  void on_recovery_reserved() final {
+    recovery_handler->start_pglogbased_recovery();
+  }
+
+
+  bool try_reserve_recovery_space(
+    int64_t primary_num_bytes, int64_t local_num_bytes) final {
+    // TODO
+    return true;
+  }
+  void unreserve_recovery_space() final {}
+
+  struct PGLogEntryHandler : public PGLog::LogEntryHandler {
+    PG *pg;
+    ceph::os::Transaction *t;
+    PGLogEntryHandler(PG *pg, ceph::os::Transaction *t) : pg(pg), t(t) {}
+
+    // LogEntryHandler
+    void remove(const hobject_t &hoid) override {
+      // TODO
+    }
+    void try_stash(const hobject_t &hoid, version_t v) override {
+      // TODO
+    }
+    void rollback(const pg_log_entry_t &entry) override {
+      // TODO
+    }
+    void rollforward(const pg_log_entry_t &entry) override {
+      // TODO
+    }
+    void trim(const pg_log_entry_t &entry) override {
+      // TODO
+    }
+  };
+  PGLog::LogEntryHandlerRef get_log_handler(
+    ceph::os::Transaction &t) final {
+    return std::make_unique<PG::PGLogEntryHandler>(this, &t);
+  }
+
+  void rebuild_missing_set_with_deletes(PGLog &pglog) final {
+    ceph_assert(0 == "Impossible for crimson");
+  }
+
+  PerfCounters &get_peering_perf() final {
+    return shard_services.get_recoverystate_perf_logger();
+  }
+  PerfCounters &get_perf_logger() final {
+    return shard_services.get_perf_logger();
+  }
+
+  void log_state_enter(const char *state) final;
+  void log_state_exit(
+    const char *state_name, utime_t enter_time,
+    uint64_t events, utime_t event_dur) final;
+
+  void dump_recovery_info(Formatter *f) const final {
+  }
+
+  OstreamTemp get_clog_info() final {
+    // not needed yet: replace with not a stub (needs to be wired up to monc)
+    return OstreamTemp(CLOG_INFO, nullptr);
+  }
+  OstreamTemp get_clog_debug() final {
+    // not needed yet: replace with not a stub (needs to be wired up to monc)
+    return OstreamTemp(CLOG_DEBUG, nullptr);
+  }
+  OstreamTemp get_clog_error() final {
+    // not needed yet: replace with not a stub (needs to be wired up to monc)
+    return OstreamTemp(CLOG_ERROR, nullptr);
+  }
+
+  ceph::signedspan get_mnow() final;
+  HeartbeatStampsRef get_hb_stamps(int peer) final;
+  void schedule_renew_lease(epoch_t plr, ceph::timespan delay) final;
+
+
+  // Utility
+  bool is_primary() const final {
+    return peering_state.is_primary();
+  }
+  bool is_nonprimary() const {
+    return peering_state.is_nonprimary();
+  }
+  bool is_peered() const final {
+    return peering_state.is_peered();
+  }
+  bool is_recovering() const final {
+    return peering_state.is_recovering();
+  }
+  bool is_backfilling() const final {
+    return peering_state.is_backfilling();
+  }
+  pg_stat_t get_stats() {
+    auto stats = peering_state.prepare_stats_for_publish(
+      false,
+      pg_stat_t(),
+      object_stat_collection_t());
+    ceph_assert(stats);
+    return *stats;
+  }
+  bool get_need_up_thru() const {
+    return peering_state.get_need_up_thru();
+  }
+  epoch_t get_same_interval_since() const {
+    return get_info().history.same_interval_since;
+  }
+
+  const auto& get_pool() const {
+    return peering_state.get_pool();
+  }
+  pg_shard_t get_primary() const {
+    return peering_state.get_primary();
+  }
+
+  /// initialize created PG
+  void init(
+    int role,
+    const std::vector<int>& up,
+    int up_primary,
+    const std::vector<int>& acting,
+    int acting_primary,
+    const pg_history_t& history,
+    const PastIntervals& pim,
+    bool backfill,
+    ceph::os::Transaction &t);
+
+  seastar::future<> read_state(crimson::os::FuturizedStore* store);
+
+  void do_peering_event(
+    PGPeeringEvent& evt, PeeringCtx &rctx);
+
+  void handle_advance_map(cached_map_t next_map, PeeringCtx &rctx);
+  void handle_activate_map(PeeringCtx &rctx);
+  void handle_initialize(PeeringCtx &rctx);
+
+  static hobject_t get_oid(const MOSDOp &m);
+  static RWState::State get_lock_type(const OpInfo &op_info);
+  static std::optional<hobject_t> resolve_oid(
+    const SnapSet &snapset,
+    const hobject_t &oid);
+
+  using load_obc_ertr = crimson::errorator<
+    crimson::ct_error::object_corrupted>;
+
+  load_obc_ertr::future<crimson::osd::ObjectContextRef>
+  load_head_obc(ObjectContextRef obc);
+
+  load_obc_ertr::future<>
+  reload_obc(crimson::osd::ObjectContext& obc) const;
+
+public:
+  using with_obc_func_t =
+    std::function<load_obc_ertr::future<> (ObjectContextRef)>;
+
+  template<RWState::State State>
+  load_obc_ertr::future<> with_head_obc(hobject_t oid, with_obc_func_t&& func);
+
+  load_obc_ertr::future<> with_locked_obc(
+    Ref<MOSDOp> &m,
+    const OpInfo &op_info,
+    Operation *op,
+    with_obc_func_t&& f);
+
+  seastar::future<> handle_rep_op(Ref<MOSDRepOp> m);
+  void handle_rep_op_reply(crimson::net::ConnectionRef conn,
+			   const MOSDRepOpReply& m);
+
+  void print(std::ostream& os) const;
+  void dump_primary(Formatter*);
+
+private:
+  template<RWState::State State>
+  load_obc_ertr::future<> with_clone_obc(hobject_t oid, with_obc_func_t&& func);
+
+  load_obc_ertr::future<ObjectContextRef> get_locked_obc(
+    Operation *op,
+    const hobject_t &oid,
+    RWState::State type);
+
+  void do_peering_event(
+    const boost::statechart::event_base &evt,
+    PeeringCtx &rctx);
+  osd_op_params_t&& fill_op_params_bump_pg_version(
+    osd_op_params_t&& osd_op_p,
+    Ref<MOSDOp> m,
+    const bool user_modify);
+  seastar::future<Ref<MOSDOpReply>> handle_failed_op(
+    const std::error_code& e,
+    ObjectContextRef obc,
+    const OpsExecuter& ox,
+    const MOSDOp& m) const;
+  seastar::future<Ref<MOSDOpReply>> do_osd_ops(
+    Ref<MOSDOp> m,
+    ObjectContextRef obc,
+    const OpInfo &op_info);
+  seastar::future<Ref<MOSDOpReply>> do_pg_ops(Ref<MOSDOp> m);
+  seastar::future<> submit_transaction(const OpInfo& op_info,
+				       const std::vector<OSDOp>& ops,
+				       ObjectContextRef&& obc,
+				       ceph::os::Transaction&& txn,
+				       const osd_op_params_t& oop);
+
+private:
+  OSDMapGate osdmap_gate;
+  ShardServices &shard_services;
+
+  cached_map_t osdmap;
+
+public:
+  cached_map_t get_osdmap() { return osdmap; }
+  eversion_t next_version() {
+    return eversion_t(get_osdmap_epoch(),
+		      ++projected_last_update.version);
+  }
+  ShardServices& get_shard_services() final {
+    return shard_services;
+  }
+  seastar::future<> stop();
+
+private:
+  std::unique_ptr<PGBackend> backend;
+  std::unique_ptr<RecoveryBackend> recovery_backend;
+  std::unique_ptr<PGRecovery> recovery_handler;
+
+  PeeringState peering_state;
+  eversion_t projected_last_update;
+public:
+  RecoveryBackend* get_recovery_backend() final {
+    return recovery_backend.get();
+  }
+  PGRecovery* get_recovery_handler() final {
+    return recovery_handler.get();
+  }
+  PeeringState& get_peering_state() final {
+    return peering_state;
+  }
+  bool has_reset_since(epoch_t epoch) const final {
+    return peering_state.pg_has_reset_since(epoch);
+  }
+
+  const pg_missing_tracker_t& get_local_missing() const {
+    return peering_state.get_pg_log().get_missing();
+  }
+  epoch_t get_last_peering_reset() const final {
+    return peering_state.get_last_peering_reset();
+  }
+  const set<pg_shard_t> &get_acting_recovery_backfill() const {
+    return peering_state.get_acting_recovery_backfill();
+  }
+  bool is_backfill_target(pg_shard_t osd) const {
+    return peering_state.is_backfill_target(osd);
+  }
+  void begin_peer_recover(pg_shard_t peer, const hobject_t oid) {
+    peering_state.begin_peer_recover(peer, oid);
+  }
+  uint64_t min_peer_features() const {
+    return peering_state.get_min_peer_features();
+  }
+  const map<hobject_t, set<pg_shard_t>>&
+  get_missing_loc_shards() const {
+    return peering_state.get_missing_loc().get_missing_locs();
+  }
+  const map<pg_shard_t, pg_missing_t> &get_shard_missing() const {
+    return peering_state.get_peer_missing();
+  }
+  const pg_missing_const_i* get_shard_missing(pg_shard_t shard) const {
+    if (shard == pg_whoami)
+      return &get_local_missing();
+    else {
+      auto it = peering_state.get_peer_missing().find(shard);
+      if (it == peering_state.get_peer_missing().end())
+	return nullptr;
+      else
+	return &it->second;
+    }
+  }
+  int get_recovery_op_priority() const {
+    int64_t pri = 0;
+    get_pool().info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri);
+    return  pri > 0 ? pri : crimson::common::local_conf()->osd_recovery_op_priority;
+  }
+  seastar::future<> mark_unfound_lost(int) {
+    // TODO: see PrimaryLogPG::mark_all_unfound_lost()
+    return seastar::now();
+  }
+
+private:
+  // instead of seastar::gate, we use a boolean flag to indicate
+  // whether the system is shutting down, as we don't need to track
+  // continuations here.
+  bool stopping = false;
+
+  class WaitForActiveBlocker : public BlockerT<WaitForActiveBlocker> {
+    PG *pg;
+
+    const spg_t pgid;
+    seastar::shared_promise<> p;
+
+  protected:
+    void dump_detail(Formatter *f) const;
+
+  public:
+    static constexpr const char *type_name = "WaitForActiveBlocker";
+
+    WaitForActiveBlocker(PG *pg) : pg(pg) {}
+    void on_active();
+    blocking_future<> wait();
+    seastar::future<> stop();
+  } wait_for_active_blocker;
+
+  friend std::ostream& operator<<(std::ostream&, const PG& pg);
+  friend class ClientRequest;
+  friend class PGAdvanceMap;
+  friend class PeeringEvent;
+  friend class RepRequest;
+  friend class BackfillRecovery;
+  friend struct PGFacade;
+private:
+  seastar::future<bool> find_unfound() {
+    return seastar::make_ready_future<bool>(true);
+  }
+
+  template <typename MsgType>
+  bool can_discard_replica_op(const MsgType& m) const;
+  bool can_discard_op(const MOSDOp& m) const;
+  bool is_missing_object(const hobject_t& soid) const {
+    return peering_state.get_pg_log().get_missing().get_items().count(soid);
+  }
+  bool is_unreadable_object(const hobject_t &oid,
+			    eversion_t* v = 0) const final {
+    return is_missing_object(oid) ||
+      !peering_state.get_missing_loc().readable_with_acting(
+	oid, get_actingset(), v);
+  }
+  bool is_degraded_or_backfilling_object(const hobject_t& soid) const;
+  const set<pg_shard_t> &get_actingset() const {
+    return peering_state.get_actingset();
+  }
+
+private:
+  BackfillRecovery::BackfillRecoveryPipeline backfill_pipeline;
+};
+
+std::ostream& operator<<(std::ostream&, const PG& pg);
+
+}
diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc
new file mode 100644
index 000000000..38dbdbf41
--- /dev/null
+++ b/src/crimson/osd/pg_backend.cc
@@ -0,0 +1,1171 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "pg_backend.h"
+
+#include <optional>
+#include <boost/range/adaptor/filtered.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/algorithm/copy.hpp>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <seastar/core/print.hh>
+
+#include "messages/MOSDOp.h"
+#include "os/Transaction.h"
+#include "common/Checksummer.h"
+#include "common/Clock.h"
+
+#include "crimson/common/exception.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+#include "crimson/osd/osd_operation.h"
+#include "replicated_backend.h"
+#include "replicated_recovery_backend.h"
+#include "ec_backend.h"
+#include "exceptions.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+using crimson::common::local_conf;
+
+std::unique_ptr<PGBackend>
+PGBackend::create(pg_t pgid,
+		  const pg_shard_t pg_shard,
+		  const pg_pool_t& pool,
+		  crimson::os::CollectionRef coll,
+		  crimson::osd::ShardServices& shard_services,
+		  const ec_profile_t& ec_profile)
+{
+  switch (pool.type) {
+  case pg_pool_t::TYPE_REPLICATED:
+    return std::make_unique<ReplicatedBackend>(pgid, pg_shard,
+					       coll, shard_services);
+  case pg_pool_t::TYPE_ERASURE:
+    return std::make_unique<ECBackend>(pg_shard.shard, coll, shard_services,
+                                       std::move(ec_profile),
+                                       pool.stripe_width);
+  default:
+    throw runtime_error(seastar::format("unsupported pool type '{}'",
+                                        pool.type));
+  }
+}
+
+PGBackend::PGBackend(shard_id_t shard,
+                     CollectionRef coll,
+                     crimson::os::FuturizedStore* store)
+  : shard{shard},
+    coll{coll},
+    store{store}
+{}
+
+PGBackend::load_metadata_ertr::future<PGBackend::loaded_object_md_t::ref>
+PGBackend::load_metadata(const hobject_t& oid)
+{
+  if (__builtin_expect(stopping, false)) {
+    throw crimson::common::system_shutdown_exception();
+  }
+
+  return store->get_attrs(
+    coll,
+    ghobject_t{oid, ghobject_t::NO_GEN, shard}).safe_then(
+      [oid](auto &&attrs) -> load_metadata_ertr::future<loaded_object_md_t::ref>{
+	loaded_object_md_t::ref ret(new loaded_object_md_t());
+	if (auto oiiter = attrs.find(OI_ATTR); oiiter != attrs.end()) {
+	  bufferlist bl;
+	  bl.push_back(std::move(oiiter->second));
+	  ret->os = ObjectState(
+	    object_info_t(bl),
+	    true);
+	} else {
+	  logger().error(
+	    "load_metadata: object {} present but missing object info",
+	    oid);
+	  return crimson::ct_error::object_corrupted::make();
+	}
+	
+	if (oid.is_head()) {
+	  if (auto ssiter = attrs.find(SS_ATTR); ssiter != attrs.end()) {
+	    bufferlist bl;
+	    bl.push_back(std::move(ssiter->second));
+	    ret->ss = SnapSet(bl);
+	  } else {
+	    /* TODO: add support for writing out snapsets
+	    logger().error(
+	      "load_metadata: object {} present but missing snapset",
+	      oid);
+	    //return crimson::ct_error::object_corrupted::make();
+	    */
+	    ret->ss = SnapSet();
+	  }
+	}
+
+	return load_metadata_ertr::make_ready_future<loaded_object_md_t::ref>(
+	  std::move(ret));
+      }, crimson::ct_error::enoent::handle([oid] {
+	logger().debug(
+	  "load_metadata: object {} doesn't exist, returning empty metadata",
+	  oid);
+	return load_metadata_ertr::make_ready_future<loaded_object_md_t::ref>(
+	  new loaded_object_md_t{
+	    ObjectState(
+	      object_info_t(oid),
+	      false),
+	    oid.is_head() ? std::optional<SnapSet>(SnapSet()) : std::nullopt
+	  });
+      }));
+}
+
+seastar::future<crimson::osd::acked_peers_t>
+PGBackend::mutate_object(
+  std::set<pg_shard_t> pg_shards,
+  crimson::osd::ObjectContextRef &&obc,
+  ceph::os::Transaction&& txn,
+  const osd_op_params_t& osd_op_p,
+  epoch_t min_epoch,
+  epoch_t map_epoch,
+  std::vector<pg_log_entry_t>&& log_entries)
+{
+  logger().trace("mutate_object: num_ops={}", txn.get_num_ops());
+  if (obc->obs.exists) {
+#if 0
+    obc->obs.oi.version = ctx->at_version;
+    obc->obs.oi.prior_version = ctx->obs->oi.version;
+#endif
+
+    auto& m = osd_op_p.req;
+    obc->obs.oi.prior_version = obc->obs.oi.version;
+    obc->obs.oi.version = osd_op_p.at_version;
+    if (osd_op_p.user_at_version > obc->obs.oi.user_version)
+      obc->obs.oi.user_version = osd_op_p.user_at_version;
+    obc->obs.oi.last_reqid = m->get_reqid();
+    obc->obs.oi.mtime = m->get_mtime();
+    obc->obs.oi.local_mtime = ceph_clock_now();
+
+    // object_info_t
+    {
+      ceph::bufferlist osv;
+      encode(obc->obs.oi, osv, CEPH_FEATURES_ALL);
+      // TODO: get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+      txn.setattr(coll->get_cid(), ghobject_t{obc->obs.oi.soid}, OI_ATTR, osv);
+    }
+  } else {
+    // reset cached ObjectState without enforcing eviction
+    obc->obs.oi = object_info_t(obc->obs.oi.soid);
+  }
+  return _submit_transaction(
+    std::move(pg_shards), obc->obs.oi.soid, std::move(txn),
+    std::move(osd_op_p), min_epoch, map_epoch, std::move(log_entries));
+}
+
+static inline bool _read_verify_data(
+  const object_info_t& oi,
+  const ceph::bufferlist& data)
+{
+  if (oi.is_data_digest() && oi.size == data.length()) {
+    // whole object?  can we verify the checksum?
+    if (auto crc = data.crc32c(-1); crc != oi.data_digest) {
+      logger().error("full-object read crc {} != expected {} on {}",
+                     crc, oi.data_digest, oi.soid);
+      // todo: mark soid missing, perform recovery, and retry
+      return false;
+    }
+  }
+  return true;
+}
+
+PGBackend::read_errorator::future<>
+PGBackend::read(const ObjectState& os, OSDOp& osd_op)
+{
+  const auto& oi = os.oi;
+  const ceph_osd_op& op = osd_op.op;
+  const uint64_t offset = op.extent.offset;
+  uint64_t length = op.extent.length;
+  logger().trace("read: {} {}~{}", oi.soid, offset, length);
+
+  if (!os.exists || os.oi.is_whiteout()) {
+    logger().debug("{}: {} DNE", __func__, os.oi.soid);
+    return crimson::ct_error::enoent::make();
+  }
+  // are we beyond truncate_size?
+  size_t size = oi.size;
+  if ((op.extent.truncate_seq > oi.truncate_seq) &&
+      (op.extent.truncate_size < offset + length) &&
+      (op.extent.truncate_size < size)) {
+    size = op.extent.truncate_size;
+  }
+  if (offset >= size) {
+    // read size was trimmed to zero and it is expected to do nothing,
+    return read_errorator::now();
+  }
+  if (!length) {
+    // read the whole object if length is 0
+    length = size;
+  }
+  return _read(oi.soid, offset, length, op.flags).safe_then(
+    [&oi, &osd_op](auto&& bl) -> read_errorator::future<> {
+    if (!_read_verify_data(oi, bl)) {
+      return crimson::ct_error::object_corrupted::make();
+    }
+    logger().debug("read: data length: {}", bl.length());
+    osd_op.rval = bl.length();
+    osd_op.outdata = std::move(bl);
+    return read_errorator::now();
+  });
+}
+
+PGBackend::read_errorator::future<>
+PGBackend::sparse_read(const ObjectState& os, OSDOp& osd_op)
+{
+  const auto& op = osd_op.op;
+  logger().trace("sparse_read: {} {}~{}",
+                 os.oi.soid, op.extent.offset, op.extent.length);
+  return store->fiemap(coll, ghobject_t{os.oi.soid},
+		       op.extent.offset,
+		       op.extent.length).then([&os, &osd_op, this](auto&& m) {
+    return seastar::do_with(interval_set<uint64_t>{std::move(m)},
+			    [&os, &osd_op, this](auto&& extents) {
+      return store->readv(coll, ghobject_t{os.oi.soid},
+                          extents, osd_op.op.flags).safe_then(
+        [&os, &osd_op, &extents](auto&& bl) -> read_errorator::future<> {
+        if (_read_verify_data(os.oi, bl)) {
+          osd_op.op.extent.length = bl.length();
+          // re-encode since it might be modified
+          ceph::encode(extents, osd_op.outdata);
+          encode_destructively(bl, osd_op.outdata);
+          logger().trace("sparse_read got {} bytes from object {}",
+                         osd_op.op.extent.length, os.oi.soid);
+          return read_errorator::make_ready_future<>();
+        } else {
+          // TODO: repair it if crc mismatches
+          return crimson::ct_error::object_corrupted::make();
+        }
+      });
+    });
+  });
+}
+
+namespace {
+
+  template<class CSum>
+  PGBackend::checksum_errorator::future<>
+  do_checksum(ceph::bufferlist& init_value_bl,
+	      size_t chunk_size,
+	      const ceph::bufferlist& buf,
+	      ceph::bufferlist& result)
+  {
+    typename CSum::init_value_t init_value;
+    auto init_value_p = init_value_bl.cbegin();
+    try {
+      decode(init_value, init_value_p);
+      // chop off the consumed part
+      init_value_bl.splice(0, init_value_p.get_off());
+    } catch (const ceph::buffer::end_of_buffer&) {
+      logger().warn("{}: init value not provided", __func__);
+      return crimson::ct_error::invarg::make();
+    }
+    const uint32_t chunk_count = buf.length() / chunk_size;
+    ceph::bufferptr csum_data{
+      ceph::buffer::create(sizeof(typename CSum::value_t) * chunk_count)};
+    Checksummer::calculate<CSum>(
+      init_value, chunk_size, 0, buf.length(), buf, &csum_data);
+    encode(chunk_count, result);
+    result.append(std::move(csum_data));
+    return PGBackend::checksum_errorator::now();
+  }
+}
+
+PGBackend::checksum_errorator::future<>
+PGBackend::checksum(const ObjectState& os, OSDOp& osd_op)
+{
+  // sanity tests and normalize the argments
+  auto& checksum = osd_op.op.checksum;
+  if (checksum.offset == 0 && checksum.length == 0) {
+    // zeroed offset+length implies checksum whole object
+    checksum.length = os.oi.size;
+  } else if (checksum.offset >= os.oi.size) {
+    // read size was trimmed to zero, do nothing,
+    // see PGBackend::read()
+    return checksum_errorator::now();
+  }
+  if (checksum.chunk_size > 0) {
+    if (checksum.length == 0) {
+      logger().warn("{}: length required when chunk size provided", __func__);
+      return crimson::ct_error::invarg::make();
+    }
+    if (checksum.length % checksum.chunk_size != 0) {
+      logger().warn("{}: length not aligned to chunk size", __func__);
+      return crimson::ct_error::invarg::make();
+    }
+  } else {
+    checksum.chunk_size = checksum.length;
+  }
+  if (checksum.length == 0) {
+    uint32_t count = 0;
+    encode(count, osd_op.outdata);
+    return checksum_errorator::now();
+  }
+
+  // read the chunk to be checksum'ed
+  return _read(os.oi.soid, checksum.offset, checksum.length, osd_op.op.flags).safe_then(
+    [&osd_op](auto&& read_bl) mutable -> checksum_errorator::future<> {
+    auto& checksum = osd_op.op.checksum;
+    if (read_bl.length() != checksum.length) {
+      logger().warn("checksum: bytes read {} != {}",
+                        read_bl.length(), checksum.length);
+      return crimson::ct_error::invarg::make();
+    }
+    // calculate its checksum and put the result in outdata
+    switch (checksum.type) {
+    case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
+      return do_checksum<Checksummer::xxhash32>(osd_op.indata,
+                                                checksum.chunk_size,
+                                                read_bl,
+                                                osd_op.outdata);
+    case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
+      return do_checksum<Checksummer::xxhash64>(osd_op.indata,
+                                                checksum.chunk_size,
+                                                read_bl,
+                                                osd_op.outdata);
+    case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
+      return do_checksum<Checksummer::crc32c>(osd_op.indata,
+                                              checksum.chunk_size,
+                                              read_bl,
+                                              osd_op.outdata);
+    default:
+      logger().warn("checksum: unknown crc type ({})",
+		    static_cast<uint32_t>(checksum.type));
+      return crimson::ct_error::invarg::make();
+    }
+  });
+}
+
+PGBackend::cmp_ext_errorator::future<>
+PGBackend::cmp_ext(const ObjectState& os, OSDOp& osd_op)
+{
+  const ceph_osd_op& op = osd_op.op;
+  // return the index of the first unmatched byte in the payload, hence the
+  // strange limit and check
+  if (op.extent.length > MAX_ERRNO) {
+    return crimson::ct_error::invarg::make();
+  }
+  uint64_t obj_size = os.oi.size;
+  if (os.oi.truncate_seq < op.extent.truncate_seq &&
+      op.extent.offset + op.extent.length > op.extent.truncate_size) {
+    obj_size = op.extent.truncate_size;
+  }
+  uint64_t ext_len;
+  if (op.extent.offset >= obj_size) {
+    ext_len = 0;
+  } else if (op.extent.offset + op.extent.length > obj_size) {
+    ext_len = obj_size - op.extent.offset;
+  } else {
+    ext_len = op.extent.length;
+  }
+  auto read_ext = ll_read_errorator::make_ready_future<ceph::bufferlist>();
+  if (ext_len == 0) {
+    logger().debug("{}: zero length extent", __func__);
+  } else if (!os.exists || os.oi.is_whiteout()) {
+    logger().debug("{}: {} DNE", __func__, os.oi.soid);
+  } else {
+    read_ext = _read(os.oi.soid, op.extent.offset, ext_len, 0);
+  }
+  return read_ext.safe_then([&osd_op](auto&& read_bl) {
+    int32_t retcode = 0;
+    for (unsigned index = 0; index < osd_op.indata.length(); index++) {
+      char byte_in_op = osd_op.indata[index];
+      char byte_from_disk = (index < read_bl.length() ? read_bl[index] : 0);
+      if (byte_in_op != byte_from_disk) {
+        logger().debug("cmp_ext: mismatch at {}", index);
+        retcode = -MAX_ERRNO - index;
+	break;
+      }
+    }
+    logger().debug("cmp_ext: {}", retcode);
+    osd_op.rval = retcode;
+  });
+}
+
+PGBackend::stat_errorator::future<> PGBackend::stat(
+  const ObjectState& os,
+  OSDOp& osd_op)
+{
+  if (os.exists/* TODO: && !os.is_whiteout() */) {
+    logger().debug("stat os.oi.size={}, os.oi.mtime={}", os.oi.size, os.oi.mtime);
+    encode(os.oi.size, osd_op.outdata);
+    encode(os.oi.mtime, osd_op.outdata);
+  } else {
+    logger().debug("stat object does not exist");
+    return crimson::ct_error::enoent::make();
+  }
+  return stat_errorator::now();
+  // TODO: ctx->delta_stats.num_rd++;
+}
+
+bool PGBackend::maybe_create_new_object(
+  ObjectState& os,
+  ceph::os::Transaction& txn)
+{
+  if (!os.exists) {
+    ceph_assert(!os.oi.is_whiteout());
+    os.exists = true;
+    os.oi.new_object();
+
+    txn.touch(coll->get_cid(), ghobject_t{os.oi.soid});
+    // TODO: delta_stats.num_objects++
+    return false;
+  } else if (os.oi.is_whiteout()) {
+    os.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+    // TODO: delta_stats.num_whiteouts--
+  }
+  return true;
+}
+
+static bool is_offset_and_length_valid(
+  const std::uint64_t offset,
+  const std::uint64_t length)
+{
+  if (const std::uint64_t max = local_conf()->osd_max_object_size;
+      offset >= max || length > max || offset + length > max) {
+    logger().debug("{} osd_max_object_size: {}, offset: {}, len: {}; "
+                   "Hard limit of object size is 4GB",
+                   __func__, max, offset, length);
+    return false;
+  } else {
+    return true;
+  }
+}
+
+seastar::future<> PGBackend::write(
+    ObjectState& os,
+    const OSDOp& osd_op,
+    ceph::os::Transaction& txn,
+    osd_op_params_t& osd_op_params)
+{
+  const ceph_osd_op& op = osd_op.op;
+  uint64_t offset = op.extent.offset;
+  uint64_t length = op.extent.length;
+  bufferlist buf = osd_op.indata;
+  if (auto seq = os.oi.truncate_seq;
+      seq != 0 && op.extent.truncate_seq < seq) {
+    // old write, arrived after trimtrunc
+    if (offset + length > os.oi.size) {
+      // no-op
+      if (offset > os.oi.size) {
+	length = 0;
+	buf.clear();
+      } else {
+	// truncate
+	auto len = os.oi.size - offset;
+	buf.splice(len, length);
+	length = len;
+      }
+    }
+  } else if (op.extent.truncate_seq > seq) {
+    // write arrives before trimtrunc
+    if (os.exists && !os.oi.is_whiteout()) {
+      txn.truncate(coll->get_cid(),
+                   ghobject_t{os.oi.soid}, op.extent.truncate_size);
+      if (op.extent.truncate_size != os.oi.size) {
+        os.oi.size = length;
+        // TODO: truncate_update_size_and_usage()
+        if (op.extent.truncate_size > os.oi.size) {
+          osd_op_params.clean_regions.mark_data_region_dirty(os.oi.size,
+              op.extent.truncate_size - os.oi.size);
+        } else {
+          osd_op_params.clean_regions.mark_data_region_dirty(op.extent.truncate_size,
+              os.oi.size - op.extent.truncate_size);
+        }
+      }
+    }
+    os.oi.truncate_seq = op.extent.truncate_seq;
+    os.oi.truncate_size = op.extent.truncate_size;
+  }
+  maybe_create_new_object(os, txn);
+  if (length == 0) {
+    if (offset > os.oi.size) {
+      txn.truncate(coll->get_cid(), ghobject_t{os.oi.soid}, op.extent.offset);
+    } else {
+      txn.nop();
+    }
+  } else {
+    txn.write(coll->get_cid(), ghobject_t{os.oi.soid},
+	      offset, length, std::move(buf), op.flags);
+    os.oi.size = std::max(offset + length, os.oi.size);
+  }
+  osd_op_params.clean_regions.mark_data_region_dirty(op.extent.offset,
+						     op.extent.length);
+
+  return seastar::now();
+}
+
+seastar::future<> PGBackend::write_same(
+  ObjectState& os,
+  const OSDOp& osd_op,
+  ceph::os::Transaction& txn,
+  osd_op_params_t& osd_op_params)
+{
+  const ceph_osd_op& op = osd_op.op;
+  const uint64_t len = op.writesame.length;
+  if (len == 0) {
+    return seastar::now();
+  }
+  if (op.writesame.data_length == 0 ||
+      len % op.writesame.data_length != 0 ||
+      op.writesame.data_length != osd_op.indata.length()) {
+    throw crimson::osd::invalid_argument();
+  }
+  ceph::bufferlist repeated_indata;
+  for (uint64_t size = 0; size < len; size += op.writesame.data_length) {
+    repeated_indata.append(osd_op.indata);
+  }
+  maybe_create_new_object(os, txn);
+  txn.write(coll->get_cid(), ghobject_t{os.oi.soid},
+            op.writesame.offset, len,
+            std::move(repeated_indata), op.flags);
+  os.oi.size = len;
+  osd_op_params.clean_regions.mark_data_region_dirty(op.writesame.offset, len);
+  return seastar::now();
+}
+
+seastar::future<> PGBackend::writefull(
+  ObjectState& os,
+  const OSDOp& osd_op,
+  ceph::os::Transaction& txn,
+  osd_op_params_t& osd_op_params)
+{
+  const ceph_osd_op& op = osd_op.op;
+  if (op.extent.length != osd_op.indata.length()) {
+    throw crimson::osd::invalid_argument();
+  }
+
+  const bool existing = maybe_create_new_object(os, txn);
+  if (existing && op.extent.length < os.oi.size) {
+    txn.truncate(coll->get_cid(), ghobject_t{os.oi.soid}, op.extent.length);
+    osd_op_params.clean_regions.mark_data_region_dirty(op.extent.length,
+	os.oi.size - op.extent.length);
+  }
+  if (op.extent.length) {
+    txn.write(coll->get_cid(), ghobject_t{os.oi.soid}, 0, op.extent.length,
+              osd_op.indata, op.flags);
+    os.oi.size = op.extent.length;
+    osd_op_params.clean_regions.mark_data_region_dirty(0,
+	std::max((uint64_t) op.extent.length, os.oi.size));
+  }
+  return seastar::now();
+}
+
+PGBackend::append_errorator::future<> PGBackend::append(
+  ObjectState& os,
+  OSDOp& osd_op,
+  ceph::os::Transaction& txn,
+  osd_op_params_t& osd_op_params)
+{
+  const ceph_osd_op& op = osd_op.op;
+  if (op.extent.length != osd_op.indata.length()) {
+    return crimson::ct_error::invarg::make();
+  }
+  maybe_create_new_object(os, txn);
+  if (op.extent.length) {
+    txn.write(coll->get_cid(), ghobject_t{os.oi.soid},
+              os.oi.size /* offset */, op.extent.length,
+              std::move(osd_op.indata), op.flags);
+    os.oi.size += op.extent.length;
+    osd_op_params.clean_regions.mark_data_region_dirty(os.oi.size,
+                                                       op.extent.length);
+  }
+  return seastar::now();
+}
+
+PGBackend::write_ertr::future<> PGBackend::truncate(
+  ObjectState& os,
+  const OSDOp& osd_op,
+  ceph::os::Transaction& txn,
+  osd_op_params_t& osd_op_params)
+{
+  if (!os.exists || os.oi.is_whiteout()) {
+    logger().debug("{} object dne, truncate is a no-op", __func__);
+    return write_ertr::now();
+  }
+  const ceph_osd_op& op = osd_op.op;
+  if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) {
+    return crimson::ct_error::file_too_large::make();
+  }
+  if (op.extent.truncate_seq) {
+    assert(op.extent.offset == op.extent.truncate_size);
+    if (op.extent.truncate_seq <= os.oi.truncate_seq) {
+      logger().debug("{} truncate seq {} <= current {}, no-op",
+                     __func__, op.extent.truncate_seq, os.oi.truncate_seq);
+      return write_ertr::make_ready_future<>();
+    } else {
+      logger().debug("{} truncate seq {} > current {}, truncating",
+                     __func__, op.extent.truncate_seq, os.oi.truncate_seq);
+      os.oi.truncate_seq = op.extent.truncate_seq;
+      os.oi.truncate_size = op.extent.truncate_size;
+    }
+  }
+  maybe_create_new_object(os, txn);
+  if (os.oi.size != op.extent.offset) {
+    txn.truncate(coll->get_cid(),
+                 ghobject_t{os.oi.soid}, op.extent.offset);
+    if (os.oi.size > op.extent.offset) {
+      // TODO: modified_ranges.union_of(trim);
+      osd_op_params.clean_regions.mark_data_region_dirty(
+        op.extent.offset,
+	os.oi.size - op.extent.offset);
+    } else {
+      // os.oi.size < op.extent.offset
+      osd_op_params.clean_regions.mark_data_region_dirty(
+        os.oi.size,
+        op.extent.offset - os.oi.size);
+    }
+    os.oi.size = op.extent.offset;
+    os.oi.clear_data_digest();
+  }
+  // TODO: truncate_update_size_and_usage()
+  // TODO: ctx->delta_stats.num_wr++;
+  // ----
+  // do no set exists, or we will break above DELETE -> TRUNCATE munging.
+  return write_ertr::now();
+}
+
+PGBackend::write_ertr::future<> PGBackend::zero(
+  ObjectState& os,
+  const OSDOp& osd_op,
+  ceph::os::Transaction& txn,
+  osd_op_params_t& osd_op_params)
+{
+  if (!os.exists || os.oi.is_whiteout()) {
+    logger().debug("{} object dne, zero is a no-op", __func__);
+    return write_ertr::now();
+  }
+  const ceph_osd_op& op = osd_op.op;
+  if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) {
+    return crimson::ct_error::file_too_large::make();
+  }
+  assert(op.extent.length);
+  txn.zero(coll->get_cid(),
+           ghobject_t{os.oi.soid},
+           op.extent.offset,
+           op.extent.length);
+  // TODO: modified_ranges.union_of(zeroed);
+  osd_op_params.clean_regions.mark_data_region_dirty(op.extent.offset,
+						     op.extent.length);
+  // TODO: ctx->delta_stats.num_wr++;
+  os.oi.clear_data_digest();
+  return write_ertr::now();
+}
+
+seastar::future<> PGBackend::create(
+  ObjectState& os,
+  const OSDOp& osd_op,
+  ceph::os::Transaction& txn)
+{
+  if (os.exists && !os.oi.is_whiteout() &&
+      (osd_op.op.flags & CEPH_OSD_OP_FLAG_EXCL)) {
+    // this is an exclusive create
+    throw crimson::osd::make_error(-EEXIST);
+  }
+
+  if (osd_op.indata.length()) {
+    // handle the legacy. `category` is no longer implemented.
+    try {
+      auto p = osd_op.indata.cbegin();
+      std::string category;
+      decode(category, p);
+    } catch (buffer::error&) {
+      throw crimson::osd::invalid_argument();
+    }
+  }
+  maybe_create_new_object(os, txn);
+  txn.nop();
+  return seastar::now();
+}
+
+seastar::future<> PGBackend::remove(ObjectState& os,
+                                    ceph::os::Transaction& txn)
+{
+  // todo: snapset
+  txn.remove(coll->get_cid(),
+	     ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard});
+  os.oi.size = 0;
+  os.oi.new_object();
+  os.exists = false;
+  // todo: update watchers
+  if (os.oi.is_whiteout()) {
+    os.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+  }
+  return seastar::now();
+}
+
+seastar::future<std::tuple<std::vector<hobject_t>, hobject_t>>
+PGBackend::list_objects(const hobject_t& start, uint64_t limit) const
+{
+  if (__builtin_expect(stopping, false)) {
+    throw crimson::common::system_shutdown_exception();
+  }
+
+  auto gstart = start.is_min() ? ghobject_t{} : ghobject_t{start, 0, shard};
+  return store->list_objects(coll,
+                             gstart,
+                             ghobject_t::get_max(),
+                             limit)
+    .then([](auto ret) {
+      auto& [gobjects, next] = ret;
+      std::vector<hobject_t> objects;
+      boost::copy(gobjects |
+        boost::adaptors::filtered([](const ghobject_t& o) {
+          if (o.is_pgmeta()) {
+            return false;
+          } else if (o.hobj.is_temp()) {
+            return false;
+          } else {
+            return o.is_no_gen();
+          }
+        }) |
+        boost::adaptors::transformed([](const ghobject_t& o) {
+          return o.hobj;
+        }),
+        std::back_inserter(objects));
+      return seastar::make_ready_future<std::tuple<std::vector<hobject_t>, hobject_t>>(
+        std::make_tuple(objects, next.hobj));
+    });
+}
+
+seastar::future<> PGBackend::setxattr(
+  ObjectState& os,
+  const OSDOp& osd_op,
+  ceph::os::Transaction& txn)
+{
+  if (local_conf()->osd_max_attr_size > 0 &&
+      osd_op.op.xattr.value_len > local_conf()->osd_max_attr_size) {
+    throw crimson::osd::make_error(-EFBIG);
+  }
+
+  const auto max_name_len = std::min<uint64_t>(
+    store->get_max_attr_name_length(), local_conf()->osd_max_attr_name_len);
+  if (osd_op.op.xattr.name_len > max_name_len) {
+    throw crimson::osd::make_error(-ENAMETOOLONG);
+  }
+
+  maybe_create_new_object(os, txn);
+
+  std::string name{"_"};
+  ceph::bufferlist val;
+  {
+    auto bp = osd_op.indata.cbegin();
+    bp.copy(osd_op.op.xattr.name_len, name);
+    bp.copy(osd_op.op.xattr.value_len, val);
+  }
+  logger().debug("setxattr on obj={} for attr={}", os.oi.soid, name);
+
+  txn.setattr(coll->get_cid(), ghobject_t{os.oi.soid}, name, val);
+  return seastar::now();
+  //ctx->delta_stats.num_wr++;
+}
+
+PGBackend::get_attr_errorator::future<> PGBackend::getxattr(
+  const ObjectState& os,
+  OSDOp& osd_op) const
+{
+  std::string name;
+  ceph::bufferlist val;
+  {
+    auto bp = osd_op.indata.cbegin();
+    std::string aname;
+    bp.copy(osd_op.op.xattr.name_len, aname);
+    name = "_" + aname;
+  }
+  logger().debug("getxattr on obj={} for attr={}", os.oi.soid, name);
+  return getxattr(os.oi.soid, name).safe_then([&osd_op] (ceph::bufferptr val) {
+    osd_op.outdata.clear();
+    osd_op.outdata.push_back(std::move(val));
+    osd_op.op.xattr.value_len = osd_op.outdata.length();
+    return get_attr_errorator::now();
+    //ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+  });
+  //ctx->delta_stats.num_rd++;
+}
+
+PGBackend::get_attr_errorator::future<ceph::bufferptr> PGBackend::getxattr(
+  const hobject_t& soid,
+  std::string_view key) const
+{
+  if (__builtin_expect(stopping, false)) {
+    throw crimson::common::system_shutdown_exception();
+  }
+
+  return store->get_attr(coll, ghobject_t{soid}, key);
+}
+
+PGBackend::get_attr_errorator::future<> PGBackend::get_xattrs(
+  const ObjectState& os,
+  OSDOp& osd_op) const
+{
+  if (__builtin_expect(stopping, false)) {
+    throw crimson::common::system_shutdown_exception();
+  }
+  return store->get_attrs(coll, ghobject_t{os.oi.soid}).safe_then(
+    [&osd_op](auto&& attrs) {
+    std::vector<std::pair<std::string, bufferlist>> user_xattrs;
+    for (auto& [key, val] : attrs) {
+      if (key.size() > 1 && key[0] == '_') {
+	ceph::bufferlist bl;
+	bl.append(std::move(val));
+	user_xattrs.emplace_back(key.substr(1), std::move(bl));
+      }
+    }
+    ceph::encode(user_xattrs, osd_op.outdata);
+    return get_attr_errorator::now();
+  });
+}
+
+PGBackend::rm_xattr_ertr::future<> PGBackend::rm_xattr(
+  ObjectState& os,
+  const OSDOp& osd_op,
+  ceph::os::Transaction& txn)
+{
+  if (__builtin_expect(stopping, false)) {
+    throw crimson::common::system_shutdown_exception();
+  }
+  if (!os.exists || os.oi.is_whiteout()) {
+    logger().debug("{}: {} DNE", __func__, os.oi.soid);
+    return crimson::ct_error::enoent::make();
+  }
+  auto bp = osd_op.indata.cbegin();
+  string attr_name{"_"};
+  bp.copy(osd_op.op.xattr.name_len, attr_name);
+  txn.rmattr(coll->get_cid(), ghobject_t{os.oi.soid}, attr_name);
+  return rm_xattr_ertr::now();
+}
+
+using get_omap_ertr =
+  crimson::os::FuturizedStore::read_errorator::extend<
+    crimson::ct_error::enodata>;
+static
+get_omap_ertr::future<
+  crimson::os::FuturizedStore::omap_values_t>
+maybe_get_omap_vals_by_keys(
+  crimson::os::FuturizedStore* store,
+  const crimson::os::CollectionRef& coll,
+  const object_info_t& oi,
+  const std::set<std::string>& keys_to_get)
+{
+  if (oi.is_omap()) {
+    return store->omap_get_values(coll, ghobject_t{oi.soid}, keys_to_get);
+  } else {
+    return crimson::ct_error::enodata::make();
+  }
+}
+
+static
+get_omap_ertr::future<
+  std::tuple<bool, crimson::os::FuturizedStore::omap_values_t>>
+maybe_get_omap_vals(
+  crimson::os::FuturizedStore* store,
+  const crimson::os::CollectionRef& coll,
+  const object_info_t& oi,
+  const std::string& start_after)
+{
+  if (oi.is_omap()) {
+    return store->omap_get_values(coll, ghobject_t{oi.soid}, start_after);
+  } else {
+    return crimson::ct_error::enodata::make();
+  }
+}
+
+PGBackend::ll_read_errorator::future<ceph::bufferlist>
+PGBackend::omap_get_header(
+  const crimson::os::CollectionRef& c,
+  const ghobject_t& oid) const
+{
+  return store->omap_get_header(c, oid);
+}
+
+PGBackend::ll_read_errorator::future<>
+PGBackend::omap_get_header(
+  const ObjectState& os,
+  OSDOp& osd_op) const
+{
+  return omap_get_header(coll, ghobject_t{os.oi.soid}).safe_then(
+    [&osd_op] (ceph::bufferlist&& header) {
+      osd_op.outdata = std::move(header);
+      return seastar::now();
+    });
+}
+
+PGBackend::ll_read_errorator::future<>
+PGBackend::omap_get_keys(
+  const ObjectState& os,
+  OSDOp& osd_op) const
+{
+  if (__builtin_expect(stopping, false)) {
+    throw crimson::common::system_shutdown_exception();
+  }
+  if (!os.exists || os.oi.is_whiteout()) {
+    logger().debug("{}: object does not exist: {}", os.oi.soid);
+    return crimson::ct_error::enoent::make();
+  }
+  std::string start_after;
+  uint64_t max_return;
+  try {
+    auto p = osd_op.indata.cbegin();
+    decode(start_after, p);
+    decode(max_return, p);
+  } catch (buffer::error&) {
+    throw crimson::osd::invalid_argument{};
+  }
+  max_return =
+    std::min(max_return, local_conf()->osd_max_omap_entries_per_request);
+
+  // TODO: truly chunk the reading
+  return maybe_get_omap_vals(store, coll, os.oi, start_after).safe_then(
+    [=, &osd_op](auto ret) {
+      ceph::bufferlist result;
+      bool truncated = false;
+      uint32_t num = 0;
+      for (auto &[key, val] : std::get<1>(ret)) {
+        if (num >= max_return ||
+            result.length() >= local_conf()->osd_max_omap_bytes_per_request) {
+          truncated = true;
+          break;
+        }
+        encode(key, result);
+        ++num;
+      }
+      encode(num, osd_op.outdata);
+      osd_op.outdata.claim_append(result);
+      encode(truncated, osd_op.outdata);
+      return seastar::now();
+    }).handle_error(
+      crimson::ct_error::enodata::handle([&osd_op] {
+        uint32_t num = 0;
+	bool truncated = false;
+	encode(num, osd_op.outdata);
+	encode(truncated, osd_op.outdata);
+	return seastar::now();
+      }),
+      ll_read_errorator::pass_further{}
+    );
+  // TODO:
+  //ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+  //ctx->delta_stats.num_rd++;
+}
+
+PGBackend::ll_read_errorator::future<>
+PGBackend::omap_get_vals(
+  const ObjectState& os,
+  OSDOp& osd_op) const
+{
+  if (__builtin_expect(stopping, false)) {
+    throw crimson::common::system_shutdown_exception();
+  }
+
+  std::string start_after;
+  uint64_t max_return;
+  std::string filter_prefix;
+  try {
+    auto p = osd_op.indata.cbegin();
+    decode(start_after, p);
+    decode(max_return, p);
+    decode(filter_prefix, p);
+  } catch (buffer::error&) {
+    throw crimson::osd::invalid_argument{};
+  }
+
+  max_return = \
+    std::min(max_return, local_conf()->osd_max_omap_entries_per_request);
+
+  // TODO: truly chunk the reading
+  return maybe_get_omap_vals(store, coll, os.oi, start_after).safe_then(
+    [=, &osd_op] (auto&& ret) {
+      auto [done, vals] = std::move(ret);
+      assert(done);
+      ceph::bufferlist result;
+      bool truncated = false;
+      uint32_t num = 0;
+      auto iter = filter_prefix > start_after ? vals.lower_bound(filter_prefix)
+                                              : std::begin(vals);
+      for (; iter != std::end(vals); ++iter) {
+        const auto& [key, value] = *iter;
+        if (key.substr(0, filter_prefix.size()) != filter_prefix) {
+          break;
+        } else if (num >= max_return ||
+            result.length() >= local_conf()->osd_max_omap_bytes_per_request) {
+          truncated = true;
+          break;
+        }
+        encode(key, result);
+        encode(value, result);
+        ++num;
+      }
+      encode(num, osd_op.outdata);
+      osd_op.outdata.claim_append(result);
+      encode(truncated, osd_op.outdata);
+      return ll_read_errorator::now();
+    }).handle_error(
+      crimson::ct_error::enodata::handle([&osd_op] {
+        encode(uint32_t{0} /* num */, osd_op.outdata);
+        encode(bool{false} /* truncated */, osd_op.outdata);
+        return ll_read_errorator::now();
+      }),
+      ll_read_errorator::pass_further{}
+    );
+
+  // TODO:
+  //ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+  //ctx->delta_stats.num_rd++;
+}
+
+PGBackend::ll_read_errorator::future<>
+PGBackend::omap_get_vals_by_keys(
+  const ObjectState& os,
+  OSDOp& osd_op) const
+{
+  if (__builtin_expect(stopping, false)) {
+    throw crimson::common::system_shutdown_exception();
+  }
+  if (!os.exists || os.oi.is_whiteout()) {
+    logger().debug("{}: object does not exist: {}", os.oi.soid);
+    return crimson::ct_error::enoent::make();
+  }
+
+  std::set<std::string> keys_to_get;
+  try {
+    auto p = osd_op.indata.cbegin();
+    decode(keys_to_get, p);
+  } catch (buffer::error&) {
+    throw crimson::osd::invalid_argument();
+  }
+  return maybe_get_omap_vals_by_keys(store, coll, os.oi, keys_to_get).safe_then(
+    [&osd_op] (crimson::os::FuturizedStore::omap_values_t&& vals) {
+      encode(vals, osd_op.outdata);
+      return ll_read_errorator::now();
+    }).handle_error(
+      crimson::ct_error::enodata::handle([&osd_op] {
+        uint32_t num = 0;
+        encode(num, osd_op.outdata);
+        return ll_read_errorator::now();
+      }),
+      ll_read_errorator::pass_further{}
+    );
+
+  // TODO:
+  //ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+  //ctx->delta_stats.num_rd++;
+}
+
+seastar::future<> PGBackend::omap_set_vals(
+  ObjectState& os,
+  const OSDOp& osd_op,
+  ceph::os::Transaction& txn,
+  osd_op_params_t& osd_op_params)
+{
+  maybe_create_new_object(os, txn);
+
+  ceph::bufferlist to_set_bl;
+  try {
+    auto p = osd_op.indata.cbegin();
+    decode_str_str_map_to_bl(p, &to_set_bl);
+  } catch (buffer::error&) {
+    throw crimson::osd::invalid_argument{};
+  }
+
+  txn.omap_setkeys(coll->get_cid(), ghobject_t{os.oi.soid}, to_set_bl);
+
+  // TODO:
+  //ctx->clean_regions.mark_omap_dirty();
+
+  // TODO:
+  //ctx->delta_stats.num_wr++;
+  //ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10);
+  os.oi.set_flag(object_info_t::FLAG_OMAP);
+  os.oi.clear_omap_digest();
+  osd_op_params.clean_regions.mark_omap_dirty();
+  return seastar::now();
+}
+
+seastar::future<> PGBackend::omap_set_header(
+  ObjectState& os,
+  const OSDOp& osd_op,
+  ceph::os::Transaction& txn)
+{
+  maybe_create_new_object(os, txn);
+  txn.omap_setheader(coll->get_cid(), ghobject_t{os.oi.soid}, osd_op.indata);
+  //TODO:
+  //ctx->clean_regions.mark_omap_dirty();
+  //ctx->delta_stats.num_wr++;
+  os.oi.set_flag(object_info_t::FLAG_OMAP);
+  os.oi.clear_omap_digest();
+  return seastar::now();
+}
+
+seastar::future<> PGBackend::omap_remove_range(
+  ObjectState& os,
+  const OSDOp& osd_op,
+  ceph::os::Transaction& txn)
+{
+  std::string key_begin, key_end;
+  try {
+    auto p = osd_op.indata.cbegin();
+    decode(key_begin, p);
+    decode(key_end, p);
+  } catch (buffer::error& e) {
+    throw crimson::osd::invalid_argument{};
+  }
+  txn.omap_rmkeyrange(coll->get_cid(), ghobject_t{os.oi.soid}, key_begin, key_end);
+  //TODO:
+  //ctx->delta_stats.num_wr++;
+  os.oi.clear_omap_digest();
+  return seastar::now();
+}
+
+PGBackend::omap_clear_ertr::future<>
+PGBackend::omap_clear(
+  ObjectState& os,
+  OSDOp& osd_op,
+  ceph::os::Transaction& txn,
+  osd_op_params_t& osd_op_params)
+{
+  if (__builtin_expect(stopping, false)) {
+    throw crimson::common::system_shutdown_exception();
+  }
+  if (!os.exists || os.oi.is_whiteout()) {
+    logger().debug("{}: object does not exist: {}", os.oi.soid);
+    return crimson::ct_error::enoent::make();
+  }
+  if (!os.oi.is_omap()) {
+    return omap_clear_ertr::now();
+  }
+  txn.omap_clear(coll->get_cid(), ghobject_t{os.oi.soid});
+  osd_op_params.clean_regions.mark_omap_dirty();
+  os.oi.clear_omap_digest();
+  os.oi.clear_flag(object_info_t::FLAG_OMAP);
+  return omap_clear_ertr::now();
+}
+
+seastar::future<struct stat> PGBackend::stat(
+  CollectionRef c,
+  const ghobject_t& oid) const
+{
+  return store->stat(c, oid);
+}
+
+seastar::future<std::map<uint64_t, uint64_t>>
+PGBackend::fiemap(
+  CollectionRef c,
+  const ghobject_t& oid,
+  uint64_t off,
+  uint64_t len)
+{
+  return store->fiemap(c, oid, off, len);
+}
+
+void PGBackend::on_activate_complete() {
+  peering.reset();
+}
+
diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h
new file mode 100644
index 000000000..d8fa8b2ac
--- /dev/null
+++ b/src/crimson/osd/pg_backend.h
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <boost/smart_ptr/local_shared_ptr.hpp>
+#include <boost/container/flat_set.hpp>
+
+#include "crimson/os/futurized_store.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/osd/acked_peers.h"
+#include "crimson/osd/pg.h"
+#include "crimson/common/shared_lru.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "os/Transaction.h"
+#include "osd/osd_types.h"
+#include "crimson/osd/object_context.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/osd_operations/osdop_params.h"
+
+struct hobject_t;
+
+namespace ceph::os {
+  class Transaction;
+}
+
+namespace crimson::osd {
+  class ShardServices;
+}
+
+class PGBackend
+{
+protected:
+  using CollectionRef = crimson::os::CollectionRef;
+  using ec_profile_t = std::map<std::string, std::string>;
+  // low-level read errorator
+  using ll_read_errorator = crimson::os::FuturizedStore::read_errorator;
+
+public:
+  using load_metadata_ertr = crimson::errorator<
+    crimson::ct_error::object_corrupted>;
+  PGBackend(shard_id_t shard, CollectionRef coll, crimson::os::FuturizedStore* store);
+  virtual ~PGBackend() = default;
+  static std::unique_ptr<PGBackend> create(pg_t pgid,
+					   const pg_shard_t pg_shard,
+					   const pg_pool_t& pool,
+					   crimson::os::CollectionRef coll,
+					   crimson::osd::ShardServices& shard_services,
+					   const ec_profile_t& ec_profile);
+  using attrs_t =
+    std::map<std::string, ceph::bufferptr, std::less<>>;
+  using read_errorator = ll_read_errorator::extend<
+    crimson::ct_error::object_corrupted>;
+  read_errorator::future<> read(
+    const ObjectState& os,
+    OSDOp& osd_op);
+  read_errorator::future<> sparse_read(
+    const ObjectState& os,
+    OSDOp& osd_op);
+  using checksum_errorator = ll_read_errorator::extend<
+    crimson::ct_error::object_corrupted,
+    crimson::ct_error::invarg>;
+  checksum_errorator::future<> checksum(
+    const ObjectState& os,
+    OSDOp& osd_op);
+  using cmp_ext_errorator = ll_read_errorator::extend<
+    crimson::ct_error::invarg>;
+  cmp_ext_errorator::future<> cmp_ext(
+    const ObjectState& os,
+    OSDOp& osd_op);
+  using stat_errorator = crimson::errorator<crimson::ct_error::enoent>;
+  stat_errorator::future<> stat(
+    const ObjectState& os,
+    OSDOp& osd_op);
+
+  // TODO: switch the entire write family to errorator.
+  using write_ertr = crimson::errorator<
+    crimson::ct_error::file_too_large>;
+  seastar::future<> create(
+    ObjectState& os,
+    const OSDOp& osd_op,
+    ceph::os::Transaction& trans);
+  seastar::future<> remove(
+    ObjectState& os,
+    ceph::os::Transaction& txn);
+  seastar::future<> write(
+    ObjectState& os,
+    const OSDOp& osd_op,
+    ceph::os::Transaction& trans,
+    osd_op_params_t& osd_op_params);
+  seastar::future<> write_same(
+    ObjectState& os,
+    const OSDOp& osd_op,
+    ceph::os::Transaction& trans,
+    osd_op_params_t& osd_op_params);
+  seastar::future<> writefull(
+    ObjectState& os,
+    const OSDOp& osd_op,
+    ceph::os::Transaction& trans,
+    osd_op_params_t& osd_op_params);
+  using append_errorator = crimson::errorator<
+    crimson::ct_error::invarg>;
+  append_errorator::future<> append(
+    ObjectState& os,
+    OSDOp& osd_op,
+    ceph::os::Transaction& trans,
+    osd_op_params_t& osd_op_params);
+  write_ertr::future<> truncate(
+    ObjectState& os,
+    const OSDOp& osd_op,
+    ceph::os::Transaction& trans,
+    osd_op_params_t& osd_op_params);
+  write_ertr::future<> zero(
+    ObjectState& os,
+    const OSDOp& osd_op,
+    ceph::os::Transaction& trans,
+    osd_op_params_t& osd_op_params);
+  seastar::future<crimson::osd::acked_peers_t> mutate_object(
+    std::set<pg_shard_t> pg_shards,
+    crimson::osd::ObjectContextRef &&obc,
+    ceph::os::Transaction&& txn,
+    const osd_op_params_t& osd_op_p,
+    epoch_t min_epoch,
+    epoch_t map_epoch,
+    std::vector<pg_log_entry_t>&& log_entries);
+  seastar::future<std::tuple<std::vector<hobject_t>, hobject_t>> list_objects(
+    const hobject_t& start,
+    uint64_t limit) const;
+  seastar::future<> setxattr(
+    ObjectState& os,
+    const OSDOp& osd_op,
+    ceph::os::Transaction& trans);
+  using get_attr_errorator = crimson::os::FuturizedStore::get_attr_errorator;
+  get_attr_errorator::future<> getxattr(
+    const ObjectState& os,
+    OSDOp& osd_op) const;
+  get_attr_errorator::future<ceph::bufferptr> getxattr(
+    const hobject_t& soid,
+    std::string_view key) const;
+  get_attr_errorator::future<> get_xattrs(
+    const ObjectState& os,
+    OSDOp& osd_op) const;
+  using rm_xattr_ertr = crimson::errorator<crimson::ct_error::enoent>;
+  rm_xattr_ertr::future<> rm_xattr(
+    ObjectState& os,
+    const OSDOp& osd_op,
+    ceph::os::Transaction& trans);
+  seastar::future<struct stat> stat(
+    CollectionRef c,
+    const ghobject_t& oid) const;
+  seastar::future<std::map<uint64_t, uint64_t>> fiemap(
+    CollectionRef c,
+    const ghobject_t& oid,
+    uint64_t off,
+    uint64_t len);
+
+  // OMAP
+  ll_read_errorator::future<> omap_get_keys(
+    const ObjectState& os,
+    OSDOp& osd_op) const;
+  ll_read_errorator::future<> omap_get_vals(
+    const ObjectState& os,
+    OSDOp& osd_op) const;
+  ll_read_errorator::future<> omap_get_vals_by_keys(
+    const ObjectState& os,
+    OSDOp& osd_op) const;
+  seastar::future<> omap_set_vals(
+    ObjectState& os,
+    const OSDOp& osd_op,
+    ceph::os::Transaction& trans,
+    osd_op_params_t& osd_op_params);
+  ll_read_errorator::future<ceph::bufferlist> omap_get_header(
+    const crimson::os::CollectionRef& c,
+    const ghobject_t& oid) const;
+  ll_read_errorator::future<> omap_get_header(
+    const ObjectState& os,
+    OSDOp& osd_op) const;
+  seastar::future<> omap_set_header(
+    ObjectState& os,
+    const OSDOp& osd_op,
+    ceph::os::Transaction& trans);
+  seastar::future<> omap_remove_range(
+    ObjectState& os,
+    const OSDOp& osd_op,
+    ceph::os::Transaction& trans);
+  using omap_clear_ertr = crimson::errorator<crimson::ct_error::enoent>;
+  omap_clear_ertr::future<> omap_clear(
+    ObjectState& os,
+    OSDOp& osd_op,
+    ceph::os::Transaction& trans,
+    osd_op_params_t& osd_op_params);
+
+  virtual void got_rep_op_reply(const MOSDRepOpReply&) {}
+  virtual seastar::future<> stop() = 0;
+  struct peering_info_t {
+    bool is_primary;
+  };
+  virtual void on_actingset_changed(peering_info_t pi) = 0;
+  virtual void on_activate_complete();
+protected:
+  const shard_id_t shard;
+  CollectionRef coll;
+  crimson::os::FuturizedStore* store;
+  bool stopping = false;
+  std::optional<peering_info_t> peering;
+public:
+  struct loaded_object_md_t {
+    ObjectState os;
+    std::optional<SnapSet> ss;
+    using ref = std::unique_ptr<loaded_object_md_t>;
+  };
+  load_metadata_ertr::future<loaded_object_md_t::ref> load_metadata(
+    const hobject_t &oid);
+
+private:
+  virtual ll_read_errorator::future<ceph::bufferlist> _read(
+    const hobject_t& hoid,
+    size_t offset,
+    size_t length,
+    uint32_t flags) = 0;
+
+  bool maybe_create_new_object(ObjectState& os, ceph::os::Transaction& txn);
+  virtual seastar::future<crimson::osd::acked_peers_t>
+  _submit_transaction(std::set<pg_shard_t>&& pg_shards,
+		      const hobject_t& hoid,
+		      ceph::os::Transaction&& txn,
+		      const osd_op_params_t& osd_op_p,
+		      epoch_t min_epoch, epoch_t max_epoch,
+		      std::vector<pg_log_entry_t>&& log_entries) = 0;
+  friend class ReplicatedRecoveryBackend;
+};
diff --git a/src/crimson/osd/pg_map.cc b/src/crimson/osd/pg_map.cc
new file mode 100644
index 000000000..08071f260
--- /dev/null
+++ b/src/crimson/osd/pg_map.cc
@@ -0,0 +1,77 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/osd/pg_map.h"
+
+#include "crimson/osd/pg.h"
+#include "common/Formatter.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+namespace crimson::osd {
+
+PGMap::PGCreationState::PGCreationState(spg_t pgid) : pgid(pgid) {}
+PGMap::PGCreationState::~PGCreationState() {}
+
+void PGMap::PGCreationState::dump_detail(Formatter *f) const
+{
+  f->dump_stream("pgid") << pgid;
+  f->dump_bool("creating", creating);
+}
+
+std::pair<blocking_future<Ref<PG>>, bool> PGMap::wait_for_pg(spg_t pgid)
+{
+  if (auto pg = get_pg(pgid)) {
+    return make_pair(make_ready_blocking_future<Ref<PG>>(pg), true);
+  } else {
+    auto &state = pgs_creating.emplace(pgid, pgid).first->second;
+    return make_pair(
+      state.make_blocking_future(state.promise.get_shared_future()),
+      state.creating);
+  }
+}
+
+Ref<PG> PGMap::get_pg(spg_t pgid)
+{
+  if (auto pg = pgs.find(pgid); pg != pgs.end()) {
+    return pg->second;
+  } else {
+    return nullptr;
+  }
+}
+
+void PGMap::set_creating(spg_t pgid)
+{
+  logger().debug("Creating {}", pgid);
+  ceph_assert(pgs.count(pgid) == 0);
+  auto pg = pgs_creating.find(pgid);
+  ceph_assert(pg != pgs_creating.end());
+  ceph_assert(pg->second.creating == false);
+  pg->second.creating = true;
+}
+
+void PGMap::pg_created(spg_t pgid, Ref<PG> pg)
+{
+  logger().debug("Created {}", pgid);
+  ceph_assert(!pgs.count(pgid));
+  pgs.emplace(pgid, pg);
+
+  auto state = pgs_creating.find(pgid);
+  ceph_assert(state != pgs_creating.end());
+  state->second.promise.set_value(pg);
+  pgs_creating.erase(pgid);
+}
+
+void PGMap::pg_loaded(spg_t pgid, Ref<PG> pg)
+{
+  ceph_assert(!pgs.count(pgid));
+  pgs.emplace(pgid, pg);
+}
+
+PGMap::~PGMap() {}
+
+}
diff --git a/src/crimson/osd/pg_map.h b/src/crimson/osd/pg_map.h
new file mode 100644
index 000000000..b3fe4b562
--- /dev/null
+++ b/src/crimson/osd/pg_map.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+
+#include "include/types.h"
+#include "crimson/common/type_helpers.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/pg.h"
+#include "osd/osd_types.h"
+
+namespace crimson::osd {
+class PG;
+
+class PGMap {
+  struct PGCreationState : BlockerT<PGCreationState> {
+    static constexpr const char * type_name = "PGCreation";
+
+    void dump_detail(Formatter *f) const final;
+
+    spg_t pgid;
+    seastar::shared_promise<Ref<PG>> promise;
+    bool creating = false;
+    PGCreationState(spg_t pgid);
+
+    PGCreationState(const PGCreationState &) = delete;
+    PGCreationState(PGCreationState &&) = delete;
+    PGCreationState &operator=(const PGCreationState &) = delete;
+    PGCreationState &operator=(PGCreationState &&) = delete;
+
+    ~PGCreationState();
+  };
+
+  std::map<spg_t, PGCreationState> pgs_creating;
+  using pgs_t = std::map<spg_t, Ref<PG>>;
+  pgs_t pgs;
+
+public:
+  /**
+   * Get future for pg with a bool indicating whether it's already being
+   * created.
+   */
+  std::pair<blocking_future<Ref<PG>>, bool> wait_for_pg(spg_t pgid);
+
+  /**
+   * get PG in non-blocking manner
+   */
+  Ref<PG> get_pg(spg_t pgid);
+
+  /**
+   * Set creating
+   */
+  void set_creating(spg_t pgid);
+
+  /**
+   * Set newly created pg
+   */
+  void pg_created(spg_t pgid, Ref<PG> pg);
+
+  /**
+   * Add newly loaded pg
+   */
+  void pg_loaded(spg_t pgid, Ref<PG> pg);
+
+  pgs_t& get_pgs() { return pgs; }
+  const pgs_t& get_pgs() const { return pgs; }
+  PGMap() = default;
+  ~PGMap();
+};
+
+}
diff --git a/src/crimson/osd/pg_meta.cc b/src/crimson/osd/pg_meta.cc
new file mode 100644
index 000000000..ad5385963
--- /dev/null
+++ b/src/crimson/osd/pg_meta.cc
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "pg_meta.h"
+
+#include <string_view>
+
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+
+// prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
+// easily skip them
+using crimson::os::FuturizedStore;
+
+PGMeta::PGMeta(FuturizedStore* store, spg_t pgid)
+  : store{store},
+    pgid{pgid}
+{}
+
+namespace {
+  template<typename T>
+  std::optional<T> find_value(const FuturizedStore::omap_values_t& values,
+                              string_view key)
+  {
+    auto found = values.find(key);
+    if (found == values.end()) {
+      return {};
+    }
+    auto p = found->second.cbegin();
+    T value;
+    decode(value, p);
+    return std::make_optional(std::move(value));
+  }
+}
+
+seastar::future<epoch_t> PGMeta::get_epoch()
+{
+  return store->open_collection(coll_t{pgid}).then([this](auto ch) {
+    return store->omap_get_values(ch,
+                                pgid.make_pgmeta_oid(),
+                                {string{infover_key},
+                                 string{epoch_key}}).safe_then(
+    [](auto&& values) {
+      {
+        // sanity check
+        auto infover = find_value<__u8>(values, infover_key);
+        assert(infover);
+        if (*infover < 10) {
+          throw std::runtime_error("incompatible pg meta");
+        }
+      }
+      {
+        auto epoch = find_value<epoch_t>(values, epoch_key);
+        assert(epoch);
+        return seastar::make_ready_future<epoch_t>(*epoch);
+      }
+    },
+    FuturizedStore::read_errorator::assert_all{
+      "PGMeta::get_epoch: unable to read pgmeta"
+    });
+  });
+}
+
+seastar::future<std::tuple<pg_info_t, PastIntervals>> PGMeta::load()
+{
+  return store->open_collection(coll_t{pgid}).then([this](auto ch) {
+    return store->omap_get_values(ch,
+                                pgid.make_pgmeta_oid(),
+                                {string{infover_key},
+                                 string{info_key},
+                                 string{biginfo_key},
+                                 string{fastinfo_key}});
+  }).safe_then([](auto&& values) {
+    {
+      // sanity check
+      auto infover = find_value<__u8>(values, infover_key);
+      assert(infover);
+      if (infover < 10) {
+        throw std::runtime_error("incompatible pg meta");
+      }
+    }
+    pg_info_t info;
+    {
+      auto found = find_value<pg_info_t>(values, info_key);
+      assert(found);
+      info = *std::move(found);
+    }
+    PastIntervals past_intervals;
+    {
+      using biginfo_t = std::pair<PastIntervals, decltype(info.purged_snaps)>;
+      auto big_info = find_value<biginfo_t>(values, biginfo_key);
+      assert(big_info);
+      past_intervals = std::move(big_info->first);
+      info.purged_snaps = std::move(big_info->second);
+    }
+    {
+      auto fast_info = find_value<pg_fast_info_t>(values, fastinfo_key);
+      if (fast_info) {
+        fast_info->try_apply_to(&info);
+      }
+    }
+    return seastar::make_ready_future<std::tuple<pg_info_t, PastIntervals>>(
+      std::make_tuple(std::move(info), std::move(past_intervals)));
+  },
+  FuturizedStore::read_errorator::assert_all{
+    "PGMeta::load: unable to read pgmeta"
+  });
+}
diff --git a/src/crimson/osd/pg_meta.h b/src/crimson/osd/pg_meta.h
new file mode 100644
index 000000000..e0aa02716
--- /dev/null
+++ b/src/crimson/osd/pg_meta.h
@@ -0,0 +1,23 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <tuple>
+#include <seastar/core/future.hh>
+#include "osd/osd_types.h"
+
+namespace crimson::os {
+  class FuturizedStore;
+}
+
+/// PG related metadata
+class PGMeta
+{
+  crimson::os::FuturizedStore* store;
+  const spg_t pgid;
+public:
+  PGMeta(crimson::os::FuturizedStore *store, spg_t pgid);
+  seastar::future<epoch_t> get_epoch();
+  seastar::future<std::tuple<pg_info_t, PastIntervals>> load();
+};
diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc
new file mode 100644
index 000000000..7d70b5e8f
--- /dev/null
+++ b/src/crimson/osd/pg_recovery.cc
@@ -0,0 +1,550 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "crimson/common/type_helpers.h"
+#include "crimson/osd/backfill_facades.h"
+#include "crimson/osd/osd_operations/background_recovery.h"
+#include "crimson/osd/osd_operations/peering_event.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_backend.h"
+#include "crimson/osd/pg_recovery.h"
+
+#include "messages/MOSDPGPull.h"
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPushReply.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
+
+#include "osd/osd_types.h"
+#include "osd/PeeringState.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+void PGRecovery::start_pglogbased_recovery()
+{
+  using PglogBasedRecovery = crimson::osd::PglogBasedRecovery;
+  (void) pg->get_shard_services().start_operation<PglogBasedRecovery>(
+    static_cast<crimson::osd::PG*>(pg),
+    pg->get_shard_services(),
+    pg->get_osdmap_epoch());
+}
+
+crimson::osd::blocking_future<bool>
+PGRecovery::start_recovery_ops(size_t max_to_start)
+{
+  assert(pg->is_primary());
+  assert(pg->is_peered());
+  assert(pg->is_recovering());
+  // in ceph-osd the do_recovery() path handles both the pg log-based
+  // recovery and the backfill, albeit they are separated at the layer
+  // of PeeringState. In crimson-osd backfill has been cut from it, so
+  // and do_recovery() is actually solely for pg log-based recovery.
+  // At the time of writing it's considered to move it to FSM and fix
+  // the naming as well.
+  assert(!pg->is_backfilling());
+  assert(!pg->get_peering_state().is_deleting());
+
+  std::vector<crimson::osd::blocking_future<>> started;
+  started.reserve(max_to_start);
+  max_to_start -= start_primary_recovery_ops(max_to_start, &started);
+  if (max_to_start > 0) {
+    max_to_start -= start_replica_recovery_ops(max_to_start, &started);
+  }
+  return crimson::osd::join_blocking_futures(std::move(started)).then(
+    [this] {
+    bool done = !pg->get_peering_state().needs_recovery();
+    if (done) {
+      logger().debug("start_recovery_ops: AllReplicasRecovered for pg: {}",
+                     pg->get_pgid());
+      using LocalPeeringEvent = crimson::osd::LocalPeeringEvent;
+      if (!pg->get_peering_state().needs_backfill()) {
+        logger().debug("start_recovery_ops: AllReplicasRecovered for pg: {}",
+                      pg->get_pgid());
+        (void) pg->get_shard_services().start_operation<LocalPeeringEvent>(
+          static_cast<crimson::osd::PG*>(pg),
+          pg->get_shard_services(),
+          pg->get_pg_whoami(),
+          pg->get_pgid(),
+          pg->get_osdmap_epoch(),
+          pg->get_osdmap_epoch(),
+          PeeringState::AllReplicasRecovered{});
+      } else {
+        logger().debug("start_recovery_ops: RequestBackfill for pg: {}",
+                      pg->get_pgid());
+        (void) pg->get_shard_services().start_operation<LocalPeeringEvent>(
+          static_cast<crimson::osd::PG*>(pg),
+          pg->get_shard_services(),
+          pg->get_pg_whoami(),
+          pg->get_pgid(),
+          pg->get_osdmap_epoch(),
+          pg->get_osdmap_epoch(),
+          PeeringState::RequestBackfill{});
+      }
+    }
+    return seastar::make_ready_future<bool>(!done);
+  });
+}
+
+size_t PGRecovery::start_primary_recovery_ops(
+  size_t max_to_start,
+  std::vector<crimson::osd::blocking_future<>> *out)
+{
+  if (!pg->is_recovering()) {
+    return 0;
+  }
+
+  if (!pg->get_peering_state().have_missing()) {
+    pg->get_peering_state().local_recovery_complete();
+    return 0;
+  }
+
+  const auto &missing = pg->get_peering_state().get_pg_log().get_missing();
+
+  logger().info("{} recovering {} in pg {}, missing {}", __func__,
+                pg->get_recovery_backend()->total_recovering(),
+                *static_cast<crimson::osd::PG*>(pg),
+                missing);
+
+  unsigned started = 0;
+  int skipped = 0;
+
+  map<version_t, hobject_t>::const_iterator p =
+    missing.get_rmissing().lower_bound(pg->get_peering_state().get_pg_log().get_log().last_requested);
+  while (started < max_to_start && p != missing.get_rmissing().end()) {
+    // TODO: chain futures here to enable yielding to scheduler?
+    hobject_t soid;
+    version_t v = p->first;
+
+    auto it_objects = pg->get_peering_state().get_pg_log().get_log().objects.find(p->second);
+    if (it_objects != pg->get_peering_state().get_pg_log().get_log().objects.end()) {
+      // look at log!
+      pg_log_entry_t *latest = it_objects->second;
+      assert(latest->is_update() || latest->is_delete());
+      soid = latest->soid;
+    } else {
+      soid = p->second;
+    }
+    const pg_missing_item& item = missing.get_items().find(p->second)->second;
+    ++p;
+
+    hobject_t head = soid.get_head();
+
+    logger().info(
+      "{} {} item.need {} {} {} {} {}",
+      __func__,
+      soid,
+      item.need,
+      missing.is_missing(soid) ? " (missing)":"",
+      missing.is_missing(head) ? " (missing head)":"",
+      pg->get_recovery_backend()->is_recovering(soid) ? " (recovering)":"",
+      pg->get_recovery_backend()->is_recovering(head) ? " (recovering head)":"");
+
+    // TODO: handle lost/unfound
+    if (pg->get_recovery_backend()->is_recovering(soid)) {
+      auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid);
+      out->push_back(recovery_waiter.wait_for_recovered_blocking());
+      ++started;
+    } else if (pg->get_recovery_backend()->is_recovering(head)) {
+      ++skipped;
+    } else {
+      out->push_back(recover_missing(soid, item.need));
+      ++started;
+    }
+
+    if (!skipped)
+      pg->get_peering_state().set_last_requested(v);
+  }
+
+  logger().info("{} started {} skipped {}", __func__, started, skipped);
+
+  return started;
+}
+
+size_t PGRecovery::start_replica_recovery_ops(
+  size_t max_to_start,
+  std::vector<crimson::osd::blocking_future<>> *out)
+{
+  if (!pg->is_recovering()) {
+    return 0;
+  }
+  uint64_t started = 0;
+
+  assert(!pg->get_peering_state().get_acting_recovery_backfill().empty());
+
+  auto recovery_order = get_replica_recovery_order();
+  for (auto &peer : recovery_order) {
+    assert(peer != pg->get_peering_state().get_primary());
+    const auto& pm = pg->get_peering_state().get_peer_missing(peer);
+
+    logger().debug("{}: peer osd.{} missing {} objects", __func__,
+                 peer, pm.num_missing());
+    logger().trace("{}: peer osd.{} missing {}", __func__,
+                 peer, pm.get_items());
+
+    // recover oldest first
+    for (auto p = pm.get_rmissing().begin();
+	 p != pm.get_rmissing().end() && started < max_to_start;
+	 ++p) {
+      const auto &soid = p->second;
+
+      if (pg->get_peering_state().get_missing_loc().is_unfound(soid)) {
+	logger().debug("{}: object {} still unfound", __func__, soid);
+	continue;
+      }
+
+      const pg_info_t &pi = pg->get_peering_state().get_peer_info(peer);
+      if (soid > pi.last_backfill) {
+	if (!pg->get_recovery_backend()->is_recovering(soid)) {
+	  logger().error(
+	    "{}: object {} in missing set for backfill (last_backfill {})"
+	    " but not in recovering",
+	    __func__,
+	    soid,
+	    pi.last_backfill);
+	  ceph_abort();
+	}
+	continue;
+      }
+
+      if (pg->get_recovery_backend()->is_recovering(soid)) {
+	logger().debug("{}: already recovering object {}", __func__, soid);
+	auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid);
+	out->push_back(recovery_waiter.wait_for_recovered_blocking());
+	started++;
+	continue;
+      }
+
+      if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) {
+	logger().debug("{}: soid {} is a delete, removing", __func__, soid);
+	map<hobject_t,pg_missing_item>::const_iterator r =
+	  pm.get_items().find(soid);
+	started += prep_object_replica_deletes(
+	  soid, r->second.need, out);
+	continue;
+      }
+
+      if (soid.is_snap() &&
+	  pg->get_peering_state().get_pg_log().get_missing().is_missing(
+	    soid.get_head())) {
+	logger().debug("{}: head {} still missing on primary", __func__,
+		     soid.get_head());
+	continue;
+      }
+
+      if (pg->get_peering_state().get_pg_log().get_missing().is_missing(soid)) {
+	logger().debug("{}: soid {} still missing on primary", __func__, soid);
+	continue;
+      }
+
+      logger().debug("{}: recover_object_replicas({})", __func__,soid);
+      map<hobject_t,pg_missing_item>::const_iterator r = pm.get_items().find(
+	soid);
+      started += prep_object_replica_pushes(
+	soid, r->second.need, out);
+    }
+  }
+
+  return started;
+}
+
+crimson::osd::blocking_future<> PGRecovery::recover_missing(
+  const hobject_t &soid, eversion_t need)
+{
+  if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) {
+    return pg->get_recovery_backend()->add_recovering(soid).make_blocking_future(
+	pg->get_recovery_backend()->recover_delete(soid, need));
+  } else {
+    return pg->get_recovery_backend()->add_recovering(soid).make_blocking_future(
+      pg->get_recovery_backend()->recover_object(soid, need).handle_exception(
+	[=, soid = std::move(soid)] (auto e) {
+	on_failed_recover({ pg->get_pg_whoami() }, soid, need);
+	return seastar::make_ready_future<>();
+      })
+    );
+  }
+}
+
+size_t PGRecovery::prep_object_replica_deletes(
+  const hobject_t& soid,
+  eversion_t need,
+  std::vector<crimson::osd::blocking_future<>> *in_progress)
+{
+  in_progress->push_back(
+    pg->get_recovery_backend()->add_recovering(soid).make_blocking_future(
+      pg->get_recovery_backend()->push_delete(soid, need).then([=] {
+	object_stat_sum_t stat_diff;
+	stat_diff.num_objects_recovered = 1;
+	on_global_recover(soid, stat_diff, true);
+	return seastar::make_ready_future<>();
+      })
+    )
+  );
+  return 1;
+}
+
+size_t PGRecovery::prep_object_replica_pushes(
+  const hobject_t& soid,
+  eversion_t need,
+  std::vector<crimson::osd::blocking_future<>> *in_progress)
+{
+  in_progress->push_back(
+    pg->get_recovery_backend()->add_recovering(soid).make_blocking_future(
+      pg->get_recovery_backend()->recover_object(soid, need).handle_exception(
+	[=, soid = std::move(soid)] (auto e) {
+	on_failed_recover({ pg->get_pg_whoami() }, soid, need);
+	return seastar::make_ready_future<>();
+      })
+    )
+  );
+  return 1;
+}
+
+void PGRecovery::on_local_recover(
+  const hobject_t& soid,
+  const ObjectRecoveryInfo& recovery_info,
+  const bool is_delete,
+  ceph::os::Transaction& t)
+{
+  pg->get_peering_state().recover_got(soid,
+      recovery_info.version, is_delete, t);
+
+  if (pg->is_primary()) {
+    if (!is_delete) {
+      auto& obc = pg->get_recovery_backend()->get_recovering(soid).obc; //TODO: move to pg backend?
+      obc->obs.exists = true;
+      obc->obs.oi = recovery_info.oi;
+    }
+    if (!pg->is_unreadable_object(soid)) {
+      pg->get_recovery_backend()->get_recovering(soid).set_readable();
+    }
+    pg->publish_stats_to_osd();
+  }
+}
+
+void PGRecovery::on_global_recover (
+  const hobject_t& soid,
+  const object_stat_sum_t& stat_diff,
+  const bool is_delete)
+{
+  logger().info("{} {}", __func__, soid);
+  pg->get_peering_state().object_recovered(soid, stat_diff);
+  pg->publish_stats_to_osd();
+  auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid);
+  if (!is_delete)
+    recovery_waiter.obc->drop_recovery_read();
+  recovery_waiter.set_recovered();
+  pg->get_recovery_backend()->remove_recovering(soid);
+}
+
+void PGRecovery::on_failed_recover(
+  const set<pg_shard_t>& from,
+  const hobject_t& soid,
+  const eversion_t& v)
+{
+  for (auto pg_shard : from) {
+    if (pg_shard != pg->get_pg_whoami()) {
+      pg->get_peering_state().force_object_missing(pg_shard, soid, v);
+    }
+  }
+}
+
+void PGRecovery::on_peer_recover(
+  pg_shard_t peer,
+  const hobject_t &oid,
+  const ObjectRecoveryInfo &recovery_info)
+{
+  crimson::get_logger(ceph_subsys_osd).debug(
+      "{}: {}, {} on {}", __func__, oid,
+      recovery_info.version, peer);
+  pg->get_peering_state().on_peer_recover(peer, oid, recovery_info.version);
+}
+
+void PGRecovery::_committed_pushed_object(epoch_t epoch,
+			      eversion_t last_complete)
+{
+  if (!pg->has_reset_since(epoch)) {
+    pg->get_peering_state().recovery_committed_to(last_complete);
+  } else {
+    crimson::get_logger(ceph_subsys_osd).debug(
+	"{} pg has changed, not touching last_complete_ondisk",
+	__func__);
+  }
+}
+
+template <class EventT>
+void PGRecovery::start_backfill_recovery(const EventT& evt)
+{
+  using BackfillRecovery = crimson::osd::BackfillRecovery;
+  std::ignore = pg->get_shard_services().start_operation<BackfillRecovery>(
+    static_cast<crimson::osd::PG*>(pg),
+    pg->get_shard_services(),
+    pg->get_osdmap_epoch(),
+    evt);
+}
+
+void PGRecovery::request_replica_scan(
+  const pg_shard_t& target,
+  const hobject_t& begin,
+  const hobject_t& end)
+{
+  logger().debug("{}: target.osd={}", __func__, target.osd);
+  auto msg = make_message<MOSDPGScan>(
+    MOSDPGScan::OP_SCAN_GET_DIGEST,
+    pg->get_pg_whoami(),
+    pg->get_osdmap_epoch(),
+    pg->get_last_peering_reset(),
+    spg_t(pg->get_pgid().pgid, target.shard),
+    begin,
+    end);
+  std::ignore = pg->get_shard_services().send_to_osd(
+    target.osd,
+    std::move(msg),
+    pg->get_osdmap_epoch());
+}
+
+void PGRecovery::request_primary_scan(
+  const hobject_t& begin)
+{
+  logger().debug("{}", __func__);
+  using crimson::common::local_conf;
+  std::ignore = pg->get_recovery_backend()->scan_for_backfill(
+    begin,
+    local_conf()->osd_backfill_scan_min,
+    local_conf()->osd_backfill_scan_max
+  ).then([this] (BackfillInterval bi) {
+    logger().debug("request_primary_scan:{}", __func__);
+    using BackfillState = crimson::osd::BackfillState;
+    start_backfill_recovery(BackfillState::PrimaryScanned{ std::move(bi) });
+  });
+}
+
+void PGRecovery::enqueue_push(
+  const hobject_t& obj,
+  const eversion_t& v)
+{
+  logger().debug("{}: obj={} v={}",
+                 __func__, obj, v);
+  pg->get_recovery_backend()->add_recovering(obj);
+  std::ignore = pg->get_recovery_backend()->recover_object(obj, v).\
+  handle_exception([] (auto) {
+    ceph_abort_msg("got exception on backfill's push");
+    return seastar::make_ready_future<>();
+  }).then([this, obj] {
+    logger().debug("enqueue_push:{}", __func__);
+    using BackfillState = crimson::osd::BackfillState;
+    start_backfill_recovery(BackfillState::ObjectPushed(std::move(obj)));
+  });
+}
+
+void PGRecovery::enqueue_drop(
+  const pg_shard_t& target,
+  const hobject_t& obj,
+  const eversion_t& v)
+{
+  // allocate a pair if target is seen for the first time
+  auto& req = backfill_drop_requests[target];
+  if (!req) {
+    req = ceph::make_message<MOSDPGBackfillRemove>(
+      spg_t(pg->get_pgid().pgid, target.shard), pg->get_osdmap_epoch());
+  }
+  req->ls.emplace_back(obj, v);
+}
+
+void PGRecovery::maybe_flush()
+{
+  for (auto& [target, req] : backfill_drop_requests) {
+    std::ignore = pg->get_shard_services().send_to_osd(
+      target.osd,
+      std::move(req),
+      pg->get_osdmap_epoch());
+  }
+  backfill_drop_requests.clear();
+}
+
+void PGRecovery::update_peers_last_backfill(
+  const hobject_t& new_last_backfill)
+{
+  logger().debug("{}: new_last_backfill={}",
+                 __func__, new_last_backfill);
+  // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
+  // all the backfill targets.  Otherwise, we will move last_backfill up on
+  // those targets need it and send OP_BACKFILL_PROGRESS to them.
+  for (const auto& bt : pg->get_peering_state().get_backfill_targets()) {
+    if (const pg_info_t& pinfo = pg->get_peering_state().get_peer_info(bt);
+        new_last_backfill > pinfo.last_backfill) {
+      pg->get_peering_state().update_peer_last_backfill(bt, new_last_backfill);
+      auto m = make_message<MOSDPGBackfill>(
+        pinfo.last_backfill.is_max() ? MOSDPGBackfill::OP_BACKFILL_FINISH
+                                     : MOSDPGBackfill::OP_BACKFILL_PROGRESS,
+        pg->get_osdmap_epoch(),
+        pg->get_last_peering_reset(),
+        spg_t(pg->get_pgid().pgid, bt.shard));
+      // Use default priority here, must match sub_op priority
+      // TODO: if pinfo.last_backfill.is_max(), then
+      //       start_recovery_op(hobject_t::get_max());
+      m->last_backfill = pinfo.last_backfill;
+      m->stats = pinfo.stats;
+      std::ignore = pg->get_shard_services().send_to_osd(
+        bt.osd, std::move(m), pg->get_osdmap_epoch());
+      logger().info("{}: peer {} num_objects now {} / {}",
+                    __func__,
+                    bt,
+                    pinfo.stats.stats.sum.num_objects,
+                    pg->get_info().stats.stats.sum.num_objects);
+    }
+  }
+}
+
+bool PGRecovery::budget_available() const
+{
+  // TODO: the limits!
+  return true;
+}
+
+void PGRecovery::backfilled()
+{
+  using LocalPeeringEvent = crimson::osd::LocalPeeringEvent;
+  std::ignore = pg->get_shard_services().start_operation<LocalPeeringEvent>(
+    static_cast<crimson::osd::PG*>(pg),
+    pg->get_shard_services(),
+    pg->get_pg_whoami(),
+    pg->get_pgid(),
+    pg->get_osdmap_epoch(),
+    pg->get_osdmap_epoch(),
+    PeeringState::Backfilled{});
+}
+
+void PGRecovery::dispatch_backfill_event(
+  boost::intrusive_ptr<const boost::statechart::event_base> evt)
+{
+  logger().debug("{}", __func__);
+  backfill_state->process_event(evt);
+}
+
+void PGRecovery::on_backfill_reserved()
+{
+  logger().debug("{}", __func__);
+  // PIMP and depedency injection for the sake unittestability.
+  // I'm not afraid about the performance here.
+  using BackfillState = crimson::osd::BackfillState;
+  backfill_state = std::make_unique<BackfillState>(
+    *this,
+    std::make_unique<crimson::osd::PeeringFacade>(pg->get_peering_state()),
+    std::make_unique<crimson::osd::PGFacade>(
+      *static_cast<crimson::osd::PG*>(pg)));
+  // yes, it's **not** backfilling yet. The PG_STATE_BACKFILLING
+  // will be set after on_backfill_reserved() returns.
+  // Backfill needs to take this into consideration when scheduling
+  // events -- they must be mutually exclusive with PeeringEvent
+  // instances. Otherwise the execution might begin without having
+  // the state updated.
+  ceph_assert(!pg->get_peering_state().is_backfilling());
+  start_backfill_recovery(BackfillState::Triggered{});
+}
diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h
new file mode 100644
index 000000000..86f259de5
--- /dev/null
+++ b/src/crimson/osd/pg_recovery.h
@@ -0,0 +1,113 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/backfill_state.h"
+#include "crimson/osd/osd_operation.h"
+#include "crimson/osd/pg_recovery_listener.h"
+#include "crimson/osd/scheduler/scheduler.h"
+#include "crimson/osd/shard_services.h"
+
+#include "osd/object_state.h"
+
+class MOSDPGBackfillRemove;
+class PGBackend;
+
+class PGRecovery : public crimson::osd::BackfillState::BackfillListener {
+public:
+  PGRecovery(PGRecoveryListener* pg) : pg(pg) {}
+  virtual ~PGRecovery() {}
+  void start_pglogbased_recovery();
+
+  crimson::osd::blocking_future<bool> start_recovery_ops(size_t max_to_start);
+  void on_backfill_reserved();
+  void dispatch_backfill_event(
+    boost::intrusive_ptr<const boost::statechart::event_base> evt);
+
+  seastar::future<> stop() { return seastar::now(); }
+private:
+  PGRecoveryListener* pg;
+  size_t start_primary_recovery_ops(
+    size_t max_to_start,
+    std::vector<crimson::osd::blocking_future<>> *out);
+  size_t start_replica_recovery_ops(
+    size_t max_to_start,
+    std::vector<crimson::osd::blocking_future<>> *out);
+
+  std::vector<pg_shard_t> get_replica_recovery_order() const {
+    return pg->get_replica_recovery_order();
+  }
+  crimson::osd::blocking_future<> recover_missing(
+    const hobject_t &soid, eversion_t need);
+  size_t prep_object_replica_deletes(
+    const hobject_t& soid,
+    eversion_t need,
+    std::vector<crimson::osd::blocking_future<>> *in_progress);
+  size_t prep_object_replica_pushes(
+    const hobject_t& soid,
+    eversion_t need,
+    std::vector<crimson::osd::blocking_future<>> *in_progress);
+
+  void on_local_recover(
+    const hobject_t& soid,
+    const ObjectRecoveryInfo& recovery_info,
+    bool is_delete,
+    ceph::os::Transaction& t);
+  void on_global_recover (
+    const hobject_t& soid,
+    const object_stat_sum_t& stat_diff,
+    bool is_delete);
+  void on_failed_recover(
+    const set<pg_shard_t>& from,
+    const hobject_t& soid,
+    const eversion_t& v);
+  void on_peer_recover(
+    pg_shard_t peer,
+    const hobject_t &oid,
+    const ObjectRecoveryInfo &recovery_info);
+  void _committed_pushed_object(epoch_t epoch,
+				eversion_t last_complete);
+  friend class ReplicatedRecoveryBackend;
+  friend class crimson::osd::UrgentRecovery;
+  seastar::future<> handle_pull(Ref<MOSDPGPull> m);
+  seastar::future<> handle_push(Ref<MOSDPGPush> m);
+  seastar::future<> handle_push_reply(Ref<MOSDPGPushReply> m);
+  seastar::future<> handle_recovery_delete(Ref<MOSDPGRecoveryDelete> m);
+  seastar::future<> handle_recovery_delete_reply(
+      Ref<MOSDPGRecoveryDeleteReply> m);
+  seastar::future<> handle_pull_response(Ref<MOSDPGPush> m);
+  seastar::future<> handle_scan(MOSDPGScan& m);
+
+  // backfill begin
+  std::unique_ptr<crimson::osd::BackfillState> backfill_state;
+  std::map<pg_shard_t,
+           ceph::ref_t<MOSDPGBackfillRemove>> backfill_drop_requests;
+
+  template <class EventT>
+  void start_backfill_recovery(
+    const EventT& evt);
+  void request_replica_scan(
+    const pg_shard_t& target,
+    const hobject_t& begin,
+    const hobject_t& end) final;
+  void request_primary_scan(
+    const hobject_t& begin) final;
+  void enqueue_push(
+    const hobject_t& obj,
+    const eversion_t& v) final;
+  void enqueue_drop(
+    const pg_shard_t& target,
+    const hobject_t& obj,
+    const eversion_t& v) final;
+  void maybe_flush() final;
+  void update_peers_last_backfill(
+    const hobject_t& new_last_backfill) final;
+  bool budget_available() const final;
+  void backfilled() final;
+  friend crimson::osd::BackfillState::PGFacade;
+  friend crimson::osd::PG;
+  // backfill end
+};
diff --git a/src/crimson/osd/pg_recovery_listener.h b/src/crimson/osd/pg_recovery_listener.h
new file mode 100644
index 000000000..c922b9956
--- /dev/null
+++ b/src/crimson/osd/pg_recovery_listener.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+#include "common/hobject.h"
+#include "include/types.h"
+#include "osd/osd_types.h"
+
+namespace crimson::osd {
+  class ShardServices;
+};
+
+class RecoveryBackend;
+class PGRecovery;
+
+class PGRecoveryListener {
+public:
+  virtual crimson::osd::ShardServices& get_shard_services() = 0;
+  virtual PGRecovery* get_recovery_handler() = 0;
+  virtual epoch_t get_osdmap_epoch() const = 0;
+  virtual bool is_primary() const = 0;
+  virtual bool is_peered() const = 0;
+  virtual bool is_recovering() const = 0;
+  virtual bool is_backfilling() const = 0;
+  virtual PeeringState& get_peering_state() = 0;
+  virtual const pg_shard_t& get_pg_whoami() const = 0;
+  virtual const spg_t& get_pgid() const = 0;
+  virtual RecoveryBackend* get_recovery_backend() = 0;
+  virtual bool is_unreadable_object(const hobject_t&, eversion_t* v = 0) const = 0;
+  virtual bool has_reset_since(epoch_t) const = 0;
+  virtual std::vector<pg_shard_t> get_replica_recovery_order() const = 0;
+  virtual epoch_t get_last_peering_reset() const = 0;
+  virtual const pg_info_t& get_info() const= 0;
+  virtual seastar::future<> stop() = 0;
+  virtual void publish_stats_to_osd() = 0;
+};
diff --git a/src/crimson/osd/recovery_backend.cc b/src/crimson/osd/recovery_backend.cc
new file mode 100644
index 000000000..aeec0d14b
--- /dev/null
+++ b/src/crimson/osd/recovery_backend.cc
@@ -0,0 +1,298 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <fmt/format.h>
+
+#include "crimson/common/exception.h"
+#include "crimson/osd/recovery_backend.h"
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_backend.h"
+
+#include "messages/MOSDFastDispatchOp.h"
+#include "osd/osd_types.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+hobject_t RecoveryBackend::get_temp_recovery_object(
+  const hobject_t& target,
+  eversion_t version) const
+{
+  hobject_t hoid =
+    target.make_temp_hobject(fmt::format("temp_recovering_{}_{}_{}_{}",
+                                         pg.get_info().pgid,
+                                         version,
+                                         pg.get_info().history.same_interval_since,
+                                         target.snap));
+  logger().debug("{} {}", __func__, hoid);
+  return hoid;
+}
+
+void RecoveryBackend::clean_up(ceph::os::Transaction& t,
+			       std::string_view why)
+{
+  for (auto& soid : temp_contents) {
+    t.remove(pg.get_collection_ref()->get_cid(),
+	      ghobject_t(soid, ghobject_t::NO_GEN, pg.get_pg_whoami().shard));
+  }
+  temp_contents.clear();
+
+  for (auto& [soid, recovery_waiter] : recovering) {
+    if ((recovery_waiter.pi && recovery_waiter.pi->is_complete())
+	|| (!recovery_waiter.pi
+	  && recovery_waiter.obc && recovery_waiter.obc->obs.exists)) {
+      recovery_waiter.obc->interrupt(
+	  ::crimson::common::actingset_changed(
+	      pg.is_primary()));
+      recovery_waiter.interrupt(why);
+    }
+  }
+  recovering.clear();
+}
+
+void RecoveryBackend::WaitForObjectRecovery::stop() {
+  readable.set_exception(
+      crimson::common::system_shutdown_exception());
+  recovered.set_exception(
+      crimson::common::system_shutdown_exception());
+  pulled.set_exception(
+      crimson::common::system_shutdown_exception());
+  for (auto& [pg_shard, pr] : pushes) {
+    pr.set_exception(
+	crimson::common::system_shutdown_exception());
+  }
+}
+
+void RecoveryBackend::handle_backfill_finish(
+  MOSDPGBackfill& m)
+{
+  logger().debug("{}", __func__);
+  ceph_assert(!pg.is_primary());
+  ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 1);
+  auto reply = make_message<MOSDPGBackfill>(
+    MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
+    pg.get_osdmap_epoch(),
+    m.query_epoch,
+    spg_t(pg.get_pgid().pgid, pg.get_primary().shard));
+  reply->set_priority(pg.get_recovery_op_priority());
+  std::ignore = m.get_connection()->send(std::move(reply));
+  shard_services.start_operation<crimson::osd::LocalPeeringEvent>(
+    static_cast<crimson::osd::PG*>(&pg),
+    shard_services,
+    pg.get_pg_whoami(),
+    pg.get_pgid(),
+    pg.get_osdmap_epoch(),
+    pg.get_osdmap_epoch(),
+    RecoveryDone{});
+}
+
+seastar::future<> RecoveryBackend::handle_backfill_progress(
+  MOSDPGBackfill& m)
+{
+  logger().debug("{}", __func__);
+  ceph_assert(!pg.is_primary());
+  ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 2);
+
+  ObjectStore::Transaction t;
+  pg.get_peering_state().update_backfill_progress(
+    m.last_backfill,
+    m.stats,
+    m.op == MOSDPGBackfill::OP_BACKFILL_PROGRESS,
+    t);
+  return shard_services.get_store().do_transaction(
+    pg.get_collection_ref(), std::move(t)
+  ).or_terminate();
+}
+
+seastar::future<> RecoveryBackend::handle_backfill_finish_ack(
+  MOSDPGBackfill& m)
+{
+  logger().debug("{}", __func__);
+  ceph_assert(pg.is_primary());
+  ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 3);
+  // TODO:
+  // finish_recovery_op(hobject_t::get_max());
+  return seastar::now();
+}
+
+seastar::future<> RecoveryBackend::handle_backfill(
+  MOSDPGBackfill& m)
+{
+  logger().debug("{}", __func__);
+  switch (m.op) {
+    case MOSDPGBackfill::OP_BACKFILL_FINISH:
+      handle_backfill_finish(m);
+      [[fallthrough]];
+    case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
+      return handle_backfill_progress(m);
+    case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
+      return handle_backfill_finish_ack(m);
+    default:
+      ceph_assert("unknown op type for pg backfill");
+      return seastar::now();
+  }
+}
+
+seastar::future<> RecoveryBackend::handle_backfill_remove(
+  MOSDPGBackfillRemove& m)
+{
+  logger().debug("{} m.ls={}", __func__, m.ls);
+  assert(m.get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
+
+  ObjectStore::Transaction t;
+  for ([[maybe_unused]] const auto& [soid, ver] : m.ls) {
+    // TODO: the reserved space management. PG::try_reserve_recovery_space().
+    t.remove(pg.get_collection_ref()->get_cid(),
+	      ghobject_t(soid, ghobject_t::NO_GEN, pg.get_pg_whoami().shard));
+  }
+  return shard_services.get_store().do_transaction(
+    pg.get_collection_ref(), std::move(t)
+  ).or_terminate();
+}
+
+seastar::future<BackfillInterval> RecoveryBackend::scan_for_backfill(
+  const hobject_t& start,
+  [[maybe_unused]] const std::int64_t min,
+  const std::int64_t max)
+{
+  logger().debug("{} starting from {}", __func__, start);
+  auto version_map = seastar::make_lw_shared<std::map<hobject_t, eversion_t>>();
+  return backend->list_objects(start, max).then(
+    [this, start, version_map] (auto&& ret) {
+    auto&& [objects, next] = std::move(ret);
+    return seastar::parallel_for_each(std::move(objects),
+      [this, version_map] (const hobject_t& object) {
+      crimson::osd::ObjectContextRef obc;
+      if (pg.is_primary()) {
+        obc = shard_services.obc_registry.maybe_get_cached_obc(object);
+      }
+      if (obc) {
+        if (obc->obs.exists) {
+          logger().debug("scan_for_backfill found (primary): {}  {}",
+                         object, obc->obs.oi.version);
+          version_map->emplace(object, obc->obs.oi.version);
+        } else {
+          // if the object does not exist here, it must have been removed
+          // between the collection_list_partial and here.  This can happen
+          // for the first item in the range, which is usually last_backfill.
+        }
+        return seastar::now();
+      } else {
+        return backend->load_metadata(object).safe_then(
+          [version_map, object] (auto md) {
+          if (md->os.exists) {
+            logger().debug("scan_for_backfill found: {}  {}",
+                           object, md->os.oi.version);
+            version_map->emplace(object, md->os.oi.version);
+          }
+          return seastar::now();
+        }, PGBackend::load_metadata_ertr::assert_all{});
+      }
+    }).then([version_map, start=std::move(start), next=std::move(next), this] {
+      BackfillInterval bi;
+      bi.begin = std::move(start);
+      bi.end = std::move(next);
+      bi.version = pg.get_info().last_update;
+      bi.objects = std::move(*version_map);
+      logger().debug("{} BackfillInterval filled, leaving",
+                     "scan_for_backfill");
+      return seastar::make_ready_future<BackfillInterval>(std::move(bi));
+    });
+  });
+}
+
+seastar::future<> RecoveryBackend::handle_scan_get_digest(
+  MOSDPGScan& m)
+{
+  logger().debug("{}", __func__);
+  if (false /* FIXME: check for backfill too full */) {
+    std::ignore = shard_services.start_operation<crimson::osd::LocalPeeringEvent>(
+      // TODO: abstract start_background_recovery
+      static_cast<crimson::osd::PG*>(&pg),
+      shard_services,
+      pg.get_pg_whoami(),
+      pg.get_pgid(),
+      pg.get_osdmap_epoch(),
+      pg.get_osdmap_epoch(),
+      PeeringState::BackfillTooFull());
+    return seastar::now();
+  }
+  return scan_for_backfill(
+    std::move(m.begin),
+    crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_min"),
+    crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_max")
+  ).then([this,
+          query_epoch=m.query_epoch,
+          conn=m.get_connection()] (auto backfill_interval) {
+    auto reply = make_message<MOSDPGScan>(
+      MOSDPGScan::OP_SCAN_DIGEST,
+      pg.get_pg_whoami(),
+      pg.get_osdmap_epoch(),
+      query_epoch,
+      spg_t(pg.get_info().pgid.pgid, pg.get_primary().shard),
+      backfill_interval.begin,
+      backfill_interval.end);
+    encode(backfill_interval.objects, reply->get_data());
+    return conn->send(std::move(reply));
+  });
+}
+
+seastar::future<> RecoveryBackend::handle_scan_digest(
+  MOSDPGScan& m)
+{
+  logger().debug("{}", __func__);
+  // Check that from is in backfill_targets vector
+  ceph_assert(pg.is_backfill_target(m.from));
+
+  BackfillInterval bi;
+  bi.begin = m.begin;
+  bi.end = m.end;
+  {
+    auto p = m.get_data().cbegin();
+    // take care to preserve ordering!
+    bi.clear_objects();
+    ::decode_noclear(bi.objects, p);
+  }
+  shard_services.start_operation<crimson::osd::BackfillRecovery>(
+    static_cast<crimson::osd::PG*>(&pg),
+    shard_services,
+    pg.get_osdmap_epoch(),
+    crimson::osd::BackfillState::ReplicaScanned{ m.from, std::move(bi) });
+  return seastar::now();
+}
+
+seastar::future<> RecoveryBackend::handle_scan(
+  MOSDPGScan& m)
+{
+  logger().debug("{}", __func__);
+  switch (m.op) {
+    case MOSDPGScan::OP_SCAN_GET_DIGEST:
+      return handle_scan_get_digest(m);
+    case MOSDPGScan::OP_SCAN_DIGEST:
+      return handle_scan_digest(m);
+    default:
+      // FIXME: move to errorator
+      ceph_assert("unknown op type for pg scan");
+      return seastar::now();
+  }
+}
+
+seastar::future<> RecoveryBackend::handle_recovery_op(
+  Ref<MOSDFastDispatchOp> m)
+{
+  switch (m->get_header().type) {
+  case MSG_OSD_PG_BACKFILL:
+    return handle_backfill(*boost::static_pointer_cast<MOSDPGBackfill>(m));
+  case MSG_OSD_PG_BACKFILL_REMOVE:
+    return handle_backfill_remove(*boost::static_pointer_cast<MOSDPGBackfillRemove>(m));
+  case MSG_OSD_PG_SCAN:
+    return handle_scan(*boost::static_pointer_cast<MOSDPGScan>(m));
+  default:
+    return seastar::make_exception_future<>(
+	std::invalid_argument(fmt::format("invalid request type: {}",
+					  m->get_header().type)));
+  }
+}
diff --git a/src/crimson/osd/recovery_backend.h b/src/crimson/osd/recovery_backend.h
new file mode 100644
index 000000000..cb0ae9f20
--- /dev/null
+++ b/src/crimson/osd/recovery_backend.h
@@ -0,0 +1,203 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+#include "crimson/common/type_helpers.h"
+#include "crimson/os/futurized_store.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/osd/object_context.h"
+#include "crimson/osd/shard_services.h"
+
+#include "messages/MOSDPGBackfill.h"
+#include "messages/MOSDPGBackfillRemove.h"
+#include "messages/MOSDPGScan.h"
+#include "osd/recovery_types.h"
+#include "osd/osd_types.h"
+
+namespace crimson::osd{
+  class PG;
+}
+
+class PGBackend;
+
+class RecoveryBackend {
+  void handle_backfill_finish(
+    MOSDPGBackfill& m);
+  seastar::future<> handle_backfill_progress(
+    MOSDPGBackfill& m);
+  seastar::future<> handle_backfill_finish_ack(
+    MOSDPGBackfill& m);
+  seastar::future<> handle_backfill(MOSDPGBackfill& m);
+
+  seastar::future<> handle_backfill_remove(MOSDPGBackfillRemove& m);
+
+  seastar::future<> handle_scan_get_digest(
+    MOSDPGScan& m);
+  seastar::future<> handle_scan_digest(
+    MOSDPGScan& m);
+  seastar::future<> handle_scan(
+    MOSDPGScan& m);
+protected:
+  class WaitForObjectRecovery;
+public:
+  RecoveryBackend(crimson::osd::PG& pg,
+		  crimson::osd::ShardServices& shard_services,
+		  crimson::os::CollectionRef coll,
+		  PGBackend* backend)
+    : pg{pg},
+      shard_services{shard_services},
+      store{&shard_services.get_store()},
+      coll{coll},
+      backend{backend} {}
+  virtual ~RecoveryBackend() {}
+  WaitForObjectRecovery& add_recovering(const hobject_t& soid) {
+    auto [it, added] = recovering.emplace(soid, WaitForObjectRecovery{});
+    assert(added);
+    return it->second;
+  }
+  WaitForObjectRecovery& get_recovering(const hobject_t& soid) {
+    assert(is_recovering(soid));
+    return recovering.at(soid);
+  }
+  void remove_recovering(const hobject_t& soid) {
+    recovering.erase(soid);
+  }
+  bool is_recovering(const hobject_t& soid) const {
+    return recovering.count(soid) != 0;
+  }
+  uint64_t total_recovering() const {
+    return recovering.size();
+  }
+
+  virtual seastar::future<> handle_recovery_op(
+    Ref<MOSDFastDispatchOp> m);
+
+  virtual seastar::future<> recover_object(
+    const hobject_t& soid,
+    eversion_t need) = 0;
+  virtual seastar::future<> recover_delete(
+    const hobject_t& soid,
+    eversion_t need) = 0;
+  virtual seastar::future<> push_delete(
+    const hobject_t& soid,
+    eversion_t need) = 0;
+
+  seastar::future<BackfillInterval> scan_for_backfill(
+    const hobject_t& from,
+    std::int64_t min,
+    std::int64_t max);
+
+  void on_peering_interval_change(ceph::os::Transaction& t) {
+    clean_up(t, "new peering interval");
+  }
+
+  seastar::future<> stop() {
+    for (auto& [soid, recovery_waiter] : recovering) {
+      recovery_waiter.stop();
+    }
+    return on_stop();
+  }
+protected:
+  crimson::osd::PG& pg;
+  crimson::osd::ShardServices& shard_services;
+  crimson::os::FuturizedStore* store;
+  crimson::os::CollectionRef coll;
+  PGBackend* backend;
+
+  struct PullInfo {
+    pg_shard_t from;
+    hobject_t soid;
+    ObjectRecoveryProgress recovery_progress;
+    ObjectRecoveryInfo recovery_info;
+    crimson::osd::ObjectContextRef head_ctx;
+    crimson::osd::ObjectContextRef obc;
+    object_stat_sum_t stat;
+    bool is_complete() const {
+      return recovery_progress.is_complete(recovery_info);
+    }
+  };
+
+  struct PushInfo {
+    ObjectRecoveryProgress recovery_progress;
+    ObjectRecoveryInfo recovery_info;
+    crimson::osd::ObjectContextRef obc;
+    object_stat_sum_t stat;
+  };
+
+  class WaitForObjectRecovery : public crimson::osd::BlockerT<WaitForObjectRecovery> {
+    seastar::shared_promise<> readable, recovered, pulled;
+    std::map<pg_shard_t, seastar::shared_promise<>> pushes;
+  public:
+    static constexpr const char* type_name = "WaitForObjectRecovery";
+
+    crimson::osd::ObjectContextRef obc;
+    std::optional<PullInfo> pi;
+    std::map<pg_shard_t, PushInfo> pushing;
+
+    seastar::future<> wait_for_readable() {
+      return readable.get_shared_future();
+    }
+    seastar::future<> wait_for_pushes(pg_shard_t shard) {
+      return pushes[shard].get_shared_future();
+    }
+    seastar::future<> wait_for_recovered() {
+      return recovered.get_shared_future();
+    }
+    crimson::osd::blocking_future<>
+    wait_for_recovered_blocking() {
+      return make_blocking_future(
+	  recovered.get_shared_future());
+    }
+    seastar::future<> wait_for_pull() {
+      return pulled.get_shared_future();
+    }
+    void set_readable() {
+      readable.set_value();
+    }
+    void set_recovered() {
+      recovered.set_value();
+    }
+    void set_pushed(pg_shard_t shard) {
+      pushes[shard].set_value();
+    }
+    void set_pulled() {
+      pulled.set_value();
+    }
+    void set_push_failed(pg_shard_t shard, std::exception_ptr e) {
+      pushes.at(shard).set_exception(e);
+    }
+    void interrupt(std::string_view why) {
+      readable.set_exception(std::system_error(
+        std::make_error_code(std::errc::interrupted), why.data()));
+      recovered.set_exception(std::system_error(
+        std::make_error_code(std::errc::interrupted), why.data()));
+      pulled.set_exception(std::system_error(
+        std::make_error_code(std::errc::interrupted), why.data()));
+      for (auto& [pg_shard, pr] : pushes) {
+        pr.set_exception(std::system_error(
+          std::make_error_code(std::errc::interrupted), why.data()));
+      }
+    }
+    void stop();
+    void dump_detail(Formatter* f) const {
+    }
+  };
+  std::map<hobject_t, WaitForObjectRecovery> recovering;
+  hobject_t get_temp_recovery_object(
+    const hobject_t& target,
+    eversion_t version) const;
+
+  boost::container::flat_set<hobject_t> temp_contents;
+
+  void add_temp_obj(const hobject_t &oid) {
+    temp_contents.insert(oid);
+  }
+  void clear_temp_obj(const hobject_t &oid) {
+    temp_contents.erase(oid);
+  }
+  void clean_up(ceph::os::Transaction& t, std::string_view why);
+  virtual seastar::future<> on_stop() = 0;
+};
diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc
new file mode 100644
index 000000000..3a131278b
--- /dev/null
+++ b/src/crimson/osd/replicated_backend.cc
@@ -0,0 +1,144 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "replicated_backend.h"
+
+#include "messages/MOSDRepOpReply.h"
+
+#include "crimson/common/exception.h"
+#include "crimson/common/log.h"
+#include "crimson/os/futurized_store.h"
+#include "crimson/osd/shard_services.h"
+#include "osd/PeeringState.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+ReplicatedBackend::ReplicatedBackend(pg_t pgid,
+                                     pg_shard_t whoami,
+                                     ReplicatedBackend::CollectionRef coll,
+                                     crimson::osd::ShardServices& shard_services)
+  : PGBackend{whoami.shard, coll, &shard_services.get_store()},
+    pgid{pgid},
+    whoami{whoami},
+    shard_services{shard_services}
+{}
+
+ReplicatedBackend::ll_read_errorator::future<ceph::bufferlist>
+ReplicatedBackend::_read(const hobject_t& hoid,
+                         const uint64_t off,
+                         const uint64_t len,
+                         const uint32_t flags)
+{
+  if (__builtin_expect(stopping, false)) {
+    throw crimson::common::system_shutdown_exception();
+  }
+  return store->read(coll, ghobject_t{hoid}, off, len, flags);
+}
+
+seastar::future<crimson::osd::acked_peers_t>
+ReplicatedBackend::_submit_transaction(std::set<pg_shard_t>&& pg_shards,
+                                       const hobject_t& hoid,
+                                       ceph::os::Transaction&& txn,
+                                       const osd_op_params_t& osd_op_p,
+                                       epoch_t min_epoch, epoch_t map_epoch,
+				       std::vector<pg_log_entry_t>&& log_entries)
+{
+  if (__builtin_expect(stopping, false)) {
+    throw crimson::common::system_shutdown_exception();
+  }
+  if (__builtin_expect((bool)peering, false)) {
+    throw crimson::common::actingset_changed(peering->is_primary);
+  }
+
+  const ceph_tid_t tid = next_txn_id++;
+  auto req_id = osd_op_p.req->get_reqid();
+  auto pending_txn =
+    pending_trans.emplace(tid, pg_shards.size()).first;
+  bufferlist encoded_txn;
+  encode(txn, encoded_txn);
+
+  return seastar::parallel_for_each(std::move(pg_shards),
+    [=, encoded_txn=std::move(encoded_txn), txn=std::move(txn)]
+    (auto pg_shard) mutable {
+      if (pg_shard == whoami) {
+        return shard_services.get_store().do_transaction(coll,std::move(txn));
+      } else {
+        auto m = make_message<MOSDRepOp>(req_id, whoami,
+                                         spg_t{pgid, pg_shard.shard}, hoid,
+                                         CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
+                                         map_epoch, min_epoch,
+                                         tid, osd_op_p.at_version);
+        m->set_data(encoded_txn);
+        pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}});
+	encode(log_entries, m->logbl);
+	m->pg_trim_to = osd_op_p.pg_trim_to;
+	m->min_last_complete_ondisk = osd_op_p.min_last_complete_ondisk;
+	m->set_rollback_to(osd_op_p.at_version);
+        // TODO: set more stuff. e.g., pg_states
+        return shard_services.send_to_osd(pg_shard.osd, std::move(m), map_epoch);
+      }
+    }).then([this, peers=pending_txn->second.weak_from_this()] {
+      if (!peers) {
+	// for now, only actingset_changed can cause peers
+	// to be nullptr
+	assert(peering);
+	throw crimson::common::actingset_changed(peering->is_primary);
+      }
+      if (--peers->pending == 0) {
+        peers->all_committed.set_value();
+	peers->all_committed = {};
+	return seastar::now();
+      }
+      return peers->all_committed.get_future();
+    }).then([pending_txn, this] {
+      auto acked_peers = std::move(pending_txn->second.acked_peers);
+      pending_trans.erase(pending_txn);
+      return seastar::make_ready_future<crimson::osd::acked_peers_t>(std::move(acked_peers));
+    });
+}
+
+void ReplicatedBackend::on_actingset_changed(peering_info_t pi)
+{
+  peering.emplace(pi);
+  crimson::common::actingset_changed e_actingset_changed{peering->is_primary};
+  for (auto& [tid, pending_txn] : pending_trans) {
+    pending_txn.all_committed.set_exception(e_actingset_changed);
+  }
+  pending_trans.clear();
+}
+
+void ReplicatedBackend::got_rep_op_reply(const MOSDRepOpReply& reply)
+{
+  auto found = pending_trans.find(reply.get_tid());
+  if (found == pending_trans.end()) {
+    logger().warn("{}: no matched pending rep op: {}", __func__, reply);
+    return;
+  }
+  auto& peers = found->second;
+  for (auto& peer : peers.acked_peers) {
+    if (peer.shard == reply.from) {
+      peer.last_complete_ondisk = reply.get_last_complete_ondisk();
+      if (--peers.pending == 0) {
+        peers.all_committed.set_value();
+        peers.all_committed = {};
+      }
+      return;
+    }
+  }
+}
+
+seastar::future<> ReplicatedBackend::stop()
+{
+  logger().info("ReplicatedBackend::stop {}", coll->get_cid());
+  stopping = true;
+  for (auto& [tid, pending_on] : pending_trans) {
+    pending_on.all_committed.set_exception(
+	crimson::common::system_shutdown_exception());
+  }
+  pending_trans.clear();
+  return seastar::now();
+}
diff --git a/src/crimson/osd/replicated_backend.h b/src/crimson/osd/replicated_backend.h
new file mode 100644
index 000000000..01c0bba64
--- /dev/null
+++ b/src/crimson/osd/replicated_backend.h
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <seastar/core/future.hh>
+#include <seastar/core/weak_ptr.hh>
+#include "include/buffer_fwd.h"
+#include "osd/osd_types.h"
+
+#include "acked_peers.h"
+#include "pg_backend.h"
+
+namespace crimson::osd {
+  class ShardServices;
+}
+
+class ReplicatedBackend : public PGBackend
+{
+public:
+  ReplicatedBackend(pg_t pgid, pg_shard_t whoami,
+		    CollectionRef coll,
+		    crimson::osd::ShardServices& shard_services);
+  void got_rep_op_reply(const MOSDRepOpReply& reply) final;
+  seastar::future<> stop() final;
+  void on_actingset_changed(peering_info_t pi) final;
+private:
+  ll_read_errorator::future<ceph::bufferlist> _read(const hobject_t& hoid,
+					            uint64_t off,
+					            uint64_t len,
+					            uint32_t flags) override;
+  seastar::future<crimson::osd::acked_peers_t>
+  _submit_transaction(std::set<pg_shard_t>&& pg_shards,
+		      const hobject_t& hoid,
+		      ceph::os::Transaction&& txn,
+		      const osd_op_params_t& osd_op_p,
+		      epoch_t min_epoch, epoch_t max_epoch,
+		      std::vector<pg_log_entry_t>&& log_entries) final;
+  const pg_t pgid;
+  const pg_shard_t whoami;
+  crimson::osd::ShardServices& shard_services;
+  ceph_tid_t next_txn_id = 0;
+  class pending_on_t : public seastar::weakly_referencable<pending_on_t> {
+  public:
+    pending_on_t(size_t pending)
+      : pending{static_cast<unsigned>(pending)}
+    {}
+    unsigned pending;
+    crimson::osd::acked_peers_t acked_peers;
+    seastar::promise<> all_committed;
+  };
+  using pending_transactions_t = std::map<ceph_tid_t, pending_on_t>;
+  pending_transactions_t pending_trans;
+};
diff --git a/src/crimson/osd/replicated_recovery_backend.cc b/src/crimson/osd/replicated_recovery_backend.cc
new file mode 100644
index 000000000..0812003bb
--- /dev/null
+++ b/src/crimson/osd/replicated_recovery_backend.cc
@@ -0,0 +1,1076 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <seastar/core/future.hh>
+#include <seastar/core/do_with.hh>
+
+#include "crimson/osd/pg.h"
+#include "crimson/osd/pg_backend.h"
+#include "replicated_recovery_backend.h"
+
+#include "msg/Message.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+seastar::future<> ReplicatedRecoveryBackend::recover_object(
+  const hobject_t& soid,
+  eversion_t need)
+{
+  logger().debug("{}: {}, {}", __func__, soid, need);
+  // always add_recovering(soid) before recover_object(soid)
+  assert(is_recovering(soid));
+  // start tracking the recovery of soid
+  return maybe_pull_missing_obj(soid, need).then([this, soid, need] {
+    logger().debug("recover_object: loading obc: {}", soid);
+    return pg.with_head_obc<RWState::RWREAD>(soid,
+      [this, soid, need](auto obc) {
+      logger().debug("recover_object: loaded obc: {}", obc->obs.oi.soid);
+      auto& recovery_waiter = recovering.at(soid);
+      recovery_waiter.obc = obc;
+      recovery_waiter.obc->wait_recovery_read();
+      return maybe_push_shards(soid, need);
+    }).handle_error(
+      crimson::osd::PG::load_obc_ertr::all_same_way([soid](auto& code) {
+      // TODO: may need eio handling?
+      logger().error("recover_object saw error code {}, ignoring object {}",
+                     code, soid);
+    }));
+  });
+}
+
+seastar::future<>
+ReplicatedRecoveryBackend::maybe_push_shards(
+  const hobject_t& soid,
+  eversion_t need)
+{
+  return seastar::parallel_for_each(get_shards_to_push(soid),
+    [this, need, soid](auto shard) {
+    return prep_push(soid, need, shard).then([this, soid, shard](auto push) {
+      auto msg = make_message<MOSDPGPush>();
+      msg->from = pg.get_pg_whoami();
+      msg->pgid = pg.get_pgid();
+      msg->map_epoch = pg.get_osdmap_epoch();
+      msg->min_epoch = pg.get_last_peering_reset();
+      msg->pushes.push_back(std::move(push));
+      msg->set_priority(pg.get_recovery_op_priority());
+      return shard_services.send_to_osd(shard.osd,
+                                        std::move(msg),
+                                        pg.get_osdmap_epoch()).then(
+        [this, soid, shard] {
+        return recovering.at(soid).wait_for_pushes(shard);
+      });
+    });
+  }).then([this, soid] {
+    auto &recovery = recovering.at(soid);
+    auto push_info = recovery.pushing.begin();
+    object_stat_sum_t stat = {};
+    if (push_info != recovery.pushing.end()) {
+      stat = push_info->second.stat;
+    } else {
+      // no push happened, take pull_info's stat
+      assert(recovery.pi);
+      stat = recovery.pi->stat;
+    }
+    pg.get_recovery_handler()->on_global_recover(soid, stat, false);
+    return seastar::make_ready_future<>();
+  }).handle_exception([this, soid](auto e) {
+    auto &recovery = recovering.at(soid);
+    if (recovery.obc) {
+      recovery.obc->drop_recovery_read();
+    }
+    recovering.erase(soid);
+    return seastar::make_exception_future<>(e);
+  });
+}
+
+seastar::future<>
+ReplicatedRecoveryBackend::maybe_pull_missing_obj(
+  const hobject_t& soid,
+  eversion_t need)
+{
+  pg_missing_tracker_t local_missing = pg.get_local_missing();
+  if (!local_missing.is_missing(soid)) {
+    return seastar::make_ready_future<>();
+  }
+  PullOp po;
+  auto& recovery_waiter = recovering.at(soid);
+  recovery_waiter.pi = std::make_optional<RecoveryBackend::PullInfo>();
+  auto& pi = *recovery_waiter.pi;
+  prepare_pull(po, pi, soid, need);
+  auto msg = make_message<MOSDPGPull>();
+  msg->from = pg.get_pg_whoami();
+  msg->set_priority(pg.get_recovery_op_priority());
+  msg->pgid = pg.get_pgid();
+  msg->map_epoch = pg.get_osdmap_epoch();
+  msg->min_epoch = pg.get_last_peering_reset();
+  msg->set_pulls({std::move(po)});
+  return shard_services.send_to_osd(
+    pi.from.osd,
+    std::move(msg),
+    pg.get_osdmap_epoch()
+  ).then([&recovery_waiter] {
+    return recovery_waiter.wait_for_pull();
+  });
+}
+
+seastar::future<> ReplicatedRecoveryBackend::push_delete(
+  const hobject_t& soid,
+  eversion_t need)
+{
+  logger().debug("{}: {}, {}", __func__, soid, need);
+  recovering[soid];
+  epoch_t min_epoch = pg.get_last_peering_reset();
+
+  assert(pg.get_acting_recovery_backfill().size() > 0);
+  return seastar::parallel_for_each(pg.get_acting_recovery_backfill(),
+    [this, soid, need, min_epoch](pg_shard_t shard) {
+    if (shard == pg.get_pg_whoami())
+      return seastar::make_ready_future<>();
+    auto iter = pg.get_shard_missing().find(shard);
+    if (iter == pg.get_shard_missing().end())
+      return seastar::make_ready_future<>();
+    if (iter->second.is_missing(soid)) {
+      logger().debug("push_delete: will remove {} from {}", soid, shard);
+      pg.begin_peer_recover(shard, soid);
+      spg_t target_pg(pg.get_info().pgid.pgid, shard.shard);
+      auto msg = make_message<MOSDPGRecoveryDelete>(
+	  pg.get_pg_whoami(), target_pg, pg.get_osdmap_epoch(), min_epoch);
+      msg->set_priority(pg.get_recovery_op_priority());
+      msg->objects.push_back(std::make_pair(soid, need));
+      return shard_services.send_to_osd(shard.osd, std::move(msg),
+					pg.get_osdmap_epoch()).then(
+	[this, soid, shard] {
+	return recovering.at(soid).wait_for_pushes(shard);
+      });
+    }
+    return seastar::make_ready_future<>();
+  });
+}
+
+seastar::future<> ReplicatedRecoveryBackend::handle_recovery_delete(
+  Ref<MOSDPGRecoveryDelete> m)
+{
+  logger().debug("{}: {}", __func__, *m);
+
+  auto& p = m->objects.front(); //TODO: only one delete per message for now.
+  return local_recover_delete(p.first, p.second, pg.get_osdmap_epoch()).then(
+    [this, m] {
+    auto reply = make_message<MOSDPGRecoveryDeleteReply>();
+    reply->from = pg.get_pg_whoami();
+    reply->set_priority(m->get_priority());
+    reply->pgid = spg_t(pg.get_info().pgid.pgid, m->from.shard);
+    reply->map_epoch = m->map_epoch;
+    reply->min_epoch = m->min_epoch;
+    reply->objects = m->objects;
+    return shard_services.send_to_osd(m->from.osd, std::move(reply), pg.get_osdmap_epoch());
+  });
+}
+
+seastar::future<> ReplicatedRecoveryBackend::on_local_recover_persist(
+  const hobject_t& soid,
+  const ObjectRecoveryInfo& _recovery_info,
+  bool is_delete,
+  epoch_t epoch_frozen)
+{
+  logger().debug("{}", __func__);
+  ceph::os::Transaction t;
+  pg.get_recovery_handler()->on_local_recover(soid, _recovery_info, is_delete, t);
+  return shard_services.get_store().do_transaction(coll, std::move(t)).then(
+    [this, epoch_frozen, last_complete = pg.get_info().last_complete] {
+    pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete);
+    return seastar::make_ready_future<>();
+  });
+}
+
+seastar::future<> ReplicatedRecoveryBackend::local_recover_delete(
+  const hobject_t& soid,
+  eversion_t need,
+  epoch_t epoch_to_freeze)
+{
+  logger().debug("{}: {}, {}", __func__, soid, need);
+  return backend->load_metadata(soid).safe_then([this]
+    (auto lomt) {
+    if (lomt->os.exists) {
+      return seastar::do_with(ceph::os::Transaction(),
+	[this, lomt = std::move(lomt)](auto& txn) {
+	return backend->remove(lomt->os, txn).then([this, &txn]() mutable {
+	  return shard_services.get_store().do_transaction(coll,
+							   std::move(txn));
+	});
+      });
+    }
+    return seastar::make_ready_future<>();
+  }).safe_then([this, soid, epoch_to_freeze, need] {
+    ObjectRecoveryInfo recovery_info;
+    recovery_info.soid = soid;
+    recovery_info.version = need;
+    return on_local_recover_persist(soid, recovery_info,
+	                            true, epoch_to_freeze);
+  }, PGBackend::load_metadata_ertr::all_same_way(
+      [this, soid, epoch_to_freeze, need] (auto e) {
+      ObjectRecoveryInfo recovery_info;
+      recovery_info.soid = soid;
+      recovery_info.version = need;
+      return on_local_recover_persist(soid, recovery_info,
+				      true, epoch_to_freeze);
+    })
+  );
+}
+
+seastar::future<> ReplicatedRecoveryBackend::recover_delete(
+  const hobject_t &soid, eversion_t need)
+{
+  logger().debug("{}: {}, {}", __func__, soid, need);
+
+  epoch_t cur_epoch = pg.get_osdmap_epoch();
+  return seastar::do_with(object_stat_sum_t(),
+    [this, soid, need, cur_epoch](auto& stat_diff) {
+    return local_recover_delete(soid, need, cur_epoch).then(
+      [this, &stat_diff, cur_epoch, soid, need] {
+      if (!pg.has_reset_since(cur_epoch)) {
+	bool object_missing = false;
+	for (const auto& shard : pg.get_acting_recovery_backfill()) {
+	  if (shard == pg.get_pg_whoami())
+	    continue;
+	  if (pg.get_shard_missing(shard)->is_missing(soid)) {
+	    logger().debug("recover_delete: soid {} needs to deleted from replca {}",
+			   soid, shard);
+	    object_missing = true;
+	    break;
+	  }
+	}
+
+	if (!object_missing) {
+	  stat_diff.num_objects_recovered = 1;
+	  return seastar::make_ready_future<>();
+	} else {
+	  return push_delete(soid, need);
+	}
+      }
+      return seastar::make_ready_future<>();
+    }).then([this, soid, &stat_diff] {
+      pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true);
+      return seastar::make_ready_future<>();
+    });
+  });
+}
+
+seastar::future<PushOp>
+ReplicatedRecoveryBackend::prep_push(
+  const hobject_t& soid,
+  eversion_t need,
+  pg_shard_t pg_shard)
+{
+  logger().debug("{}: {}, {}", __func__, soid, need);
+
+  auto& recovery_waiter = recovering.at(soid);
+  auto& obc = recovery_waiter.obc;
+  interval_set<uint64_t> data_subset;
+  if (obc->obs.oi.size) {
+    data_subset.insert(0, obc->obs.oi.size);
+  }
+  const auto& missing = pg.get_shard_missing().find(pg_shard)->second;
+  if (HAVE_FEATURE(pg.min_peer_features(), SERVER_OCTOPUS)) {
+    const auto it = missing.get_items().find(soid);
+    assert(it != missing.get_items().end());
+    data_subset.intersection_of(it->second.clean_regions.get_dirty_regions());
+    logger().debug("prep_push: {} data_subset {}", soid, data_subset);
+  }
+
+  logger().debug("prep_push: {} to {}", soid, pg_shard);
+  auto& pi = recovery_waiter.pushing[pg_shard];
+  pg.begin_peer_recover(pg_shard, soid);
+  const auto pmissing_iter = pg.get_shard_missing().find(pg_shard);
+  const auto missing_iter = pmissing_iter->second.get_items().find(soid);
+  assert(missing_iter != pmissing_iter->second.get_items().end());
+
+  pi.obc = obc;
+  pi.recovery_info.size = obc->obs.oi.size;
+  pi.recovery_info.copy_subset = data_subset;
+  pi.recovery_info.soid = soid;
+  pi.recovery_info.oi = obc->obs.oi;
+  pi.recovery_info.version = obc->obs.oi.version;
+  pi.recovery_info.object_exist =
+    missing_iter->second.clean_regions.object_is_exist();
+  pi.recovery_progress.omap_complete =
+    (!missing_iter->second.clean_regions.omap_is_dirty() &&
+     HAVE_FEATURE(pg.min_peer_features(), SERVER_OCTOPUS));
+
+  return build_push_op(pi.recovery_info, pi.recovery_progress, &pi.stat).then(
+    [this, soid, pg_shard](auto pop) {
+    auto& recovery_waiter = recovering.at(soid);
+    auto& pi = recovery_waiter.pushing[pg_shard];
+    pi.recovery_progress = pop.after_progress;
+    return pop;
+  });
+}
+
+void ReplicatedRecoveryBackend::prepare_pull(PullOp& po, PullInfo& pi,
+  const hobject_t& soid,
+  eversion_t need) {
+  logger().debug("{}: {}, {}", __func__, soid, need);
+
+  pg_missing_tracker_t local_missing = pg.get_local_missing();
+  const auto missing_iter = local_missing.get_items().find(soid);
+  auto m = pg.get_missing_loc_shards();
+  pg_shard_t fromshard = *(m[soid].begin());
+
+  //TODO: skipped snap objects case for now
+  po.recovery_info.copy_subset.insert(0, (uint64_t) -1);
+  if (HAVE_FEATURE(pg.min_peer_features(), SERVER_OCTOPUS))
+    po.recovery_info.copy_subset.intersection_of(
+	missing_iter->second.clean_regions.get_dirty_regions());
+  po.recovery_info.size = ((uint64_t) -1);
+  po.recovery_info.object_exist =
+    missing_iter->second.clean_regions.object_is_exist();
+  po.recovery_info.soid = soid;
+  po.soid = soid;
+  po.recovery_progress.data_complete = false;
+  po.recovery_progress.omap_complete =
+    !missing_iter->second.clean_regions.omap_is_dirty() &&
+    HAVE_FEATURE(pg.min_peer_features(), SERVER_OCTOPUS);
+  po.recovery_progress.data_recovered_to = 0;
+  po.recovery_progress.first = true;
+
+  pi.from = fromshard;
+  pi.soid = soid;
+  pi.recovery_info = po.recovery_info;
+  pi.recovery_progress = po.recovery_progress;
+}
+
+seastar::future<PushOp> ReplicatedRecoveryBackend::build_push_op(
+    const ObjectRecoveryInfo& recovery_info,
+    const ObjectRecoveryProgress& progress,
+    object_stat_sum_t* stat)
+{
+  logger().debug("{} {} @{}",
+		 __func__, recovery_info.soid, recovery_info.version);
+  return seastar::do_with(ObjectRecoveryProgress(progress),
+			  uint64_t(crimson::common::local_conf()
+			    ->osd_recovery_max_chunk),
+			  recovery_info.version,
+			  PushOp(),
+    [this, &recovery_info, &progress, stat]
+    (auto new_progress, auto available, auto v, auto pop) {
+    return read_metadata_for_push_op(recovery_info.soid,
+                                     progress, new_progress,
+                                     v, &pop).then([&](eversion_t local_ver) mutable {
+      // If requestor didn't know the version, use ours
+      if (v == eversion_t()) {
+        v = local_ver;
+      } else if (v != local_ver) {
+        logger().error("build_push_op: {} push {} v{} failed because local copy is {}",
+                       pg.get_pgid(), recovery_info.soid, recovery_info.version, local_ver);
+        // TODO: bail out
+      }
+      return read_omap_for_push_op(recovery_info.soid,
+                                   progress,
+                                   new_progress,
+                                   &available, &pop);
+    }).then([this, &recovery_info, &progress, &available, &pop]() mutable {
+      logger().debug("build_push_op: available: {}, copy_subset: {}",
+		     available, recovery_info.copy_subset);
+      return read_object_for_push_op(recovery_info.soid,
+				     recovery_info.copy_subset,
+				     progress.data_recovered_to,
+				     available, &pop);
+    }).then([&recovery_info, &v, &progress, &new_progress, stat, &pop]
+            (uint64_t recovered_to) mutable {
+      new_progress.data_recovered_to = recovered_to;
+      if (new_progress.is_complete(recovery_info)) {
+	new_progress.data_complete = true;
+	if (stat)
+	  stat->num_objects_recovered++;
+      } else if (progress.first && progress.omap_complete) {
+      // If omap is not changed, we need recovery omap
+      // when recovery cannot be completed once
+	new_progress.omap_complete = false;
+      }
+      if (stat) {
+	stat->num_keys_recovered += pop.omap_entries.size();
+	stat->num_bytes_recovered += pop.data.length();
+      }
+      pop.version = v;
+      pop.soid = recovery_info.soid;
+      pop.recovery_info = recovery_info;
+      pop.after_progress = new_progress;
+      pop.before_progress = progress;
+      logger().debug("build_push_op: pop version: {}, pop data length: {}",
+		     pop.version, pop.data.length());
+      return seastar::make_ready_future<PushOp>(std::move(pop));
+    });
+  });
+}
+
+seastar::future<eversion_t>
+ReplicatedRecoveryBackend::read_metadata_for_push_op(
+    const hobject_t& oid,
+    const ObjectRecoveryProgress& progress,
+    ObjectRecoveryProgress& new_progress,
+    eversion_t ver,
+    PushOp* push_op)
+{
+  if (!progress.first) {
+    return seastar::make_ready_future<eversion_t>(ver);
+  }
+  return seastar::when_all_succeed(
+    backend->omap_get_header(coll, ghobject_t(oid)).handle_error(
+      crimson::os::FuturizedStore::read_errorator::all_same_way(
+        [] (const std::error_code& e) {
+        return seastar::make_ready_future<bufferlist>();
+      })),
+    store->get_attrs(coll, ghobject_t(oid)).handle_error(
+      crimson::os::FuturizedStore::get_attrs_ertr::all_same_way(
+        [] (const std::error_code& e) {
+        return seastar::make_ready_future<crimson::os::FuturizedStore::attrs_t>();
+      }))
+  ).then_unpack([&new_progress, push_op](auto bl, auto attrs) {
+    if (bl.length() == 0) {
+      logger().error("read_metadata_for_push_op: fail to read omap header");
+      return eversion_t{};
+    } else if (attrs.empty()) {
+      logger().error("read_metadata_for_push_op: fail to read attrs");
+      return eversion_t{};
+    }
+    push_op->omap_header.claim_append(std::move(bl));
+    for (auto&& [key, val] : std::move(attrs)) {
+      push_op->attrset[key].push_back(val);
+    }
+    logger().debug("read_metadata_for_push_op: {}", push_op->attrset[OI_ATTR]);
+    object_info_t oi;
+    oi.decode(push_op->attrset[OI_ATTR]);
+    new_progress.first = false;
+    return oi.version;
+  });
+}
+
+seastar::future<uint64_t>
+ReplicatedRecoveryBackend::read_object_for_push_op(
+    const hobject_t& oid,
+    const interval_set<uint64_t>& copy_subset,
+    uint64_t offset,
+    uint64_t max_len,
+    PushOp* push_op)
+{
+  if (max_len == 0 || copy_subset.empty()) {
+    push_op->data_included.clear();
+    return seastar::make_ready_future<uint64_t>(offset);
+  }
+  // 1. get the extents in the interested range
+  return backend->fiemap(coll, ghobject_t{oid},
+                         0, copy_subset.range_end()).then_wrapped(
+    [=](auto&& fiemap_included) mutable {
+    interval_set<uint64_t> extents;
+    try {
+      extents.intersection_of(copy_subset, fiemap_included.get0());
+    } catch (std::exception &) {
+      // if fiemap() fails, we will read nothing, as the intersection of
+      // copy_subset and an empty interval_set would be empty anyway
+      extents.clear();
+    }
+    // 2. we can read up to "max_len" bytes from "offset", so truncate the
+    //    extents down to this quota. no need to return the number of consumed
+    //    bytes, as this is the last consumer of this quota
+    push_op->data_included.span_of(extents, offset, max_len);
+    // 3. read the truncated extents
+    // TODO: check if the returned extents are pruned
+    return store->readv(coll, ghobject_t{oid}, push_op->data_included, 0);
+  }).safe_then([push_op, range_end=copy_subset.range_end()](auto &&bl) {
+    push_op->data.claim_append(std::move(bl));
+    uint64_t recovered_to = 0;
+    if (push_op->data_included.empty()) {
+      // zero filled section, skip to end!
+      recovered_to = range_end;
+    } else {
+      // note down the progress, we will start from there next time
+      recovered_to = push_op->data_included.range_end();
+    }
+    return seastar::make_ready_future<uint64_t>(recovered_to);
+  }, PGBackend::read_errorator::all_same_way([](auto e) {
+    logger().debug("build_push_op: read exception");
+    return seastar::make_exception_future<uint64_t>(e);
+  }));
+}
+
+seastar::future<>
+ReplicatedRecoveryBackend::read_omap_for_push_op(
+    const hobject_t& oid,
+    const ObjectRecoveryProgress& progress,
+    ObjectRecoveryProgress& new_progress,
+    uint64_t* max_len,
+    PushOp* push_op)
+{
+  if (progress.omap_complete) {
+    return seastar::make_ready_future<>();
+  }
+  return shard_services.get_store().get_omap_iterator(coll, ghobject_t{oid})
+    .then([&progress, &new_progress, max_len, push_op](auto omap_iter) {
+    return omap_iter->lower_bound(progress.omap_recovered_to).then(
+      [omap_iter, &new_progress, max_len, push_op] {
+      return seastar::do_until([omap_iter, &new_progress, max_len, push_op] {
+        if (!omap_iter->valid()) {
+          new_progress.omap_complete = true;
+          return true;
+        }
+        if (push_op->omap_entries.empty()) {
+          return false;
+        }
+        if (const uint64_t entries_per_chunk =
+            crimson::common::local_conf()->osd_recovery_max_omap_entries_per_chunk;
+            entries_per_chunk > 0 &&
+            push_op->omap_entries.size() >= entries_per_chunk) {
+          new_progress.omap_recovered_to = omap_iter->key();
+          return true;
+        }
+        if (omap_iter->key().size() + omap_iter->value().length() > *max_len) {
+          new_progress.omap_recovered_to = omap_iter->key();
+          return true;
+        }
+        return false;
+      },
+      [omap_iter, max_len, push_op] {
+        push_op->omap_entries.emplace(omap_iter->key(), omap_iter->value());
+        if (const uint64_t entry_size =
+            omap_iter->key().size() + omap_iter->value().length();
+            entry_size > *max_len) {
+          *max_len -= entry_size;
+        } else {
+          *max_len = 0;
+        }
+        return omap_iter->next();
+      });
+    });
+  });
+}
+
+std::vector<pg_shard_t>
+ReplicatedRecoveryBackend::get_shards_to_push(const hobject_t& soid) const
+{
+  std::vector<pg_shard_t> shards;
+  assert(pg.get_acting_recovery_backfill().size() > 0);
+  for (const auto& peer : pg.get_acting_recovery_backfill()) {
+    if (peer == pg.get_pg_whoami())
+      continue;
+    auto shard_missing =
+      pg.get_shard_missing().find(peer);
+    assert(shard_missing != pg.get_shard_missing().end());
+    if (shard_missing->second.is_missing(soid)) {
+      shards.push_back(shard_missing->first);
+    }
+  }
+  return shards;
+}
+
+seastar::future<> ReplicatedRecoveryBackend::handle_pull(Ref<MOSDPGPull> m)
+{
+  logger().debug("{}: {}", __func__, *m);
+  return seastar::parallel_for_each(m->take_pulls(),
+				    [this, from=m->from](auto& pull_op) {
+    const hobject_t& soid = pull_op.soid;
+    logger().debug("handle_pull: {}", soid);
+    return backend->stat(coll, ghobject_t(soid)).then(
+      [this, &pull_op](auto st) {
+      ObjectRecoveryInfo &recovery_info = pull_op.recovery_info;
+      ObjectRecoveryProgress &progress = pull_op.recovery_progress;
+      if (progress.first && recovery_info.size == ((uint64_t) -1)) {
+        // Adjust size and copy_subset
+        recovery_info.size = st.st_size;
+        if (st.st_size) {
+          interval_set<uint64_t> object_range;
+          object_range.insert(0, st.st_size);
+          recovery_info.copy_subset.intersection_of(object_range);
+        } else {
+          recovery_info.copy_subset.clear();
+        }
+        assert(recovery_info.clone_subset.empty());
+      }
+      return build_push_op(recovery_info, progress, 0);
+    }).then([this, from](auto pop) {
+      auto msg = make_message<MOSDPGPush>();
+      msg->from = pg.get_pg_whoami();
+      msg->pgid = pg.get_pgid();
+      msg->map_epoch = pg.get_osdmap_epoch();
+      msg->min_epoch = pg.get_last_peering_reset();
+      msg->set_priority(pg.get_recovery_op_priority());
+      msg->pushes.push_back(std::move(pop));
+      return shard_services.send_to_osd(from.osd, std::move(msg),
+                                        pg.get_osdmap_epoch());
+    });
+  });
+}
+
+seastar::future<bool> ReplicatedRecoveryBackend::_handle_pull_response(
+  pg_shard_t from,
+  const PushOp& pop,
+  PullOp* response,
+  ceph::os::Transaction* t)
+{
+  logger().debug("handle_pull_response {} {} data.size() is {} data_included: {}",
+      pop.recovery_info, pop.after_progress, pop.data.length(), pop.data_included);
+
+  const hobject_t &hoid = pop.soid;
+  auto& recovery_waiter = recovering.at(hoid);
+  auto& pi = *recovery_waiter.pi;
+  if (pi.recovery_info.size == (uint64_t(-1))) {
+    pi.recovery_info.size = pop.recovery_info.size;
+    pi.recovery_info.copy_subset.intersection_of(
+	pop.recovery_info.copy_subset);
+  }
+
+  // If primary doesn't have object info and didn't know version
+  if (pi.recovery_info.version == eversion_t())
+    pi.recovery_info.version = pop.version;
+
+  auto prepare_waiter = seastar::make_ready_future<>();
+  if (pi.recovery_progress.first) {
+    prepare_waiter = pg.with_head_obc<RWState::RWNONE>(
+      pi.recovery_info.soid, [&pi, &recovery_waiter, &pop](auto obc) {
+        pi.obc = obc;
+        recovery_waiter.obc = obc;
+        obc->obs.oi.decode(pop.attrset.at(OI_ATTR));
+        pi.recovery_info.oi = obc->obs.oi;
+        return crimson::osd::PG::load_obc_ertr::now();
+      }).handle_error(crimson::ct_error::assert_all{});
+  };
+  return prepare_waiter.then([this, &pi, &pop, t, response]() mutable {
+    const bool first = pi.recovery_progress.first;
+    pi.recovery_progress = pop.after_progress;
+    logger().debug("new recovery_info {}, new progress {}",
+		   pi.recovery_info, pi.recovery_progress);
+    interval_set<uint64_t> data_zeros;
+    {
+      uint64_t offset = pop.before_progress.data_recovered_to;
+      uint64_t length = (pop.after_progress.data_recovered_to -
+			 pop.before_progress.data_recovered_to);
+      if (length) {
+        data_zeros.insert(offset, length);
+      }
+    }
+    auto [usable_intervals, data] =
+      trim_pushed_data(pi.recovery_info.copy_subset,
+                       pop.data_included, pop.data);
+    bool complete = pi.is_complete();
+    bool clear_omap = !pop.before_progress.omap_complete;
+    return submit_push_data(pi.recovery_info, first, complete, clear_omap,
+	  std::move(data_zeros), usable_intervals, data, pop.omap_header,
+	  pop.attrset, pop.omap_entries, t).then(
+      [this, response, &pi, &pop, complete, t, bytes_recovered=data.length()] {
+      pi.stat.num_keys_recovered += pop.omap_entries.size();
+      pi.stat.num_bytes_recovered += bytes_recovered;
+
+      if (complete) {
+	pi.stat.num_objects_recovered++;
+	pg.get_recovery_handler()->on_local_recover(
+	    pop.soid, recovering.at(pop.soid).pi->recovery_info,
+	    false, *t);
+	return true;
+      } else {
+        response->soid = pop.soid;
+        response->recovery_info = pi.recovery_info;
+        response->recovery_progress = pi.recovery_progress;
+        return false;
+      }
+    });
+  });
+}
+
+seastar::future<> ReplicatedRecoveryBackend::handle_pull_response(
+  Ref<MOSDPGPush> m)
+{
+  const PushOp& pop = m->pushes[0]; //TODO: only one push per message for now.
+  if (pop.version == eversion_t()) {
+    // replica doesn't have it!
+    pg.get_recovery_handler()->on_failed_recover({ m->from }, pop.soid,
+	get_recovering(pop.soid).pi->recovery_info.version);
+    return seastar::make_exception_future<>(
+	std::runtime_error(fmt::format(
+	    "Error on pushing side {} when pulling obj {}",
+	    m->from, pop.soid)));
+  }
+
+  logger().debug("{}: {}", __func__, *m);
+  return seastar::do_with(PullOp(), [this, m](auto& response) {
+    return seastar::do_with(ceph::os::Transaction(), m.get(),
+      [this, &response](auto& t, auto& m) {
+      pg_shard_t from = m->from;
+      PushOp& pop = m->pushes[0]; // only one push per message for now
+      return _handle_pull_response(from, pop, &response, &t).then(
+	[this, &t](bool complete) {
+	epoch_t epoch_frozen = pg.get_osdmap_epoch();
+	return shard_services.get_store().do_transaction(coll, std::move(t))
+	  .then([this, epoch_frozen, complete,
+	  last_complete = pg.get_info().last_complete] {
+	  pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete);
+	  return seastar::make_ready_future<bool>(complete);
+	});
+      });
+    }).then([this, m, &response](bool complete) {
+      if (complete) {
+	auto& pop = m->pushes[0];
+	recovering.at(pop.soid).set_pulled();
+	return seastar::make_ready_future<>();
+      } else {
+	auto reply = make_message<MOSDPGPull>();
+	reply->from = pg.get_pg_whoami();
+	reply->set_priority(m->get_priority());
+	reply->pgid = pg.get_info().pgid;
+	reply->map_epoch = m->map_epoch;
+	reply->min_epoch = m->min_epoch;
+	reply->set_pulls({std::move(response)});
+	return shard_services.send_to_osd(m->from.osd, std::move(reply), pg.get_osdmap_epoch());
+      }
+    });
+  });
+}
+
+seastar::future<> ReplicatedRecoveryBackend::_handle_push(
+  pg_shard_t from,
+  const PushOp &pop,
+  PushReplyOp *response,
+  ceph::os::Transaction *t)
+{
+  logger().debug("{}", __func__);
+
+  bool first = pop.before_progress.first;
+  interval_set<uint64_t> data_zeros;
+  {
+    uint64_t offset = pop.before_progress.data_recovered_to;
+    uint64_t length = (pop.after_progress.data_recovered_to -
+                       pop.before_progress.data_recovered_to);
+    if (length) {
+      data_zeros.insert(offset, length);
+    }
+  }
+  bool complete = (pop.after_progress.data_complete &&
+		   pop.after_progress.omap_complete);
+  bool clear_omap = !pop.before_progress.omap_complete;
+  response->soid = pop.recovery_info.soid;
+
+  return submit_push_data(pop.recovery_info, first, complete, clear_omap,
+        std::move(data_zeros), pop.data_included, pop.data, pop.omap_header,
+        pop.attrset, pop.omap_entries, t).then([this, complete, &pop, t] {
+    if (complete) {
+      pg.get_recovery_handler()->on_local_recover(
+        pop.recovery_info.soid, pop.recovery_info,
+        false, *t);
+    }
+  });
+}
+
+seastar::future<> ReplicatedRecoveryBackend::handle_push(
+  Ref<MOSDPGPush> m)
+{
+  if (pg.is_primary()) {
+    return handle_pull_response(m);
+  }
+
+  logger().debug("{}: {}", __func__, *m);
+  return seastar::do_with(PushReplyOp(), [this, m](auto& response) {
+    const PushOp& pop = m->pushes[0]; //TODO: only one push per message for now
+    return seastar::do_with(ceph::os::Transaction(),
+      [this, m, &pop, &response](auto& t) {
+      return _handle_push(m->from, pop, &response, &t).then(
+	[this, &t] {
+	epoch_t epoch_frozen = pg.get_osdmap_epoch();
+	return shard_services.get_store().do_transaction(coll, std::move(t)).then(
+	  [this, epoch_frozen, last_complete = pg.get_info().last_complete] {
+	  //TODO: this should be grouped with pg.on_local_recover somehow.
+	  pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete);
+	});
+      });
+    }).then([this, m, &response]() mutable {
+      auto reply = make_message<MOSDPGPushReply>();
+      reply->from = pg.get_pg_whoami();
+      reply->set_priority(m->get_priority());
+      reply->pgid = pg.get_info().pgid;
+      reply->map_epoch = m->map_epoch;
+      reply->min_epoch = m->min_epoch;
+      std::vector<PushReplyOp> replies = { std::move(response) };
+      reply->replies.swap(replies);
+      return shard_services.send_to_osd(m->from.osd,
+	  std::move(reply), pg.get_osdmap_epoch());
+    });
+  });
+}
+
+seastar::future<std::optional<PushOp>>
+ReplicatedRecoveryBackend::_handle_push_reply(
+  pg_shard_t peer,
+  const PushReplyOp &op)
+{
+  const hobject_t& soid = op.soid;
+  logger().debug("{}, soid {}, from {}", __func__, soid, peer);
+  auto recovering_iter = recovering.find(soid);
+  if (recovering_iter == recovering.end()
+      || !recovering_iter->second.pushing.count(peer)) {
+    logger().debug("huh, i wasn't pushing {} to osd.{}", soid, peer);
+    return seastar::make_ready_future<std::optional<PushOp>>();
+  } else {
+    auto& pi = recovering_iter->second.pushing[peer];
+    bool error = pi.recovery_progress.error;
+    if (!pi.recovery_progress.data_complete && !error) {
+      return build_push_op(pi.recovery_info, pi.recovery_progress,
+			   &pi.stat).then([&pi] (auto pop) {
+        pi.recovery_progress = pop.after_progress;
+	return seastar::make_ready_future<std::optional<PushOp>>(std::move(pop));
+      }).handle_exception([recovering_iter, &pi, peer] (auto e) {
+        pi.recovery_progress.error = true;
+        recovering_iter->second.set_push_failed(peer, e);
+        return seastar::make_ready_future<std::optional<PushOp>>();
+      });
+    }
+    if (!error) {
+      pg.get_recovery_handler()->on_peer_recover(peer, soid, pi.recovery_info);
+    }
+    recovering_iter->second.set_pushed(peer);
+    return seastar::make_ready_future<std::optional<PushOp>>();
+  }
+}
+
+seastar::future<> ReplicatedRecoveryBackend::handle_push_reply(
+  Ref<MOSDPGPushReply> m)
+{
+  logger().debug("{}: {}", __func__, *m);
+  auto from = m->from;
+  auto& push_reply = m->replies[0]; //TODO: only one reply per message
+
+  return _handle_push_reply(from, push_reply).then(
+    [this, from](std::optional<PushOp> push_op) {
+    if (push_op) {
+      auto msg = make_message<MOSDPGPush>();
+      msg->from = pg.get_pg_whoami();
+      msg->pgid = pg.get_pgid();
+      msg->map_epoch = pg.get_osdmap_epoch();
+      msg->min_epoch = pg.get_last_peering_reset();
+      msg->set_priority(pg.get_recovery_op_priority());
+      msg->pushes.push_back(std::move(*push_op));
+      return shard_services.send_to_osd(from.osd,
+                                        std::move(msg),
+                                        pg.get_osdmap_epoch());
+    } else {
+      return seastar::make_ready_future<>();
+    }
+  });
+}
+
+std::pair<interval_set<uint64_t>,
+	  bufferlist>
+ReplicatedRecoveryBackend::trim_pushed_data(
+  const interval_set<uint64_t> &copy_subset,
+  const interval_set<uint64_t> &intervals_received,
+  ceph::bufferlist data_received)
+{
+  logger().debug("{}", __func__);
+  // what i have is only a subset of what i want
+  if (intervals_received.subset_of(copy_subset)) {
+    return {intervals_received, data_received};
+  }
+  // only collect the extents included by copy_subset and intervals_received
+  interval_set<uint64_t> intervals_usable;
+  bufferlist data_usable;
+  intervals_usable.intersection_of(copy_subset, intervals_received);
+  uint64_t have_off = 0;
+  for (auto [have_start, have_len] : intervals_received) {
+    interval_set<uint64_t> want;
+    want.insert(have_start, have_len);
+    want.intersection_of(copy_subset);
+    for (auto [want_start, want_len] : want) {
+      bufferlist sub;
+      uint64_t data_off = have_off + (want_start - have_start);
+      sub.substr_of(data_received, data_off, want_len);
+      data_usable.claim_append(sub);
+    }
+    have_off += have_len;
+  }
+  return {intervals_usable, data_usable};
+}
+
+seastar::future<> ReplicatedRecoveryBackend::submit_push_data(
+  const ObjectRecoveryInfo &recovery_info,
+  bool first,
+  bool complete,
+  bool clear_omap,
+  interval_set<uint64_t> data_zeros,
+  const interval_set<uint64_t> &intervals_included,
+  bufferlist data_included,
+  bufferlist omap_header,
+  const map<string, bufferlist> &attrs,
+  const map<string, bufferlist> &omap_entries,
+  ObjectStore::Transaction *t)
+{
+  logger().debug("{}", __func__);
+  hobject_t target_oid;
+  if (first && complete) {
+    target_oid = recovery_info.soid;
+  } else {
+    target_oid = get_temp_recovery_object(recovery_info.soid,
+					  recovery_info.version);
+    if (first) {
+      logger().debug("{}: Adding oid {} in the temp collection",
+	  __func__, target_oid);
+      add_temp_obj(target_oid);
+    }
+  }
+
+  return [this, &recovery_info, first, complete, t,
+    &omap_header, &attrs, target_oid, clear_omap] {
+    if (first) {
+      if (!complete) {
+	t->remove(coll->get_cid(), ghobject_t(target_oid));
+	t->touch(coll->get_cid(), ghobject_t(target_oid));
+	bufferlist bv = attrs.at(OI_ATTR);
+	object_info_t oi(bv);
+	t->set_alloc_hint(coll->get_cid(), ghobject_t(target_oid),
+			  oi.expected_object_size,
+			  oi.expected_write_size,
+			  oi.alloc_hint_flags);
+      } else {
+        if (!recovery_info.object_exist) {
+	  t->remove(coll->get_cid(), ghobject_t(target_oid));
+          t->touch(coll->get_cid(), ghobject_t(target_oid));
+          bufferlist bv = attrs.at(OI_ATTR);
+          object_info_t oi(bv);
+          t->set_alloc_hint(coll->get_cid(), ghobject_t(target_oid),
+                            oi.expected_object_size,
+                            oi.expected_write_size,
+                            oi.alloc_hint_flags);
+        }
+        //remove xattr and update later if overwrite on original object
+        t->rmattrs(coll->get_cid(), ghobject_t(target_oid));
+        //if need update omap, clear the previous content first
+        if (clear_omap)
+          t->omap_clear(coll->get_cid(), ghobject_t(target_oid));
+      }
+
+      t->truncate(coll->get_cid(), ghobject_t(target_oid), recovery_info.size);
+      if (omap_header.length())
+	t->omap_setheader(coll->get_cid(), ghobject_t(target_oid), omap_header);
+
+      return store->stat(coll, ghobject_t(recovery_info.soid)).then(
+	[this, &recovery_info, complete, t, target_oid,
+	omap_header = std::move(omap_header)] (auto st) {
+	//TODO: pg num bytes counting
+	if (!complete) {
+	  //clone overlap content in local object
+	  if (recovery_info.object_exist) {
+	    uint64_t local_size = std::min(recovery_info.size, (uint64_t)st.st_size);
+	    interval_set<uint64_t> local_intervals_included, local_intervals_excluded;
+	    if (local_size) {
+	      local_intervals_included.insert(0, local_size);
+	      local_intervals_excluded.intersection_of(local_intervals_included, recovery_info.copy_subset);
+	      local_intervals_included.subtract(local_intervals_excluded);
+	    }
+	    for (auto [off, len] : local_intervals_included) {
+	      logger().debug(" clone_range {} {}~{}",
+		  recovery_info.soid, off, len);
+	      t->clone_range(coll->get_cid(), ghobject_t(recovery_info.soid),
+			     ghobject_t(target_oid), off, len, off);
+	    }
+	  }
+	}
+	return seastar::make_ready_future<>();
+      });
+    }
+    return seastar::make_ready_future<>();
+  }().then([this, data_zeros=std::move(data_zeros),
+	    &recovery_info, &intervals_included, t, target_oid,
+	    &omap_entries, &attrs, data_included, complete, first]() mutable {
+    uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL;
+    // Punch zeros for data, if fiemap indicates nothing but it is marked dirty
+    if (!data_zeros.empty()) {
+      data_zeros.intersection_of(recovery_info.copy_subset);
+      assert(intervals_included.subset_of(data_zeros));
+      data_zeros.subtract(intervals_included);
+
+      logger().debug("submit_push_data recovering object {} copy_subset: {} "
+	  "intervals_included: {} data_zeros: {}",
+	  recovery_info.soid, recovery_info.copy_subset,
+	  intervals_included, data_zeros);
+
+      for (auto [start, len] : data_zeros) {
+        t->zero(coll->get_cid(), ghobject_t(target_oid), start, len);
+      }
+    }
+    uint64_t off = 0;
+    for (auto [start, len] : intervals_included) {
+      bufferlist bit;
+      bit.substr_of(data_included, off, len);
+      t->write(coll->get_cid(), ghobject_t(target_oid),
+	       start, len, bit, fadvise_flags);
+      off += len;
+    }
+
+    if (!omap_entries.empty())
+      t->omap_setkeys(coll->get_cid(), ghobject_t(target_oid), omap_entries);
+    if (!attrs.empty())
+      t->setattrs(coll->get_cid(), ghobject_t(target_oid), attrs);
+
+    if (complete) {
+      if (!first) {
+	logger().debug("submit_push_data: Removing oid {} from the temp collection",
+	  target_oid);
+	clear_temp_obj(target_oid);
+	t->remove(coll->get_cid(), ghobject_t(recovery_info.soid));
+	t->collection_move_rename(coll->get_cid(), ghobject_t(target_oid),
+				  coll->get_cid(), ghobject_t(recovery_info.soid));
+      }
+      submit_push_complete(recovery_info, t);
+    }
+    logger().debug("submit_push_data: done");
+    return seastar::make_ready_future<>();
+  });
+}
+
+void ReplicatedRecoveryBackend::submit_push_complete(
+  const ObjectRecoveryInfo &recovery_info,
+  ObjectStore::Transaction *t)
+{
+  for (const auto& [oid, extents] : recovery_info.clone_subset) {
+    for (const auto [off, len] : extents) {
+      logger().debug(" clone_range {} {}~{}", oid, off, len);
+      t->clone_range(coll->get_cid(), ghobject_t(oid), ghobject_t(recovery_info.soid),
+                     off, len, off);
+    }
+  }
+}
+
+seastar::future<> ReplicatedRecoveryBackend::handle_recovery_delete_reply(
+  Ref<MOSDPGRecoveryDeleteReply> m)
+{
+  auto& p = m->objects.front();
+  hobject_t soid = p.first;
+  ObjectRecoveryInfo recovery_info;
+  recovery_info.version = p.second;
+  pg.get_recovery_handler()->on_peer_recover(m->from, soid, recovery_info);
+  get_recovering(soid).set_pushed(m->from);
+  return seastar::now();
+}
+
+seastar::future<> ReplicatedRecoveryBackend::handle_recovery_op(Ref<MOSDFastDispatchOp> m)
+{
+  switch (m->get_header().type) {
+  case MSG_OSD_PG_PULL:
+    return handle_pull(boost::static_pointer_cast<MOSDPGPull>(m));
+  case MSG_OSD_PG_PUSH:
+    return handle_push(boost::static_pointer_cast<MOSDPGPush>(m));
+  case MSG_OSD_PG_PUSH_REPLY:
+    return handle_push_reply(
+	boost::static_pointer_cast<MOSDPGPushReply>(m));
+  case MSG_OSD_PG_RECOVERY_DELETE:
+    return handle_recovery_delete(
+	boost::static_pointer_cast<MOSDPGRecoveryDelete>(m));
+  case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+    return handle_recovery_delete_reply(
+	boost::static_pointer_cast<MOSDPGRecoveryDeleteReply>(m));
+  default:
+    // delegate to parent class for handling backend-agnostic recovery ops.
+    return RecoveryBackend::handle_recovery_op(std::move(m));
+  }
+}
+
diff --git a/src/crimson/osd/replicated_recovery_backend.h b/src/crimson/osd/replicated_recovery_backend.h
new file mode 100644
index 000000000..d99538a75
--- /dev/null
+++ b/src/crimson/osd/replicated_recovery_backend.h
@@ -0,0 +1,151 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/osd/recovery_backend.h"
+
+#include "messages/MOSDPGPull.h"
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPushReply.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
+#include "os/ObjectStore.h"
+
+class ReplicatedRecoveryBackend : public RecoveryBackend {
+public:
+  ReplicatedRecoveryBackend(crimson::osd::PG& pg,
+			    crimson::osd::ShardServices& shard_services,
+			    crimson::os::CollectionRef coll,
+			    PGBackend* backend)
+    : RecoveryBackend(pg, shard_services, coll, backend) {}
+  seastar::future<> handle_recovery_op(
+    Ref<MOSDFastDispatchOp> m) final;
+
+  seastar::future<> recover_object(
+    const hobject_t& soid,
+    eversion_t need) final;
+  seastar::future<> recover_delete(
+    const hobject_t& soid,
+    eversion_t need) final;
+  seastar::future<> push_delete(
+    const hobject_t& soid,
+    eversion_t need) final;
+protected:
+  seastar::future<> handle_pull(
+    Ref<MOSDPGPull> m);
+  seastar::future<> handle_pull_response(
+    Ref<MOSDPGPush> m);
+  seastar::future<> handle_push(
+    Ref<MOSDPGPush> m);
+  seastar::future<> handle_push_reply(
+    Ref<MOSDPGPushReply> m);
+  seastar::future<> handle_recovery_delete(
+    Ref<MOSDPGRecoveryDelete> m);
+  seastar::future<> handle_recovery_delete_reply(
+    Ref<MOSDPGRecoveryDeleteReply> m);
+  seastar::future<PushOp> prep_push(
+    const hobject_t& soid,
+    eversion_t need,
+    pg_shard_t pg_shard);
+  void prepare_pull(
+    PullOp& po,
+    PullInfo& pi,
+    const hobject_t& soid,
+    eversion_t need);
+  std::vector<pg_shard_t> get_shards_to_push(
+    const hobject_t& soid) const;
+  seastar::future<PushOp> build_push_op(
+    const ObjectRecoveryInfo& recovery_info,
+    const ObjectRecoveryProgress& progress,
+    object_stat_sum_t* stat);
+  /// @returns true if this push op is the last push op for
+  ///          recovery @c pop.soid
+  seastar::future<bool> _handle_pull_response(
+    pg_shard_t from,
+    const PushOp& pop,
+    PullOp* response,
+    ceph::os::Transaction* t);
+  std::pair<interval_set<uint64_t>, ceph::bufferlist> trim_pushed_data(
+    const interval_set<uint64_t> &copy_subset,
+    const interval_set<uint64_t> &intervals_received,
+    ceph::bufferlist data_received);
+  seastar::future<> submit_push_data(
+    const ObjectRecoveryInfo &recovery_info,
+    bool first,
+    bool complete,
+    bool clear_omap,
+    interval_set<uint64_t> data_zeros,
+    const interval_set<uint64_t> &intervals_included,
+    ceph::bufferlist data_included,
+    ceph::bufferlist omap_header,
+    const std::map<string, bufferlist> &attrs,
+    const std::map<string, bufferlist> &omap_entries,
+    ceph::os::Transaction *t);
+  void submit_push_complete(
+    const ObjectRecoveryInfo &recovery_info,
+    ObjectStore::Transaction *t);
+  seastar::future<> _handle_push(
+    pg_shard_t from,
+    const PushOp &pop,
+    PushReplyOp *response,
+    ceph::os::Transaction *t);
+  seastar::future<std::optional<PushOp>> _handle_push_reply(
+    pg_shard_t peer,
+    const PushReplyOp &op);
+  seastar::future<> on_local_recover_persist(
+    const hobject_t& soid,
+    const ObjectRecoveryInfo& _recovery_info,
+    bool is_delete,
+    epoch_t epoch_to_freeze);
+  seastar::future<> local_recover_delete(
+    const hobject_t& soid,
+    eversion_t need,
+    epoch_t epoch_frozen);
+  seastar::future<> on_stop() final {
+    return seastar::now();
+  }
+private:
+  /// pull missing object from peer
+  seastar::future<> maybe_pull_missing_obj(
+    const hobject_t& soid,
+    eversion_t need);
+
+  /// load object context for recovery if it is not ready yet
+  using load_obc_ertr = crimson::errorator<
+    crimson::ct_error::object_corrupted>;
+
+  seastar::future<> maybe_push_shards(
+    const hobject_t& soid,
+    eversion_t need);
+
+  /// read the data attached to given object. the size of them is supposed to
+  /// be relatively small.
+  ///
+  /// @return @c oi.version
+  seastar::future<eversion_t> read_metadata_for_push_op(
+    const hobject_t& oid,
+    const ObjectRecoveryProgress& progress,
+    ObjectRecoveryProgress& new_progress,
+    eversion_t ver,
+    PushOp* push_op);
+  /// read the remaining extents of object to be recovered and fill push_op
+  /// with them
+  ///
+  /// @param oid object being recovered
+  /// @param copy_subset extents we want
+  /// @param offset the offset in object from where we should read
+  /// @return the new offset
+  seastar::future<uint64_t> read_object_for_push_op(
+    const hobject_t& oid,
+    const interval_set<uint64_t>& copy_subset,
+    uint64_t offset,
+    uint64_t max_len,
+    PushOp* push_op);
+  seastar::future<> read_omap_for_push_op(
+    const hobject_t& oid,
+    const ObjectRecoveryProgress& progress,
+    ObjectRecoveryProgress& new_progress,
+    uint64_t* max_len,
+    PushOp* push_op);
+};
diff --git a/src/crimson/osd/scheduler/mclock_scheduler.cc b/src/crimson/osd/scheduler/mclock_scheduler.cc
new file mode 100644
index 000000000..195ea8dd8
--- /dev/null
+++ b/src/crimson/osd/scheduler/mclock_scheduler.cc
@@ -0,0 +1,165 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include <memory>
+#include <functional>
+
+#include "crimson/osd/scheduler/mclock_scheduler.h"
+#include "common/dout.h"
+
+namespace dmc = crimson::dmclock;
+using namespace std::placeholders;
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix *_dout
+
+
+namespace crimson::osd::scheduler {
+
+mClockScheduler::mClockScheduler(ConfigProxy &conf) :
+  scheduler(
+    std::bind(&mClockScheduler::ClientRegistry::get_info,
+	      &client_registry,
+	      _1),
+    dmc::AtLimit::Allow,
+    conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout"))
+{
+  conf.add_observer(this);
+  client_registry.update_from_config(conf);
+}
+
+void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf)
+{
+  default_external_client_info.update(
+    conf.get_val<uint64_t>("osd_mclock_scheduler_client_res"),
+    conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"),
+    conf.get_val<uint64_t>("osd_mclock_scheduler_client_lim"));
+
+  internal_client_infos[
+    static_cast<size_t>(scheduler_class_t::background_recovery)].update(
+    conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_res"),
+    conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"),
+    conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_lim"));
+
+  internal_client_infos[
+    static_cast<size_t>(scheduler_class_t::background_best_effort)].update(
+    conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_res"),
+    conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"),
+    conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_lim"));
+}
+
+const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client(
+  const client_profile_id_t &client) const
+{
+  auto ret = external_client_infos.find(client);
+  if (ret == external_client_infos.end())
+    return &default_external_client_info;
+  else
+    return &(ret->second);
+}
+
+const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info(
+  const scheduler_id_t &id) const {
+  switch (id.class_id) {
+  case scheduler_class_t::immediate:
+    ceph_assert(0 == "Cannot schedule immediate");
+    return (dmc::ClientInfo*)nullptr;
+  case scheduler_class_t::repop:
+  case scheduler_class_t::client:
+    return get_external_client(id.client_profile_id);
+  default:
+    ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size());
+    return &internal_client_infos[static_cast<size_t>(id.class_id)];
+  }
+}
+
+void mClockScheduler::dump(ceph::Formatter &f) const
+{
+}
+
+void mClockScheduler::enqueue(item_t&& item)
+{
+  auto id = get_scheduler_id(item);
+  auto cost = item.params.cost;
+
+  if (scheduler_class_t::immediate == item.params.klass) {
+    immediate.push_front(std::move(item));
+  } else {
+    scheduler.add_request(
+      std::move(item),
+      id,
+      cost);
+  }
+}
+
+void mClockScheduler::enqueue_front(item_t&& item)
+{
+  immediate.push_back(std::move(item));
+  // TODO: item may not be immediate, update mclock machinery to permit
+  // putting the item back in the queue
+}
+
+item_t mClockScheduler::dequeue()
+{
+  if (!immediate.empty()) {
+    auto ret = std::move(immediate.back());
+    immediate.pop_back();
+    return ret;
+  } else {
+    mclock_queue_t::PullReq result = scheduler.pull_request();
+    if (result.is_future()) {
+      ceph_assert(
+	0 == "Not implemented, user would have to be able to be woken up");
+      return std::move(*(item_t*)nullptr);
+    } else if (result.is_none()) {
+      ceph_assert(
+	0 == "Impossible, must have checked empty() first");
+      return std::move(*(item_t*)nullptr);
+    } else {
+      ceph_assert(result.is_retn());
+
+      auto &retn = result.get_retn();
+      return std::move(*retn.request);
+    }
+  }
+}
+
+const char** mClockScheduler::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    "osd_mclock_scheduler_client_res",
+    "osd_mclock_scheduler_client_wgt",
+    "osd_mclock_scheduler_client_lim",
+    "osd_mclock_scheduler_background_recovery_res",
+    "osd_mclock_scheduler_background_recovery_wgt",
+    "osd_mclock_scheduler_background_recovery_lim",
+    "osd_mclock_scheduler_background_best_effort_res",
+    "osd_mclock_scheduler_background_best_effort_wgt",
+    "osd_mclock_scheduler_background_best_effort_lim",
+    NULL
+  };
+  return KEYS;
+}
+
+void mClockScheduler::handle_conf_change(
+  const ConfigProxy& conf,
+  const std::set<std::string> &changed)
+{
+  client_registry.update_from_config(conf);
+}
+
+}
diff --git a/src/crimson/osd/scheduler/mclock_scheduler.h b/src/crimson/osd/scheduler/mclock_scheduler.h
new file mode 100644
index 000000000..c3edbe729
--- /dev/null
+++ b/src/crimson/osd/scheduler/mclock_scheduler.h
@@ -0,0 +1,130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include <ostream>
+#include <map>
+#include <vector>
+
+#include "boost/variant.hpp"
+
+#include "dmclock/src/dmclock_server.h"
+
+#include "crimson/osd/scheduler/scheduler.h"
+#include "common/config.h"
+#include "include/cmp.h"
+#include "common/ceph_context.h"
+
+
+namespace crimson::osd::scheduler {
+
+using client_id_t = uint64_t;
+using profile_id_t = uint64_t;
+
+struct client_profile_id_t {
+  client_id_t client_id;
+  profile_id_t profile_id;
+};
+
+WRITE_EQ_OPERATORS_2(client_profile_id_t, client_id, profile_id)
+WRITE_CMP_OPERATORS_2(client_profile_id_t, client_id, profile_id)
+
+
+struct scheduler_id_t {
+  scheduler_class_t class_id;
+  client_profile_id_t client_profile_id;
+};
+
+WRITE_EQ_OPERATORS_2(scheduler_id_t, class_id, client_profile_id)
+WRITE_CMP_OPERATORS_2(scheduler_id_t, class_id, client_profile_id)
+
+/**
+ * Scheduler implementation based on mclock.
+ *
+ * TODO: explain configs
+ */
+class mClockScheduler : public Scheduler, md_config_obs_t {
+
+  class ClientRegistry {
+    std::array<
+      crimson::dmclock::ClientInfo,
+      static_cast<size_t>(scheduler_class_t::client)
+    > internal_client_infos = {
+      // Placeholder, gets replaced with configured values
+      crimson::dmclock::ClientInfo(1, 1, 1),
+      crimson::dmclock::ClientInfo(1, 1, 1)
+    };
+
+    crimson::dmclock::ClientInfo default_external_client_info = {1, 1, 1};
+    std::map<client_profile_id_t,
+	     crimson::dmclock::ClientInfo> external_client_infos;
+    const crimson::dmclock::ClientInfo *get_external_client(
+      const client_profile_id_t &client) const;
+  public:
+    void update_from_config(const ConfigProxy &conf);
+    const crimson::dmclock::ClientInfo *get_info(
+      const scheduler_id_t &id) const;
+  } client_registry;
+
+  using mclock_queue_t = crimson::dmclock::PullPriorityQueue<
+    scheduler_id_t,
+    item_t,
+    true,
+    true,
+    2>;
+  mclock_queue_t scheduler;
+  std::list<item_t> immediate;
+
+  static scheduler_id_t get_scheduler_id(const item_t &item) {
+    return scheduler_id_t{
+      item.params.klass,
+	client_profile_id_t{
+	item.params.owner,
+	  0
+	  }
+    };
+  }
+
+public:
+  mClockScheduler(ConfigProxy &conf);
+
+  // Enqueue op in the back of the regular queue
+  void enqueue(item_t &&item) final;
+
+  // Enqueue the op in the front of the regular queue
+  void enqueue_front(item_t &&item) final;
+
+  // Return an op to be dispatch
+  item_t dequeue() final;
+
+  // Returns if the queue is empty
+  bool empty() const final {
+    return immediate.empty() && scheduler.empty();
+  }
+
+  // Formatted output of the queue
+  void dump(ceph::Formatter &f) const final;
+
+  void print(std::ostream &ostream) const final {
+    ostream << "mClockScheduler";
+  }
+
+  const char** get_tracked_conf_keys() const final;
+  void handle_conf_change(const ConfigProxy& conf,
+			  const std::set<std::string> &changed) final;
+};
+
+}
diff --git a/src/crimson/osd/scheduler/scheduler.cc b/src/crimson/osd/scheduler/scheduler.cc
new file mode 100644
index 000000000..c85cb388e
--- /dev/null
+++ b/src/crimson/osd/scheduler/scheduler.cc
@@ -0,0 +1,181 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <ostream>
+
+#include <seastar/core/print.hh>
+
+#include "crimson/osd/scheduler/scheduler.h"
+#include "crimson/osd/scheduler/mclock_scheduler.h"
+#include "common/WeightedPriorityQueue.h"
+
+namespace crimson::osd::scheduler {
+
+std::ostream &operator<<(std::ostream &lhs, const scheduler_class_t &c)
+{
+  switch (c) {
+  case scheduler_class_t::background_best_effort:
+    return lhs << "background_best_effort";
+  case scheduler_class_t::background_recovery:
+    return lhs << "background_recovery";
+  case scheduler_class_t::client:
+    return lhs << "client";
+  case scheduler_class_t::repop:
+    return lhs << "repop";
+  case scheduler_class_t::immediate:
+    return lhs << "immediate";
+  default:
+    return lhs;
+  }
+}
+
+/**
+ * Implements Scheduler in terms of OpQueue
+ *
+ * Templated on queue type to avoid dynamic dispatch, T should implement
+ * OpQueue<Scheduleritem_t, client_t>.  This adapter is mainly responsible for
+ * the boilerplate priority cutoff/strict concept which is needed for
+ * OpQueue based implementations.
+ */
+template <typename T>
+class ClassedOpQueueScheduler final : public Scheduler {
+  const scheduler_class_t cutoff;
+  T queue;
+
+  using priority_t = uint64_t;
+  std::array<
+    priority_t,
+    static_cast<size_t>(scheduler_class_t::immediate)
+  > priority_map = {
+    // Placeholder, gets replaced with configured values
+    0, 0, 0
+  };
+
+  static scheduler_class_t get_io_prio_cut(ConfigProxy &conf) {
+    if (conf.get_val<std::string>("osd_op_queue_cut_off") == "debug_random") {
+      srand(time(NULL));
+      return (rand() % 2 < 1) ?
+	scheduler_class_t::repop : scheduler_class_t::immediate;
+    } else if (conf.get_val<std::string>("osd_op_queue_cut_off") == "high") {
+      return scheduler_class_t::immediate;
+    } else {
+      return scheduler_class_t::repop;
+    }
+  }
+
+  bool use_strict(scheduler_class_t kl) const {
+    return static_cast<uint8_t>(kl) >= static_cast<uint8_t>(cutoff);
+  }
+
+  priority_t get_priority(scheduler_class_t kl) const {
+    ceph_assert(static_cast<size_t>(kl) <
+		static_cast<size_t>(scheduler_class_t::immediate));
+    return priority_map[static_cast<size_t>(kl)];
+  }
+
+public:
+  template <typename... Args>
+  ClassedOpQueueScheduler(ConfigProxy &conf, Args&&... args) :
+    cutoff(get_io_prio_cut(conf)),
+    queue(std::forward<Args>(args)...)
+  {
+    priority_map[
+      static_cast<size_t>(scheduler_class_t::background_best_effort)
+    ] = conf.get_val<uint64_t>("osd_scrub_priority");
+    priority_map[
+      static_cast<size_t>(scheduler_class_t::background_recovery)
+    ] = conf.get_val<uint64_t>("osd_recovery_op_priority");
+    priority_map[
+      static_cast<size_t>(scheduler_class_t::client)
+    ] = conf.get_val<uint64_t>("osd_client_op_priority");
+    priority_map[
+      static_cast<size_t>(scheduler_class_t::repop)
+    ] = conf.get_val<uint64_t>("osd_client_op_priority");
+  }
+
+  void enqueue(item_t &&item) final {
+    if (use_strict(item.params.klass))
+      queue.enqueue_strict(
+	item.params.owner, get_priority(item.params.klass), std::move(item));
+    else
+      queue.enqueue(
+	item.params.owner, get_priority(item.params.klass),
+	item.params.cost, std::move(item));
+  }
+
+  void enqueue_front(item_t &&item) final {
+    if (use_strict(item.params.klass))
+      queue.enqueue_strict_front(
+	item.params.owner, get_priority(item.params.klass), std::move(item));
+    else
+      queue.enqueue_front(
+	item.params.owner, get_priority(item.params.klass),
+	item.params.cost, std::move(item));
+  }
+
+  bool empty() const final {
+    return queue.empty();
+  }
+
+  item_t dequeue() final {
+    return queue.dequeue();
+  }
+
+  void dump(ceph::Formatter &f) const final {
+    return queue.dump(&f);
+  }
+
+  void print(std::ostream &out) const final {
+    out << "ClassedOpQueueScheduler(queue=";
+    queue.print(out);
+    out << ", cutoff=" << cutoff << ")";
+  }
+
+  ~ClassedOpQueueScheduler() final {};
+};
+
+SchedulerRef make_scheduler(ConfigProxy &conf)
+{
+  const std::string _type = conf.get_val<std::string>("osd_op_queue");
+  const std::string *type = &_type;
+  if (*type == "debug_random") {
+    static const std::string index_lookup[] = { "mclock_scheduler",
+						"wpq" };
+    srand(time(NULL));
+    unsigned which = rand() % (sizeof(index_lookup) / sizeof(index_lookup[0]));
+    type = &index_lookup[which];
+  }
+
+  if (*type == "wpq" ) {
+    // default is 'wpq'
+    return std::make_unique<
+      ClassedOpQueueScheduler<WeightedPriorityQueue<item_t, client_t>>>(
+	conf,
+	conf.get_val<uint64_t>("osd_op_pq_max_tokens_per_priority"),
+	conf->osd_op_pq_min_cost
+      );
+  } else if (*type == "mclock_scheduler") {
+    return std::make_unique<mClockScheduler>(conf);
+  } else {
+    ceph_assert("Invalid choice of wq" == 0);
+    return std::unique_ptr<mClockScheduler>();
+  }
+}
+
+std::ostream &operator<<(std::ostream &lhs, const Scheduler &rhs) {
+  rhs.print(lhs);
+  return lhs;
+}
+
+}
diff --git a/src/crimson/osd/scheduler/scheduler.h b/src/crimson/osd/scheduler/scheduler.h
new file mode 100644
index 000000000..a014991ab
--- /dev/null
+++ b/src/crimson/osd/scheduler/scheduler.h
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <ostream>
+
+#include "crimson/common/config_proxy.h"
+
+namespace crimson::osd::scheduler {
+
+enum class scheduler_class_t : uint8_t {
+  background_best_effort = 0,
+  background_recovery,
+  client,
+  repop,
+  immediate,
+};
+
+std::ostream &operator<<(std::ostream &, const scheduler_class_t &);
+
+using client_t = uint64_t;
+using cost_t = uint64_t;
+
+struct params_t {
+  cost_t cost = 1;
+  client_t owner;
+  scheduler_class_t klass;
+};
+
+struct item_t {
+  params_t params;
+  seastar::promise<> wake;
+};
+
+/**
+ * Base interface for classes responsible for choosing
+ * op processing order in the OSD.
+ */
+class Scheduler {
+public:
+  // Enqueue op for scheduling
+  virtual void enqueue(item_t &&item) = 0;
+
+  // Enqueue op for processing as though it were enqueued prior
+  // to other items already scheduled.
+  virtual void enqueue_front(item_t &&item) = 0;
+
+  // Returns true iff there are no ops scheduled
+  virtual bool empty() const = 0;
+
+  // Return next op to be processed
+  virtual item_t dequeue() = 0;
+
+  // Dump formatted representation for the queue
+  virtual void dump(ceph::Formatter &f) const = 0;
+
+  // Print human readable brief description with relevant parameters
+  virtual void print(std::ostream &out) const = 0;
+
+  // Destructor
+  virtual ~Scheduler() {};
+};
+
+std::ostream &operator<<(std::ostream &lhs, const Scheduler &);
+using SchedulerRef = std::unique_ptr<Scheduler>;
+
+SchedulerRef make_scheduler(ConfigProxy &);
+
+}
diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc
new file mode 100644
index 000000000..8c2cfc415
--- /dev/null
+++ b/src/crimson/osd/shard_services.cc
@@ -0,0 +1,311 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/osd/shard_services.h"
+
+#include "messages/MOSDAlive.h"
+
+#include "osd/osd_perf_counters.h"
+#include "osd/PeeringState.h"
+#include "crimson/common/config_proxy.h"
+#include "crimson/mgr/client.h"
+#include "crimson/mon/MonClient.h"
+#include "crimson/net/Messenger.h"
+#include "crimson/net/Connection.h"
+#include "crimson/os/cyanstore/cyan_store.h"
+#include "crimson/osd/osdmap_service.h"
+#include "messages/MOSDPGTemp.h"
+#include "messages/MOSDPGCreated.h"
+#include "messages/MOSDPGNotify.h"
+#include "messages/MOSDPGInfo.h"
+#include "messages/MOSDPGQuery.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+namespace crimson::osd {
+
+ShardServices::ShardServices(
+  OSDMapService &osdmap_service,
+  const int whoami,
+  crimson::net::Messenger &cluster_msgr,
+  crimson::net::Messenger &public_msgr,
+  crimson::mon::Client &monc,
+  crimson::mgr::Client &mgrc,
+  crimson::os::FuturizedStore &store)
+    : osdmap_service(osdmap_service),
+      whoami(whoami),
+      cluster_msgr(cluster_msgr),
+      public_msgr(public_msgr),
+      monc(monc),
+      mgrc(mgrc),
+      store(store),
+      throttler(crimson::common::local_conf()),
+      obc_registry(crimson::common::local_conf()),
+      local_reserver(
+	&cct,
+	&finisher,
+	crimson::common::local_conf()->osd_max_backfills,
+	crimson::common::local_conf()->osd_min_recovery_priority),
+      remote_reserver(
+	&cct,
+	&finisher,
+	crimson::common::local_conf()->osd_max_backfills,
+	crimson::common::local_conf()->osd_min_recovery_priority)
+{
+  perf = build_osd_logger(&cct);
+  cct.get_perfcounters_collection()->add(perf);
+
+  recoverystate_perf = build_recoverystate_perf(&cct);
+  cct.get_perfcounters_collection()->add(recoverystate_perf);
+
+  crimson::common::local_conf().add_observer(this);
+}
+
+const char** ShardServices::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    "osd_max_backfills",
+    "osd_min_recovery_priority",
+    nullptr
+  };
+  return KEYS;
+}
+
+void ShardServices::handle_conf_change(const ConfigProxy& conf,
+				       const std::set <std::string> &changed)
+{
+  if (changed.count("osd_max_backfills")) {
+    local_reserver.set_max(conf->osd_max_backfills);
+    remote_reserver.set_max(conf->osd_max_backfills);
+  }
+  if (changed.count("osd_min_recovery_priority")) {
+    local_reserver.set_min_priority(conf->osd_min_recovery_priority);
+    remote_reserver.set_min_priority(conf->osd_min_recovery_priority);
+  }
+}
+
+seastar::future<> ShardServices::send_to_osd(
+  int peer, Ref<Message> m, epoch_t from_epoch) {
+  if (osdmap->is_down(peer)) {
+    logger().info("{}: osd.{} is_down", __func__, peer);
+    return seastar::now();
+  } else if (osdmap->get_info(peer).up_from > from_epoch) {
+    logger().info("{}: osd.{} {} > {}", __func__, peer,
+		    osdmap->get_info(peer).up_from, from_epoch);
+    return seastar::now();
+  } else {
+    auto conn = cluster_msgr.connect(
+        osdmap->get_cluster_addrs(peer).front(), CEPH_ENTITY_TYPE_OSD);
+    return conn->send(m);
+  }
+}
+
+seastar::future<> ShardServices::dispatch_context_transaction(
+  crimson::os::CollectionRef col, PeeringCtx &ctx) {
+  auto ret = store.do_transaction(
+    col,
+    std::move(ctx.transaction));
+  ctx.reset_transaction();
+  return ret;
+}
+
+seastar::future<> ShardServices::dispatch_context_messages(
+  BufferedRecoveryMessages &&ctx)
+{
+  auto ret = seastar::parallel_for_each(std::move(ctx.message_map),
+    [this](auto& osd_messages) {
+      auto& [peer, messages] = osd_messages;
+      logger().debug("dispatch_context_messages sending messages to {}", peer);
+      return seastar::parallel_for_each(
+        std::move(messages), [=, peer=peer](auto& m) {
+        return send_to_osd(peer, m, osdmap->get_epoch());
+      });
+    });
+  ctx.message_map.clear();
+  return ret;
+}
+
+seastar::future<> ShardServices::dispatch_context(
+  crimson::os::CollectionRef col,
+  PeeringCtx &&ctx)
+{
+  ceph_assert(col || ctx.transaction.empty());
+  return seastar::when_all_succeed(
+    dispatch_context_messages(
+      BufferedRecoveryMessages{ceph_release_t::octopus, ctx}),
+    col ? dispatch_context_transaction(col, ctx) : seastar::now()
+  ).then_unpack([] {
+    return seastar::now();
+  });
+}
+
+void ShardServices::queue_want_pg_temp(pg_t pgid,
+				    const vector<int>& want,
+				    bool forced)
+{
+  auto p = pg_temp_pending.find(pgid);
+  if (p == pg_temp_pending.end() ||
+      p->second.acting != want ||
+      forced) {
+    pg_temp_wanted[pgid] = {want, forced};
+  }
+}
+
+void ShardServices::remove_want_pg_temp(pg_t pgid)
+{
+  pg_temp_wanted.erase(pgid);
+  pg_temp_pending.erase(pgid);
+}
+
+void ShardServices::requeue_pg_temp()
+{
+  unsigned old_wanted = pg_temp_wanted.size();
+  unsigned old_pending = pg_temp_pending.size();
+  pg_temp_wanted.merge(pg_temp_pending);
+  pg_temp_pending.clear();
+  logger().debug(
+    "{}: {} + {} -> {}",
+    __func__ ,
+    old_wanted,
+    old_pending,
+    pg_temp_wanted.size());
+}
+
+std::ostream& operator<<(
+  std::ostream& out,
+  const ShardServices::pg_temp_t& pg_temp)
+{
+  out << pg_temp.acting;
+  if (pg_temp.forced) {
+    out << " (forced)";
+  }
+  return out;
+}
+
+seastar::future<> ShardServices::send_pg_temp()
+{
+  if (pg_temp_wanted.empty())
+    return seastar::now();
+  logger().debug("{}: {}", __func__, pg_temp_wanted);
+  boost::intrusive_ptr<MOSDPGTemp> ms[2] = {nullptr, nullptr};
+  for (auto& [pgid, pg_temp] : pg_temp_wanted) {
+    auto& m = ms[pg_temp.forced];
+    if (!m) {
+      m = make_message<MOSDPGTemp>(osdmap->get_epoch());
+      m->forced = pg_temp.forced;
+    }
+    m->pg_temp.emplace(pgid, pg_temp.acting);
+  }
+  pg_temp_pending.merge(pg_temp_wanted);
+  pg_temp_wanted.clear();
+  return seastar::parallel_for_each(std::begin(ms), std::end(ms),
+    [this](auto m) {
+      if (m) {
+	return monc.send_message(m);
+      } else {
+	return seastar::now();
+      }
+    });
+}
+
+void ShardServices::update_map(cached_map_t new_osdmap)
+{
+  osdmap = std::move(new_osdmap);
+}
+
+ShardServices::cached_map_t &ShardServices::get_osdmap()
+{
+  return osdmap;
+}
+
+seastar::future<> ShardServices::send_pg_created(pg_t pgid)
+{
+  logger().debug(__func__);
+  auto o = get_osdmap();
+  ceph_assert(o->require_osd_release >= ceph_release_t::luminous);
+  pg_created.insert(pgid);
+  return monc.send_message(make_message<MOSDPGCreated>(pgid));
+}
+
+seastar::future<> ShardServices::send_pg_created()
+{
+  logger().debug(__func__);
+  auto o = get_osdmap();
+  ceph_assert(o->require_osd_release >= ceph_release_t::luminous);
+  return seastar::parallel_for_each(pg_created,
+    [this](auto &pgid) {
+      return monc.send_message(make_message<MOSDPGCreated>(pgid));
+    });
+}
+
+void ShardServices::prune_pg_created()
+{
+  logger().debug(__func__);
+  auto o = get_osdmap();
+  auto i = pg_created.begin();
+  while (i != pg_created.end()) {
+    auto p = o->get_pg_pool(i->pool());
+    if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
+      logger().debug("{} pruning {}", __func__, *i);
+      i = pg_created.erase(i);
+    } else {
+      logger().debug(" keeping {}", __func__, *i);
+      ++i;
+    }
+  }
+}
+
+seastar::future<> ShardServices::osdmap_subscribe(version_t epoch, bool force_request)
+{
+  logger().info("{}({})", __func__, epoch);
+  if (monc.sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
+      force_request) {
+    return monc.renew_subs();
+  } else {
+    return seastar::now();
+  }
+}
+
+HeartbeatStampsRef ShardServices::get_hb_stamps(int peer)
+{
+  auto [stamps, added] = heartbeat_stamps.try_emplace(peer);
+  if (added) {
+    stamps->second = ceph::make_ref<HeartbeatStamps>(peer);
+  }
+  return stamps->second;
+}
+
+seastar::future<> ShardServices::send_alive(const epoch_t want)
+{
+  logger().info(
+    "{} want={} up_thru_wanted={}",
+    __func__,
+    want,
+    up_thru_wanted);
+
+  if (want > up_thru_wanted) {
+    up_thru_wanted = want;
+  } else {
+    logger().debug("{} want={} <= up_thru_wanted={}; skipping",
+                   __func__, want, up_thru_wanted);
+    return seastar::now();
+  }
+  if (!osdmap->exists(whoami)) {
+    logger().warn("{} DNE", __func__);
+    return seastar::now();
+  } if (const epoch_t up_thru = osdmap->get_up_thru(whoami);
+        up_thru_wanted > up_thru) {
+    logger().debug("{} up_thru_wanted={} up_thru={}", __func__, want, up_thru);
+    return monc.send_message(
+      make_message<MOSDAlive>(osdmap->get_epoch(), want));
+  } else {
+    logger().debug("{} {} <= {}", __func__, want, osdmap->get_up_thru(whoami));
+    return seastar::now();
+  }
+}
+
+};
diff --git a/src/crimson/osd/shard_services.h b/src/crimson/osd/shard_services.h
new file mode 100644
index 000000000..2957639c6
--- /dev/null
+++ b/src/crimson/osd/shard_services.h
@@ -0,0 +1,215 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/common_fwd.h"
+#include "osd_operation.h"
+#include "msg/MessageRef.h"
+#include "crimson/common/exception.h"
+#include "crimson/os/futurized_collection.h"
+#include "osd/PeeringState.h"
+#include "crimson/osd/osdmap_service.h"
+#include "crimson/osd/object_context.h"
+#include "common/AsyncReserver.h"
+
+namespace crimson::net {
+  class Messenger;
+}
+
+namespace crimson::mgr {
+  class Client;
+}
+
+namespace crimson::mon {
+  class Client;
+}
+
+namespace crimson::os {
+  class FuturizedStore;
+}
+
+class OSDMap;
+class PeeringCtx;
+class BufferedRecoveryMessages;
+
+namespace crimson::osd {
+
+/**
+ * Represents services available to each PG
+ */
+class ShardServices : public md_config_obs_t {
+  using cached_map_t = boost::local_shared_ptr<const OSDMap>;
+  OSDMapService &osdmap_service;
+  const int whoami;
+  crimson::net::Messenger &cluster_msgr;
+  crimson::net::Messenger &public_msgr;
+  crimson::mon::Client &monc;
+  crimson::mgr::Client &mgrc;
+  crimson::os::FuturizedStore &store;
+
+  crimson::common::CephContext cct;
+
+  PerfCounters *perf = nullptr;
+  PerfCounters *recoverystate_perf = nullptr;
+
+  const char** get_tracked_conf_keys() const final;
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set <std::string> &changed) final;
+public:
+  ShardServices(
+    OSDMapService &osdmap_service,
+    const int whoami,
+    crimson::net::Messenger &cluster_msgr,
+    crimson::net::Messenger &public_msgr,
+    crimson::mon::Client &monc,
+    crimson::mgr::Client &mgrc,
+    crimson::os::FuturizedStore &store);
+
+  seastar::future<> send_to_osd(
+    int peer,
+    MessageRef m,
+    epoch_t from_epoch);
+
+  crimson::os::FuturizedStore &get_store() {
+    return store;
+  }
+
+  crimson::common::CephContext *get_cct() {
+    return &cct;
+  }
+
+  // OSDMapService
+  const OSDMapService &get_osdmap_service() const {
+    return osdmap_service;
+  }
+
+  // Op Management
+  OperationRegistry registry;
+  OperationThrottler throttler;
+
+  template <typename T, typename... Args>
+  auto start_operation(Args&&... args) {
+    if (__builtin_expect(stopping, false)) {
+      throw crimson::common::system_shutdown_exception();
+    }
+    auto op = registry.create_operation<T>(std::forward<Args>(args)...);
+    return std::make_pair(op, op->start());
+  }
+
+  seastar::future<> stop() {
+    stopping = true;
+    return registry.stop();
+  }
+
+  // Loggers
+  PerfCounters &get_recoverystate_perf_logger() {
+    return *recoverystate_perf;
+  }
+  PerfCounters &get_perf_logger() {
+    return *perf;
+  }
+
+  /// Dispatch and reset ctx transaction
+  seastar::future<> dispatch_context_transaction(
+    crimson::os::CollectionRef col, PeeringCtx &ctx);
+
+  /// Dispatch and reset ctx messages
+  seastar::future<> dispatch_context_messages(
+    BufferedRecoveryMessages &&ctx);
+
+  /// Dispatch ctx and dispose of context
+  seastar::future<> dispatch_context(
+    crimson::os::CollectionRef col,
+    PeeringCtx &&ctx);
+
+  /// Dispatch ctx and dispose of ctx, transaction must be empty
+  seastar::future<> dispatch_context(
+    PeeringCtx &&ctx) {
+    return dispatch_context({}, std::move(ctx));
+  }
+
+  // PG Temp State
+private:
+  // TODO: hook into map processing and some kind of heartbeat/peering
+  // message processing
+  struct pg_temp_t {
+    std::vector<int> acting;
+    bool forced = false;
+  };
+  map<pg_t, pg_temp_t> pg_temp_wanted;
+  map<pg_t, pg_temp_t> pg_temp_pending;
+  friend std::ostream& operator<<(std::ostream&, const pg_temp_t&);
+public:
+  void queue_want_pg_temp(pg_t pgid, const vector<int>& want,
+			  bool forced = false);
+  void remove_want_pg_temp(pg_t pgid);
+  void requeue_pg_temp();
+  seastar::future<> send_pg_temp();
+
+  // Shard-local OSDMap
+private:
+  cached_map_t osdmap;
+public:
+  void update_map(cached_map_t new_osdmap);
+  cached_map_t &get_osdmap();
+
+  // PG Created State
+private:
+  set<pg_t> pg_created;
+public:
+  seastar::future<> send_pg_created(pg_t pgid);
+  seastar::future<> send_pg_created();
+  void prune_pg_created();
+
+  unsigned get_pg_num() const {
+    return num_pgs;
+  }
+  void inc_pg_num() {
+    ++num_pgs;
+  }
+  void dec_pg_num() {
+    --num_pgs;
+  }
+
+  seastar::future<> osdmap_subscribe(version_t epoch, bool force_request);
+
+  // Time state
+  ceph::mono_time startup_time = ceph::mono_clock::now();
+  ceph::signedspan get_mnow() const {
+    return ceph::mono_clock::now() - startup_time;
+  }
+  HeartbeatStampsRef get_hb_stamps(int peer);
+  std::map<int, HeartbeatStampsRef> heartbeat_stamps;
+
+  crimson::osd::ObjectContextRegistry obc_registry;
+
+  // Async Reservers
+private:
+  unsigned num_pgs = 0;
+
+  struct DirectFinisher {
+    void queue(Context *c) {
+      c->complete(0);
+    }
+  } finisher;
+  // prevent creating new osd operations when system is shutting down,
+  // this is necessary because there are chances that a new operation
+  // is created, after the interruption of all ongoing operations, and
+  // creats and waits on a new and may-never-resolve future, in which
+  // case the shutdown may never succeed.
+  bool stopping = false;
+public:
+  AsyncReserver<spg_t, DirectFinisher> local_reserver;
+  AsyncReserver<spg_t, DirectFinisher> remote_reserver;
+
+private:
+  epoch_t up_thru_wanted = 0;
+public:
+  seastar::future<> send_alive(epoch_t want);
+};
+
+}
diff --git a/src/crimson/osd/state.h b/src/crimson/osd/state.h
new file mode 100644
index 000000000..ba48cd36f
--- /dev/null
+++ b/src/crimson/osd/state.h
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string_view>
+#include <ostream>
+
+class OSDMap;
+
+class OSDState {
+
+  enum class State {
+    INITIALIZING,
+    PREBOOT,
+    BOOTING,
+    ACTIVE,
+    PRESTOP,
+    STOPPING,
+    WAITING_FOR_HEALTHY,
+  };
+
+  State state = State::INITIALIZING;
+
+public:
+  bool is_initializing() const {
+    return state == State::INITIALIZING;
+  }
+  bool is_preboot() const {
+    return state == State::PREBOOT;
+  }
+  bool is_booting() const {
+    return state == State::BOOTING;
+  }
+  bool is_active() const {
+    return state == State::ACTIVE;
+  }
+  bool is_prestop() const {
+    return state == State::PRESTOP;
+  }
+  bool is_stopping() const {
+    return state == State::STOPPING;
+  }
+  bool is_waiting_for_healthy() const {
+    return state == State::WAITING_FOR_HEALTHY;
+  }
+  void set_preboot() {
+    state = State::PREBOOT;
+  }
+  void set_booting() {
+    state = State::BOOTING;
+  }
+  void set_active() {
+    state = State::ACTIVE;
+  }
+  void set_prestop() {
+    state = State::PRESTOP;
+  }
+  void set_stopping() {
+    state = State::STOPPING;
+  }
+  std::string_view to_string() const {
+    switch (state) {
+    case State::INITIALIZING: return "initializing";
+    case State::PREBOOT: return "preboot";
+    case State::BOOTING: return "booting";
+    case State::ACTIVE: return "active";
+    case State::PRESTOP: return "prestop";
+    case State::STOPPING: return "stopping";
+    case State::WAITING_FOR_HEALTHY: return "waiting_for_healthy";
+    default: return "???";
+    }
+  }
+};
+
+inline std::ostream&
+operator<<(std::ostream& os, const OSDState& s) {
+  return os << s.to_string();
+}
diff --git a/src/crimson/osd/watch.cc b/src/crimson/osd/watch.cc
new file mode 100644
index 000000000..a7a3311aa
--- /dev/null
+++ b/src/crimson/osd/watch.cc
@@ -0,0 +1,169 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/osd/watch.h"
+#include "messages/MWatchNotify.h"
+
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_osd);
+  }
+}
+
+namespace crimson::osd {
+
+bool Watch::NotifyCmp::operator()(NotifyRef lhs, NotifyRef rhs) const
+{
+  ceph_assert(lhs);
+  ceph_assert(rhs);
+  return lhs->get_id() < rhs->get_id();
+}
+
+seastar::future<> Watch::connect(crimson::net::ConnectionRef conn, bool)
+{
+  if (this->conn == conn) {
+    logger().debug("conn={} already connected", conn);
+  }
+
+  this->conn = std::move(conn);
+  return seastar::now();
+}
+
+seastar::future<> Watch::send_notify_msg(NotifyRef notify)
+{
+  logger().info("{} for notify(id={})", __func__, notify->ninfo.notify_id);
+  return conn->send(make_message<MWatchNotify>(
+    winfo.cookie,
+    notify->user_version,
+    notify->ninfo.notify_id,
+    CEPH_WATCH_EVENT_NOTIFY,
+    notify->ninfo.bl,
+    notify->client_gid));
+}
+
+seastar::future<> Watch::start_notify(NotifyRef notify)
+{
+  logger().info("{} adding notify(id={})", __func__, notify->ninfo.notify_id);
+  auto [ it, emplaced ] = in_progress_notifies.emplace(std::move(notify));
+  ceph_assert(emplaced);
+  ceph_assert(is_alive());
+  return is_connected() ? send_notify_msg(*it) : seastar::now();
+}
+
+seastar::future<> Watch::notify_ack(
+  const uint64_t notify_id,
+  const ceph::bufferlist& reply_bl)
+{
+  logger().info("{}", __func__);
+  return seastar::do_for_each(in_progress_notifies,
+    [this_shared=shared_from_this(), &reply_bl] (auto notify) {
+      return notify->complete_watcher(this_shared, reply_bl);
+    }
+  ).then([this] {
+    in_progress_notifies.clear();
+    return seastar::now();
+  });
+}
+
+seastar::future<> Watch::send_disconnect_msg()
+{
+  if (!is_connected()) {
+    return seastar::now();
+  }
+  ceph::bufferlist empty;
+  return conn->send(make_message<MWatchNotify>(
+    winfo.cookie,
+    0,
+    0,
+    CEPH_WATCH_EVENT_DISCONNECT,
+    empty));
+}
+
+void Watch::discard_state()
+{
+  ceph_assert(obc);
+  in_progress_notifies.clear();
+}
+
+seastar::future<> Watch::remove(const bool send_disconnect)
+{
+  logger().info("{}", __func__);
+  auto disconnected = send_disconnect ? send_disconnect_msg()
+                                      : seastar::now();
+  return std::move(disconnected).then([this] {
+    return seastar::do_for_each(in_progress_notifies,
+      [this_shared=shared_from_this()] (auto notify) {
+        return notify->remove_watcher(this_shared);
+      }).then([this] {
+        discard_state();
+        return seastar::now();
+      });
+    });
+}
+
+bool notify_reply_t::operator<(const notify_reply_t& rhs) const
+{
+  // comparing std::pairs to emphasize our legacy. ceph-osd stores
+  // notify_replies as std::multimap<std::pair<gid, cookie>, bl>.
+  // unfortunately, what seems to be an implementation detail, got
+  // exposed as part of our public API (the `reply_buffer` parameter
+  // of the `rados_notify` family).
+  const auto lhsp = std::make_pair(watcher_gid, watcher_cookie);
+  const auto rhsp = std::make_pair(rhs.watcher_gid, rhs.watcher_cookie);
+  return lhsp < rhsp;
+}
+
+seastar::future<> Notify::remove_watcher(WatchRef watch)
+{
+  if (discarded || complete) {
+    return seastar::now();
+  }
+  [[maybe_unused]] const auto num_removed = watchers.erase(watch);
+  assert(num_removed > 0);
+  return maybe_send_completion();
+}
+
+
+seastar::future<> Notify::complete_watcher(
+  WatchRef watch,
+  const ceph::bufferlist& reply_bl)
+{
+  if (discarded || complete) {
+    return seastar::now();
+  }
+  notify_replies.emplace(notify_reply_t{
+    watch->get_watcher_gid(),
+    watch->get_cookie(),
+    reply_bl});
+  return remove_watcher(std::move(watch));
+}
+
+seastar::future<> Notify::maybe_send_completion()
+{
+  logger().info("{} -- {} in progress watchers", __func__, watchers.size());
+  if (watchers.empty()) {
+    // prepare reply
+    ceph::bufferlist bl;
+    encode(notify_replies, bl);
+    // FIXME: this is just a stub
+    std::list<std::pair<uint64_t,uint64_t>> missed;
+    encode(missed, bl);
+
+    complete = true;
+
+    ceph::bufferlist empty;
+    auto reply = make_message<MWatchNotify>(
+      ninfo.cookie,
+      user_version,
+      ninfo.notify_id,
+      CEPH_WATCH_EVENT_NOTIFY_COMPLETE,
+      empty,
+      client_gid);
+    reply->set_data(bl);
+    return conn->send(std::move(reply));
+  }
+  return seastar::now();
+}
+
+} // namespace crimson::osd
diff --git a/src/crimson/osd/watch.h b/src/crimson/osd/watch.h
new file mode 100644
index 000000000..6049e16cf
--- /dev/null
+++ b/src/crimson/osd/watch.h
@@ -0,0 +1,194 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iterator>
+#include <map>
+#include <set>
+
+#include <seastar/core/shared_ptr.hh>
+
+#include "crimson/net/Connection.h"
+#include "crimson/osd/object_context.h"
+#include "include/denc.h"
+
+namespace crimson::osd {
+
+class Notify;
+using NotifyRef = seastar::shared_ptr<Notify>;
+
+// NOTE: really need to have this public. Otherwise `shared_from_this()`
+// will abort. According to cppreference.com:
+//
+//   "The constructors of std::shared_ptr detect the presence
+//   of an unambiguous and accessible (ie. public inheritance
+//   is mandatory) (since C++17) enable_shared_from_this base".
+//
+// I expect the `seastar::shared_ptr` shares this behaviour.
+class Watch : public seastar::enable_shared_from_this<Watch> {
+  // this is a private tag for the public constructor that turns it into
+  // de facto private one. The motivation behind the hack is make_shared
+  // used by create().
+  struct private_ctag_t{};
+
+  struct NotifyCmp {
+    inline bool operator()(NotifyRef lhs, NotifyRef rhs) const;
+  };
+  std::set<NotifyRef, NotifyCmp> in_progress_notifies;
+  crimson::net::ConnectionRef conn;
+  crimson::osd::ObjectContextRef obc;
+
+  watch_info_t winfo;
+  entity_name_t entity_name;
+
+  seastar::future<> start_notify(NotifyRef);
+  seastar::future<> send_notify_msg(NotifyRef);
+  seastar::future<> send_disconnect_msg();
+  void discard_state();
+
+  friend Notify;
+
+public:
+  Watch(private_ctag_t,
+        crimson::osd::ObjectContextRef obc,
+        const watch_info_t& winfo,
+        const entity_name_t& entity_name)
+    : obc(std::move(obc)),
+      winfo(winfo),
+      entity_name(entity_name) {
+  }
+
+  seastar::future<> connect(crimson::net::ConnectionRef, bool);
+  bool is_alive() const {
+    return true;
+  }
+  bool is_connected() const {
+    return static_cast<bool>(conn);
+  }
+  void got_ping(utime_t) {
+    // NOP
+  }
+
+  seastar::future<> remove(bool send_disconnect);
+
+  /// Call when notify_ack received on notify_id
+  seastar::future<> notify_ack(
+    uint64_t notify_id, ///< [in] id of acked notify
+    const ceph::bufferlist& reply_bl); ///< [in] notify reply buffer
+
+  template <class... Args>
+  static seastar::shared_ptr<Watch> create(Args&&... args) {
+    return seastar::make_shared<Watch>(private_ctag_t{},
+                                       std::forward<Args>(args)...);
+  };
+
+  uint64_t get_watcher_gid() const {
+    return entity_name.num();
+  }
+  uint64_t get_cookie() const {
+    return winfo.cookie;
+  }
+};
+
+using WatchRef = seastar::shared_ptr<Watch>;
+
+struct notify_reply_t {
+  uint64_t watcher_gid;
+  uint64_t watcher_cookie;
+  ceph::bufferlist bl;
+
+  bool operator<(const notify_reply_t& rhs) const;
+  DENC(notify_reply_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.watcher_gid, p);
+    denc(v.watcher_cookie, p);
+    denc(v.bl, p);
+    DENC_FINISH(p);
+  }
+};
+
+class Notify {
+  std::set<WatchRef> watchers;
+  notify_info_t ninfo;
+  crimson::net::ConnectionRef conn;
+  uint64_t client_gid;
+  uint64_t user_version;
+  bool complete = false;
+  bool discarded = false;
+
+  /// (gid,cookie) -> reply_bl for everyone who acked the notify
+  std::multiset<notify_reply_t> notify_replies;
+
+  uint64_t get_id() const { return ninfo.notify_id; }
+  seastar::future<> maybe_send_completion();
+
+  template <class WatchIteratorT>
+  Notify(WatchIteratorT begin,
+         WatchIteratorT end,
+         crimson::net::ConnectionRef conn,
+         const notify_info_t& ninfo,
+         const uint64_t client_gid,
+         const uint64_t user_version);
+  // this is a private tag for the public constructor that turns it into
+  // de facto private one. The motivation behind the hack is make_shared
+  // used by create_n_propagate factory.
+  struct private_ctag_t{};
+
+  friend Watch;
+
+public:
+  template <class... Args>
+  Notify(private_ctag_t, Args&&... args) : Notify(std::forward<Args>(args)...) {
+  }
+
+  template <class WatchIteratorT, class... Args>
+  static seastar::future<> create_n_propagate(
+    WatchIteratorT begin,
+    WatchIteratorT end,
+    Args&&... args);
+
+  seastar::future<> remove_watcher(WatchRef watch);
+  seastar::future<> complete_watcher(WatchRef watch,
+                                     const ceph::bufferlist& reply_bl);
+};
+
+
+template <class WatchIteratorT>
+Notify::Notify(WatchIteratorT begin,
+               WatchIteratorT end,
+               crimson::net::ConnectionRef conn,
+               const notify_info_t& ninfo,
+               const uint64_t client_gid,
+               const uint64_t user_version)
+  : watchers(begin, end),
+    ninfo(ninfo),
+    conn(std::move(conn)),
+    client_gid(client_gid),
+    user_version(user_version) {
+}
+
+template <class WatchIteratorT, class... Args>
+seastar::future<> Notify::create_n_propagate(
+  WatchIteratorT begin,
+  WatchIteratorT end,
+  Args&&... args)
+{
+  static_assert(
+    std::is_same_v<typename std::iterator_traits<WatchIteratorT>::value_type,
+                   crimson::osd::WatchRef>);
+  auto notify = seastar::make_shared<Notify>(
+    private_ctag_t{},
+    begin,
+    end,
+    std::forward<Args>(args)...);
+  return seastar::do_for_each(begin, end, [=] (auto& watchref) {
+    return watchref->start_notify(notify);
+  }).then([notify = std::move(notify)] {
+    return notify->maybe_send_completion();
+  });
+}
+
+} // namespace crimson::osd
+
+WRITE_CLASS_DENC(crimson::osd::notify_reply_t)
diff --git a/src/crimson/tools/CMakeLists.txt b/src/crimson/tools/CMakeLists.txt
new file mode 100644
index 000000000..1a59a9a11
--- /dev/null
+++ b/src/crimson/tools/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_executable(crimson-store-nbd
+	store-nbd.cc
+  )
+target_link_libraries(crimson-store-nbd
+  crimson-seastore)
+install(TARGETS crimson-store-nbd DESTINATION bin)
diff --git a/src/crimson/tools/store-nbd.cc b/src/crimson/tools/store-nbd.cc
new file mode 100644
index 000000000..cdf853d15
--- /dev/null
+++ b/src/crimson/tools/store-nbd.cc
@@ -0,0 +1,621 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+
+/**
+ * crimson-store-nbd
+ *
+ * This tool exposes crimson object store internals as an nbd server
+ * for use with fio in basic benchmarking.
+ *
+ * Example usage:
+ *
+ *  $ ./bin/crimson-store-nbd --device-path /dev/nvme1n1 -c 1 --total-device-size=107374182400 --mkfs true --uds-path /tmp/store_nbd_socket.sock
+ *
+ *  $ cat nbd.fio
+ *  [global]
+ *  ioengine=nbd
+ *  uri=nbd+unix:///?socket=/tmp/store_nbd_socket.sock
+ *  rw=randrw
+ *  time_based
+ *  runtime=120
+ *  group_reporting
+ *  iodepth=1
+ *  size=500G
+ *
+ *  [job0]
+ *  offset=0
+ *
+ *  $ fio nbd.fio
+ */
+
+#include <random>
+
+#include <boost/program_options/variables_map.hpp>
+#include <boost/program_options/parsers.hpp>
+
+#include <linux/nbd.h>
+#include <linux/fs.h>
+
+#include <seastar/core/byteorder.hh>
+
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/segment_cleaner.h"
+#include "crimson/os/seastore/segment_manager.h"
+#include "crimson/os/seastore/segment_manager/block.h"
+#include "crimson/os/seastore/transaction_manager.h"
+
+#include "test/crimson/seastar_runner.h"
+#include "test/crimson/seastore/test_block.h"
+
+namespace po = boost::program_options;
+
+using namespace ceph;
+using namespace crimson;
+using namespace crimson::os;
+using namespace crimson::os::seastore;
+using namespace crimson::os::seastore::segment_manager::block;
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_test);
+  }
+}
+
+/**
+ * BlockDriver
+ *
+ * Simple interface to enable throughput test to compare raw disk to
+ * transaction_manager, etc
+ */
+class BlockDriver {
+public:
+  struct config_t {
+    std::string type;
+    bool mkfs = false;
+    std::optional<std::string> path;
+    size_t segment_size;
+    size_t total_device_size;
+
+    void populate_options(
+      po::options_description &desc)
+    {
+      desc.add_options()
+	("type",
+	 po::value<std::string>()
+	 ->default_value("transaction_manager")
+	 ->notifier([this](auto s) { type = s; }),
+	 "Backend to use, options are transaction_manager"
+	)
+	("segment-size",
+	 po::value<size_t>()
+	 ->default_value(16ul << 20 /* 16MB */)
+	 ->notifier([this](auto s) { segment_size = s; }),
+	 "Total working set size"
+	)
+	("total-device-size",
+	 po::value<size_t>()
+	 ->default_value(10ul << 30 /* 10G */)
+	 ->notifier([this](auto s) { total_device_size = s; }),
+	 "Size of writes"
+	)
+	("device-path",
+	 po::value<std::string>()
+	 ->required()
+	 ->notifier([this](auto s) { path = s; }),
+	 "Number of writes outstanding"
+	)
+	("mkfs",
+	 po::value<bool>()
+	 ->default_value(false)
+	 ->notifier([this](auto s) { mkfs = s; }),
+	 "Do mkfs first"
+	);
+    }
+  };
+
+  virtual bufferptr get_buffer(size_t size) = 0;
+
+  virtual seastar::future<> write(
+    off_t offset,
+    bufferptr ptr) = 0;
+
+  virtual seastar::future<bufferlist> read(
+    off_t offset,
+    size_t size) = 0;
+
+  virtual size_t get_size() const = 0;
+
+  virtual seastar::future<> mount() = 0;
+  virtual seastar::future<> close() = 0;
+
+  virtual ~BlockDriver() {}
+};
+using BlockDriverRef = std::unique_ptr<BlockDriver>;
+
+BlockDriverRef get_backend(BlockDriver::config_t config);
+
+struct request_context_t {
+  uint32_t magic = 0;
+  uint32_t type = 0;
+
+  char handle[8] = {0};
+
+  uint64_t from = 0;
+  uint32_t len = 0;
+
+  unsigned err = 0;
+  std::optional<bufferptr> in_buffer;
+  std::optional<bufferlist> out_buffer;
+
+  bool check_magic() const {
+    // todo
+    return true;
+  }
+
+  uint32_t get_command() const {
+    return type & 0xff;
+  }
+
+  bool has_input_buffer() const {
+    return get_command() == NBD_CMD_WRITE;
+  }
+
+  seastar::future<> read_request(seastar::input_stream<char> &in) {
+    return in.read_exactly(sizeof(struct nbd_request)
+    ).then([this, &in](auto buf) {
+      auto p = buf.get();
+      magic = seastar::consume_be<uint32_t>(p);
+      type = seastar::consume_be<uint32_t>(p);
+      memcpy(handle, p, sizeof(handle));
+      p += sizeof(handle);
+      from = seastar::consume_be<uint64_t>(p);
+      len = seastar::consume_be<uint32_t>(p);
+      logger().debug(
+        "Got request, magic {}, type {}, from {}, len {}",
+	magic, type, from, len);
+
+      if (has_input_buffer()) {
+	return in.read_exactly(len).then([this](auto buf) {
+	  in_buffer = ceph::buffer::create_page_aligned(len);
+	  in_buffer->copy_in(0, len, buf.get());
+	  return seastar::now();
+	});
+      } else {
+	return seastar::now();
+      }
+    });
+  }
+
+  seastar::future<> write_reply(seastar::output_stream<char> &out) {
+    seastar::temporary_buffer<char> buffer{sizeof(struct nbd_reply)};
+    auto p = buffer.get_write();
+    seastar::produce_be<uint32_t>(p, NBD_REPLY_MAGIC);
+    seastar::produce_be<uint32_t>(p, err);
+    memcpy(p, handle, sizeof(handle));
+    return out.write(std::move(buffer)).then([this, &out] {
+      if (out_buffer) {
+        return seastar::do_for_each(
+          out_buffer->mut_buffers(),
+          [&out](bufferptr &ptr) {
+            return out.write(
+	      seastar::temporary_buffer<char>(
+		ptr.c_str(),
+		ptr.length(),
+		seastar::make_deleter([ptr](){}))
+	    );
+          });
+      } else {
+        return seastar::now();
+      }
+    }).then([&out] {
+      return out.flush();
+    });
+  }
+};
+
+/**
+ * NBDHandler
+ *
+ * Simple throughput test for concurrent, single threaded
+ * writes to an BlockDriver.
+ */
+class NBDHandler {
+  BlockDriver &backend;
+  std::string uds_path;
+public:
+  struct config_t {
+    std::string uds_path;
+
+    void populate_options(
+      po::options_description &desc)
+    {
+      desc.add_options()
+	("uds-path",
+	 po::value<std::string>()
+	 ->default_value("/tmp/store_nbd_socket.sock")
+	 ->notifier([this](auto s) {
+	   uds_path = s;
+	 }),
+	 "Path to domain socket for nbd"
+	);
+    }
+  };
+
+  NBDHandler(
+    BlockDriver &backend,
+    config_t config) :
+    backend(backend),
+    uds_path(config.uds_path)
+  {}
+
+  seastar::future<> run();
+};
+
+int main(int argc, char** argv)
+{
+  po::options_description desc{"Allowed options"};
+  bool debug = false;
+  desc.add_options()
+    ("help,h", "show help message")
+    ("debug", po::value<bool>(&debug)->default_value(false),
+     "enable debugging");
+
+  po::options_description nbd_pattern_options{"NBD Pattern Options"};
+  NBDHandler::config_t nbd_config;
+  nbd_config.populate_options(nbd_pattern_options);
+  desc.add(nbd_pattern_options);
+
+  po::options_description backend_pattern_options{"Backend Options"};
+  BlockDriver::config_t backend_config;
+  backend_config.populate_options(backend_pattern_options);
+  desc.add(backend_pattern_options);
+
+  po::variables_map vm;
+  std::vector<std::string> unrecognized_options;
+  try {
+    auto parsed = po::command_line_parser(argc, argv)
+      .options(desc)
+      .allow_unregistered()
+      .run();
+    po::store(parsed, vm);
+    if (vm.count("help")) {
+      std::cout << desc << std::endl;
+      return 0;
+    }
+
+    po::notify(vm);
+    unrecognized_options =
+      po::collect_unrecognized(parsed.options, po::include_positional);
+ }  catch(const po::error& e) {
+    std::cerr << "error: " << e.what() << std::endl;
+    return 1;
+  }
+  std::vector<const char*> args(argv, argv + argc);
+
+  seastar::app_template app;
+
+  std::vector<char*> av{argv[0]};
+  std::transform(begin(unrecognized_options),
+                 end(unrecognized_options),
+                 std::back_inserter(av),
+                 [](auto& s) {
+                   return const_cast<char*>(s.c_str());
+                 });
+
+  SeastarRunner sc;
+  sc.init(av.size(), av.data());
+
+  if (debug) {
+    seastar::global_logger_registry().set_all_loggers_level(
+      seastar::log_level::debug
+    );
+  }
+
+  sc.run([=] {
+    auto backend = get_backend(backend_config);
+    return seastar::do_with(
+      NBDHandler(*backend, nbd_config),
+      std::move(backend),
+      [](auto &nbd, auto &backend) {
+	return backend->mount(
+	).then([&] {
+	  logger().debug("Running nbd server...");
+	  return nbd.run();
+	}).then([&] {
+	  return backend->close();
+	});
+      });
+  });
+  sc.stop();
+}
+
+class nbd_oldstyle_negotiation_t {
+  uint64_t magic = seastar::cpu_to_be(0x4e42444d41474943); // "NBDMAGIC"
+  uint64_t magic2 = seastar::cpu_to_be(0x00420281861253);  // "IHAVEOPT"
+  uint64_t size = 0;
+  uint32_t flags = seastar::cpu_to_be(0);
+  char reserved[124] = {0};
+
+public:
+  nbd_oldstyle_negotiation_t(uint64_t size, uint32_t flags)
+    : size(seastar::cpu_to_be(size)), flags(seastar::cpu_to_be(flags)) {}
+} __attribute__((packed));
+
+seastar::future<> send_negotiation(
+  size_t size,
+  seastar::output_stream<char>& out)
+{
+  seastar::temporary_buffer<char> buf{sizeof(nbd_oldstyle_negotiation_t)};
+  new (buf.get_write()) nbd_oldstyle_negotiation_t(size, 1);
+  return out.write(std::move(buf)
+  ).then([&out] {
+    return out.flush();
+  });
+}
+
+seastar::future<> handle_command(
+  BlockDriver &backend,
+  request_context_t &context,
+  seastar::output_stream<char> &out)
+{
+  logger().debug("got command {}", context.get_command());
+  return ([&] {
+    switch (context.get_command()) {
+    case NBD_CMD_WRITE:
+      return backend.write(
+	context.from,
+	*context.in_buffer);
+    case NBD_CMD_READ:
+      return backend.read(
+	context.from,
+	context.len).then([&context] (auto buffer) {
+	  context.out_buffer = buffer;
+	});
+    case NBD_CMD_DISC:
+      throw std::system_error(std::make_error_code(std::errc::bad_message));
+    case NBD_CMD_TRIM:
+      throw std::system_error(std::make_error_code(std::errc::bad_message));
+    default:
+      throw std::system_error(std::make_error_code(std::errc::bad_message));
+    }
+  })().then([&] {
+    logger().debug("Writing reply");
+    return context.write_reply(out);
+  });
+}
+
+
+seastar::future<> handle_commands(
+  BlockDriver &backend,
+  seastar::input_stream<char>& in,
+  seastar::output_stream<char>& out)
+{
+  logger().debug("handle_commands");
+  return seastar::keep_doing(
+    [&] {
+      logger().debug("waiting for command");
+      auto request_ref = std::make_unique<request_context_t>();
+      auto &request = *request_ref;
+      return request.read_request(in
+      ).then([&] {
+	return handle_command(backend, request, out);
+      }).then([req=std::move(request_ref)] {
+	logger().debug("complete");
+      });
+    });
+}
+
+seastar::future<> NBDHandler::run()
+{
+  logger().debug("About to listen on {}", uds_path);
+  return seastar::do_with(
+    seastar::engine().listen(
+      seastar::socket_address{
+	seastar::unix_domain_addr{uds_path}}),
+    [=](auto &socket) {
+      return seastar::keep_doing(
+	[this, &socket] {
+	  return socket.accept().then([this](auto acc) {
+	    logger().debug("Accepted");
+	    return seastar::do_with(
+	      std::move(acc.connection),
+	      [this](auto &conn) {
+		return seastar::do_with(
+		  conn.input(),
+		  conn.output(),
+		  [&, this](auto &input, auto &output) {
+		    return send_negotiation(
+		      backend.get_size(),
+		      output
+		    ).then([&, this] {
+		      return handle_commands(backend, input, output);
+		    }).finally([&] {
+		      return input.close();
+		    }).finally([&] {
+		      return output.close();
+		    }).handle_exception([](auto e) {
+		      return seastar::now();
+		    });
+		  });
+	      });
+	  });
+	});
+    });
+}
+
+class TMDriver final : public BlockDriver {
+  const config_t config;
+  std::unique_ptr<segment_manager::block::BlockSegmentManager> segment_manager;
+  std::unique_ptr<SegmentCleaner> segment_cleaner;
+  std::unique_ptr<Journal> journal;
+  std::unique_ptr<Cache> cache;
+  LBAManagerRef lba_manager;
+  std::unique_ptr<TransactionManager> tm;
+
+public:
+  TMDriver(config_t config) : config(config) {}
+  ~TMDriver() final {}
+
+  bufferptr get_buffer(size_t size) final {
+    return ceph::buffer::create_page_aligned(size);
+  }
+
+  seastar::future<> write(
+    off_t offset,
+    bufferptr ptr) final {
+    logger().debug("Writing offset {}", offset);
+    assert(offset % segment_manager->get_block_size() == 0);
+    assert(ptr.length() == (size_t)segment_manager->get_block_size());
+    return seastar::do_with(
+      tm->create_transaction(),
+      std::move(ptr),
+      [this, offset](auto &t, auto &ptr) {
+	return tm->dec_ref(
+	  *t,
+	  offset
+	).safe_then([](auto){}).handle_error(
+	  crimson::ct_error::enoent::handle([](auto) { return seastar::now(); }),
+	  crimson::ct_error::pass_further_all{}
+	).safe_then([=, &t, &ptr] {
+	  logger().debug("dec_ref complete");
+	  return tm->alloc_extent<TestBlock>(
+	    *t,
+	    offset,
+	    ptr.length());
+	}).safe_then([=, &t, &ptr](auto ext) mutable {
+	  assert(ext->get_laddr() == (size_t)offset);
+	  assert(ext->get_bptr().length() == ptr.length());
+	  ext->get_bptr().swap(ptr);
+	  logger().debug("submitting transaction");
+	  return tm->submit_transaction(std::move(t));
+	});
+      }).handle_error(
+	crimson::ct_error::assert_all{}
+      );
+  }
+
+  seastar::future<bufferlist> read(
+    off_t offset,
+    size_t size) final {
+    logger().debug("Reading offset {}", offset);
+    assert(offset % segment_manager->get_block_size() == 0);
+    assert(size % (size_t)segment_manager->get_block_size() == 0);
+    return seastar::do_with(
+      tm->create_transaction(),
+      [this, offset, size](auto &t) {
+	return tm->read_extents<TestBlock>(*t, offset, size
+	).safe_then([=](auto ext_list) mutable {
+	  size_t cur = offset;
+	  bufferlist bl;
+	  for (auto &i: ext_list) {
+	    if (cur != i.first) {
+	      assert(cur < i.first);
+	      bl.append_zero(i.first - cur);
+	      cur = i.first;
+	    }
+	    bl.append(i.second->get_bptr());
+	    cur += i.second->get_bptr().length();
+	  }
+	  if (bl.length() != size) {
+	    assert(bl.length() < size);
+	    bl.append_zero(size - bl.length());
+	  }
+	  return seastar::make_ready_future<bufferlist>(std::move(bl));
+	});
+      }).handle_error(
+	crimson::ct_error::assert_all{}
+      );
+  }
+
+  void init() {
+    segment_cleaner = std::make_unique<SegmentCleaner>(
+      SegmentCleaner::config_t::default_from_segment_manager(
+	*segment_manager),
+      true);
+    journal = std::make_unique<Journal>(*segment_manager);
+    cache = std::make_unique<Cache>(*segment_manager);
+    lba_manager = lba_manager::create_lba_manager(*segment_manager, *cache);
+    tm = std::make_unique<TransactionManager>(
+      *segment_manager, *segment_cleaner, *journal, *cache, *lba_manager);
+    journal->set_segment_provider(&*segment_cleaner);
+    segment_cleaner->set_extent_callback(&*tm);
+  }
+
+  void clear() {
+    tm.reset();
+    lba_manager.reset();
+    cache.reset();
+    journal.reset();
+    segment_cleaner.reset();
+  }
+
+  size_t get_size() const final {
+    return segment_manager->get_size() * .5;
+  }
+
+  seastar::future<> mkfs() {
+    assert(config.path);
+    segment_manager = std::make_unique<
+      segment_manager::block::BlockSegmentManager
+      >();
+    logger().debug("mkfs");
+    return segment_manager->mkfs(
+      { *config.path, config.segment_size, config.total_device_size }
+    ).safe_then([this] {
+      logger().debug("");
+      return segment_manager->mount({ *config.path });
+    }).safe_then([this] {
+      init();
+      logger().debug("tm mkfs");
+      return tm->mkfs();
+    }).safe_then([this] {
+      logger().debug("tm close");
+      return tm->close();
+    }).safe_then([this] {
+      logger().debug("sm close");
+      return segment_manager->close();
+    }).safe_then([this] {
+      clear();
+      logger().debug("mkfs complete");
+      return TransactionManager::mkfs_ertr::now();
+    }).handle_error(
+      crimson::ct_error::assert_all{}
+    );
+  }
+
+  seastar::future<> mount() final {
+    return (config.mkfs ? mkfs() : seastar::now()
+    ).then([this] {
+      segment_manager = std::make_unique<
+	segment_manager::block::BlockSegmentManager
+	>();
+      return segment_manager->mount({ *config.path });
+    }).safe_then([this] {
+      init();
+      return tm->mount();
+    }).handle_error(
+      crimson::ct_error::assert_all{}
+    );
+  };
+
+  seastar::future<> close() final {
+    return segment_manager->close(
+    ).safe_then([this] {
+      return tm->close();
+    }).safe_then([this] {
+      clear();
+      return seastar::now();
+    }).handle_error(
+      crimson::ct_error::assert_all{}
+    );
+  }
+};
+
+BlockDriverRef get_backend(BlockDriver::config_t config)
+{
+  if (config.type == "transaction_manager") {
+    return std::make_unique<TMDriver>(config);
+  } else {
+    ceph_assert(0 == "invalid option");
+    return BlockDriverRef();
+  }
+}