summaryrefslogtreecommitdiffstats
path: root/src/crimson/os
diff options
context:
space:
mode:
Diffstat (limited to 'src/crimson/os')
-rw-r--r--src/crimson/os/CMakeLists.txt15
-rw-r--r--src/crimson/os/alienstore/CMakeLists.txt76
-rw-r--r--src/crimson/os/alienstore/alien_collection.h26
-rw-r--r--src/crimson/os/alienstore/alien_store.cc575
-rw-r--r--src/crimson/os/alienstore/alien_store.h125
-rw-r--r--src/crimson/os/alienstore/thread_pool.cc80
-rw-r--r--src/crimson/os/alienstore/thread_pool.h132
-rw-r--r--src/crimson/os/cyanstore/CMakeLists.txt7
-rw-r--r--src/crimson/os/cyanstore/cyan_collection.cc76
-rw-r--r--src/crimson/os/cyanstore/cyan_collection.h51
-rw-r--r--src/crimson/os/cyanstore/cyan_object.cc89
-rw-r--r--src/crimson/os/cyanstore/cyan_object.h45
-rw-r--r--src/crimson/os/cyanstore/cyan_store.cc835
-rw-r--r--src/crimson/os/cyanstore/cyan_store.h185
-rw-r--r--src/crimson/os/futurized_collection.h37
-rw-r--r--src/crimson/os/futurized_store.cc22
-rw-r--r--src/crimson/os/futurized_store.h167
-rw-r--r--src/crimson/os/seastore/CMakeLists.txt37
-rw-r--r--src/crimson/os/seastore/cache.cc541
-rw-r--r--src/crimson/os/seastore/cache.h516
-rw-r--r--src/crimson/os/seastore/cached_extent.cc96
-rw-r--r--src/crimson/os/seastore/cached_extent.h659
-rw-r--r--src/crimson/os/seastore/extentmap_manager.cc32
-rw-r--r--src/crimson/os/seastore/extentmap_manager.h124
-rw-r--r--src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.cc118
-rw-r--r--src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h64
-rw-r--r--src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h145
-rw-r--r--src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.cc373
-rw-r--r--src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h281
-rw-r--r--src/crimson/os/seastore/journal.cc756
-rw-r--r--src/crimson/os/seastore/journal.h405
-rw-r--r--src/crimson/os/seastore/lba_manager.cc17
-rw-r--r--src/crimson/os/seastore/lba_manager.h207
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc580
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h188
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc153
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h274
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h269
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc701
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h555
-rw-r--r--src/crimson/os/seastore/onode.cc44
-rw-r--r--src/crimson/os/seastore/onode.h48
-rw-r--r--src/crimson/os/seastore/onode_manager.h57
-rw-r--r--src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc71
-rw-r--r--src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h65
-rw-r--r--src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc188
-rw-r--r--src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h70
-rw-r--r--src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc567
-rw-r--r--src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h942
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h93
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node.cc809
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node.h476
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h42
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h413
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc35
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h86
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h156
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc88
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h126
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h67
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc39
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h80
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc76
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h197
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h613
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h75
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h64
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc165
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h180
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc32
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h846
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc318
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h226
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc96
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h366
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h2186
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h411
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc208
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h341
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/super.cc26
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/super.h143
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc235
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/tree.h119
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h125
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h333
-rw-r--r--src/crimson/os/seastore/root_block.h109
-rw-r--r--src/crimson/os/seastore/seastore.cc532
-rw-r--r--src/crimson/os/seastore/seastore.h181
-rw-r--r--src/crimson/os/seastore/seastore_types.cc105
-rw-r--r--src/crimson/os/seastore/seastore_types.h369
-rw-r--r--src/crimson/os/seastore/segment_cleaner.cc340
-rw-r--r--src/crimson/os/seastore/segment_cleaner.h691
-rw-r--r--src/crimson/os/seastore/segment_manager.h128
-rw-r--r--src/crimson/os/seastore/segment_manager/block.cc402
-rw-r--r--src/crimson/os/seastore/segment_manager/block.h222
-rw-r--r--src/crimson/os/seastore/segment_manager/ephemeral.cc226
-rw-r--r--src/crimson/os/seastore/segment_manager/ephemeral.h111
-rw-r--r--src/crimson/os/seastore/transaction.h145
-rw-r--r--src/crimson/os/seastore/transaction_manager.cc306
-rw-r--r--src/crimson/os/seastore/transaction_manager.h296
100 files changed, 25740 insertions, 0 deletions
diff --git a/src/crimson/os/CMakeLists.txt b/src/crimson/os/CMakeLists.txt
new file mode 100644
index 000000000..f221dd7c1
--- /dev/null
+++ b/src/crimson/os/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_library(crimson-os STATIC
+ futurized_store.cc
+ ${PROJECT_SOURCE_DIR}/src/os/Transaction.cc)
+add_subdirectory(cyanstore)
+
+if(WITH_BLUESTORE)
+ add_subdirectory(alienstore)
+endif()
+
+add_subdirectory(seastore)
+target_link_libraries(crimson-os
+ crimson-cyanstore
+ crimson-alienstore
+ crimson-seastore
+ crimson)
diff --git a/src/crimson/os/alienstore/CMakeLists.txt b/src/crimson/os/alienstore/CMakeLists.txt
new file mode 100644
index 000000000..659a3c6ce
--- /dev/null
+++ b/src/crimson/os/alienstore/CMakeLists.txt
@@ -0,0 +1,76 @@
+include_directories(SYSTEM "${CMAKE_SOURCE_DIR}/src/rocksdb/include")
+
+add_library(alien::cflags INTERFACE IMPORTED)
+set_target_properties(alien::cflags PROPERTIES
+ INTERFACE_COMPILE_DEFINITIONS "WITH_SEASTAR;WITH_ALIEN"
+ INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:Seastar::seastar,INTERFACE_INCLUDE_DIRECTORIES>)
+
+add_library(crimson-alien-common STATIC
+ ${PROJECT_SOURCE_DIR}/src/common/admin_socket.cc
+ ${PROJECT_SOURCE_DIR}/src/common/blkdev.cc
+ ${PROJECT_SOURCE_DIR}/src/common/ceph_context.cc
+ ${PROJECT_SOURCE_DIR}/src/common/ceph_crypto.cc
+ ${PROJECT_SOURCE_DIR}/src/common/condition_variable_debug.cc
+ ${PROJECT_SOURCE_DIR}/src/common/cmdparse.cc
+ ${PROJECT_SOURCE_DIR}/src/common/Finisher.cc
+ ${PROJECT_SOURCE_DIR}/src/common/HeartbeatMap.cc
+ ${PROJECT_SOURCE_DIR}/src/common/PluginRegistry.cc
+ ${PROJECT_SOURCE_DIR}/src/common/lockdep.cc
+ ${PROJECT_SOURCE_DIR}/src/common/mutex_debug.cc
+ ${PROJECT_SOURCE_DIR}/src/common/perf_counters.cc
+ ${PROJECT_SOURCE_DIR}/src/common/perf_counters_collection.cc
+ ${PROJECT_SOURCE_DIR}/src/common/RefCountedObj.cc
+ ${PROJECT_SOURCE_DIR}/src/common/shared_mutex_debug.cc
+ ${PROJECT_SOURCE_DIR}/src/common/SubProcess.cc
+ ${PROJECT_SOURCE_DIR}/src/common/Throttle.cc
+ ${PROJECT_SOURCE_DIR}/src/common/Timer.cc
+ ${PROJECT_SOURCE_DIR}/src/common/TrackedOp.cc
+ ${PROJECT_SOURCE_DIR}/src/common/WorkQueue.cc
+ ${PROJECT_SOURCE_DIR}/src/common/util.cc
+ ${PROJECT_SOURCE_DIR}/src/crush/CrushLocation.cc
+ ${PROJECT_SOURCE_DIR}/src/global/global_context.cc
+ $<TARGET_OBJECTS:compressor_objs>
+ $<TARGET_OBJECTS:common_prioritycache_obj>)
+target_link_libraries(crimson-alien-common
+ crimson-common
+ alien::cflags)
+
+set(alien_store_srcs
+ alien_store.cc
+ thread_pool.cc
+ ${PROJECT_SOURCE_DIR}/src/os/ObjectStore.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/Allocator.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/AvlAllocator.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/BitmapFreelistManager.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueFS.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/bluefs_types.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueRocksEnv.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueStore.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/bluestore_types.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/fastbmap_allocator_impl.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/FreelistManager.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/HybridAllocator.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/StupidAllocator.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/BitmapAllocator.cc)
+if(WITH_ZBD)
+ list(APPEND alien_store_srcs
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/zoned_types.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/ZonedFreelistManager.cc
+ ${PROJECT_SOURCE_DIR}/src/os/bluestore/ZonedAllocator.cc)
+endif()
+add_library(crimson-alienstore STATIC
+ ${alien_store_srcs})
+if(WITH_LTTNG)
+ add_dependencies(crimson-alienstore bluestore-tp)
+endif()
+target_link_libraries(crimson-alienstore
+ PRIVATE
+ alien::cflags
+ fmt::fmt
+ kv
+ heap_profiler
+ crimson-alien-common
+ ${BLKID_LIBRARIES}
+ ${UDEV_LIBRARIES}
+ crimson
+ blk)
diff --git a/src/crimson/os/alienstore/alien_collection.h b/src/crimson/os/alienstore/alien_collection.h
new file mode 100644
index 000000000..98b8fdef4
--- /dev/null
+++ b/src/crimson/os/alienstore/alien_collection.h
@@ -0,0 +1,26 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "os/ObjectStore.h"
+
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+#include "alien_store.h"
+
+namespace crimson::os {
+
+class AlienCollection final : public FuturizedCollection {
+public:
+ AlienCollection(ObjectStore::CollectionHandle ch)
+ : FuturizedCollection(ch->cid),
+ collection(ch) {}
+
+ ~AlienCollection() {}
+
+private:
+ ObjectStore::CollectionHandle collection;
+ friend AlienStore;
+};
+}
diff --git a/src/crimson/os/alienstore/alien_store.cc b/src/crimson/os/alienstore/alien_store.cc
new file mode 100644
index 000000000..cb5553254
--- /dev/null
+++ b/src/crimson/os/alienstore/alien_store.cc
@@ -0,0 +1,575 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "alien_collection.h"
+#include "alien_store.h"
+
+#include <map>
+#include <string_view>
+#include <boost/algorithm/string/trim.hpp>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include <seastar/core/alien.hh>
+#include <seastar/core/future-util.hh>
+#include <seastar/core/reactor.hh>
+
+#include "common/ceph_context.h"
+#include "global/global_context.h"
+#include "include/Context.h"
+#include "os/bluestore/BlueStore.h"
+#include "os/ObjectStore.h"
+#include "os/Transaction.h"
+
+#include "crimson/common/log.h"
+#include "crimson/os/futurized_store.h"
+
+namespace {
+ seastar::logger& logger()
+ {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+
+class OnCommit final: public Context
+{
+ int cpuid;
+ Context *oncommit;
+ seastar::promise<> &alien_done;
+public:
+ OnCommit(
+ int id,
+ seastar::promise<> &done,
+ Context *oncommit,
+ ceph::os::Transaction& txn)
+ : cpuid(id), oncommit(oncommit),
+ alien_done(done) {}
+
+ void finish(int) final {
+ return seastar::alien::submit_to(cpuid, [this] {
+ if (oncommit) oncommit->complete(0);
+ alien_done.set_value();
+ return seastar::make_ready_future<>();
+ }).wait();
+ }
+};
+}
+
+namespace crimson::os {
+
+AlienStore::AlienStore(const std::string& path, const ConfigValues& values)
+ : path{path}
+{
+ cct = std::make_unique<CephContext>(CEPH_ENTITY_TYPE_OSD);
+ g_ceph_context = cct.get();
+ cct->_conf.set_config_values(values);
+ store = std::make_unique<BlueStore>(cct.get(), path);
+
+ long cpu_id = 0;
+ if (long nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); nr_cpus != -1) {
+ cpu_id = nr_cpus - 1;
+ } else {
+ logger().error("{}: unable to get nproc: {}", __func__, errno);
+ cpu_id = -1;
+ }
+ tp = std::make_unique<crimson::os::ThreadPool>(1, 128, cpu_id);
+}
+
+seastar::future<> AlienStore::start()
+{
+ return tp->start();
+}
+
+seastar::future<> AlienStore::stop()
+{
+ return tp->submit([this] {
+ for (auto [cid, ch]: coll_map)
+ static_cast<AlienCollection*>(ch.get())->collection.reset();
+ store.reset();
+ }).then([this] {
+ return tp->stop();
+ });
+}
+
+AlienStore::~AlienStore() = default;
+
+seastar::future<> AlienStore::mount()
+{
+ logger().debug("{}", __func__);
+ return tp->submit([this] {
+ return store->mount();
+ }).then([] (int r) {
+ assert(r == 0);
+ return seastar::now();
+ });
+}
+
+seastar::future<> AlienStore::umount()
+{
+ logger().info("{}", __func__);
+ return transaction_gate.close().then([this] {
+ return tp->submit([this] {
+ return store->umount();
+ });
+ }).then([] (int r) {
+ assert(r == 0);
+ return seastar::now();
+ });
+}
+
+seastar::future<> AlienStore::mkfs(uuid_d osd_fsid)
+{
+ logger().debug("{}", __func__);
+ store->set_fsid(osd_fsid);
+ return tp->submit([this] {
+ return store->mkfs();
+ }).then([] (int r) {
+ assert(r == 0);
+ return seastar::now();
+ });
+}
+
+seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
+AlienStore::list_objects(CollectionRef ch,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit) const
+{
+ logger().debug("{}", __func__);
+ return seastar::do_with(std::vector<ghobject_t>(), ghobject_t(),
+ [=] (auto &objects, auto &next) {
+ objects.reserve(limit);
+ return tp->submit([=, &objects, &next] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->collection_list(c->collection, start, end,
+ store->get_ideal_list_max(),
+ &objects, &next);
+ }).then([&objects, &next] (int r) {
+ assert(r == 0);
+ return seastar::make_ready_future<std::tuple<std::vector<ghobject_t>, ghobject_t>>(
+ std::make_tuple(std::move(objects), std::move(next)));
+ });
+ });
+}
+
+seastar::future<CollectionRef> AlienStore::create_new_collection(const coll_t& cid)
+{
+ logger().debug("{}", __func__);
+ return tp->submit([this, cid] {
+ return store->create_new_collection(cid);
+ }).then([this, cid] (ObjectStore::CollectionHandle c) {
+ CollectionRef ch;
+ auto cp = coll_map.find(c->cid);
+ if (cp == coll_map.end()) {
+ ch = new AlienCollection(c);
+ coll_map[c->cid] = ch;
+ } else {
+ ch = cp->second;
+ auto ach = static_cast<AlienCollection*>(ch.get());
+ if (ach->collection != c) {
+ ach->collection = c;
+ }
+ }
+ return seastar::make_ready_future<CollectionRef>(ch);
+ });
+
+}
+
+seastar::future<CollectionRef> AlienStore::open_collection(const coll_t& cid)
+{
+ logger().debug("{}", __func__);
+ return tp->submit([this, cid] {
+ return store->open_collection(cid);
+ }).then([this] (ObjectStore::CollectionHandle c) {
+ CollectionRef ch;
+ auto cp = coll_map.find(c->cid);
+ if (cp == coll_map.end()){
+ ch = new AlienCollection(c);
+ coll_map[c->cid] = ch;
+ } else {
+ ch = cp->second;
+ auto ach = static_cast<AlienCollection*>(ch.get());
+ if (ach->collection != c){
+ ach->collection = c;
+ }
+ }
+ return seastar::make_ready_future<CollectionRef>(ch);
+ });
+}
+
+seastar::future<std::vector<coll_t>> AlienStore::list_collections()
+{
+ logger().debug("{}", __func__);
+
+ return seastar::do_with(std::vector<coll_t>{}, [=] (auto &ls) {
+ return tp->submit([this, &ls] {
+ return store->list_collections(ls);
+ }).then([&ls] (int r) {
+ assert(r == 0);
+ return seastar::make_ready_future<std::vector<coll_t>>(std::move(ls));
+ });
+ });
+}
+
+AlienStore::read_errorator::future<ceph::bufferlist>
+AlienStore::read(CollectionRef ch,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ uint32_t op_flags)
+{
+ logger().debug("{}", __func__);
+ return seastar::do_with(ceph::bufferlist{}, [=] (auto &bl) {
+ return tp->submit([=, &bl] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->read(c->collection, oid, offset, len, bl, op_flags);
+ }).then([&bl] (int r) -> read_errorator::future<ceph::bufferlist> {
+ if (r == -ENOENT) {
+ return crimson::ct_error::enoent::make();
+ } else if (r == -EIO) {
+ return crimson::ct_error::input_output_error::make();
+ } else {
+ return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl));
+ }
+ });
+ });
+}
+
+AlienStore::read_errorator::future<ceph::bufferlist>
+AlienStore::readv(CollectionRef ch,
+ const ghobject_t& oid,
+ interval_set<uint64_t>& m,
+ uint32_t op_flags)
+{
+ logger().debug("{}", __func__);
+ return seastar::do_with(ceph::bufferlist{},
+ [this, ch, oid, &m, op_flags](auto& bl) {
+ return tp->submit([this, ch, oid, &m, op_flags, &bl] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->readv(c->collection, oid, m, bl, op_flags);
+ }).then([&bl](int r) -> read_errorator::future<ceph::bufferlist> {
+ if (r == -ENOENT) {
+ return crimson::ct_error::enoent::make();
+ } else if (r == -EIO) {
+ return crimson::ct_error::input_output_error::make();
+ } else {
+ return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl));
+ }
+ });
+ });
+}
+
+AlienStore::get_attr_errorator::future<ceph::bufferptr>
+AlienStore::get_attr(CollectionRef ch,
+ const ghobject_t& oid,
+ std::string_view name) const
+{
+ logger().debug("{}", __func__);
+ return seastar::do_with(ceph::bufferptr{}, [=] (auto &value) {
+ return tp->submit([=, &value] {
+ auto c =static_cast<AlienCollection*>(ch.get());
+ return store->getattr(c->collection, oid,
+ static_cast<std::string>(name).c_str(), value);
+ }).then([oid, &value] (int r) -> get_attr_errorator::future<ceph::bufferptr> {
+ if (r == -ENOENT) {
+ return crimson::ct_error::enoent::make();
+ } else if (r == -ENODATA) {
+ return crimson::ct_error::enodata::make();
+ } else {
+ return get_attr_errorator::make_ready_future<ceph::bufferptr>(
+ std::move(value));
+ }
+ });
+ });
+}
+
+AlienStore::get_attrs_ertr::future<AlienStore::attrs_t>
+AlienStore::get_attrs(CollectionRef ch,
+ const ghobject_t& oid)
+{
+ logger().debug("{}", __func__);
+ return seastar::do_with(attrs_t{}, [=] (auto &aset) {
+ return tp->submit([=, &aset] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->getattrs(c->collection, oid,
+ reinterpret_cast<map<string,bufferptr>&>(aset));
+ }).then([&aset] (int r) -> get_attrs_ertr::future<attrs_t> {
+ if (r == -ENOENT) {
+ return crimson::ct_error::enoent::make();
+ } else {
+ return get_attrs_ertr::make_ready_future<attrs_t>(std::move(aset));
+ }
+ });
+ });
+}
+
+auto AlienStore::omap_get_values(CollectionRef ch,
+ const ghobject_t& oid,
+ const set<string>& keys)
+ -> read_errorator::future<omap_values_t>
+{
+ logger().debug("{}", __func__);
+ return seastar::do_with(omap_values_t{}, [=] (auto &values) {
+ return tp->submit([=, &values] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->omap_get_values(c->collection, oid, keys,
+ reinterpret_cast<map<string, bufferlist>*>(&values));
+ }).then([&values] (int r) -> read_errorator::future<omap_values_t> {
+ if (r == -ENOENT) {
+ return crimson::ct_error::enoent::make();
+ } else {
+ assert(r == 0);
+ return read_errorator::make_ready_future<omap_values_t>(std::move(values));
+ }
+ });
+ });
+}
+
+auto AlienStore::omap_get_values(CollectionRef ch,
+ const ghobject_t &oid,
+ const std::optional<string> &start)
+ -> read_errorator::future<std::tuple<bool, omap_values_t>>
+{
+ logger().debug("{} with_start", __func__);
+ return seastar::do_with(omap_values_t{}, [=] (auto &values) {
+ return tp->submit([=, &values] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->omap_get_values(c->collection, oid, start,
+ reinterpret_cast<map<string, bufferlist>*>(&values));
+ }).then([&values] (int r)
+ -> read_errorator::future<std::tuple<bool, omap_values_t>> {
+ if (r == -ENOENT) {
+ return crimson::ct_error::enoent::make();
+ } else if (r < 0){
+ logger().error("omap_get_values(start): {}", r);
+ return crimson::ct_error::input_output_error::make();
+ } else {
+ return read_errorator::make_ready_future<std::tuple<bool, omap_values_t>>(
+ std::make_tuple(true, std::move(values)));
+ }
+ });
+ });
+}
+
+seastar::future<> AlienStore::do_transaction(CollectionRef ch,
+ ceph::os::Transaction&& txn)
+{
+ logger().debug("{}", __func__);
+ auto id = seastar::this_shard_id();
+ auto done = seastar::promise<>();
+ return seastar::do_with(
+ std::move(txn),
+ std::move(done),
+ [this, ch, id] (auto &txn, auto &done) {
+ return seastar::with_gate(transaction_gate, [this, ch, id, &txn, &done] {
+ return tp_mutex.lock().then ([this, ch, id, &txn, &done] {
+ Context *crimson_wrapper =
+ ceph::os::Transaction::collect_all_contexts(txn);
+ return tp->submit([this, ch, id, crimson_wrapper, &txn, &done] {
+ txn.register_on_commit(new OnCommit(id, done, crimson_wrapper, txn));
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->queue_transaction(c->collection, std::move(txn));
+ });
+ }).then([this, &done] (int r) {
+ assert(r == 0);
+ tp_mutex.unlock();
+ return done.get_future();
+ });
+ });
+ });
+}
+
+seastar::future<> AlienStore::write_meta(const std::string& key,
+ const std::string& value)
+{
+ logger().debug("{}", __func__);
+ return tp->submit([=] {
+ return store->write_meta(key, value);
+ }).then([] (int r) {
+ assert(r == 0);
+ return seastar::make_ready_future<>();
+ });
+}
+
+seastar::future<std::tuple<int, std::string>>
+AlienStore::read_meta(const std::string& key)
+{
+ logger().debug("{}", __func__);
+ return tp->submit([this, key] {
+ std::string value;
+ int r = store->read_meta(key, &value);
+ if (r > 0) {
+ value.resize(r);
+ boost::algorithm::trim_right_if(value,
+ [] (unsigned char c) {return isspace(c);});
+ } else {
+ value.clear();
+ }
+ return std::make_pair(r, value);
+ }).then([] (auto entry) {
+ return seastar::make_ready_future<std::tuple<int, std::string>>(
+ std::move(entry));
+ });
+}
+
+uuid_d AlienStore::get_fsid() const
+{
+ logger().debug("{}", __func__);
+ return store->get_fsid();
+}
+
+seastar::future<store_statfs_t> AlienStore::stat() const
+{
+ logger().info("{}", __func__);
+ return seastar::do_with(store_statfs_t{}, [this] (store_statfs_t &st) {
+ return tp->submit([this, &st] {
+ return store->statfs(&st, nullptr);
+ }).then([&st] (int r) {
+ assert(r == 0);
+ return seastar::make_ready_future<store_statfs_t>(std::move(st));
+ });
+ });
+}
+
+unsigned AlienStore::get_max_attr_name_length() const
+{
+ logger().info("{}", __func__);
+ return 256;
+}
+
+seastar::future<struct stat> AlienStore::stat(
+ CollectionRef ch,
+ const ghobject_t& oid)
+{
+ return seastar::do_with((struct stat){}, [this, ch, oid](auto& st) {
+ return tp->submit([this, ch, oid, &st] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ store->stat(c->collection, oid, &st);
+ return st;
+ });
+ });
+}
+
+auto AlienStore::omap_get_header(CollectionRef ch,
+ const ghobject_t& oid)
+ -> read_errorator::future<ceph::bufferlist>
+{
+ return seastar::do_with(ceph::bufferlist(), [=](auto& bl) {
+ return tp->submit([=, &bl] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->omap_get_header(c->collection, oid, &bl);
+ }).then([&bl] (int r) -> read_errorator::future<ceph::bufferlist> {
+ if (r == -ENOENT) {
+ return crimson::ct_error::enoent::make();
+ } else if (r < 0) {
+ logger().error("omap_get_header: {}", r);
+ return crimson::ct_error::input_output_error::make();
+ } else {
+ return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl));
+ }
+ });
+ });
+}
+
+seastar::future<std::map<uint64_t, uint64_t>> AlienStore::fiemap(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ uint64_t off,
+ uint64_t len)
+{
+ return seastar::do_with(std::map<uint64_t, uint64_t>(), [=](auto& destmap) {
+ return tp->submit([=, &destmap] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->fiemap(c->collection, oid, off, len, destmap);
+ }).then([&destmap] (int i) {
+ return seastar::make_ready_future
+ <std::map<uint64_t, uint64_t>>
+ (std::move(destmap));
+ });
+ });
+}
+
+seastar::future<FuturizedStore::OmapIteratorRef> AlienStore::get_omap_iterator(
+ CollectionRef ch,
+ const ghobject_t& oid)
+{
+ return tp->submit([=] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ auto iter = store->get_omap_iterator(c->collection, oid);
+ return FuturizedStore::OmapIteratorRef(
+ new AlienStore::AlienOmapIterator(iter,
+ this));
+ });
+}
+
+//TODO: each iterator op needs one submit, this is not efficient,
+// needs further optimization.
+seastar::future<> AlienStore::AlienOmapIterator::seek_to_first()
+{
+ return store->tp->submit([=] {
+ return iter->seek_to_first();
+ }).then([] (int r) {
+ assert(r == 0);
+ return seastar::now();
+ });
+}
+
+seastar::future<> AlienStore::AlienOmapIterator::upper_bound(
+ const std::string& after)
+{
+ return store->tp->submit([this, after] {
+ return iter->upper_bound(after);
+ }).then([] (int r) {
+ assert(r == 0);
+ return seastar::now();
+ });
+}
+
+seastar::future<> AlienStore::AlienOmapIterator::lower_bound(
+ const std::string& to)
+{
+ return store->tp->submit([this, to] {
+ return iter->lower_bound(to);
+ }).then([] (int r) {
+ assert(r == 0);
+ return seastar::now();
+ });
+}
+
+seastar::future<> AlienStore::AlienOmapIterator::next()
+{
+ return store->tp->submit([this] {
+ return iter->next();
+ }).then([] (int r) {
+ assert(r == 0);
+ return seastar::now();
+ });
+}
+
+bool AlienStore::AlienOmapIterator::valid() const
+{
+ return iter->valid();
+}
+
+std::string AlienStore::AlienOmapIterator::key()
+{
+ return iter->key();
+}
+
+seastar::future<std::string> AlienStore::AlienOmapIterator::tail_key()
+{
+ return store->tp->submit([this] {
+ return iter->tail_key();
+ });
+}
+
+ceph::buffer::list AlienStore::AlienOmapIterator::value()
+{
+ return iter->value();
+}
+
+int AlienStore::AlienOmapIterator::status() const
+{
+ return iter->status();
+}
+
+}
diff --git a/src/crimson/os/alienstore/alien_store.h b/src/crimson/os/alienstore/alien_store.h
new file mode 100644
index 000000000..92739340e
--- /dev/null
+++ b/src/crimson/os/alienstore/alien_store.h
@@ -0,0 +1,125 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_mutex.hh>
+
+#include "common/ceph_context.h"
+#include "os/ObjectStore.h"
+#include "osd/osd_types.h"
+
+#include "crimson/os/alienstore/thread_pool.h"
+#include "crimson/os/futurized_collection.h"
+#include "crimson/os/futurized_store.h"
+
+namespace ceph::os {
+class Transaction;
+}
+
+namespace crimson::os {
+class AlienStore final : public FuturizedStore {
+public:
+ class AlienOmapIterator final : public OmapIterator {
+ public:
+ AlienOmapIterator(ObjectMap::ObjectMapIterator& it,
+ AlienStore* store) : iter(it), store(store) {}
+ seastar::future<> seek_to_first();
+ seastar::future<> upper_bound(const std::string& after);
+ seastar::future<> lower_bound(const std::string& to);
+ bool valid() const;
+ seastar::future<> next();
+ std::string key();
+ seastar::future<std::string> tail_key();
+ ceph::buffer::list value();
+ int status() const;
+ private:
+ ObjectMap::ObjectMapIterator iter;
+ AlienStore* store;
+ };
+ AlienStore(const std::string& path, const ConfigValues& values);
+ ~AlienStore() final;
+
+ seastar::future<> start() final;
+ seastar::future<> stop() final;
+ seastar::future<> mount() final;
+ seastar::future<> umount() final;
+
+ seastar::future<> mkfs(uuid_d new_osd_fsid) final;
+ read_errorator::future<ceph::bufferlist> read(CollectionRef c,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ uint32_t op_flags = 0) final;
+ read_errorator::future<ceph::bufferlist> readv(CollectionRef c,
+ const ghobject_t& oid,
+ interval_set<uint64_t>& m,
+ uint32_t op_flags = 0) final;
+
+
+ get_attr_errorator::future<ceph::bufferptr> get_attr(CollectionRef c,
+ const ghobject_t& oid,
+ std::string_view name) const final;
+ get_attrs_ertr::future<attrs_t> get_attrs(CollectionRef c,
+ const ghobject_t& oid) final;
+
+ read_errorator::future<omap_values_t> omap_get_values(
+ CollectionRef c,
+ const ghobject_t& oid,
+ const omap_keys_t& keys) final;
+
+ /// Retrieves paged set of values > start (if present)
+ read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
+ CollectionRef c, ///< [in] collection
+ const ghobject_t &oid, ///< [in] oid
+ const std::optional<std::string> &start ///< [in] start, empty for begin
+ ) final; ///< @return <done, values> values.empty() iff done
+
+ seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
+ CollectionRef c,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit) const final;
+
+ seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
+ seastar::future<CollectionRef> open_collection(const coll_t& cid) final;
+ seastar::future<std::vector<coll_t>> list_collections() final;
+
+ seastar::future<> do_transaction(CollectionRef c,
+ ceph::os::Transaction&& txn) final;
+
+ seastar::future<> write_meta(const std::string& key,
+ const std::string& value) final;
+ seastar::future<std::tuple<int, std::string>> read_meta(
+ const std::string& key) final;
+ uuid_d get_fsid() const final;
+ seastar::future<store_statfs_t> stat() const final;
+ unsigned get_max_attr_name_length() const final;
+ seastar::future<struct stat> stat(
+ CollectionRef,
+ const ghobject_t&) final;
+ read_errorator::future<ceph::bufferlist> omap_get_header(
+ CollectionRef,
+ const ghobject_t&) final;
+ seastar::future<std::map<uint64_t, uint64_t>> fiemap(
+ CollectionRef,
+ const ghobject_t&,
+ uint64_t off,
+ uint64_t len) final;
+ seastar::future<FuturizedStore::OmapIteratorRef> get_omap_iterator(
+ CollectionRef ch,
+ const ghobject_t& oid) final;
+
+private:
+ constexpr static unsigned MAX_KEYS_PER_OMAP_GET_CALL = 32;
+ mutable std::unique_ptr<crimson::os::ThreadPool> tp;
+ const std::string path;
+ uint64_t used_bytes = 0;
+ std::unique_ptr<ObjectStore> store;
+ std::unique_ptr<CephContext> cct;
+ seastar::gate transaction_gate;
+ std::unordered_map<coll_t, CollectionRef> coll_map;
+ seastar::shared_mutex tp_mutex;
+};
+}
diff --git a/src/crimson/os/alienstore/thread_pool.cc b/src/crimson/os/alienstore/thread_pool.cc
new file mode 100644
index 000000000..e127d87d5
--- /dev/null
+++ b/src/crimson/os/alienstore/thread_pool.cc
@@ -0,0 +1,80 @@
+#include "thread_pool.h"
+
+#include <chrono>
+#include <pthread.h>
+
+#include "include/ceph_assert.h"
+#include "crimson/common/config_proxy.h"
+
+using crimson::common::local_conf;
+
+namespace crimson::os {
+
+ThreadPool::ThreadPool(size_t n_threads,
+ size_t queue_sz,
+ long cpu_id)
+ : queue_size{round_up_to(queue_sz, seastar::smp::count)},
+ pending{queue_size}
+{
+ auto queue_max_wait = std::chrono::seconds(local_conf()->threadpool_empty_queue_max_wait);
+ for (size_t i = 0; i < n_threads; i++) {
+ threads.emplace_back([this, cpu_id, queue_max_wait] {
+ if (cpu_id >= 0) {
+ pin(cpu_id);
+ }
+ loop(queue_max_wait);
+ });
+ }
+}
+
+ThreadPool::~ThreadPool()
+{
+ for (auto& thread : threads) {
+ thread.join();
+ }
+}
+
+void ThreadPool::pin(unsigned cpu_id)
+{
+ cpu_set_t cs;
+ CPU_ZERO(&cs);
+ CPU_SET(cpu_id, &cs);
+ [[maybe_unused]] auto r = pthread_setaffinity_np(pthread_self(),
+ sizeof(cs), &cs);
+ ceph_assert(r == 0);
+}
+
+void ThreadPool::loop(std::chrono::milliseconds queue_max_wait)
+{
+ for (;;) {
+ WorkItem* work_item = nullptr;
+ {
+ std::unique_lock lock{mutex};
+ cond.wait_for(lock, queue_max_wait,
+ [this, &work_item] {
+ return pending.pop(work_item) || is_stopping();
+ });
+ }
+ if (work_item) {
+ work_item->process();
+ } else if (is_stopping()) {
+ break;
+ }
+ }
+}
+
+seastar::future<> ThreadPool::start()
+{
+ auto slots_per_shard = queue_size / seastar::smp::count;
+ return submit_queue.start(slots_per_shard);
+}
+
+seastar::future<> ThreadPool::stop()
+{
+ return submit_queue.stop().then([this] {
+ stopping = true;
+ cond.notify_all();
+ });
+}
+
+} // namespace crimson::os
diff --git a/src/crimson/os/alienstore/thread_pool.h b/src/crimson/os/alienstore/thread_pool.h
new file mode 100644
index 000000000..27840da18
--- /dev/null
+++ b/src/crimson/os/alienstore/thread_pool.h
@@ -0,0 +1,132 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <tuple>
+#include <type_traits>
+#include <boost/lockfree/queue.hpp>
+#include <boost/optional.hpp>
+#include <seastar/core/future.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/core/semaphore.hh>
+#include <seastar/core/sharded.hh>
+
+namespace crimson::os {
+
+struct WorkItem {
+ virtual ~WorkItem() {}
+ virtual void process() = 0;
+};
+
+template<typename Func>
+struct Task final : WorkItem {
+ using T = std::invoke_result_t<Func>;
+ using future_stored_type_t =
+ std::conditional_t<std::is_void_v<T>,
+ seastar::internal::future_stored_type_t<>,
+ seastar::internal::future_stored_type_t<T>>;
+ using futurator_t = seastar::futurize<T>;
+public:
+ explicit Task(Func&& f)
+ : func(std::move(f))
+ {}
+ void process() override {
+ try {
+ if constexpr (std::is_void_v<T>) {
+ func();
+ state.set();
+ } else {
+ state.set(func());
+ }
+ } catch (...) {
+ state.set_exception(std::current_exception());
+ }
+ on_done.write_side().signal(1);
+ }
+ typename futurator_t::type get_future() {
+ return on_done.wait().then([this](size_t) {
+ if (state.failed()) {
+ return futurator_t::make_exception_future(state.get_exception());
+ } else {
+ return futurator_t::from_tuple(state.get_value());
+ }
+ });
+ }
+private:
+ Func func;
+ seastar::future_state<future_stored_type_t> state;
+ seastar::readable_eventfd on_done;
+};
+
+struct SubmitQueue {
+ seastar::semaphore free_slots;
+ seastar::gate pending_tasks;
+ explicit SubmitQueue(size_t num_free_slots)
+ : free_slots(num_free_slots)
+ {}
+ seastar::future<> stop() {
+ return pending_tasks.close();
+ }
+};
+
+/// an engine for scheduling non-seastar tasks from seastar fibers
+class ThreadPool {
+ std::atomic<bool> stopping = false;
+ std::mutex mutex;
+ std::condition_variable cond;
+ std::vector<std::thread> threads;
+ seastar::sharded<SubmitQueue> submit_queue;
+ const size_t queue_size;
+ boost::lockfree::queue<WorkItem*> pending;
+
+ void loop(std::chrono::milliseconds queue_max_wait);
+ bool is_stopping() const {
+ return stopping.load(std::memory_order_relaxed);
+ }
+ static void pin(unsigned cpu_id);
+ seastar::semaphore& local_free_slots() {
+ return submit_queue.local().free_slots;
+ }
+ ThreadPool(const ThreadPool&) = delete;
+ ThreadPool& operator=(const ThreadPool&) = delete;
+public:
+ /**
+ * @param queue_sz the depth of pending queue. before a task is scheduled,
+ * it waits in this queue. we will round this number to
+ * multiple of the number of cores.
+ * @param n_threads the number of threads in this thread pool.
+ * @param cpu the CPU core to which this thread pool is assigned
+ * @note each @c Task has its own crimson::thread::Condition, which possesses
+ * an fd, so we should keep the size of queue under a reasonable limit.
+ */
+ ThreadPool(size_t n_threads, size_t queue_sz, long cpu);
+ ~ThreadPool();
+ seastar::future<> start();
+ seastar::future<> stop();
+ template<typename Func, typename...Args>
+ auto submit(Func&& func, Args&&... args) {
+ auto packaged = [func=std::move(func),
+ args=std::forward_as_tuple(args...)] {
+ return std::apply(std::move(func), std::move(args));
+ };
+ return seastar::with_gate(submit_queue.local().pending_tasks,
+ [packaged=std::move(packaged), this] {
+ return local_free_slots().wait()
+ .then([packaged=std::move(packaged), this] {
+ auto task = new Task{std::move(packaged)};
+ auto fut = task->get_future();
+ pending.push(task);
+ cond.notify_one();
+ return fut.finally([task, this] {
+ local_free_slots().signal();
+ delete task;
+ });
+ });
+ });
+ }
+};
+
+} // namespace crimson::os
diff --git a/src/crimson/os/cyanstore/CMakeLists.txt b/src/crimson/os/cyanstore/CMakeLists.txt
new file mode 100644
index 000000000..65f2b5498
--- /dev/null
+++ b/src/crimson/os/cyanstore/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_library(crimson-cyanstore STATIC
+ cyan_store.cc
+ cyan_collection.cc
+ cyan_object.cc)
+target_link_libraries(crimson-cyanstore
+ crimson
+ crimson-os)
diff --git a/src/crimson/os/cyanstore/cyan_collection.cc b/src/crimson/os/cyanstore/cyan_collection.cc
new file mode 100644
index 000000000..f44234e84
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_collection.cc
@@ -0,0 +1,76 @@
+#include "cyan_collection.h"
+
+#include "cyan_object.h"
+
+namespace crimson::os
+{
+
+Collection::Collection(const coll_t& c)
+ : FuturizedCollection{c}
+{}
+
+Collection::~Collection() = default;
+
+Collection::ObjectRef Collection::create_object() const
+{
+ return new crimson::os::Object;
+}
+
+Collection::ObjectRef Collection::get_object(ghobject_t oid)
+{
+ auto o = object_hash.find(oid);
+ if (o == object_hash.end())
+ return ObjectRef();
+ return o->second;
+}
+
+Collection::ObjectRef Collection::get_or_create_object(ghobject_t oid)
+{
+ auto result = object_hash.emplace(oid, ObjectRef{});
+ if (result.second)
+ object_map[oid] = result.first->second = create_object();
+ return result.first->second;
+}
+
+uint64_t Collection::used_bytes() const
+{
+ uint64_t result = 0;
+ for (auto& obj : object_map) {
+ result += obj.second->get_size();
+ }
+ return result;
+}
+
+void Collection::encode(bufferlist& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ encode(xattr, bl);
+ encode(use_page_set, bl);
+ uint32_t s = object_map.size();
+ encode(s, bl);
+ for (auto& [oid, obj] : object_map) {
+ encode(oid, bl);
+ obj->encode(bl);
+ }
+ ENCODE_FINISH(bl);
+}
+
+void Collection::decode(bufferlist::const_iterator& p)
+{
+ DECODE_START(1, p);
+ decode(xattr, p);
+ decode(use_page_set, p);
+ uint32_t s;
+ decode(s, p);
+ while (s--) {
+ ghobject_t k;
+ decode(k, p);
+ auto o = create_object();
+ o->decode(p);
+ object_map.insert(make_pair(k, o));
+ object_hash.insert(make_pair(k, o));
+ }
+ DECODE_FINISH(p);
+}
+
+}
diff --git a/src/crimson/os/cyanstore/cyan_collection.h b/src/crimson/os/cyanstore/cyan_collection.h
new file mode 100644
index 000000000..068e427d8
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_collection.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include "include/buffer.h"
+#include "osd/osd_types.h"
+
+#include "crimson/os/futurized_collection.h"
+
+namespace crimson::os {
+
+class Object;
+/**
+ * a collection also orders transactions
+ *
+ * Any transactions queued under a given collection will be applied in
+ * sequence. Transactions queued under different collections may run
+ * in parallel.
+ *
+ * ObjectStore users may get collection handles with open_collection() (or,
+ * for bootstrapping a new collection, create_new_collection()).
+ */
+struct Collection final : public FuturizedCollection {
+ using ObjectRef = boost::intrusive_ptr<Object>;
+ int bits = 0;
+ // always use bufferlist object for testing
+ bool use_page_set = false;
+ std::unordered_map<ghobject_t, ObjectRef> object_hash; ///< for lookup
+ std::map<ghobject_t, ObjectRef> object_map; ///< for iteration
+ std::map<std::string,bufferptr> xattr;
+ bool exists = true;
+
+ Collection(const coll_t& c);
+ ~Collection() final;
+
+ ObjectRef create_object() const;
+ ObjectRef get_object(ghobject_t oid);
+ ObjectRef get_or_create_object(ghobject_t oid);
+ uint64_t used_bytes() const;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& p);
+};
+
+}
diff --git a/src/crimson/os/cyanstore/cyan_object.cc b/src/crimson/os/cyanstore/cyan_object.cc
new file mode 100644
index 000000000..34bc13b7f
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_object.cc
@@ -0,0 +1,89 @@
+#include "cyan_object.h"
+#include "include/encoding.h"
+
+namespace crimson::os {
+
+size_t Object::get_size() const {
+ return data.length();
+}
+
+ceph::bufferlist Object::read(uint64_t offset, uint64_t len)
+{
+ bufferlist ret;
+ ret.substr_of(data, offset, len);
+ return ret;
+}
+
+int Object::write(uint64_t offset, const bufferlist &src)
+{
+ unsigned len = src.length();
+ // before
+ bufferlist newdata;
+ if (get_size() >= offset) {
+ newdata.substr_of(data, 0, offset);
+ } else {
+ if (get_size()) {
+ newdata.substr_of(data, 0, get_size());
+ }
+ newdata.append_zero(offset - get_size());
+ }
+
+ newdata.append(src);
+
+ // after
+ if (get_size() > offset + len) {
+ bufferlist tail;
+ tail.substr_of(data, offset + len, get_size() - (offset + len));
+ newdata.append(tail);
+ }
+
+ data = std::move(newdata);
+ return 0;
+}
+
+int Object::clone(Object *src, uint64_t srcoff, uint64_t len,
+ uint64_t dstoff)
+{
+ bufferlist bl;
+ if (srcoff == dstoff && len == src->get_size()) {
+ data = src->data;
+ return 0;
+ }
+ bl.substr_of(src->data, srcoff, len);
+ return write(dstoff, bl);
+
+}
+
+int Object::truncate(uint64_t size)
+{
+ if (get_size() > size) {
+ bufferlist bl;
+ bl.substr_of(data, 0, size);
+ data = std::move(bl);
+ } else if (get_size() == size) {
+ // do nothing
+ } else {
+ data.append_zero(size - get_size());
+ }
+ return 0;
+}
+
+void Object::encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(data, bl);
+ encode(xattr, bl);
+ encode(omap_header, bl);
+ encode(omap, bl);
+ ENCODE_FINISH(bl);
+}
+
+void Object::decode(bufferlist::const_iterator& p) {
+ DECODE_START(1, p);
+ decode(data, p);
+ decode(xattr, p);
+ decode(omap_header, p);
+ decode(omap, p);
+ DECODE_FINISH(p);
+}
+
+}
diff --git a/src/crimson/os/cyanstore/cyan_object.h b/src/crimson/os/cyanstore/cyan_object.h
new file mode 100644
index 000000000..f19b87212
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_object.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <cstddef>
+#include <map>
+#include <string>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include "include/buffer.h"
+
+namespace crimson::os {
+
+struct Object : public boost::intrusive_ref_counter<
+ Object,
+ boost::thread_unsafe_counter>
+{
+ using bufferlist = ceph::bufferlist;
+
+ bufferlist data;
+ // use transparent comparator for better performance, see
+ // https://en.cppreference.com/w/cpp/utility/functional/less_void
+ std::map<std::string,bufferptr,std::less<>> xattr;
+ bufferlist omap_header;
+ std::map<std::string,bufferlist> omap;
+
+ typedef boost::intrusive_ptr<Object> Ref;
+
+ Object() = default;
+
+ // interface for object data
+ size_t get_size() const;
+ ceph::bufferlist read(uint64_t offset, uint64_t len);
+ int write(uint64_t offset, const bufferlist &bl);
+ int clone(Object *src, uint64_t srcoff, uint64_t len,
+ uint64_t dstoff);
+ int truncate(uint64_t offset);
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& p);
+};
+using ObjectRef = boost::intrusive_ptr<Object>;
+
+}
diff --git a/src/crimson/os/cyanstore/cyan_store.cc b/src/crimson/os/cyanstore/cyan_store.cc
new file mode 100644
index 000000000..eb93d72ec
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_store.cc
@@ -0,0 +1,835 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "cyan_store.h"
+
+#include <boost/algorithm/string/trim.hpp>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "common/safe_io.h"
+#include "os/Transaction.h"
+
+#include "crimson/common/buffer_io.h"
+#include "crimson/common/config_proxy.h"
+#include "cyan_collection.h"
+#include "cyan_object.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+using crimson::common::local_conf;
+
+namespace crimson::os {
+
+using ObjectRef = boost::intrusive_ptr<Object>;
+
+CyanStore::CyanStore(const std::string& path)
+ : path{path}
+{}
+
+CyanStore::~CyanStore() = default;
+
+seastar::future<> CyanStore::mount()
+{
+ ceph::bufferlist bl;
+ std::string fn = path + "/collections";
+ std::string err;
+ if (int r = bl.read_file(fn.c_str(), &err); r < 0) {
+ throw std::runtime_error("read_file");
+ }
+
+ std::set<coll_t> collections;
+ auto p = bl.cbegin();
+ ceph::decode(collections, p);
+
+ for (auto& coll : collections) {
+ std::string fn = fmt::format("{}/{}", path, coll);
+ ceph::bufferlist cbl;
+ if (int r = cbl.read_file(fn.c_str(), &err); r < 0) {
+ throw std::runtime_error("read_file");
+ }
+ boost::intrusive_ptr<Collection> c{new Collection{coll}};
+ auto p = cbl.cbegin();
+ c->decode(p);
+ coll_map[coll] = c;
+ used_bytes += c->used_bytes();
+ }
+ return seastar::now();
+}
+
+seastar::future<> CyanStore::umount()
+{
+ return seastar::do_with(std::set<coll_t>{}, [this](auto& collections) {
+ return seastar::do_for_each(coll_map, [&collections, this](auto& coll) {
+ auto& [col, ch] = coll;
+ collections.insert(col);
+ ceph::bufferlist bl;
+ ceph_assert(ch);
+ ch->encode(bl);
+ std::string fn = fmt::format("{}/{}", path, col);
+ return crimson::write_file(std::move(bl), fn);
+ }).then([&collections, this] {
+ ceph::bufferlist bl;
+ ceph::encode(collections, bl);
+ std::string fn = fmt::format("{}/collections", path);
+ return crimson::write_file(std::move(bl), fn);
+ });
+ });
+}
+
+seastar::future<> CyanStore::mkfs(uuid_d new_osd_fsid)
+{
+ return read_meta("fsid").then([=](auto&& ret) {
+ auto& [r, fsid_str] = ret;
+ if (r == -ENOENT) {
+ if (new_osd_fsid.is_zero()) {
+ osd_fsid.generate_random();
+ } else {
+ osd_fsid = new_osd_fsid;
+ }
+ return write_meta("fsid", fmt::format("{}", osd_fsid));
+ } else if (r < 0) {
+ throw std::runtime_error("read_meta");
+ } else {
+ logger().info("{} already has fsid {}", __func__, fsid_str);
+ if (!osd_fsid.parse(fsid_str.c_str())) {
+ throw std::runtime_error("failed to parse fsid");
+ } else if (osd_fsid != new_osd_fsid) {
+ logger().error("on-disk fsid {} != provided {}", osd_fsid, new_osd_fsid);
+ throw std::runtime_error("unmatched osd_fsid");
+ } else {
+ return seastar::now();
+ }
+ }
+ }).then([this]{
+ std::string fn = path + "/collections";
+ ceph::bufferlist bl;
+ std::set<coll_t> collections;
+ ceph::encode(collections, bl);
+ return crimson::write_file(std::move(bl), fn);
+ }).then([this] {
+ return write_meta("type", "memstore");
+ });
+}
+
+seastar::future<store_statfs_t> CyanStore::stat() const
+{
+ logger().debug("{}", __func__);
+ store_statfs_t st;
+ st.total = crimson::common::local_conf().get_val<Option::size_t>("memstore_device_bytes");
+ st.available = st.total - used_bytes;
+ return seastar::make_ready_future<store_statfs_t>(std::move(st));
+}
+
+seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
+CyanStore::list_objects(CollectionRef ch,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit) const
+{
+ auto c = static_cast<Collection*>(ch.get());
+ logger().debug("{} {} {} {} {}",
+ __func__, c->get_cid(), start, end, limit);
+ std::vector<ghobject_t> objects;
+ objects.reserve(limit);
+ ghobject_t next = ghobject_t::get_max();
+ for (const auto& [oid, obj] :
+ boost::make_iterator_range(c->object_map.lower_bound(start),
+ c->object_map.end())) {
+ std::ignore = obj;
+ if (oid >= end || objects.size() >= limit) {
+ next = oid;
+ break;
+ }
+ objects.push_back(oid);
+ }
+ return seastar::make_ready_future<std::tuple<std::vector<ghobject_t>, ghobject_t>>(
+ std::make_tuple(std::move(objects), next));
+}
+
+seastar::future<CollectionRef> CyanStore::create_new_collection(const coll_t& cid)
+{
+ auto c = new Collection{cid};
+ new_coll_map[cid] = c;
+ return seastar::make_ready_future<CollectionRef>(c);
+}
+
+seastar::future<CollectionRef> CyanStore::open_collection(const coll_t& cid)
+{
+ return seastar::make_ready_future<CollectionRef>(_get_collection(cid));
+}
+
+seastar::future<std::vector<coll_t>> CyanStore::list_collections()
+{
+ std::vector<coll_t> collections;
+ for (auto& coll : coll_map) {
+ collections.push_back(coll.first);
+ }
+ return seastar::make_ready_future<std::vector<coll_t>>(std::move(collections));
+}
+
+CyanStore::read_errorator::future<ceph::bufferlist> CyanStore::read(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ uint32_t op_flags)
+{
+ auto c = static_cast<Collection*>(ch.get());
+ logger().debug("{} {} {} {}~{}",
+ __func__, c->get_cid(), oid, offset, len);
+ if (!c->exists) {
+ return crimson::ct_error::enoent::make();
+ }
+ ObjectRef o = c->get_object(oid);
+ if (!o) {
+ return crimson::ct_error::enoent::make();
+ }
+ if (offset >= o->get_size())
+ return read_errorator::make_ready_future<ceph::bufferlist>();
+ size_t l = len;
+ if (l == 0 && offset == 0) // note: len == 0 means read the entire object
+ l = o->get_size();
+ else if (offset + l > o->get_size())
+ l = o->get_size() - offset;
+ return read_errorator::make_ready_future<ceph::bufferlist>(o->read(offset, l));
+}
+
+CyanStore::read_errorator::future<ceph::bufferlist> CyanStore::readv(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ interval_set<uint64_t>& m,
+ uint32_t op_flags)
+{
+ return seastar::do_with(ceph::bufferlist{},
+ [this, ch, oid, &m, op_flags](auto& bl) {
+ return crimson::do_for_each(m,
+ [this, ch, oid, op_flags, &bl](auto& p) {
+ return read(ch, oid, p.first, p.second, op_flags)
+ .safe_then([&bl](auto ret) {
+ bl.claim_append(ret);
+ });
+ }).safe_then([&bl] {
+ return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl));
+ });
+ });
+}
+
+
+CyanStore::get_attr_errorator::future<ceph::bufferptr> CyanStore::get_attr(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ std::string_view name) const
+{
+ auto c = static_cast<Collection*>(ch.get());
+ logger().debug("{} {} {}",
+ __func__, c->get_cid(), oid);
+ auto o = c->get_object(oid);
+ if (!o) {
+ return crimson::ct_error::enoent::make();
+ }
+ if (auto found = o->xattr.find(name); found != o->xattr.end()) {
+ return get_attr_errorator::make_ready_future<ceph::bufferptr>(found->second);
+ } else {
+ return crimson::ct_error::enodata::make();
+ }
+}
+
+CyanStore::get_attrs_ertr::future<CyanStore::attrs_t> CyanStore::get_attrs(
+ CollectionRef ch,
+ const ghobject_t& oid)
+{
+ auto c = static_cast<Collection*>(ch.get());
+ logger().debug("{} {} {}",
+ __func__, c->get_cid(), oid);
+ auto o = c->get_object(oid);
+ if (!o) {
+ return crimson::ct_error::enoent::make();
+ }
+ return get_attrs_ertr::make_ready_future<attrs_t>(o->xattr);
+}
+
+auto CyanStore::omap_get_values(CollectionRef ch,
+ const ghobject_t& oid,
+ const omap_keys_t& keys)
+ -> read_errorator::future<omap_values_t>
+{
+ auto c = static_cast<Collection*>(ch.get());
+ logger().debug("{} {} {}", __func__, c->get_cid(), oid);
+ auto o = c->get_object(oid);
+ if (!o) {
+ return crimson::ct_error::enoent::make();
+ }
+ omap_values_t values;
+ for (auto& key : keys) {
+ if (auto found = o->omap.find(key); found != o->omap.end()) {
+ values.insert(*found);
+ }
+ }
+ return seastar::make_ready_future<omap_values_t>(std::move(values));
+}
+
+auto
+CyanStore::omap_get_values(CollectionRef ch,
+ const ghobject_t &oid,
+ const std::optional<string> &start)
+ -> read_errorator::future<std::tuple<bool, omap_values_t>>
+{
+ auto c = static_cast<Collection*>(ch.get());
+ logger().debug("{} {} {}", __func__, c->get_cid(), oid);
+ auto o = c->get_object(oid);
+ if (!o) {
+ return crimson::ct_error::enoent::make();
+ }
+ omap_values_t values;
+ for (auto i = start ? o->omap.upper_bound(*start) : o->omap.begin();
+ values.size() < MAX_KEYS_PER_OMAP_GET_CALL && i != o->omap.end();
+ ++i) {
+ values.insert(*i);
+ }
+ return seastar::make_ready_future<std::tuple<bool, omap_values_t>>(
+ std::make_tuple(true, std::move(values)));
+}
+
+auto
+CyanStore::omap_get_header(CollectionRef ch,
+ const ghobject_t& oid)
+ -> read_errorator::future<ceph::bufferlist>
+{
+ auto c = static_cast<Collection*>(ch.get());
+ auto o = c->get_object(oid);
+ if (!o) {
+ return crimson::ct_error::enoent::make();
+ }
+
+ return read_errorator::make_ready_future<ceph::bufferlist>(
+ o->omap_header);
+}
+
+seastar::future<> CyanStore::do_transaction(CollectionRef ch,
+ ceph::os::Transaction&& t)
+{
+ using ceph::os::Transaction;
+ int r = 0;
+ try {
+ auto i = t.begin();
+ while (i.have_op()) {
+ r = 0;
+ switch (auto op = i.decode_op(); op->op) {
+ case Transaction::OP_NOP:
+ break;
+ case Transaction::OP_REMOVE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ r = _remove(cid, oid);
+ if (r == -ENOENT) {
+ r = 0;
+ }
+ }
+ break;
+ case Transaction::OP_TOUCH:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ r = _touch(cid, oid);
+ }
+ break;
+ case Transaction::OP_WRITE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ uint32_t fadvise_flags = i.get_fadvise_flags();
+ ceph::bufferlist bl;
+ i.decode_bl(bl);
+ r = _write(cid, oid, off, len, bl, fadvise_flags);
+ }
+ break;
+ case Transaction::OP_ZERO:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ r = _zero(cid, oid, off, len);
+ }
+ break;
+ case Transaction::OP_TRUNCATE:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ uint64_t off = op->off;
+ r = _truncate(cid, oid, off);
+ }
+ break;
+ case Transaction::OP_SETATTR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ std::string name = i.decode_string();
+ ceph::bufferlist bl;
+ i.decode_bl(bl);
+ std::map<std::string, bufferptr> to_set;
+ to_set[name] = bufferptr(bl.c_str(), bl.length());
+ r = _setattrs(cid, oid, to_set);
+ }
+ break;
+ case Transaction::OP_RMATTR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ std::string name = i.decode_string();
+ r = _rm_attr(cid, oid, name);
+ }
+ break;
+ case Transaction::OP_MKCOLL:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ r = _create_collection(cid, op->split_bits);
+ }
+ break;
+ case Transaction::OP_OMAP_CLEAR:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ r = _omap_clear(cid, oid);
+ }
+ break;
+ case Transaction::OP_OMAP_SETKEYS:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ ghobject_t oid = i.get_oid(op->oid);
+ std::map<std::string, ceph::bufferlist> aset;
+ i.decode_attrset(aset);
+ r = _omap_set_values(cid, oid, std::move(aset));
+ }
+ break;
+ case Transaction::OP_OMAP_SETHEADER:
+ {
+ const coll_t &cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ ceph::bufferlist bl;
+ i.decode_bl(bl);
+ r = _omap_set_header(cid, oid, bl);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYS:
+ {
+ const coll_t &cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ omap_keys_t keys;
+ i.decode_keyset(keys);
+ r = _omap_rmkeys(cid, oid, keys);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYRANGE:
+ {
+ const coll_t &cid = i.get_cid(op->cid);
+ const ghobject_t &oid = i.get_oid(op->oid);
+ string first, last;
+ first = i.decode_string();
+ last = i.decode_string();
+ r = _omap_rmkeyrange(cid, oid, first, last);
+ }
+ break;
+ case Transaction::OP_COLL_HINT:
+ {
+ ceph::bufferlist hint;
+ i.decode_bl(hint);
+ // ignored
+ break;
+ }
+ default:
+ logger().error("bad op {}", static_cast<unsigned>(op->op));
+ abort();
+ }
+ if (r < 0) {
+ break;
+ }
+ }
+ } catch (std::exception &e) {
+ logger().error("{} got exception {}", __func__, e);
+ r = -EINVAL;
+ }
+ if (r < 0) {
+ logger().error(" transaction dump:\n");
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+ t.dump(&f);
+ f.close_section();
+ std::stringstream str;
+ f.flush(str);
+ logger().error("{}", str.str());
+ ceph_assert(r == 0);
+ }
+ for (auto i : {
+ t.get_on_applied(),
+ t.get_on_commit(),
+ t.get_on_applied_sync()}) {
+ if (i) {
+ i->complete(0);
+ }
+ }
+ return seastar::now();
+}
+
+int CyanStore::_remove(const coll_t& cid, const ghobject_t& oid)
+{
+ logger().debug("{} cid={} oid={}",
+ __func__, cid, oid);
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ auto i = c->object_hash.find(oid);
+ if (i == c->object_hash.end())
+ return -ENOENT;
+ used_bytes -= i->second->get_size();
+ c->object_hash.erase(i);
+ c->object_map.erase(oid);
+ return 0;
+}
+
+int CyanStore::_touch(const coll_t& cid, const ghobject_t& oid)
+{
+ logger().debug("{} cid={} oid={}",
+ __func__, cid, oid);
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ c->get_or_create_object(oid);
+ return 0;
+}
+
+int CyanStore::_write(const coll_t& cid, const ghobject_t& oid,
+ uint64_t offset, size_t len, const ceph::bufferlist& bl,
+ uint32_t fadvise_flags)
+{
+ logger().debug("{} {} {} {} ~ {}",
+ __func__, cid, oid, offset, len);
+ assert(len == bl.length());
+
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_or_create_object(oid);
+ if (len > 0 && !local_conf()->memstore_debug_omit_block_device_write) {
+ const ssize_t old_size = o->get_size();
+ o->write(offset, bl);
+ used_bytes += (o->get_size() - old_size);
+ }
+
+ return 0;
+}
+
+int CyanStore::_zero(const coll_t& cid, const ghobject_t& oid,
+ uint64_t offset, size_t len)
+{
+ logger().debug("{} {} {} {} ~ {}",
+ __func__, cid, oid, offset, len);
+
+ ceph::buffer::list bl;
+ bl.append_zero(len);
+ return _write(cid, oid, offset, len, bl, 0);
+}
+
+int CyanStore::_omap_clear(
+ const coll_t& cid,
+ const ghobject_t& oid)
+{
+ logger().debug("{} {} {}", __func__, cid, oid);
+
+ auto c = _get_collection(cid);
+ if (!c) {
+ return -ENOENT;
+ }
+ ObjectRef o = c->get_object(oid);
+ if (!o) {
+ return -ENOENT;
+ }
+ o->omap.clear();
+ o->omap_header.clear();
+ return 0;
+}
+
+int CyanStore::_omap_set_values(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ std::map<std::string, ceph::bufferlist> &&aset)
+{
+ logger().debug(
+ "{} {} {} {} keys",
+ __func__, cid, oid, aset.size());
+
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_or_create_object(oid);
+ for (auto &&i: aset) {
+ o->omap.insert(std::move(i));
+ }
+ return 0;
+}
+
+int CyanStore::_omap_set_header(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ const ceph::bufferlist &header)
+{
+ logger().debug(
+ "{} {} {} {} bytes",
+ __func__, cid, oid, header.length());
+
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_or_create_object(oid);
+ o->omap_header = header;
+ return 0;
+}
+
+int CyanStore::_omap_rmkeys(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ const omap_keys_t& aset)
+{
+ logger().debug(
+ "{} {} {} {} keys",
+ __func__, cid, oid, aset.size());
+
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_or_create_object(oid);
+ for (auto &i: aset) {
+ o->omap.erase(i);
+ }
+ return 0;
+}
+
+int CyanStore::_omap_rmkeyrange(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ const std::string &first,
+ const std::string &last)
+{
+ logger().debug(
+ "{} {} {} first={} last={}",
+ __func__, cid, oid, first, last);
+
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_or_create_object(oid);
+ for (auto i = o->omap.lower_bound(first);
+ i != o->omap.end() && i->first <= last;
+ o->omap.erase(i++));
+ return 0;
+}
+
+int CyanStore::_truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size)
+{
+ logger().debug("{} cid={} oid={} size={}",
+ __func__, cid, oid, size);
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ if (local_conf()->memstore_debug_omit_block_device_write)
+ return 0;
+ const ssize_t old_size = o->get_size();
+ int r = o->truncate(size);
+ used_bytes += (o->get_size() - old_size);
+ return r;
+}
+
+int CyanStore::_setattrs(const coll_t& cid, const ghobject_t& oid,
+ std::map<std::string,bufferptr>& aset)
+{
+ logger().debug("{} cid={} oid={}",
+ __func__, cid, oid);
+ auto c = _get_collection(cid);
+ if (!c)
+ return -ENOENT;
+
+ ObjectRef o = c->get_object(oid);
+ if (!o)
+ return -ENOENT;
+ for (std::map<std::string, bufferptr>::const_iterator p = aset.begin();
+ p != aset.end(); ++p)
+ o->xattr[p->first] = p->second;
+ return 0;
+}
+
+int CyanStore::_rm_attr(const coll_t& cid, const ghobject_t& oid,
+ std::string_view name)
+{
+ logger().debug("{} cid={} oid={} name={}", __func__, cid, oid, name);
+ auto c = _get_collection(cid);
+ if (!c) {
+ return -ENOENT;
+ }
+ ObjectRef o = c->get_object(oid);
+ if (!o) {
+ return -ENOENT;
+ }
+ auto i = o->xattr.find(name);
+ if (i == o->xattr.end()) {
+ return -ENODATA;
+ }
+ o->xattr.erase(i);
+ return 0;
+}
+
+int CyanStore::_create_collection(const coll_t& cid, int bits)
+{
+ auto result = coll_map.try_emplace(cid);
+ if (!result.second)
+ return -EEXIST;
+ auto p = new_coll_map.find(cid);
+ assert(p != new_coll_map.end());
+ result.first->second = p->second;
+ result.first->second->bits = bits;
+ new_coll_map.erase(p);
+ return 0;
+}
+
+boost::intrusive_ptr<Collection> CyanStore::_get_collection(const coll_t& cid)
+{
+ auto cp = coll_map.find(cid);
+ if (cp == coll_map.end())
+ return {};
+ return cp->second;
+}
+
+seastar::future<> CyanStore::write_meta(const std::string& key,
+ const std::string& value)
+{
+ std::string v = value;
+ v += "\n";
+ if (int r = safe_write_file(path.c_str(), key.c_str(),
+ v.c_str(), v.length(), 0600);
+ r < 0) {
+ throw std::runtime_error{fmt::format("unable to write_meta({})", key)};
+ }
+ return seastar::make_ready_future<>();
+}
+
+seastar::future<std::tuple<int, std::string>>
+CyanStore::read_meta(const std::string& key)
+{
+ std::string fsid(4096, '\0');
+ int r = safe_read_file(path.c_str(), key.c_str(), fsid.data(), fsid.size());
+ if (r > 0) {
+ fsid.resize(r);
+ // drop trailing newlines
+ boost::algorithm::trim_right_if(fsid,
+ [](unsigned char c) {return isspace(c);});
+ } else {
+ fsid.clear();
+ }
+ return seastar::make_ready_future<std::tuple<int, std::string>>(
+ std::make_tuple(r, fsid));
+}
+
+uuid_d CyanStore::get_fsid() const
+{
+ return osd_fsid;
+}
+
+unsigned CyanStore::get_max_attr_name_length() const
+{
+ // arbitrary limitation exactly like in the case of MemStore.
+ return 256;
+}
+
+seastar::future<FuturizedStore::OmapIteratorRef> CyanStore::get_omap_iterator(
+ CollectionRef ch,
+ const ghobject_t& oid)
+{
+ auto c = static_cast<Collection*>(ch.get());
+ auto o = c->get_object(oid);
+ if (!o) {
+ throw std::runtime_error(fmt::format("object does not exist: {}", oid));
+ }
+ return seastar::make_ready_future<FuturizedStore::OmapIteratorRef>(
+ new CyanStore::CyanOmapIterator(o));
+}
+
+seastar::future<std::map<uint64_t, uint64_t>>
+CyanStore::fiemap(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ uint64_t off,
+ uint64_t len)
+{
+ auto c = static_cast<Collection*>(ch.get());
+
+ ObjectRef o = c->get_object(oid);
+ if (!o) {
+ throw std::runtime_error(fmt::format("object does not exist: {}", oid));
+ }
+ std::map<uint64_t, uint64_t> m{{0, o->get_size()}};
+ return seastar::make_ready_future<std::map<uint64_t, uint64_t>>(std::move(m));
+}
+
+seastar::future<struct stat>
+CyanStore::stat(
+ CollectionRef ch,
+ const ghobject_t& oid)
+{
+ auto c = static_cast<Collection*>(ch.get());
+ auto o = c->get_object(oid);
+ if (!o) {
+ throw std::runtime_error(fmt::format("object does not exist: {}", oid));
+ }
+ struct stat st;
+ st.st_size = o->get_size();
+ return seastar::make_ready_future<struct stat>(std::move(st));
+}
+
+seastar::future<> CyanStore::CyanOmapIterator::seek_to_first()
+{
+ iter = obj->omap.begin();
+ return seastar::make_ready_future<>();
+}
+
+seastar::future<> CyanStore::CyanOmapIterator::upper_bound(const std::string& after)
+{
+ iter = obj->omap.upper_bound(after);
+ return seastar::make_ready_future<>();
+}
+
+seastar::future<> CyanStore::CyanOmapIterator::lower_bound(const std::string &to)
+{
+ iter = obj->omap.lower_bound(to);
+ return seastar::make_ready_future<>();
+}
+
+bool CyanStore::CyanOmapIterator::valid() const
+{
+ return iter != obj->omap.end();
+}
+
+seastar::future<> CyanStore::CyanOmapIterator::next()
+{
+ ++iter;
+ return seastar::make_ready_future<>();
+}
+
+}
diff --git a/src/crimson/os/cyanstore/cyan_store.h b/src/crimson/os/cyanstore/cyan_store.h
new file mode 100644
index 000000000..07a8ff29e
--- /dev/null
+++ b/src/crimson/os/cyanstore/cyan_store.h
@@ -0,0 +1,185 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <map>
+#include <typeinfo>
+#include <vector>
+
+#include <optional>
+#include <seastar/core/future.hh>
+#include <seastar/core/future-util.hh>
+
+#include "osd/osd_types.h"
+#include "include/uuid.h"
+
+#include "crimson/os/cyanstore/cyan_object.h"
+#include "crimson/os/futurized_store.h"
+
+namespace ceph::os {
+class Transaction;
+}
+
+namespace crimson::os {
+class Collection;
+
+class CyanStore final : public FuturizedStore {
+ constexpr static unsigned MAX_KEYS_PER_OMAP_GET_CALL = 32;
+
+ const std::string path;
+ std::unordered_map<coll_t, boost::intrusive_ptr<Collection>> coll_map;
+ std::map<coll_t, boost::intrusive_ptr<Collection>> new_coll_map;
+ uint64_t used_bytes = 0;
+ uuid_d osd_fsid;
+
+public:
+ class CyanOmapIterator final : public OmapIterator {
+ public:
+ CyanOmapIterator() {}
+ CyanOmapIterator(ObjectRef obj) : obj(obj) {
+ iter = obj->omap.begin();
+ }
+ seastar::future<> seek_to_first() final;
+ seastar::future<> upper_bound(const std::string &after) final;
+ seastar::future<> lower_bound(const std::string &to) final;
+ bool valid() const final;
+ seastar::future<> next() final;
+ std::string key() final {
+ return iter->first;
+ }
+ virtual seastar::future<std::string> tail_key(){
+ return seastar::make_ready_future<std::string>((++obj->omap.end())->first);
+ }
+ virtual ceph::buffer::list value() {
+ return iter->second;
+ }
+ virtual int status() const {
+ return iter != obj->omap.end() ? 0 : -1;
+ }
+ virtual ~CyanOmapIterator() {}
+ private:
+ std::map<std::string, bufferlist>::const_iterator iter;
+ ObjectRef obj;
+ };
+
+ CyanStore(const std::string& path);
+ ~CyanStore() final;
+
+ seastar::future<> stop() final {
+ return seastar::now();
+ }
+ seastar::future<> mount() final;
+ seastar::future<> umount() final;
+
+ seastar::future<> mkfs(uuid_d new_osd_fsid) final;
+ seastar::future<store_statfs_t> stat() const final;
+ seastar::future<struct stat> stat(
+ CollectionRef c,
+ const ghobject_t& oid) final;
+
+ read_errorator::future<ceph::bufferlist> read(
+ CollectionRef c,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ uint32_t op_flags = 0) final;
+ read_errorator::future<ceph::bufferlist> readv(
+ CollectionRef c,
+ const ghobject_t& oid,
+ interval_set<uint64_t>& m,
+ uint32_t op_flags = 0) final;
+
+ get_attr_errorator::future<ceph::bufferptr> get_attr(
+ CollectionRef c,
+ const ghobject_t& oid,
+ std::string_view name) const final;
+ get_attrs_ertr::future<attrs_t> get_attrs(
+ CollectionRef c,
+ const ghobject_t& oid);
+
+ read_errorator::future<omap_values_t> omap_get_values(
+ CollectionRef c,
+ const ghobject_t& oid,
+ const omap_keys_t& keys) final;
+
+ /// Retrieves paged set of values > start (if present)
+ read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
+ CollectionRef c, ///< [in] collection
+ const ghobject_t &oid, ///< [in] oid
+ const std::optional<std::string> &start ///< [in] start, empty for begin
+ ) final; ///< @return <done, values> values.empty() iff done
+
+ seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
+ CollectionRef c,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit) const final;
+
+ read_errorator::future<ceph::bufferlist> omap_get_header(
+ CollectionRef c,
+ const ghobject_t& oid) final;
+
+ seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
+ seastar::future<CollectionRef> open_collection(const coll_t& cid) final;
+ seastar::future<std::vector<coll_t>> list_collections() final;
+
+ seastar::future<> do_transaction(CollectionRef ch,
+ ceph::os::Transaction&& txn) final;
+
+ seastar::future<> write_meta(const std::string& key,
+ const std::string& value) final;
+ seastar::future<std::tuple<int, std::string>>
+ read_meta(const std::string& key) final;
+ uuid_d get_fsid() const final;
+ unsigned get_max_attr_name_length() const final;
+
+ seastar::future<OmapIteratorRef> get_omap_iterator(
+ CollectionRef c,
+ const ghobject_t& oid);
+
+ seastar::future<std::map<uint64_t, uint64_t>> fiemap(CollectionRef c,
+ const ghobject_t& oid,
+ uint64_t off,
+ uint64_t len);
+
+private:
+ int _remove(const coll_t& cid, const ghobject_t& oid);
+ int _touch(const coll_t& cid, const ghobject_t& oid);
+ int _write(const coll_t& cid, const ghobject_t& oid,
+ uint64_t offset, size_t len, const ceph::bufferlist& bl,
+ uint32_t fadvise_flags);
+ int _zero(const coll_t& cid, const ghobject_t& oid,
+ uint64_t offset, size_t len);
+ int _omap_clear(
+ const coll_t& cid,
+ const ghobject_t& oid);
+ int _omap_set_values(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ std::map<std::string, ceph::bufferlist> &&aset);
+ int _omap_set_header(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ const ceph::bufferlist &header);
+ int _omap_rmkeys(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ const omap_keys_t& aset);
+ int _omap_rmkeyrange(
+ const coll_t& cid,
+ const ghobject_t& oid,
+ const std::string &first,
+ const std::string &last);
+ int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size);
+ int _setattrs(const coll_t& cid, const ghobject_t& oid,
+ std::map<std::string,bufferptr>& aset);
+ int _rm_attr(const coll_t& cid, const ghobject_t& oid,
+ string_view name);
+ int _create_collection(const coll_t& cid, int bits);
+ boost::intrusive_ptr<Collection> _get_collection(const coll_t& cid);
+};
+
+}
diff --git a/src/crimson/os/futurized_collection.h b/src/crimson/os/futurized_collection.h
new file mode 100644
index 000000000..06f7d2f47
--- /dev/null
+++ b/src/crimson/os/futurized_collection.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "osd/osd_types.h"
+
+namespace crimson::os {
+class FuturizedStore;
+
+class FuturizedCollection
+ : public boost::intrusive_ref_counter<FuturizedCollection,
+ boost::thread_unsafe_counter>
+{
+public:
+ FuturizedCollection(const coll_t& cid)
+ : cid{cid} {}
+ virtual ~FuturizedCollection() {}
+ virtual seastar::future<> flush() {
+ return seastar::make_ready_future<>();
+ }
+ virtual seastar::future<bool> flush_commit() {
+ return seastar::make_ready_future<bool>(true);
+ }
+ const coll_t& get_cid() const {
+ return cid;
+ }
+private:
+ const coll_t cid;
+};
+
+using CollectionRef = boost::intrusive_ptr<FuturizedCollection>;
+}
diff --git a/src/crimson/os/futurized_store.cc b/src/crimson/os/futurized_store.cc
new file mode 100644
index 000000000..bb73c3478
--- /dev/null
+++ b/src/crimson/os/futurized_store.cc
@@ -0,0 +1,22 @@
+#include "futurized_store.h"
+#include "cyanstore/cyan_store.h"
+#include "alienstore/alien_store.h"
+
+namespace crimson::os {
+
+std::unique_ptr<FuturizedStore>
+FuturizedStore::create(const std::string& type,
+ const std::string& data,
+ const ConfigValues& values)
+{
+ if (type == "memstore") {
+ return std::make_unique<crimson::os::CyanStore>(data);
+ } else if (type == "bluestore") {
+ return std::make_unique<crimson::os::AlienStore>(data, values);
+ } else {
+ ceph_abort_msgf("unsupported objectstore type: %s", type.c_str());
+ return {};
+ }
+}
+
+}
diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h
new file mode 100644
index 000000000..bb173056b
--- /dev/null
+++ b/src/crimson/os/futurized_store.h
@@ -0,0 +1,167 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <map>
+#include <optional>
+#include <vector>
+
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/exceptions.h"
+#include "include/buffer_fwd.h"
+#include "include/uuid.h"
+#include "osd/osd_types.h"
+
+namespace ceph::os {
+class Transaction;
+}
+
+namespace crimson::os {
+class FuturizedCollection;
+
+class FuturizedStore {
+
+public:
+ class OmapIterator {
+ public:
+ virtual seastar::future<> seek_to_first() = 0;
+ virtual seastar::future<> upper_bound(const std::string &after) = 0;
+ virtual seastar::future<> lower_bound(const std::string &to) = 0;
+ virtual bool valid() const {
+ return false;
+ }
+ virtual seastar::future<> next() = 0;
+ virtual std::string key() {
+ return {};
+ }
+ virtual seastar::future<std::string> tail_key() {
+ return seastar::make_ready_future<std::string>();
+ }
+ virtual ceph::buffer::list value() {
+ return {};
+ }
+ virtual int status() const {
+ return 0;
+ }
+ virtual ~OmapIterator() {}
+ private:
+ unsigned count = 0;
+ friend void intrusive_ptr_add_ref(FuturizedStore::OmapIterator* iter);
+ friend void intrusive_ptr_release(FuturizedStore::OmapIterator* iter);
+ };
+ using OmapIteratorRef = boost::intrusive_ptr<OmapIterator>;
+
+ static std::unique_ptr<FuturizedStore> create(const std::string& type,
+ const std::string& data,
+ const ConfigValues& values);
+ FuturizedStore() = default;
+ virtual ~FuturizedStore() = default;
+
+ // no copying
+ explicit FuturizedStore(const FuturizedStore& o) = delete;
+ const FuturizedStore& operator=(const FuturizedStore& o) = delete;
+
+ virtual seastar::future<> start() {
+ return seastar::now();
+ }
+ virtual seastar::future<> stop() = 0;
+ virtual seastar::future<> mount() = 0;
+ virtual seastar::future<> umount() = 0;
+
+ virtual seastar::future<> mkfs(uuid_d new_osd_fsid) = 0;
+ virtual seastar::future<store_statfs_t> stat() const = 0;
+
+ using CollectionRef = boost::intrusive_ptr<FuturizedCollection>;
+ using read_errorator = crimson::errorator<crimson::ct_error::enoent,
+ crimson::ct_error::input_output_error>;
+ virtual read_errorator::future<ceph::bufferlist> read(
+ CollectionRef c,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ uint32_t op_flags = 0) = 0;
+ virtual read_errorator::future<ceph::bufferlist> readv(
+ CollectionRef c,
+ const ghobject_t& oid,
+ interval_set<uint64_t>& m,
+ uint32_t op_flags = 0) = 0;
+
+ using get_attr_errorator = crimson::errorator<
+ crimson::ct_error::enoent,
+ crimson::ct_error::enodata>;
+ virtual get_attr_errorator::future<ceph::bufferptr> get_attr(
+ CollectionRef c,
+ const ghobject_t& oid,
+ std::string_view name) const = 0;
+
+ using get_attrs_ertr = crimson::errorator<
+ crimson::ct_error::enoent>;
+ using attrs_t = std::map<std::string, ceph::bufferptr, std::less<>>;
+ virtual get_attrs_ertr::future<attrs_t> get_attrs(
+ CollectionRef c,
+ const ghobject_t& oid) = 0;
+ virtual seastar::future<struct stat> stat(
+ CollectionRef c,
+ const ghobject_t& oid) = 0;
+
+ using omap_values_t = std::map<std::string, bufferlist, std::less<>>;
+ using omap_keys_t = std::set<std::string>;
+ virtual read_errorator::future<omap_values_t> omap_get_values(
+ CollectionRef c,
+ const ghobject_t& oid,
+ const omap_keys_t& keys) = 0;
+ virtual seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
+ CollectionRef c,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit) const = 0;
+ virtual read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
+ CollectionRef c, ///< [in] collection
+ const ghobject_t &oid, ///< [in] oid
+ const std::optional<std::string> &start ///< [in] start, empty for begin
+ ) = 0; ///< @return <done, values> values.empty() iff done
+
+ virtual read_errorator::future<bufferlist> omap_get_header(
+ CollectionRef c,
+ const ghobject_t& oid) = 0;
+
+ virtual seastar::future<CollectionRef> create_new_collection(const coll_t& cid) = 0;
+ virtual seastar::future<CollectionRef> open_collection(const coll_t& cid) = 0;
+ virtual seastar::future<std::vector<coll_t>> list_collections() = 0;
+
+ virtual seastar::future<> do_transaction(CollectionRef ch,
+ ceph::os::Transaction&& txn) = 0;
+ virtual seastar::future<OmapIteratorRef> get_omap_iterator(
+ CollectionRef ch,
+ const ghobject_t& oid) = 0;
+ virtual seastar::future<std::map<uint64_t, uint64_t>> fiemap(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ uint64_t off,
+ uint64_t len) = 0;
+
+ virtual seastar::future<> write_meta(const std::string& key,
+ const std::string& value) = 0;
+ virtual seastar::future<std::tuple<int, std::string>> read_meta(
+ const std::string& key) = 0;
+ virtual uuid_d get_fsid() const = 0;
+ virtual unsigned get_max_attr_name_length() const = 0;
+};
+
+inline void intrusive_ptr_add_ref(FuturizedStore::OmapIterator* iter) {
+ assert(iter);
+ iter->count++;
+}
+
+inline void intrusive_ptr_release(FuturizedStore::OmapIterator* iter) {
+ assert(iter);
+ assert(iter->count > 0);
+ if ((--iter->count) == 0) {
+ delete iter;
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt
new file mode 100644
index 000000000..77f8465cf
--- /dev/null
+++ b/src/crimson/os/seastore/CMakeLists.txt
@@ -0,0 +1,37 @@
+add_library(crimson-seastore STATIC
+ cached_extent.cc
+ seastore_types.cc
+ segment_manager/ephemeral.cc
+ segment_manager/block.cc
+ transaction_manager.cc
+ journal.cc
+ cache.cc
+ lba_manager.cc
+ segment_cleaner.cc
+ lba_manager/btree/btree_lba_manager.cc
+ lba_manager/btree/lba_btree_node_impl.cc
+ lba_manager/btree/btree_range_pin.cc
+ onode.cc
+ onode_manager/simple-fltree/onode_block.cc
+ onode_manager/simple-fltree/onode_delta.cc
+ onode_manager/simple-fltree/onode_node.cc
+ onode_manager/staged-fltree/node.cc
+ onode_manager/staged-fltree/node_extent_manager.cc
+ onode_manager/staged-fltree/node_extent_manager/seastore.cc
+ onode_manager/staged-fltree/node_extent_mutable.cc
+ onode_manager/staged-fltree/node_impl.cc
+ onode_manager/staged-fltree/stages/item_iterator_stage.cc
+ onode_manager/staged-fltree/stages/key_layout.cc
+ onode_manager/staged-fltree/stages/node_stage_layout.cc
+ onode_manager/staged-fltree/stages/node_stage.cc
+ onode_manager/staged-fltree/stages/sub_items_stage.cc
+ onode_manager/staged-fltree/super.cc
+ onode_manager/staged-fltree/tree.cc
+ extentmap_manager.cc
+ extentmap_manager/btree/extentmap_btree_node_impl.cc
+ extentmap_manager/btree/btree_extentmap_manager.cc
+ seastore.cc
+ ../../../test/crimson/seastore/test_block.cc
+ )
+target_link_libraries(crimson-seastore
+ crimson)
diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc
new file mode 100644
index 000000000..6a406c1b8
--- /dev/null
+++ b/src/crimson/os/seastore/cache.cc
@@ -0,0 +1,541 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/cache.h"
+#include "crimson/common/log.h"
+
+// included for get_extent_by_type
+#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h"
+#include "crimson/os/seastore/onode_manager/simple-fltree/onode_block.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h"
+#include "test/crimson/seastore/test_block.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+namespace crimson::os::seastore {
+
+Cache::Cache(SegmentManager &segment_manager) :
+ segment_manager(segment_manager) {}
+
+Cache::~Cache()
+{
+ for (auto &i: extents) {
+ logger().error("~Cache: extent {} still alive", i);
+ }
+ ceph_assert(extents.empty());
+}
+
+Cache::retire_extent_ret Cache::retire_extent_if_cached(
+ Transaction &t, paddr_t addr)
+{
+ if (auto ext = t.write_set.find_offset(addr); ext != t.write_set.end()) {
+ logger().debug("{}: found {} in t.write_set", __func__, addr);
+ t.add_to_retired_set(CachedExtentRef(&*ext));
+ return retire_extent_ertr::now();
+ } else if (auto iter = extents.find_offset(addr);
+ iter != extents.end()) {
+ auto ret = CachedExtentRef(&*iter);
+ return ret->wait_io().then([&t, ret=std::move(ret)]() mutable {
+ t.add_to_retired_set(ret);
+ return retire_extent_ertr::now();
+ });
+ } else {
+ return retire_extent_ertr::now();
+ }
+}
+
+void Cache::add_extent(CachedExtentRef ref)
+{
+ assert(ref->is_valid());
+ extents.insert(*ref);
+
+ if (ref->is_dirty()) {
+ add_to_dirty(ref);
+ } else {
+ ceph_assert(!ref->primary_ref_list_hook.is_linked());
+ }
+ logger().debug("add_extent: {}", *ref);
+}
+
+void Cache::mark_dirty(CachedExtentRef ref)
+{
+ if (ref->is_dirty()) {
+ assert(ref->primary_ref_list_hook.is_linked());
+ return;
+ }
+
+ add_to_dirty(ref);
+ ref->state = CachedExtent::extent_state_t::DIRTY;
+
+ logger().debug("mark_dirty: {}", *ref);
+}
+
+void Cache::add_to_dirty(CachedExtentRef ref)
+{
+ assert(ref->is_valid());
+ assert(!ref->primary_ref_list_hook.is_linked());
+ intrusive_ptr_add_ref(&*ref);
+ dirty.push_back(*ref);
+}
+
+void Cache::remove_extent(CachedExtentRef ref)
+{
+ logger().debug("remove_extent: {}", *ref);
+ assert(ref->is_valid());
+ extents.erase(*ref);
+
+ if (ref->is_dirty()) {
+ ceph_assert(ref->primary_ref_list_hook.is_linked());
+ dirty.erase(dirty.s_iterator_to(*ref));
+ intrusive_ptr_release(&*ref);
+ } else {
+ ceph_assert(!ref->primary_ref_list_hook.is_linked());
+ }
+}
+
+void Cache::replace_extent(CachedExtentRef next, CachedExtentRef prev)
+{
+ assert(next->get_paddr() == prev->get_paddr());
+ assert(next->version == prev->version + 1);
+ extents.replace(*next, *prev);
+
+ if (prev->is_dirty()) {
+ ceph_assert(prev->primary_ref_list_hook.is_linked());
+ auto prev_it = dirty.iterator_to(*prev);
+ dirty.insert(prev_it, *next);
+ dirty.erase(prev_it);
+ intrusive_ptr_release(&*prev);
+ intrusive_ptr_add_ref(&*next);
+ } else {
+ add_to_dirty(next);
+ }
+}
+
+CachedExtentRef Cache::alloc_new_extent_by_type(
+ Transaction &t, ///< [in, out] current transaction
+ extent_types_t type, ///< [in] type tag
+ segment_off_t length ///< [in] length
+)
+{
+ switch (type) {
+ case extent_types_t::ROOT:
+ assert(0 == "ROOT is never directly alloc'd");
+ return CachedExtentRef();
+ case extent_types_t::LADDR_INTERNAL:
+ return alloc_new_extent<lba_manager::btree::LBAInternalNode>(t, length);
+ case extent_types_t::LADDR_LEAF:
+ return alloc_new_extent<lba_manager::btree::LBALeafNode>(t, length);
+ case extent_types_t::ONODE_BLOCK:
+ return alloc_new_extent<OnodeBlock>(t, length);
+ case extent_types_t::EXTMAP_INNER:
+ return alloc_new_extent<extentmap_manager::ExtMapInnerNode>(t, length);
+ case extent_types_t::EXTMAP_LEAF:
+ return alloc_new_extent<extentmap_manager::ExtMapLeafNode>(t, length);
+ case extent_types_t::TEST_BLOCK:
+ return alloc_new_extent<TestBlock>(t, length);
+ case extent_types_t::TEST_BLOCK_PHYSICAL:
+ return alloc_new_extent<TestBlockPhysical>(t, length);
+ case extent_types_t::NONE: {
+ ceph_assert(0 == "NONE is an invalid extent type");
+ return CachedExtentRef();
+ }
+ default:
+ ceph_assert(0 == "impossible");
+ return CachedExtentRef();
+ }
+}
+
+CachedExtentRef Cache::duplicate_for_write(
+ Transaction &t,
+ CachedExtentRef i) {
+ if (i->is_pending())
+ return i;
+
+ auto ret = i->duplicate_for_write();
+ if (ret->get_type() == extent_types_t::ROOT) {
+ // root must be loaded before mutate
+ assert(t.root == i);
+ t.root = ret->cast<RootBlock>();
+ } else {
+ ret->last_committed_crc = i->last_committed_crc;
+ ret->prior_instance = i;
+ t.add_mutated_extent(ret);
+ }
+
+ ret->version++;
+ ret->state = CachedExtent::extent_state_t::MUTATION_PENDING;
+ logger().debug("Cache::duplicate_for_write: {} -> {}", *i, *ret);
+ return ret;
+}
+
+std::optional<record_t> Cache::try_construct_record(Transaction &t)
+{
+ // First, validate read set
+ for (auto &i: t.read_set) {
+ if (i->state == CachedExtent::extent_state_t::INVALID)
+ return std::nullopt;
+ }
+
+ record_t record;
+
+ t.write_set.clear();
+
+ // Add new copy of mutated blocks, set_io_wait to block until written
+ record.deltas.reserve(t.mutated_block_list.size());
+ for (auto &i: t.mutated_block_list) {
+ if (!i->is_valid()) {
+ logger().debug("try_construct_record: ignoring invalid {}", *i);
+ continue;
+ }
+ logger().debug("try_construct_record: mutating {}", *i);
+
+ assert(i->prior_instance);
+ replace_extent(i, i->prior_instance);
+
+ i->prepare_write();
+ i->set_io_wait();
+
+ assert(i->get_version() > 0);
+ auto final_crc = i->get_crc32c();
+ record.deltas.push_back(
+ delta_info_t{
+ i->get_type(),
+ i->get_paddr(),
+ (i->is_logical()
+ ? i->cast<LogicalCachedExtent>()->get_laddr()
+ : L_ADDR_NULL),
+ i->last_committed_crc,
+ final_crc,
+ (segment_off_t)i->get_length(),
+ i->get_version() - 1,
+ i->get_delta()
+ });
+ i->last_committed_crc = final_crc;
+ }
+
+ if (t.root) {
+ logger().debug(
+ "{}: writing out root delta for {}",
+ __func__,
+ *t.root);
+ record.deltas.push_back(
+ delta_info_t{
+ extent_types_t::ROOT,
+ paddr_t{},
+ L_ADDR_NULL,
+ 0,
+ 0,
+ 0,
+ t.root->get_version() - 1,
+ t.root->get_delta()
+ });
+ }
+
+ // Transaction is now a go, set up in-memory cache state
+ // invalidate now invalid blocks
+ for (auto &i: t.retired_set) {
+ logger().debug("try_construct_record: retiring {}", *i);
+ ceph_assert(i->is_valid());
+ remove_extent(i);
+ i->state = CachedExtent::extent_state_t::INVALID;
+ }
+
+ record.extents.reserve(t.fresh_block_list.size());
+ for (auto &i: t.fresh_block_list) {
+ logger().debug("try_construct_record: fresh block {}", *i);
+ bufferlist bl;
+ i->prepare_write();
+ bl.append(i->get_bptr());
+ if (i->get_type() == extent_types_t::ROOT) {
+ assert(0 == "ROOT never gets written as a fresh block");
+ }
+
+ assert(bl.length() == i->get_length());
+ record.extents.push_back(extent_t{
+ i->get_type(),
+ i->is_logical()
+ ? i->cast<LogicalCachedExtent>()->get_laddr()
+ : L_ADDR_NULL,
+ std::move(bl)
+ });
+ }
+
+ return std::make_optional<record_t>(std::move(record));
+}
+
+void Cache::complete_commit(
+ Transaction &t,
+ paddr_t final_block_start,
+ journal_seq_t seq,
+ SegmentCleaner *cleaner)
+{
+ if (t.root) {
+ remove_extent(root);
+ root = t.root;
+ root->state = CachedExtent::extent_state_t::DIRTY;
+ root->on_delta_write(final_block_start);
+ root->dirty_from = seq;
+ add_extent(root);
+ logger().debug("complete_commit: new root {}", *t.root);
+ }
+
+ for (auto &i: t.fresh_block_list) {
+ i->set_paddr(final_block_start.add_relative(i->get_paddr()));
+ i->last_committed_crc = i->get_crc32c();
+ i->on_initial_write();
+
+ if (!i->is_valid()) {
+ logger().debug("complete_commit: invalid {}", *i);
+ continue;
+ }
+
+ i->state = CachedExtent::extent_state_t::CLEAN;
+ logger().debug("complete_commit: fresh {}", *i);
+ add_extent(i);
+ if (cleaner) {
+ cleaner->mark_space_used(
+ i->get_paddr(),
+ i->get_length());
+ }
+ }
+
+ // Add new copy of mutated blocks, set_io_wait to block until written
+ for (auto &i: t.mutated_block_list) {
+ logger().debug("complete_commit: mutated {}", *i);
+ assert(i->prior_instance);
+ i->on_delta_write(final_block_start);
+ i->prior_instance = CachedExtentRef();
+ if (!i->is_valid()) {
+ logger().debug("complete_commit: not dirtying invalid {}", *i);
+ continue;
+ }
+ i->state = CachedExtent::extent_state_t::DIRTY;
+ if (i->version == 1) {
+ i->dirty_from = seq;
+ }
+ }
+
+ if (cleaner) {
+ for (auto &i: t.retired_set) {
+ cleaner->mark_space_free(
+ i->get_paddr(),
+ i->get_length());
+ }
+ }
+
+ for (auto &i: t.mutated_block_list) {
+ i->complete_io();
+ }
+}
+
+void Cache::init() {
+ if (root) {
+ // initial creation will do mkfs followed by mount each of which calls init
+ remove_extent(root);
+ root = nullptr;
+ }
+ root = new RootBlock();
+ root->state = CachedExtent::extent_state_t::DIRTY;
+ add_extent(root);
+}
+
+Cache::mkfs_ertr::future<> Cache::mkfs(Transaction &t)
+{
+ return get_root(t).safe_then([this, &t](auto croot) {
+ duplicate_for_write(t, croot);
+ return mkfs_ertr::now();
+ });
+}
+
+Cache::close_ertr::future<> Cache::close()
+{
+ root.reset();
+ for (auto i = dirty.begin(); i != dirty.end(); ) {
+ auto ptr = &*i;
+ dirty.erase(i++);
+ intrusive_ptr_release(ptr);
+ }
+ return close_ertr::now();
+}
+
+Cache::replay_delta_ret
+Cache::replay_delta(
+ journal_seq_t journal_seq,
+ paddr_t record_base,
+ const delta_info_t &delta)
+{
+ if (delta.type == extent_types_t::ROOT) {
+ logger().debug("replay_delta: found root delta");
+ root->apply_delta_and_adjust_crc(record_base, delta.bl);
+ root->dirty_from = journal_seq;
+ return replay_delta_ertr::now();
+ } else {
+ auto get_extent_if_cached = [this](paddr_t addr)
+ -> replay_delta_ertr::future<CachedExtentRef> {
+ auto retiter = extents.find_offset(addr);
+ if (retiter != extents.end()) {
+ return replay_delta_ertr::make_ready_future<CachedExtentRef>(&*retiter);
+ } else {
+ return replay_delta_ertr::make_ready_future<CachedExtentRef>();
+ }
+ };
+ auto extent_fut = delta.pversion == 0 ?
+ get_extent_by_type(
+ delta.type,
+ delta.paddr,
+ delta.laddr,
+ delta.length) :
+ get_extent_if_cached(
+ delta.paddr);
+ return extent_fut.safe_then([=, &delta](auto extent) {
+ if (!extent) {
+ assert(delta.pversion > 0);
+ logger().debug(
+ "replay_delta: replaying {}, extent not present so delta is obsolete",
+ delta);
+ return;
+ }
+
+ logger().debug(
+ "replay_delta: replaying {} on {}",
+ *extent,
+ delta);
+
+ assert(extent->version == delta.pversion);
+
+ assert(extent->last_committed_crc == delta.prev_crc);
+ extent->apply_delta_and_adjust_crc(record_base, delta.bl);
+ assert(extent->last_committed_crc == delta.final_crc);
+
+ if (extent->version == 0) {
+ extent->dirty_from = journal_seq;
+ }
+ extent->version++;
+ mark_dirty(extent);
+ });
+ }
+}
+
+Cache::get_next_dirty_extents_ret Cache::get_next_dirty_extents(
+ journal_seq_t seq)
+{
+ std::vector<CachedExtentRef> ret;
+ for (auto i = dirty.begin(); i != dirty.end(); ++i) {
+ CachedExtentRef cand;
+ if (i->dirty_from < seq) {
+ assert(ret.empty() || ret.back()->dirty_from <= i->dirty_from);
+ ret.push_back(&*i);
+ } else {
+ break;
+ }
+ }
+ return seastar::do_with(
+ std::move(ret),
+ [](auto &ret) {
+ return seastar::do_for_each(
+ ret,
+ [](auto &ext) {
+ logger().debug(
+ "get_next_dirty_extents: waiting on {}",
+ *ext);
+ return ext->wait_io();
+ }).then([&ret]() mutable {
+ return seastar::make_ready_future<std::vector<CachedExtentRef>>(
+ std::move(ret));
+ });
+ });
+}
+
+Cache::get_root_ret Cache::get_root(Transaction &t)
+{
+ if (t.root) {
+ return get_root_ret(
+ get_root_ertr::ready_future_marker{},
+ t.root);
+ } else {
+ auto ret = root;
+ return ret->wait_io().then([ret, &t] {
+ t.root = ret;
+ return get_root_ret(
+ get_root_ertr::ready_future_marker{},
+ ret);
+ });
+ }
+}
+
+using StagedOnodeBlock = crimson::os::seastore::onode::SeastoreNodeExtent;
+
+Cache::get_extent_ertr::future<CachedExtentRef> Cache::get_extent_by_type(
+ extent_types_t type,
+ paddr_t offset,
+ laddr_t laddr,
+ segment_off_t length)
+{
+ return [=] {
+ switch (type) {
+ case extent_types_t::ROOT:
+ assert(0 == "ROOT is never directly read");
+ return get_extent_ertr::make_ready_future<CachedExtentRef>();
+ case extent_types_t::LADDR_INTERNAL:
+ return get_extent<lba_manager::btree::LBAInternalNode>(offset, length
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::LADDR_LEAF:
+ return get_extent<lba_manager::btree::LBALeafNode>(offset, length
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::EXTMAP_INNER:
+ return get_extent<extentmap_manager::ExtMapInnerNode>(offset, length
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::EXTMAP_LEAF:
+ return get_extent<extentmap_manager::ExtMapLeafNode>(offset, length
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::ONODE_BLOCK:
+ return get_extent<OnodeBlock>(offset, length
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::ONODE_BLOCK_STAGED:
+ return get_extent<StagedOnodeBlock>(offset, length
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::TEST_BLOCK:
+ return get_extent<TestBlock>(offset, length
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::TEST_BLOCK_PHYSICAL:
+ return get_extent<TestBlockPhysical>(offset, length
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
+ case extent_types_t::NONE: {
+ ceph_assert(0 == "NONE is an invalid extent type");
+ return get_extent_ertr::make_ready_future<CachedExtentRef>();
+ }
+ default:
+ ceph_assert(0 == "impossible");
+ return get_extent_ertr::make_ready_future<CachedExtentRef>();
+ }
+ }().safe_then([laddr](CachedExtentRef e) {
+ assert(e->is_logical() == (laddr != L_ADDR_NULL));
+ if (e->is_logical()) {
+ e->cast<LogicalCachedExtent>()->set_laddr(laddr);
+ }
+ return get_extent_ertr::make_ready_future<CachedExtentRef>(e);
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h
new file mode 100644
index 000000000..624272162
--- /dev/null
+++ b/src/crimson/os/seastore/cache.h
@@ -0,0 +1,516 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include "seastar/core/shared_future.hh"
+
+#include "include/buffer.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction.h"
+#include "crimson/os/seastore/segment_manager.h"
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/root_block.h"
+#include "crimson/os/seastore/segment_cleaner.h"
+
+namespace crimson::os::seastore {
+
+/**
+ * Cache
+ *
+ * This component is responsible for buffer management, including
+ * transaction lifecycle.
+ *
+ * Seastore transactions are expressed as an atomic combination of
+ * 1) newly written blocks
+ * 2) logical mutations to existing physical blocks
+ *
+ * See record_t
+ *
+ * As such, any transaction has 3 components:
+ * 1) read_set: references to extents read during the transaction
+ * See Transaction::read_set
+ * 2) write_set: references to extents to be written as:
+ * a) new physical blocks, see Transaction::fresh_block_list
+ * b) mutations to existing physical blocks,
+ * see Transaction::mutated_block_list
+ * 3) retired_set: extent refs to be retired either due to 2b or
+ * due to releasing the extent generally.
+
+ * In the case of 2b, the CachedExtent will have been copied into
+ * a fresh CachedExtentRef such that the source extent ref is present
+ * in the read set and the newly allocated extent is present in the
+ * write_set.
+ *
+ * A transaction has 3 phases:
+ * 1) construction: user calls Cache::get_transaction() and populates
+ * the returned transaction by calling Cache methods
+ * 2) submission: user calls Cache::try_start_transaction(). If
+ * succcessful, the user may construct a record and submit the
+ * transaction to the journal.
+ * 3) completion: once the transaction is durable, the user must call
+ * Cache::complete_transaction() with the block offset to complete
+ * the transaction.
+ *
+ * Internally, in phase 1, the fields in Transaction are filled in.
+ * - reads may block if the referenced extent is being written
+ * - once a read obtains a particular CachedExtentRef for a paddr_t,
+ * it'll always get the same one until overwritten
+ * - once a paddr_t is overwritten or written, subsequent reads of
+ * that addr will get the new ref
+ *
+ * In phase 2, if all extents in the read set are valid (not expired),
+ * we can commit (otherwise, we fail and the user must retry).
+ * - Expire all extents in the retired_set (they must all be valid)
+ * - Remove all extents in the retired_set from Cache::extents
+ * - Mark all extents in the write_set wait_io(), add promises to
+ * transaction
+ * - Merge Transaction::write_set into Cache::extents
+ *
+ * After phase 2, the user will submit the record to the journal.
+ * Once complete, we perform phase 3:
+ * - For each CachedExtent in block_list, call
+ * CachedExtent::complete_initial_write(paddr_t) with the block's
+ * final offset (inferred from the extent's position in the block_list
+ * and extent lengths).
+ * - For each block in mutation_list, call
+ * CachedExtent::delta_written(paddr_t) with the address of the start
+ * of the record
+ * - Complete all promises with the final record start paddr_t
+ */
+class Cache {
+public:
+ Cache(SegmentManager &segment_manager);
+ ~Cache();
+
+ /**
+ * drop_from_cache
+ *
+ * Drop extent from cache. Intended for use when
+ * ref refers to a logically dead extent as during
+ * replay.
+ */
+ void drop_from_cache(CachedExtentRef ref) {
+ remove_extent(ref);
+ }
+
+ /// Declare ref retired in t
+ void retire_extent(Transaction &t, CachedExtentRef ref) {
+ t.add_to_retired_set(ref);
+ }
+
+ /// Declare paddr retired in t, noop if not cached
+ using retire_extent_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using retire_extent_ret = retire_extent_ertr::future<>;
+ retire_extent_ret retire_extent_if_cached(
+ Transaction &t, paddr_t addr);
+
+ /**
+ * get_root
+ *
+ * returns ref to current root or t.root if modified in t
+ */
+ using get_root_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using get_root_ret = get_root_ertr::future<RootBlockRef>;
+ get_root_ret get_root(Transaction &t);
+
+ /**
+ * get_root_fast
+ *
+ * returns t.root and assume it is already present/read in t
+ */
+ RootBlockRef get_root_fast(Transaction &t) {
+ assert(t.root);
+ return t.root;
+ }
+
+ /**
+ * get_extent
+ *
+ * returns ref to extent at offset~length of type T either from
+ * - extent_set if already in cache
+ * - disk
+ */
+ using get_extent_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ template <typename T>
+ get_extent_ertr::future<TCachedExtentRef<T>> get_extent(
+ paddr_t offset, ///< [in] starting addr
+ segment_off_t length ///< [in] length
+ ) {
+ if (auto iter = extents.find_offset(offset);
+ iter != extents.end()) {
+ auto ret = TCachedExtentRef<T>(static_cast<T*>(&*iter));
+ return ret->wait_io().then([ret=std::move(ret)]() mutable {
+ return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>(
+ std::move(ret));
+ });
+ } else {
+ auto ref = CachedExtent::make_cached_extent_ref<T>(
+ alloc_cache_buf(length));
+ ref->set_io_wait();
+ ref->set_paddr(offset);
+ ref->state = CachedExtent::extent_state_t::CLEAN;
+
+ return segment_manager.read(
+ offset,
+ length,
+ ref->get_bptr()).safe_then(
+ [this, ref=std::move(ref)]() mutable {
+ /* TODO: crc should be checked against LBA manager */
+ ref->last_committed_crc = ref->get_crc32c();
+
+ ref->on_clean_read();
+ ref->complete_io();
+ add_extent(ref);
+ return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>(
+ std::move(ref));
+ },
+ get_extent_ertr::pass_further{},
+ crimson::ct_error::discard_all{});
+ }
+ }
+
+ /**
+ * get_extent_if_cached
+ *
+ * Returns extent at offset if in cache
+ */
+ Transaction::get_extent_ret get_extent_if_cached(
+ Transaction &t,
+ paddr_t offset,
+ CachedExtentRef *out) {
+ auto result = t.get_extent(offset, out);
+ if (result != Transaction::get_extent_ret::ABSENT) {
+ return result;
+ } else if (auto iter = extents.find_offset(offset);
+ iter != extents.end()) {
+ if (out)
+ *out = &*iter;
+ return Transaction::get_extent_ret::PRESENT;
+ } else {
+ return Transaction::get_extent_ret::ABSENT;
+ }
+ }
+
+ /**
+ * get_extent
+ *
+ * returns ref to extent at offset~length of type T either from
+ * - t if modified by t
+ * - extent_set if already in cache
+ * - disk
+ *
+ * t *must not* have retired offset
+ */
+ template <typename T>
+ get_extent_ertr::future<TCachedExtentRef<T>> get_extent(
+ Transaction &t, ///< [in,out] current transaction
+ paddr_t offset, ///< [in] starting addr
+ segment_off_t length ///< [in] length
+ ) {
+ CachedExtentRef ret;
+ auto result = t.get_extent(offset, &ret);
+ if (result != Transaction::get_extent_ret::ABSENT) {
+ assert(result != Transaction::get_extent_ret::RETIRED);
+ return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>(
+ ret->cast<T>());
+ } else {
+ return get_extent<T>(offset, length).safe_then(
+ [&t](auto ref) mutable {
+ t.add_to_read_set(ref);
+ return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>(
+ std::move(ref));
+ });
+ }
+ }
+
+ /**
+ * get_extent_by_type
+ *
+ * Based on type, instantiate the correct concrete type
+ * and read in the extent at location offset~length.
+ */
+ get_extent_ertr::future<CachedExtentRef> get_extent_by_type(
+ extent_types_t type, ///< [in] type tag
+ paddr_t offset, ///< [in] starting addr
+ laddr_t laddr, ///< [in] logical address if logical
+ segment_off_t length ///< [in] length
+ );
+
+ get_extent_ertr::future<CachedExtentRef> get_extent_by_type(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t offset,
+ laddr_t laddr,
+ segment_off_t length) {
+ CachedExtentRef ret;
+ auto status = get_extent_if_cached(t, offset, &ret);
+ if (status == Transaction::get_extent_ret::RETIRED) {
+ return get_extent_ertr::make_ready_future<CachedExtentRef>();
+ } else if (status == Transaction::get_extent_ret::PRESENT) {
+ return get_extent_ertr::make_ready_future<CachedExtentRef>(ret);
+ } else {
+ return get_extent_by_type(type, offset, laddr, length
+ ).safe_then([=, &t](CachedExtentRef ret) {
+ t.add_to_read_set(ret);
+ return get_extent_ertr::make_ready_future<CachedExtentRef>(
+ std::move(ret));
+ });
+ }
+ }
+
+ /**
+ * get_extents
+ *
+ * returns refs to extents in extents from:
+ * - t if modified by t
+ * - extent_set if already in cache
+ * - disk
+ */
+ template<typename T>
+ get_extent_ertr::future<t_pextent_list_t<T>> get_extents(
+ Transaction &t, ///< [in, out] current transaction
+ paddr_list_t &&extents ///< [in] extent list for lookup
+ ) {
+ auto retref = std::make_unique<t_pextent_list_t<T>>();
+ auto &ret = *retref;
+ auto ext = std::make_unique<paddr_list_t>(std::move(extents));
+ return crimson::do_for_each(
+ ext->begin(),
+ ext->end(),
+ [this, &t, &ret](auto &p) {
+ auto &[offset, len] = p;
+ return get_extent(t, offset, len).safe_then([&ret](auto cext) {
+ ret.push_back(std::move(cext));
+ });
+ }).safe_then([retref=std::move(retref), ext=std::move(ext)]() mutable {
+ return get_extent_ertr::make_ready_future<t_pextent_list_t<T>>(
+ std::move(*retref));
+ });
+ }
+
+ /**
+ * alloc_new_extent
+ *
+ * Allocates a fresh extent. addr will be relative until commit.
+ */
+ template <typename T>
+ TCachedExtentRef<T> alloc_new_extent(
+ Transaction &t, ///< [in, out] current transaction
+ segment_off_t length ///< [in] length
+ ) {
+ auto ret = CachedExtent::make_cached_extent_ref<T>(
+ alloc_cache_buf(length));
+ t.add_fresh_extent(ret);
+ ret->state = CachedExtent::extent_state_t::INITIAL_WRITE_PENDING;
+ return ret;
+ }
+
+ /**
+ * alloc_new_extent
+ *
+ * Allocates a fresh extent. addr will be relative until commit.
+ */
+ CachedExtentRef alloc_new_extent_by_type(
+ Transaction &t, ///< [in, out] current transaction
+ extent_types_t type, ///< [in] type tag
+ segment_off_t length ///< [in] length
+ );
+
+ /**
+ * Allocates mutable buffer from extent_set on offset~len
+ *
+ * TODO: Note, currently all implementations literally copy the
+ * buffer. This needn't be true, CachedExtent implementations could
+ * choose to refer to the same buffer unmodified until commit and just
+ * buffer the mutations in an ancillary data structure.
+ *
+ * @param current transaction
+ * @param extent to duplicate
+ * @return mutable extent
+ */
+ CachedExtentRef duplicate_for_write(
+ Transaction &t, ///< [in, out] current transaction
+ CachedExtentRef i ///< [in] ref to existing extent
+ );
+
+ /**
+ * try_construct_record
+ *
+ * First checks for conflicts. If a racing write has mutated/retired
+ * an extent mutated by this transaction, nullopt will be returned.
+ *
+ * Otherwise, a record will be returned valid for use with Journal.
+ */
+ std::optional<record_t> try_construct_record(
+ Transaction &t ///< [in, out] current transaction
+ );
+
+ /**
+ * complete_commit
+ *
+ * Must be called upon completion of write. Releases blocks on mutating
+ * extents, fills in addresses, and calls relevant callbacks on fresh
+ * and mutated exents.
+ */
+ void complete_commit(
+ Transaction &t, ///< [in, out] current transaction
+ paddr_t final_block_start, ///< [in] offset of initial block
+ journal_seq_t seq, ///< [in] journal commit seq
+ SegmentCleaner *cleaner=nullptr ///< [out] optional segment stat listener
+ );
+
+ /**
+ * init
+ */
+ void init();
+
+ /**
+ * mkfs
+ *
+ * Alloc initial root node and add to t. The intention is for other
+ * components to use t to adjust the resulting root ref prior to commit.
+ */
+ using mkfs_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ mkfs_ertr::future<> mkfs(Transaction &t);
+
+ /**
+ * close
+ *
+ * TODO: should flush dirty blocks
+ */
+ using close_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ close_ertr::future<> close();
+
+ /**
+ * replay_delta
+ *
+ * Intended for use in Journal::delta. For each delta, should decode delta,
+ * read relevant block from disk or cache (using correct type), and call
+ * CachedExtent::apply_delta marking the extent dirty.
+ */
+ using replay_delta_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using replay_delta_ret = replay_delta_ertr::future<>;
+ replay_delta_ret replay_delta(
+ journal_seq_t seq,
+ paddr_t record_block_base,
+ const delta_info_t &delta);
+
+ /**
+ * init_cached_extents
+ *
+ * Calls passed lambda for each dirty cached block. Intended for use
+ * after replay to allow lba_manager (or w/e) to read in any ancestor
+ * blocks.
+ */
+ using init_cached_extents_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using init_cached_extents_ret = replay_delta_ertr::future<>;
+ template <typename F>
+ init_cached_extents_ret init_cached_extents(
+ Transaction &t,
+ F &&f)
+ {
+ std::vector<CachedExtentRef> dirty;
+ for (auto &e : extents) {
+ dirty.push_back(CachedExtentRef(&e));
+ }
+ return seastar::do_with(
+ std::forward<F>(f),
+ std::move(dirty),
+ [&t](auto &f, auto &refs) mutable {
+ return crimson::do_for_each(
+ refs,
+ [&t, &f](auto &e) { return f(t, e); });
+ });
+ }
+
+ /**
+ * update_extent_from_transaction
+ *
+ * Updates passed extent based on t. If extent has been retired,
+ * a null result will be returned.
+ */
+ CachedExtentRef update_extent_from_transaction(
+ Transaction &t,
+ CachedExtentRef extent) {
+ if (extent->get_type() == extent_types_t::ROOT) {
+ if (t.root) {
+ return t.root;
+ } else {
+ return extent;
+ }
+ } else {
+ auto result = t.get_extent(extent->get_paddr(), &extent);
+ if (result == Transaction::get_extent_ret::RETIRED) {
+ return CachedExtentRef();
+ } else {
+ return extent;
+ }
+ }
+ }
+
+ /**
+ * print
+ *
+ * Dump summary of contents (TODO)
+ */
+ std::ostream &print(
+ std::ostream &out) const {
+ return out;
+ }
+
+ /// returns extents with dirty_from < seq
+ using get_next_dirty_extents_ertr = crimson::errorator<>;
+ using get_next_dirty_extents_ret = get_next_dirty_extents_ertr::future<
+ std::vector<CachedExtentRef>>;
+ get_next_dirty_extents_ret get_next_dirty_extents(
+ journal_seq_t seq);
+
+private:
+ SegmentManager &segment_manager; ///< ref to segment_manager
+ RootBlockRef root; ///< ref to current root
+ ExtentIndex extents; ///< set of live extents
+
+ /**
+ * dirty
+ *
+ * holds refs to dirty extents. Ordered by CachedExtent::dirty_from.
+ */
+ CachedExtent::list dirty;
+
+ /// alloc buffer for cached extent
+ bufferptr alloc_cache_buf(size_t size) {
+ // TODO: memory pooling etc
+ auto bp = ceph::bufferptr(
+ buffer::create_page_aligned(size));
+ bp.zero();
+ return bp;
+ }
+
+ /// Add extent to extents handling dirty and refcounting
+ void add_extent(CachedExtentRef ref);
+
+ /// Mark exising extent ref dirty -- mainly for replay
+ void mark_dirty(CachedExtentRef ref);
+
+ /// Add dirty extent to dirty list
+ void add_to_dirty(CachedExtentRef ref);
+
+ /// Remove extent from extents handling dirty and refcounting
+ void remove_extent(CachedExtentRef ref);
+
+ /// Replace prev with next
+ void replace_extent(CachedExtentRef next, CachedExtentRef prev);
+};
+
+}
diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc
new file mode 100644
index 000000000..7019b9fb8
--- /dev/null
+++ b/src/crimson/os/seastore/cached_extent.cc
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/cached_extent.h"
+
+#include "crimson/common/log.h"
+
+namespace {
+ [[maybe_unused]] seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+namespace crimson::os::seastore {
+
+#ifdef DEBUG_CACHED_EXTENT_REF
+
+void intrusive_ptr_add_ref(CachedExtent *ptr)
+{
+ intrusive_ptr_add_ref(
+ static_cast<boost::intrusive_ref_counter<
+ CachedExtent,
+ boost::thread_unsafe_counter>*>(ptr));
+ logger().debug("intrusive_ptr_add_ref: {}", *ptr);
+}
+
+void intrusive_ptr_release(CachedExtent *ptr)
+{
+ logger().debug("intrusive_ptr_release: {}", *ptr);
+ intrusive_ptr_release(
+ static_cast<boost::intrusive_ref_counter<
+ CachedExtent,
+ boost::thread_unsafe_counter>*>(ptr));
+}
+
+#endif
+
+std::ostream &operator<<(std::ostream &out, CachedExtent::extent_state_t state)
+{
+ switch (state) {
+ case CachedExtent::extent_state_t::INITIAL_WRITE_PENDING:
+ return out << "INITIAL_WRITE_PENDING";
+ case CachedExtent::extent_state_t::MUTATION_PENDING:
+ return out << "MUTATION_PENDING";
+ case CachedExtent::extent_state_t::CLEAN:
+ return out << "CLEAN";
+ case CachedExtent::extent_state_t::DIRTY:
+ return out << "DIRTY";
+ case CachedExtent::extent_state_t::INVALID:
+ return out << "INVALID";
+ default:
+ return out << "UNKNOWN";
+ }
+}
+
+std::ostream &operator<<(std::ostream &out, const CachedExtent &ext)
+{
+ return ext.print(out);
+}
+
+CachedExtent::~CachedExtent()
+{
+ if (parent_index) {
+ parent_index->erase(*this);
+ }
+}
+
+std::ostream &LogicalCachedExtent::print_detail(std::ostream &out) const
+{
+ out << ", laddr=" << laddr;
+ if (pin) {
+ out << ", pin=" << *pin;
+ } else {
+ out << ", pin=empty";
+ }
+ return print_detail_l(out);
+}
+
+std::ostream &operator<<(std::ostream &out, const LBAPin &rhs)
+{
+ return out << "LBAPin(" << rhs.get_laddr() << "~" << rhs.get_length()
+ << "->" << rhs.get_paddr();
+}
+
+std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs)
+{
+ bool first = true;
+ out << '[';
+ for (auto &i: rhs) {
+ out << (first ? "" : ",") << *i;
+ first = false;
+ }
+ return out << ']';
+}
+
+}
diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h
new file mode 100644
index 000000000..974988489
--- /dev/null
+++ b/src/crimson/os/seastore/cached_extent.h
@@ -0,0 +1,659 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include "seastar/core/shared_future.hh"
+
+#include "include/buffer.h"
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/seastore_types.h"
+
+namespace crimson::os::seastore {
+
+class CachedExtent;
+using CachedExtentRef = boost::intrusive_ptr<CachedExtent>;
+
+// #define DEBUG_CACHED_EXTENT_REF
+#ifdef DEBUG_CACHED_EXTENT_REF
+
+void intrusive_ptr_add_ref(CachedExtent *);
+void intrusive_ptr_release(CachedExtent *);
+
+#endif
+
+template <typename T>
+using TCachedExtentRef = boost::intrusive_ptr<T>;
+
+/**
+ * CachedExtent
+ */
+namespace onode {
+ class DummyNodeExtent;
+ class TestReplayExtent;
+}
+class ExtentIndex;
+class CachedExtent : public boost::intrusive_ref_counter<
+ CachedExtent, boost::thread_unsafe_counter> {
+ enum class extent_state_t : uint8_t {
+ INITIAL_WRITE_PENDING, // In Transaction::write_set and fresh_block_list
+ MUTATION_PENDING, // In Transaction::write_set and mutated_block_list
+ CLEAN, // In Cache::extent_index, Transaction::read_set
+ // during write, contents match disk, version == 0
+ DIRTY, // Same as CLEAN, but contents do not match disk,
+ // version > 0
+ INVALID // Part of no ExtentIndex set
+ } state = extent_state_t::INVALID;
+ friend std::ostream &operator<<(std::ostream &, extent_state_t);
+ // allow a dummy extent to pretend it is at a specific state
+ friend class onode::DummyNodeExtent;
+ friend class onode::TestReplayExtent;
+
+ uint32_t last_committed_crc = 0;
+
+ // Points at current version while in state MUTATION_PENDING
+ CachedExtentRef prior_instance;
+
+ /**
+ * dirty_from
+ *
+ * When dirty, indiciates the oldest journal entry which mutates
+ * this extent.
+ */
+ journal_seq_t dirty_from;
+
+public:
+ /**
+ * duplicate_for_write
+ *
+ * Implementation should return a fresh CachedExtentRef
+ * which represents a copy of *this until on_delta_write()
+ * is complete, at which point the user may assume *this
+ * will be in state INVALID. As such, the implementation
+ * may involve a copy of get_bptr(), or an ancillary
+ * structure which defers updating the actual buffer until
+ * on_delta_write().
+ */
+ virtual CachedExtentRef duplicate_for_write() = 0;
+
+ /**
+ * prepare_write
+ *
+ * Called prior to reading buffer.
+ * Implemenation may use this callback to fully write out
+ * updates to the buffer.
+ */
+ virtual void prepare_write() {}
+
+ /**
+ * on_initial_write
+ *
+ * Called after commit of extent. State will be CLEAN.
+ * Implentation may use this call to fixup the buffer
+ * with the newly available absolute get_paddr().
+ */
+ virtual void on_initial_write() {}
+
+ /**
+ * on_clean_read
+ *
+ * Called after read of initially written extent.
+ * State will be CLEAN. Implentation may use this
+ * call to fixup the buffer with the newly available
+ * absolute get_paddr().
+ */
+ virtual void on_clean_read() {}
+
+ /**
+ * on_delta_write
+ *
+ * Called after commit of delta. State will be DIRTY.
+ * Implentation may use this call to fixup any relative
+ * references in the the buffer with the passed
+ * record_block_offset record location.
+ */
+ virtual void on_delta_write(paddr_t record_block_offset) {}
+
+ /**
+ * get_type
+ *
+ * Returns concrete type.
+ */
+ virtual extent_types_t get_type() const = 0;
+
+ virtual bool is_logical() const {
+ return false;
+ }
+
+ friend std::ostream &operator<<(std::ostream &, extent_state_t);
+ virtual std::ostream &print_detail(std::ostream &out) const { return out; }
+ std::ostream &print(std::ostream &out) const {
+ out << "CachedExtent(addr=" << this
+ << ", type=" << get_type()
+ << ", version=" << version
+ << ", dirty_from=" << dirty_from
+ << ", paddr=" << get_paddr()
+ << ", state=" << state
+ << ", last_committed_crc=" << last_committed_crc
+ << ", refcount=" << use_count();
+ print_detail(out);
+ return out << ")";
+ }
+
+ /**
+ * get_delta
+ *
+ * Must return a valid delta usable in apply_delta() in submit_transaction
+ * if state == MUTATION_PENDING.
+ */
+ virtual ceph::bufferlist get_delta() = 0;
+
+ /**
+ * apply_delta
+ *
+ * bl is a delta obtained previously from get_delta. The versions will
+ * match. Implementation should mutate buffer based on bl. base matches
+ * the address passed on_delta_write.
+ *
+ * Implementation *must* use set_last_committed_crc to update the crc to
+ * what the crc of the buffer would have been at submission. For physical
+ * extents that use base to adjust internal record-relative deltas, this
+ * means that the crc should be of the buffer after applying the delta,
+ * but before that adjustment. We do it this way because the crc in the
+ * commit path does not yet know the record base address.
+ *
+ * LogicalCachedExtent overrides this method and provides a simpler
+ * apply_delta override for LogicalCachedExtent implementers.
+ */
+ virtual void apply_delta_and_adjust_crc(
+ paddr_t base, const ceph::bufferlist &bl) = 0;
+
+ /**
+ * Called on dirty CachedExtent implementation after replay.
+ * Implementation should perform any reads/in-memory-setup
+ * necessary. (for instance, the lba implementation will use this
+ * to load in lba_manager blocks)
+ */
+ using complete_load_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ virtual complete_load_ertr::future<> complete_load() {
+ return complete_load_ertr::now();
+ }
+
+ /**
+ * cast
+ *
+ * Returns a TCachedExtentRef of the specified type.
+ * TODO: add dynamic check that the requested type is actually correct.
+ */
+ template <typename T>
+ TCachedExtentRef<T> cast() {
+ return TCachedExtentRef<T>(static_cast<T*>(this));
+ }
+ template <typename T>
+ TCachedExtentRef<const T> cast() const {
+ return TCachedExtentRef<const T>(static_cast<const T*>(this));
+ }
+
+ /// Returns true if extent is part of an open transaction
+ bool is_pending() const {
+ return state == extent_state_t::INITIAL_WRITE_PENDING ||
+ state == extent_state_t::MUTATION_PENDING;
+ }
+
+ /// Returns true if extent has a pending delta
+ bool is_mutation_pending() const {
+ return state == extent_state_t::MUTATION_PENDING;
+ }
+
+ /// Returns true if extent is a fresh extent
+ bool is_initial_pending() const {
+ return state == extent_state_t::INITIAL_WRITE_PENDING;
+ }
+
+ /// Returns true if extent is clean (does not have deltas on disk)
+ bool is_clean() const {
+ ceph_assert(is_valid());
+ return state == extent_state_t::INITIAL_WRITE_PENDING ||
+ state == extent_state_t::CLEAN;
+ }
+
+ /// Returns true if extent is dirty (has deltas on disk)
+ bool is_dirty() const {
+ ceph_assert(is_valid());
+ return !is_clean();
+ }
+
+ /// Returns true if extent has not been superceded or retired
+ bool is_valid() const {
+ return state != extent_state_t::INVALID;
+ }
+
+ /**
+ * get_dirty_from
+ *
+ * Return journal location of oldest relevant delta.
+ */
+ auto get_dirty_from() const { return dirty_from; }
+
+
+ /**
+ * get_paddr
+ *
+ * Returns current address of extent. If is_initial_pending(), address will
+ * be relative, otherwise address will be absolute.
+ */
+ paddr_t get_paddr() const { return poffset; }
+
+ /// Returns length of extent
+ extent_len_t get_length() const { return ptr.length(); }
+
+ /// Returns version, get_version() == 0 iff is_clean()
+ extent_version_t get_version() const {
+ return version;
+ }
+
+ /// Returns crc32c of buffer
+ uint32_t get_crc32c() {
+ return ceph_crc32c(
+ 1,
+ reinterpret_cast<const unsigned char *>(get_bptr().c_str()),
+ get_length());
+ }
+
+ /// Get ref to raw buffer
+ bufferptr &get_bptr() { return ptr; }
+ const bufferptr &get_bptr() const { return ptr; }
+
+ /// Compare by paddr
+ friend bool operator< (const CachedExtent &a, const CachedExtent &b) {
+ return a.poffset < b.poffset;
+ }
+ friend bool operator> (const CachedExtent &a, const CachedExtent &b) {
+ return a.poffset > b.poffset;
+ }
+ friend bool operator== (const CachedExtent &a, const CachedExtent &b) {
+ return a.poffset == b.poffset;
+ }
+
+ virtual ~CachedExtent();
+
+private:
+ friend struct paddr_cmp;
+ friend struct ref_paddr_cmp;
+ friend class ExtentIndex;
+
+ /// Pointer to containing index (or null)
+ ExtentIndex *parent_index = nullptr;
+
+ /// hook for intrusive extent_index
+ boost::intrusive::set_member_hook<> extent_index_hook;
+ using index_member_options = boost::intrusive::member_hook<
+ CachedExtent,
+ boost::intrusive::set_member_hook<>,
+ &CachedExtent::extent_index_hook>;
+ using index = boost::intrusive::set<CachedExtent, index_member_options>;
+ friend class ExtentIndex;
+ friend class Transaction;
+
+ /// hook for intrusive ref list (mainly dirty or lru list)
+ boost::intrusive::list_member_hook<> primary_ref_list_hook;
+ using primary_ref_list_member_options = boost::intrusive::member_hook<
+ CachedExtent,
+ boost::intrusive::list_member_hook<>,
+ &CachedExtent::primary_ref_list_hook>;
+ using list = boost::intrusive::list<
+ CachedExtent,
+ primary_ref_list_member_options>;
+
+ /// Actual data contents
+ ceph::bufferptr ptr;
+
+ /// number of deltas since initial write
+ extent_version_t version = EXTENT_VERSION_NULL;
+
+ /// address of original block -- relative iff is_pending() and is_clean()
+ paddr_t poffset;
+
+ /// used to wait while in-progress commit completes
+ std::optional<seastar::shared_promise<>> io_wait_promise;
+ void set_io_wait() {
+ ceph_assert(!io_wait_promise);
+ io_wait_promise = seastar::shared_promise<>();
+ }
+ void complete_io() {
+ ceph_assert(io_wait_promise);
+ io_wait_promise->set_value();
+ io_wait_promise = std::nullopt;
+ }
+ seastar::future<> wait_io() {
+ if (!io_wait_promise) {
+ return seastar::now();
+ } else {
+ return io_wait_promise->get_shared_future();
+ }
+ }
+
+protected:
+ CachedExtent(CachedExtent &&other) = delete;
+ CachedExtent(ceph::bufferptr &&ptr) : ptr(std::move(ptr)) {}
+ CachedExtent(const CachedExtent &other)
+ : state(other.state),
+ dirty_from(other.dirty_from),
+ ptr(other.ptr.c_str(), other.ptr.length()),
+ version(other.version),
+ poffset(other.poffset) {}
+
+ struct share_buffer_t {};
+ CachedExtent(const CachedExtent &other, share_buffer_t) :
+ state(other.state),
+ dirty_from(other.dirty_from),
+ ptr(other.ptr),
+ version(other.version),
+ poffset(other.poffset) {}
+
+
+ friend class Cache;
+ template <typename T>
+ static TCachedExtentRef<T> make_cached_extent_ref(bufferptr &&ptr) {
+ return new T(std::move(ptr));
+ }
+
+ CachedExtentRef get_prior_instance() {
+ return prior_instance;
+ }
+
+ /// Sets last_committed_crc
+ void set_last_committed_crc(uint32_t crc) {
+ last_committed_crc = crc;
+ }
+
+ void set_paddr(paddr_t offset) { poffset = offset; }
+
+ /**
+ * maybe_generate_relative
+ *
+ * There are three kinds of addresses one might want to
+ * store within an extent:
+ * - addr for a block within the same transaction relative to the
+ * physical location of this extent in the
+ * event that we will read it in the initial read of the extent
+ * - addr relative to the physical location of the next record to a
+ * block within that record to contain a delta for this extent in
+ * the event that we'll read it from a delta and overlay it onto a
+ * dirty representation of the extent.
+ * - absolute addr to a block already written outside of the current
+ * transaction.
+ *
+ * This helper checks addr and the current state to create the correct
+ * reference.
+ */
+ paddr_t maybe_generate_relative(paddr_t addr) {
+ if (!addr.is_relative()) {
+ return addr;
+ } else if (is_mutation_pending()) {
+ return addr;
+ } else {
+ ceph_assert(is_initial_pending());
+ ceph_assert(get_paddr().is_record_relative());
+ return addr - get_paddr();
+ }
+ }
+
+};
+
+std::ostream &operator<<(std::ostream &, CachedExtent::extent_state_t);
+std::ostream &operator<<(std::ostream &, const CachedExtent&);
+
+/// Compare extents by paddr
+struct paddr_cmp {
+ bool operator()(paddr_t lhs, const CachedExtent &rhs) const {
+ return lhs < rhs.poffset;
+ }
+ bool operator()(const CachedExtent &lhs, paddr_t rhs) const {
+ return lhs.poffset < rhs;
+ }
+};
+
+/// Compare extent refs by paddr
+struct ref_paddr_cmp {
+ using is_transparent = paddr_t;
+ bool operator()(const CachedExtentRef &lhs, const CachedExtentRef &rhs) const {
+ return lhs->poffset < rhs->poffset;
+ }
+ bool operator()(const paddr_t &lhs, const CachedExtentRef &rhs) const {
+ return lhs < rhs->poffset;
+ }
+ bool operator()(const CachedExtentRef &lhs, const paddr_t &rhs) const {
+ return lhs->poffset < rhs;
+ }
+};
+
+template <typename T, typename C>
+class addr_extent_list_base_t
+ : public std::list<std::pair<T, C>> {};
+
+using pextent_list_t = addr_extent_list_base_t<paddr_t, CachedExtentRef>;
+
+template <typename T, typename C, typename Cmp>
+class addr_extent_set_base_t
+ : public std::set<C, Cmp> {};
+
+using pextent_set_t = addr_extent_set_base_t<
+ paddr_t,
+ CachedExtentRef,
+ ref_paddr_cmp
+ >;
+
+template <typename T>
+using t_pextent_list_t = addr_extent_list_base_t<paddr_t, TCachedExtentRef<T>>;
+
+/**
+ * ExtentIndex
+ *
+ * Index of CachedExtent & by poffset, does not hold a reference,
+ * user must ensure each extent is removed prior to deletion
+ */
+class ExtentIndex {
+ friend class Cache;
+ CachedExtent::index extent_index;
+public:
+ auto get_overlap(paddr_t addr, segment_off_t len) {
+ auto bottom = extent_index.upper_bound(addr, paddr_cmp());
+ if (bottom != extent_index.begin())
+ --bottom;
+ if (bottom != extent_index.end() &&
+ bottom->get_paddr().add_offset(bottom->get_length()) <= addr)
+ ++bottom;
+
+ auto top = extent_index.lower_bound(addr.add_offset(len), paddr_cmp());
+ return std::make_pair(
+ bottom,
+ top
+ );
+ }
+
+ void clear() {
+ extent_index.clear();
+ }
+
+ void insert(CachedExtent &extent) {
+ // sanity check
+ auto [a, b] = get_overlap(
+ extent.get_paddr(),
+ extent.get_length());
+ ceph_assert(a == b);
+
+ extent_index.insert(extent);
+ extent.parent_index = this;
+ }
+
+ void erase(CachedExtent &extent) {
+ extent_index.erase(extent);
+ extent.parent_index = nullptr;
+ }
+
+ void replace(CachedExtent &to, CachedExtent &from) {
+ extent_index.replace_node(extent_index.s_iterator_to(from), to);
+ from.parent_index = nullptr;
+ to.parent_index = this;
+ }
+
+ bool empty() const {
+ return extent_index.empty();
+ }
+
+ auto find_offset(paddr_t offset) {
+ return extent_index.find(offset, paddr_cmp());
+ }
+
+ auto begin() {
+ return extent_index.begin();
+ }
+
+ auto end() {
+ return extent_index.end();
+ }
+
+ void merge(ExtentIndex &&other) {
+ for (auto it = other.extent_index.begin();
+ it != other.extent_index.end();
+ ) {
+ auto &ext = *it;
+ ++it;
+ other.extent_index.erase(ext);
+ extent_index.insert(ext);
+ }
+ }
+
+ template <typename T>
+ void remove(T &l) {
+ for (auto &ext : l) {
+ extent_index.erase(l);
+ }
+ }
+};
+
+class LogicalCachedExtent;
+class LBAPin;
+using LBAPinRef = std::unique_ptr<LBAPin>;
+class LBAPin {
+public:
+ virtual void link_extent(LogicalCachedExtent *ref) = 0;
+ virtual void take_pin(LBAPin &pin) = 0;
+ virtual extent_len_t get_length() const = 0;
+ virtual paddr_t get_paddr() const = 0;
+ virtual laddr_t get_laddr() const = 0;
+ virtual LBAPinRef duplicate() const = 0;
+
+ virtual ~LBAPin() {}
+};
+std::ostream &operator<<(std::ostream &out, const LBAPin &rhs);
+
+using lba_pin_list_t = std::list<LBAPinRef>;
+
+std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs);
+
+
+/**
+ * LogicalCachedExtent
+ *
+ * CachedExtent with associated lba mapping.
+ *
+ * Users of TransactionManager should be using extents derived from
+ * LogicalCachedExtent.
+ */
+class LogicalCachedExtent : public CachedExtent {
+public:
+ template <typename... T>
+ LogicalCachedExtent(T&&... t) : CachedExtent(std::forward<T>(t)...) {}
+
+ void set_pin(LBAPinRef &&npin) {
+ assert(!pin);
+ pin = std::move(npin);
+ laddr = pin->get_laddr();
+ pin->link_extent(this);
+ }
+
+ bool has_pin() const {
+ return !!pin;
+ }
+
+ LBAPin &get_pin() {
+ assert(pin);
+ return *pin;
+ }
+
+ laddr_t get_laddr() const {
+ assert(laddr != L_ADDR_NULL);
+ return laddr;
+ }
+
+ void set_laddr(laddr_t nladdr) {
+ laddr = nladdr;
+ }
+
+ void apply_delta_and_adjust_crc(
+ paddr_t base, const ceph::bufferlist &bl) final {
+ apply_delta(bl);
+ set_last_committed_crc(get_crc32c());
+ }
+
+ bool is_logical() const final {
+ return true;
+ }
+
+ std::ostream &print_detail(std::ostream &out) const final;
+protected:
+ virtual void apply_delta(const ceph::bufferlist &bl) = 0;
+ virtual std::ostream &print_detail_l(std::ostream &out) const {
+ return out;
+ }
+
+ virtual void logical_on_delta_write() {}
+
+ void on_delta_write(paddr_t record_block_offset) final {
+ assert(get_prior_instance());
+ pin->take_pin(*(get_prior_instance()->cast<LogicalCachedExtent>()->pin));
+ logical_on_delta_write();
+ }
+
+private:
+ laddr_t laddr = L_ADDR_NULL;
+ LBAPinRef pin;
+};
+
+using LogicalCachedExtentRef = TCachedExtentRef<LogicalCachedExtent>;
+struct ref_laddr_cmp {
+ using is_transparent = laddr_t;
+ bool operator()(const LogicalCachedExtentRef &lhs,
+ const LogicalCachedExtentRef &rhs) const {
+ return lhs->get_laddr() < rhs->get_laddr();
+ }
+ bool operator()(const laddr_t &lhs,
+ const LogicalCachedExtentRef &rhs) const {
+ return lhs < rhs->get_laddr();
+ }
+ bool operator()(const LogicalCachedExtentRef &lhs,
+ const laddr_t &rhs) const {
+ return lhs->get_laddr() < rhs;
+ }
+};
+
+using lextent_set_t = addr_extent_set_base_t<
+ laddr_t,
+ LogicalCachedExtentRef,
+ ref_laddr_cmp
+ >;
+
+template <typename T>
+using lextent_list_t = addr_extent_list_base_t<
+ laddr_t, TCachedExtentRef<T>>;
+
+}
diff --git a/src/crimson/os/seastore/extentmap_manager.cc b/src/crimson/os/seastore/extentmap_manager.cc
new file mode 100644
index 000000000..32de3a6ed
--- /dev/null
+++ b/src/crimson/os/seastore/extentmap_manager.cc
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include <experimental/iterator>
+#include <iostream>
+
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/extentmap_manager.h"
+#include "crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h"
+namespace crimson::os::seastore::extentmap_manager {
+
+ExtentMapManagerRef create_extentmap_manager(TransactionManager &trans_manager) {
+ return ExtentMapManagerRef(new BtreeExtentMapManager(trans_manager));
+}
+
+}
+
+namespace crimson::os::seastore {
+
+std::ostream &operator<<(std::ostream &out, const extent_mapping_t &rhs)
+{
+ return out << "extent_mapping_t (" << rhs.logical_offset << "~" << rhs.length
+ << "->" << rhs.laddr << ")";
+}
+
+std::ostream &operator<<(std::ostream &out, const extent_map_list_t &rhs)
+{
+ out << '[';
+ std::copy(std::begin(rhs), std::end(rhs), std::experimental::make_ostream_joiner(out, ", "));
+ return out << ']';
+}
+
+}
diff --git a/src/crimson/os/seastore/extentmap_manager.h b/src/crimson/os/seastore/extentmap_manager.h
new file mode 100644
index 000000000..7d5223b94
--- /dev/null
+++ b/src/crimson/os/seastore/extentmap_manager.h
@@ -0,0 +1,124 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iosfwd>
+#include <list>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/exceptions.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+
+#define PAGE_SIZE 4096
+#define EXTMAP_BLOCK_SIZE 4096
+
+namespace crimson::os::seastore {
+
+struct lext_map_val_t {
+ laddr_t laddr;
+ extent_len_t length = 0;
+
+ lext_map_val_t(
+ laddr_t laddr,
+ extent_len_t length)
+ : laddr(laddr), length(length) {}
+
+};
+
+class extent_mapping_t
+{
+public:
+ objaddr_t logical_offset = 0; //offset in object
+ laddr_t laddr; // lextent start address aligned with block size.
+ extent_len_t length = 0;
+ explicit extent_mapping_t(objaddr_t lo) : logical_offset(lo) { }
+
+ extent_mapping_t(
+ objaddr_t lo,
+ laddr_t laddr,
+ extent_len_t length)
+ : logical_offset(lo), laddr(laddr), length(length) {}
+
+ ~extent_mapping_t() {}
+};
+
+enum class extmap_root_state_t : uint8_t {
+ INITIAL = 0,
+ MUTATED = 1,
+ NONE = 0xFF
+};
+
+using extent_map_list_t = std::list<extent_mapping_t>;
+std::ostream &operator<<(std::ostream &out, const extent_mapping_t &rhs);
+std::ostream &operator<<(std::ostream &out, const extent_map_list_t &rhs);
+
+struct extmap_root_t {
+ depth_t depth = 0;
+ extmap_root_state_t state;
+ laddr_t extmap_root_laddr;
+ extmap_root_t(depth_t dep, laddr_t laddr)
+ : depth(dep),
+ extmap_root_laddr(laddr) { state = extmap_root_state_t::INITIAL; }
+};
+
+/**
+ * Abstract interface for managing the object inner offset to logical addr mapping
+ * each onode has an extentmap tree for a particular onode.
+ */
+class ExtentMapManager {
+public:
+ using initialize_extmap_ertr = TransactionManager::alloc_extent_ertr;
+ using initialize_extmap_ret = initialize_extmap_ertr::future<extmap_root_t>;
+ virtual initialize_extmap_ret initialize_extmap(Transaction &t) = 0;
+
+ /* find_lextents
+ *
+ * Return a list of all extent_mapping_t overlapping any portion of lo~len.
+ * or if not find any overlap extent_mapping_t will return the next extent after the range.
+ */
+ using find_lextent_ertr = TransactionManager::read_extent_ertr;
+ using find_lextent_ret = find_lextent_ertr::future<extent_map_list_t>;
+ virtual find_lextent_ret
+ find_lextent(const extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, extent_len_t len) = 0;
+
+ /* add_lextent
+ *
+ * add a new mapping (object offset -> laddr, length) to extent map
+ * return the added extent_mapping_t
+ */
+ using add_lextent_ertr = TransactionManager::read_extent_ertr;
+ using add_lextent_ret = add_lextent_ertr::future<extent_mapping_t>;
+ virtual add_lextent_ret
+ add_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) = 0;
+
+ /* rm_lextent
+ *
+ * remove an existing extent mapping from extent map
+ * return true if the extent mapping is removed, otherwise return false
+ */
+ using rm_lextent_ertr = TransactionManager::read_extent_ertr;
+ using rm_lextent_ret = rm_lextent_ertr::future<bool>;
+ virtual rm_lextent_ret rm_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) = 0;
+
+ virtual ~ExtentMapManager() {}
+};
+using ExtentMapManagerRef = std::unique_ptr<ExtentMapManager>;
+
+namespace extentmap_manager {
+/* creat ExtentMapManager for an extentmap
+ * if it is a new extmap after create_extentmap_manager need call initialize_extmap
+ * to initialize the extent map before use it
+ * if it is an exsiting extmap, needn't initialize_extmap
+ */
+ExtentMapManagerRef create_extentmap_manager(
+ TransactionManager &trans_manager);
+
+}
+
+}
diff --git a/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.cc b/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.cc
new file mode 100644
index 000000000..f7609d3e8
--- /dev/null
+++ b/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.cc
@@ -0,0 +1,118 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include <sys/mman.h>
+#include <string.h>
+
+#include "crimson/common/log.h"
+
+#include "include/buffer.h"
+#include "crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h"
+#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+namespace crimson::os::seastore::extentmap_manager {
+
+BtreeExtentMapManager::BtreeExtentMapManager(
+ TransactionManager &tm)
+ : tm(tm) {}
+
+BtreeExtentMapManager::initialize_extmap_ret
+BtreeExtentMapManager::initialize_extmap(Transaction &t)
+{
+
+ logger().debug("{}", __func__);
+ return tm.alloc_extent<ExtMapLeafNode>(t, L_ADDR_MIN, EXTMAP_BLOCK_SIZE)
+ .safe_then([](auto&& root_extent) {
+ root_extent->set_size(0);
+ extmap_node_meta_t meta{1};
+ root_extent->set_meta(meta);
+ extmap_root_t extmap_root = extmap_root_t(1, root_extent->get_laddr());
+ return initialize_extmap_ertr::make_ready_future<extmap_root_t>(extmap_root);
+ });
+}
+
+BtreeExtentMapManager::get_root_ret
+BtreeExtentMapManager::get_extmap_root(const extmap_root_t &extmap_root, Transaction &t)
+{
+ assert(extmap_root.extmap_root_laddr != L_ADDR_NULL);
+ laddr_t laddr = extmap_root.extmap_root_laddr;
+ return extmap_load_extent(get_ext_context(t), laddr, extmap_root.depth);
+}
+
+BtreeExtentMapManager::find_lextent_ret
+BtreeExtentMapManager::find_lextent(const extmap_root_t &extmap_root, Transaction &t,
+ objaddr_t lo, extent_len_t len)
+{
+ logger().debug("{}: {}, {}", __func__, lo, len);
+ return get_extmap_root(extmap_root, t).safe_then([this, &t, lo, len](auto&& extent) {
+ return extent->find_lextent(get_ext_context(t), lo, len);
+ }).safe_then([](auto &&e) {
+ logger().debug("{}: found_lextent {}", __func__, e);
+ return find_lextent_ret(
+ find_lextent_ertr::ready_future_marker{},
+ std::move(e));
+ });
+
+}
+
+BtreeExtentMapManager::add_lextent_ret
+BtreeExtentMapManager::add_lextent(extmap_root_t &extmap_root, Transaction &t,
+ objaddr_t lo, lext_map_val_t val)
+{
+ logger().debug("{}: {}, {}, {}", __func__, lo, val.laddr, val.length);
+ return get_extmap_root(extmap_root, t).safe_then([this, &extmap_root, &t, lo, val](auto &&root) {
+ return insert_lextent(extmap_root, t, root, lo, val);
+ }).safe_then([](auto ret) {
+ logger().debug("{}: {}", __func__, ret);
+ return add_lextent_ret(
+ add_lextent_ertr::ready_future_marker{},
+ std::move(ret));
+ });
+
+}
+
+BtreeExtentMapManager::insert_lextent_ret
+BtreeExtentMapManager::insert_lextent(extmap_root_t &extmap_root, Transaction &t,
+ ExtMapNodeRef root, objaddr_t logical_offset, lext_map_val_t val)
+{
+ auto split = insert_lextent_ertr::make_ready_future<ExtMapNodeRef>(root);
+ if (root->at_max_capacity()) {
+ logger().debug("{}::splitting root {}", __func__, *root);
+ split = root->extmap_alloc_extent<ExtMapInnerNode>(get_ext_context(t), EXTMAP_BLOCK_SIZE)
+ .safe_then([this, &extmap_root, root, &t, logical_offset](auto&& nroot) {
+ extmap_node_meta_t meta{root->get_node_meta().depth + 1};
+ nroot->set_meta(meta);
+ nroot->journal_insert(nroot->begin(), OBJ_ADDR_MIN,
+ root->get_laddr(), nullptr);
+ extmap_root.extmap_root_laddr = nroot->get_laddr();
+ extmap_root.depth = root->get_node_meta().depth + 1;
+ extmap_root.state = extmap_root_state_t::MUTATED;
+ return nroot->split_entry(get_ext_context(t), logical_offset, nroot->begin(), root);
+ });
+ }
+ return split.safe_then([this, &t, logical_offset, val](ExtMapNodeRef node) {
+ return node->insert(get_ext_context(t), logical_offset, val);
+ });
+}
+
+BtreeExtentMapManager::rm_lextent_ret
+BtreeExtentMapManager::rm_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val)
+{
+ logger().debug("{}: {}, {}, {}", __func__, lo, val.laddr, val.length);
+ return get_extmap_root(extmap_root, t).safe_then([this, &t, lo, val](auto extent) {
+ return extent->rm_lextent(get_ext_context(t), lo, val);
+ }).safe_then([](auto removed) {
+ logger().debug("{}: {}", __func__, removed);
+ return rm_lextent_ret(
+ rm_lextent_ertr::ready_future_marker{},
+ removed);
+ });
+}
+
+
+}
diff --git a/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h b/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h
new file mode 100644
index 000000000..db676f41d
--- /dev/null
+++ b/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/extentmap_manager.h"
+#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+
+namespace crimson::os::seastore::extentmap_manager {
+/**
+ * BtreeExtentMapManager
+ *
+ * Uses a btree to track :
+ * objaddr_t -> laddr_t mapping for each onode extentmap
+ */
+
+class BtreeExtentMapManager : public ExtentMapManager {
+ TransactionManager &tm;
+
+ ext_context_t get_ext_context(Transaction &t) {
+ return ext_context_t{tm,t};
+ }
+
+ /* get_extmap_root
+ *
+ * load extent map tree root node
+ */
+ using get_root_ertr = TransactionManager::read_extent_ertr;
+ using get_root_ret = get_root_ertr::future<ExtMapNodeRef>;
+ get_root_ret get_extmap_root(const extmap_root_t &extmap_root, Transaction &t);
+
+ using insert_lextent_ertr = TransactionManager::read_extent_ertr;
+ using insert_lextent_ret = insert_lextent_ertr::future<extent_mapping_t >;
+ insert_lextent_ret insert_lextent(extmap_root_t &extmap_root, Transaction &t,
+ ExtMapNodeRef extent, objaddr_t lo,
+ lext_map_val_t val);
+
+public:
+ explicit BtreeExtentMapManager(TransactionManager &tm);
+
+ initialize_extmap_ret initialize_extmap(Transaction &t) final;
+
+ find_lextent_ret find_lextent(const extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, extent_len_t len) final;
+
+ add_lextent_ret add_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) final;
+
+ rm_lextent_ret rm_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) final;
+
+
+};
+using BtreeExtentMapManagerRef = std::unique_ptr<BtreeExtentMapManager>;
+
+}
diff --git a/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h
new file mode 100644
index 000000000..3937bd049
--- /dev/null
+++ b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+
+#pragma once
+
+#include <boost/iterator/counting_iterator.hpp>
+
+#include "crimson/common/log.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/extentmap_manager.h"
+
+namespace crimson::os::seastore::extentmap_manager{
+
+struct ext_context_t {
+ TransactionManager &tm;
+ Transaction &t;
+};
+
+struct extmap_node_meta_t {
+ depth_t depth = 0;
+
+ std::pair<extmap_node_meta_t, extmap_node_meta_t> split_into(objaddr_t pivot) const {
+ return std::make_pair(
+ extmap_node_meta_t{depth},
+ extmap_node_meta_t{depth});
+ }
+
+ static extmap_node_meta_t merge_from(
+ const extmap_node_meta_t &lhs, const extmap_node_meta_t &rhs) {
+ assert(lhs.depth == rhs.depth);
+ return extmap_node_meta_t{lhs.depth};
+ }
+
+ static std::pair<extmap_node_meta_t, extmap_node_meta_t>
+ rebalance(const extmap_node_meta_t &lhs, const extmap_node_meta_t &rhs, laddr_t pivot) {
+ assert(lhs.depth == rhs.depth);
+ return std::make_pair(
+ extmap_node_meta_t{lhs.depth},
+ extmap_node_meta_t{lhs.depth});
+ }
+};
+
+struct ExtMapNode : LogicalCachedExtent {
+ using ExtMapNodeRef = TCachedExtentRef<ExtMapNode>;
+
+ ExtMapNode(ceph::bufferptr &&ptr) : LogicalCachedExtent(std::move(ptr)) {}
+ ExtMapNode(const ExtMapNode &other)
+ : LogicalCachedExtent(other) {}
+
+ using find_lextent_ertr = ExtentMapManager::find_lextent_ertr;
+ using find_lextent_ret = ExtentMapManager::find_lextent_ret;
+ virtual find_lextent_ret find_lextent(ext_context_t ec,
+ objaddr_t lo, extent_len_t len) = 0;
+
+ using insert_ertr = TransactionManager::read_extent_ertr;
+ using insert_ret = insert_ertr::future<extent_mapping_t>;
+ virtual insert_ret insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val) = 0;
+
+ using rm_lextent_ertr = TransactionManager::read_extent_ertr;
+ using rm_lextent_ret = rm_lextent_ertr::future<bool>;
+ virtual rm_lextent_ret rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val) = 0;
+
+ using split_children_ertr = TransactionManager::alloc_extent_ertr;
+ using split_children_ret = split_children_ertr::future
+ <std::tuple<ExtMapNodeRef, ExtMapNodeRef, uint32_t>>;
+ virtual split_children_ret make_split_children(ext_context_t ec) = 0;
+
+ using full_merge_ertr = TransactionManager::alloc_extent_ertr;
+ using full_merge_ret = full_merge_ertr::future<ExtMapNodeRef>;
+ virtual full_merge_ret make_full_merge(ext_context_t ec, ExtMapNodeRef right) = 0;
+
+ using make_balanced_ertr = TransactionManager::alloc_extent_ertr;
+ using make_balanced_ret = make_balanced_ertr::future
+ <std::tuple<ExtMapNodeRef, ExtMapNodeRef, uint32_t>>;
+ virtual make_balanced_ret
+ make_balanced(ext_context_t ec, ExtMapNodeRef right, bool prefer_left) = 0;
+
+ virtual extmap_node_meta_t get_node_meta() const = 0;
+
+ virtual bool at_max_capacity() const = 0;
+ virtual bool at_min_capacity() const = 0;
+ virtual unsigned get_node_size() const = 0;
+ virtual ~ExtMapNode() = default;
+
+ using alloc_ertr = TransactionManager::alloc_extent_ertr;
+ template<class T>
+ alloc_ertr::future<TCachedExtentRef<T>>
+ extmap_alloc_extent(ext_context_t ec, extent_len_t len) {
+ return ec.tm.alloc_extent<T>(ec.t, L_ADDR_MIN, len).safe_then(
+ [](auto&& extent) {
+ return alloc_ertr::make_ready_future<TCachedExtentRef<T>>(std::move(extent));
+ });
+ }
+
+ template<class T>
+ alloc_ertr::future<std::pair<TCachedExtentRef<T>, TCachedExtentRef<T>>>
+ extmap_alloc_2extents(ext_context_t ec, extent_len_t len) {
+ return seastar::do_with(std::pair<TCachedExtentRef<T>, TCachedExtentRef<T>>(),
+ [ec, len] (auto &extents) {
+ return crimson::do_for_each(boost::make_counting_iterator(0),
+ boost::make_counting_iterator(2),
+ [ec, len, &extents] (auto i) {
+ return ec.tm.alloc_extent<T>(ec.t, L_ADDR_MIN, len).safe_then(
+ [i, &extents](auto &&node) {
+ if (i == 0)
+ extents.first = node;
+ if (i == 1)
+ extents.second = node;
+ });
+ }).safe_then([&extents] {
+ return alloc_ertr::make_ready_future
+ <std::pair<TCachedExtentRef<T>, TCachedExtentRef<T>>>(std::move(extents));
+ });
+ });
+ }
+
+ using retire_ertr = crimson::errorator<
+ crimson::ct_error::enoent,
+ crimson::ct_error::input_output_error>;
+ using retire_ret = retire_ertr::future<std::list<unsigned>>;
+ retire_ret
+ extmap_retire_node(ext_context_t ec, std::list<laddr_t> dec_laddrs) {
+ return seastar::do_with(std::move(dec_laddrs), std::list<unsigned>(),
+ [ec] (auto &&dec_laddrs, auto &refcnt) {
+ return crimson::do_for_each(dec_laddrs.begin(), dec_laddrs.end(),
+ [ec, &refcnt] (auto &laddr) {
+ return ec.tm.dec_ref(ec.t, laddr).safe_then([&refcnt] (auto ref) {
+ refcnt.push_back(ref);
+ });
+ }).safe_then([&refcnt] {
+ return retire_ertr::make_ready_future<std::list<unsigned>>(std::move(refcnt));
+ });
+ });
+ }
+
+};
+
+using ExtMapNodeRef = ExtMapNode::ExtMapNodeRef;
+
+TransactionManager::read_extent_ertr::future<ExtMapNodeRef>
+extmap_load_extent(ext_context_t ec, laddr_t laddr, depth_t depth);
+
+}
diff --git a/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.cc b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.cc
new file mode 100644
index 000000000..7bf8680a5
--- /dev/null
+++ b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.cc
@@ -0,0 +1,373 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include <memory>
+#include <string.h>
+
+#include "include/buffer.h"
+#include "include/byteorder.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h"
+#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+namespace crimson::os::seastore::extentmap_manager {
+
+std::ostream &ExtMapInnerNode::print_detail_l(std::ostream &out) const
+{
+ return out << ", size=" << get_size()
+ << ", depth=" << get_meta().depth;
+}
+
+ExtMapInnerNode::find_lextent_ret
+ExtMapInnerNode::find_lextent(ext_context_t ec, objaddr_t lo, extent_len_t len)
+{
+ auto [begin, end] = bound(lo, lo + len);
+ auto result_up = std::make_unique<extent_map_list_t>();
+ auto &result = *result_up;
+ return crimson::do_for_each(
+ std::move(begin),
+ std::move(end),
+ [this, ec, &result, lo, len](const auto &val) mutable {
+ return extmap_load_extent(ec, val.get_val(), get_meta().depth - 1).safe_then(
+ [ec, &result, lo, len](auto extent) mutable {
+ return extent->find_lextent(ec, lo, len).safe_then(
+ [&result](auto item_list) mutable {
+ result.splice(result.end(), item_list,
+ item_list.begin(), item_list.end());
+ });
+ });
+ }).safe_then([result=std::move(result_up)] {
+ return find_lextent_ret(
+ find_lextent_ertr::ready_future_marker{},
+ std::move(*result));
+ });
+}
+
+ExtMapInnerNode::insert_ret
+ExtMapInnerNode::insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val)
+{
+ auto insertion_pt = get_containing_child(lo);
+ assert(insertion_pt != end());
+ return extmap_load_extent(ec, insertion_pt->get_val(), get_meta().depth - 1).safe_then(
+ [this, ec, insertion_pt, lo, val=std::move(val)](auto extent) mutable {
+ return extent->at_max_capacity() ?
+ split_entry(ec, lo, insertion_pt, extent) :
+ insert_ertr::make_ready_future<ExtMapNodeRef>(std::move(extent));
+ }).safe_then([ec, lo, val=std::move(val)](ExtMapNodeRef extent) mutable {
+ return extent->insert(ec, lo, val);
+ });
+}
+
+ExtMapInnerNode::rm_lextent_ret
+ExtMapInnerNode::rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val)
+{
+ auto rm_pt = get_containing_child(lo);
+ return extmap_load_extent(ec, rm_pt->get_val(), get_meta().depth - 1).safe_then(
+ [this, ec, rm_pt, lo, val=std::move(val)](auto extent) mutable {
+ if (extent->at_min_capacity() && get_node_size() > 1) {
+ return merge_entry(ec, lo, rm_pt, extent);
+ } else {
+ return merge_entry_ertr::make_ready_future<ExtMapNodeRef>(std::move(extent));
+ }
+ }).safe_then([ec, lo, val](ExtMapNodeRef extent) mutable {
+ return extent->rm_lextent(ec, lo, val);
+ });
+}
+
+ExtMapInnerNode::split_children_ret
+ExtMapInnerNode::make_split_children(ext_context_t ec)
+{
+ logger().debug("{}: {}", "ExtMapInnerNode", __func__);
+ return extmap_alloc_2extents<ExtMapInnerNode>(ec, EXTMAP_BLOCK_SIZE)
+ .safe_then([this] (auto &&ext_pair) {
+ auto [left, right] = ext_pair;
+ return split_children_ret(
+ split_children_ertr::ready_future_marker{},
+ std::make_tuple(left, right, split_into(*left, *right)));
+ });
+}
+
+ExtMapInnerNode::full_merge_ret
+ExtMapInnerNode::make_full_merge(ext_context_t ec, ExtMapNodeRef right)
+{
+ logger().debug("{}: {}", "ExtMapInnerNode", __func__);
+ return extmap_alloc_extent<ExtMapInnerNode>(ec, EXTMAP_BLOCK_SIZE)
+ .safe_then([this, right] (auto &&replacement) {
+ replacement->merge_from(*this, *right->cast<ExtMapInnerNode>());
+ return full_merge_ret(
+ full_merge_ertr::ready_future_marker{},
+ std::move(replacement));
+ });
+}
+
+ExtMapInnerNode::make_balanced_ret
+ExtMapInnerNode::make_balanced(ext_context_t ec, ExtMapNodeRef _right, bool prefer_left)
+{
+ logger().debug("{}: {}", "ExtMapInnerNode", __func__);
+ ceph_assert(_right->get_type() == type);
+ return extmap_alloc_2extents<ExtMapInnerNode>(ec, EXTMAP_BLOCK_SIZE)
+ .safe_then([this, _right, prefer_left] (auto &&replacement_pair){
+ auto [replacement_left, replacement_right] = replacement_pair;
+ auto &right = *_right->cast<ExtMapInnerNode>();
+ return make_balanced_ret(
+ make_balanced_ertr::ready_future_marker{},
+ std::make_tuple(replacement_left, replacement_right,
+ balance_into_new_nodes(*this, right, prefer_left,
+ *replacement_left, *replacement_right)));
+ });
+}
+
+ExtMapInnerNode::split_entry_ret
+ExtMapInnerNode::split_entry(ext_context_t ec, objaddr_t lo,
+ internal_iterator_t iter, ExtMapNodeRef entry)
+{
+ logger().debug("{}: {}", "ExtMapInnerNode", __func__);
+ if (!is_pending()) {
+ auto mut = ec.tm.get_mutable_extent(ec.t, this)->cast<ExtMapInnerNode>();
+ auto mut_iter = mut->iter_idx(iter->get_offset());
+ return mut->split_entry(ec, lo, mut_iter, entry);
+ }
+ ceph_assert(!at_max_capacity());
+ return entry->make_split_children(ec)
+ .safe_then([this, ec, lo, iter, entry] (auto tuple){
+ auto [left, right, pivot] = tuple;
+ journal_update(iter, left->get_laddr(), maybe_get_delta_buffer());
+ journal_insert(iter + 1, pivot, right->get_laddr(), maybe_get_delta_buffer());
+ logger().debug(
+ "ExtMapInnerNode::split_entry *this {} entry {} into left {} right {}",
+ *this, *entry, *left, *right);
+ //retire extent
+ return ec.tm.dec_ref(ec.t, entry->get_laddr())
+ .safe_then([lo, left = left, right = right, pivot = pivot] (auto ret) {
+ return split_entry_ertr::make_ready_future<ExtMapNodeRef>(
+ pivot > lo ? left : right);
+ });
+ });
+}
+
+ExtMapInnerNode::merge_entry_ret
+ExtMapInnerNode::merge_entry(ext_context_t ec, objaddr_t lo,
+ internal_iterator_t iter, ExtMapNodeRef entry)
+{
+ if (!is_pending()) {
+ auto mut = ec.tm.get_mutable_extent(ec.t, this)->cast<ExtMapInnerNode>();
+ auto mut_iter = mut->iter_idx(iter->get_offset());
+ return mut->merge_entry(ec, lo, mut_iter, entry);
+ }
+ logger().debug("ExtMapInnerNode: merge_entry: {}, {}", *this, *entry);
+ auto is_left = (iter + 1) == end();
+ auto donor_iter = is_left ? iter - 1 : iter + 1;
+ return extmap_load_extent(ec, donor_iter->get_val(), get_meta().depth - 1)
+ .safe_then([this, ec, lo, iter, entry, donor_iter, is_left]
+ (auto &&donor) mutable {
+ auto [l, r] = is_left ?
+ std::make_pair(donor, entry) : std::make_pair(entry, donor);
+ auto [liter, riter] = is_left ?
+ std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter);
+ if (donor->at_min_capacity()) {
+ return l->make_full_merge(ec, r)
+ .safe_then([this, ec, entry, l = l, r = r, liter = liter, riter = riter]
+ (auto &&replacement){
+ journal_update(liter, replacement->get_laddr(), maybe_get_delta_buffer());
+ journal_remove(riter, maybe_get_delta_buffer());
+ //retire extent
+ std::list<laddr_t> dec_laddrs;
+ dec_laddrs.push_back(l->get_laddr());
+ dec_laddrs.push_back(r->get_laddr());
+ return extmap_retire_node(ec, dec_laddrs)
+ .safe_then([replacement] (auto &&ret) {
+ return merge_entry_ertr::make_ready_future<ExtMapNodeRef>(replacement);
+ });
+ });
+ } else {
+ logger().debug("ExtMapInnerNode::merge_entry balanced l {} r {}",
+ *l, *r);
+ return l->make_balanced(ec, r, !is_left)
+ .safe_then([this, ec, lo, entry, l = l, r = r, liter = liter, riter = riter]
+ (auto tuple) {
+ auto [replacement_l, replacement_r, pivot] = tuple;
+ journal_update(liter, replacement_l->get_laddr(), maybe_get_delta_buffer());
+ journal_replace(riter, pivot, replacement_r->get_laddr(),
+ maybe_get_delta_buffer());
+ // retire extent
+ std::list<laddr_t> dec_laddrs;
+ dec_laddrs.push_back(l->get_laddr());
+ dec_laddrs.push_back(r->get_laddr());
+ return extmap_retire_node(ec, dec_laddrs)
+ .safe_then([lo, pivot = pivot, replacement_l = replacement_l, replacement_r = replacement_r]
+ (auto &&ret) {
+ return merge_entry_ertr::make_ready_future<ExtMapNodeRef>(
+ lo >= pivot ? replacement_r : replacement_l);
+ });
+ });
+ }
+ });
+}
+
+
+ExtMapInnerNode::internal_iterator_t
+ExtMapInnerNode::get_containing_child(objaddr_t lo)
+{
+ // TODO: binary search
+ for (auto i = begin(); i != end(); ++i) {
+ if (i.contains(lo))
+ return i;
+ }
+ ceph_assert(0 == "invalid");
+ return end();
+}
+
+std::ostream &ExtMapLeafNode::print_detail_l(std::ostream &out) const
+{
+ return out << ", size=" << get_size()
+ << ", depth=" << get_meta().depth;
+}
+
+ExtMapLeafNode::find_lextent_ret
+ExtMapLeafNode::find_lextent(ext_context_t ec, objaddr_t lo, extent_len_t len)
+{
+ logger().debug(
+ "ExtMapLeafNode::find_lextent {}~{}", lo, len);
+ auto ret = extent_map_list_t();
+ auto [from, to] = get_leaf_entries(lo, len);
+ if (from == to && to != end())
+ ++to;
+ for (; from != to; ++from) {
+ auto val = (*from).get_val();
+ ret.emplace_back(
+ extent_mapping_t(
+ (*from).get_key(),
+ val.laddr,
+ val.length));
+ logger().debug("ExtMapLeafNode::find_lextent find {}~{}", lo, val.laddr);
+ }
+ return find_lextent_ertr::make_ready_future<extent_map_list_t>(
+ std::move(ret));
+}
+
+ExtMapLeafNode::insert_ret
+ExtMapLeafNode::insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val)
+{
+ ceph_assert(!at_max_capacity());
+ if (!is_pending()) {
+ auto mut = ec.tm.get_mutable_extent(ec.t, this)->cast<ExtMapLeafNode>();
+ return mut->insert(ec, lo, val);
+ }
+ auto insert_pt = lower_bound(lo);
+ journal_insert(insert_pt, lo, val, maybe_get_delta_buffer());
+
+ logger().debug(
+ "ExtMapLeafNode::insert: inserted {}->{} {}",
+ insert_pt.get_key(),
+ insert_pt.get_val().laddr,
+ insert_pt.get_val().length);
+ return insert_ertr::make_ready_future<extent_mapping_t>(
+ extent_mapping_t(lo, val.laddr, val.length));
+}
+
+ExtMapLeafNode::rm_lextent_ret
+ExtMapLeafNode::rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val)
+{
+ if (!is_pending()) {
+ auto mut = ec.tm.get_mutable_extent(ec.t, this)->cast<ExtMapLeafNode>();
+ return mut->rm_lextent(ec, lo, val);
+ }
+
+ auto [rm_pt, rm_end] = get_leaf_entries(lo, val.length);
+ if (lo == rm_pt->get_key() && val.laddr == rm_pt->get_val().laddr
+ && val.length == rm_pt->get_val().length) {
+ journal_remove(rm_pt, maybe_get_delta_buffer());
+ logger().debug(
+ "ExtMapLeafNode::rm_lextent: removed {}->{} {}",
+ rm_pt.get_key(),
+ rm_pt.get_val().laddr,
+ rm_pt.get_val().length);
+ return rm_lextent_ertr::make_ready_future<bool>(true);
+ } else {
+ return rm_lextent_ertr::make_ready_future<bool>(false);
+ }
+}
+
+ExtMapLeafNode::split_children_ret
+ExtMapLeafNode::make_split_children(ext_context_t ec)
+{
+ logger().debug("{}: {}", "ExtMapLeafNode", __func__);
+ return extmap_alloc_2extents<ExtMapLeafNode>(ec, EXTMAP_BLOCK_SIZE)
+ .safe_then([this] (auto &&ext_pair) {
+ auto [left, right] = ext_pair;
+ return split_children_ret(
+ split_children_ertr::ready_future_marker{},
+ std::make_tuple(left, right, split_into(*left, *right)));
+ });
+}
+
+ExtMapLeafNode::full_merge_ret
+ExtMapLeafNode::make_full_merge(ext_context_t ec, ExtMapNodeRef right)
+{
+ logger().debug("{}: {}", "ExtMapLeafNode", __func__);
+ return extmap_alloc_extent<ExtMapLeafNode>(ec, EXTMAP_BLOCK_SIZE)
+ .safe_then([this, right] (auto &&replacement) {
+ replacement->merge_from(*this, *right->cast<ExtMapLeafNode>());
+ return full_merge_ret(
+ full_merge_ertr::ready_future_marker{},
+ std::move(replacement));
+ });
+}
+ExtMapLeafNode::make_balanced_ret
+ExtMapLeafNode::make_balanced(ext_context_t ec, ExtMapNodeRef _right, bool prefer_left)
+{
+ logger().debug("{}: {}", "ExtMapLeafNode", __func__);
+ ceph_assert(_right->get_type() == type);
+ return extmap_alloc_2extents<ExtMapLeafNode>(ec, EXTMAP_BLOCK_SIZE)
+ .safe_then([this, _right, prefer_left] (auto &&replacement_pair) {
+ auto [replacement_left, replacement_right] = replacement_pair;
+ auto &right = *_right->cast<ExtMapLeafNode>();
+ return make_balanced_ret(
+ make_balanced_ertr::ready_future_marker{},
+ std::make_tuple(
+ replacement_left, replacement_right,
+ balance_into_new_nodes(
+ *this, right, prefer_left,
+ *replacement_left, *replacement_right)));
+ });
+}
+
+
+std::pair<ExtMapLeafNode::internal_iterator_t, ExtMapLeafNode::internal_iterator_t>
+ExtMapLeafNode::get_leaf_entries(objaddr_t addr, extent_len_t len)
+{
+ return bound(addr, addr + len);
+}
+
+
+TransactionManager::read_extent_ertr::future<ExtMapNodeRef>
+extmap_load_extent(ext_context_t ec, laddr_t laddr, depth_t depth)
+{
+ ceph_assert(depth > 0);
+ if (depth > 1) {
+ return ec.tm.read_extents<ExtMapInnerNode>(ec.t, laddr, EXTMAP_BLOCK_SIZE).safe_then(
+ [](auto&& extents) {
+ assert(extents.size() == 1);
+ [[maybe_unused]] auto [laddr, e] = extents.front();
+ return TransactionManager::read_extent_ertr::make_ready_future<ExtMapNodeRef>(std::move(e));
+ });
+ } else {
+ return ec.tm.read_extents<ExtMapLeafNode>(ec.t, laddr, EXTMAP_BLOCK_SIZE).safe_then(
+ [](auto&& extents) {
+ assert(extents.size() == 1);
+ [[maybe_unused]] auto [laddr, e] = extents.front();
+ return TransactionManager::read_extent_ertr::make_ready_future<ExtMapNodeRef>(std::move(e));
+ });
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h
new file mode 100644
index 000000000..f5da8cdc2
--- /dev/null
+++ b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h
@@ -0,0 +1,281 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+#include "include/buffer.h"
+
+#include "crimson/common/fixed_kv_node_layout.h"
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/extentmap_manager.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h"
+
+namespace crimson::os::seastore::extentmap_manager {
+
+struct extmap_node_meta_le_t {
+ depth_le_t depth = init_les32(0);
+
+ extmap_node_meta_le_t() = default;
+ extmap_node_meta_le_t(const extmap_node_meta_le_t &) = default;
+ explicit extmap_node_meta_le_t(const extmap_node_meta_t &val)
+ : depth(init_les32(val.depth)) {}
+
+ operator extmap_node_meta_t() const {
+ return extmap_node_meta_t{ depth };
+ }
+};
+
+/**
+ * ExtMapInnerNode
+ *
+ * Abstracts operations on and layout of internal nodes for the
+ * Extentmap Tree.
+ *
+ * Layout (4k):
+ * num_entries: uint32_t 4b
+ * meta : depth 4b
+ * (padding) : 8b
+ * keys : objaddr_t[340] (340*4)b
+ * values : laddr_t[340] (340*8)b
+ * = 4096
+ */
+constexpr size_t INNER_NODE_CAPACITY =
+ (EXTMAP_BLOCK_SIZE - sizeof(uint32_t) - sizeof(extmap_node_meta_t))
+ / (sizeof (objaddr_t) + sizeof(laddr_t));
+
+struct ExtMapInnerNode
+ : ExtMapNode,
+ common::FixedKVNodeLayout<
+ INNER_NODE_CAPACITY,
+ extmap_node_meta_t, extmap_node_meta_le_t,
+ objaddr_t, ceph_le32,
+ laddr_t, laddr_le_t> {
+ using internal_iterator_t = const_iterator;
+ template <typename... T>
+ ExtMapInnerNode(T&&... t) :
+ ExtMapNode(std::forward<T>(t)...),
+ FixedKVNodeLayout(get_bptr().c_str()) {}
+
+ static constexpr extent_types_t type = extent_types_t::EXTMAP_INNER;
+
+ extmap_node_meta_t get_node_meta() const final {return get_meta();}
+
+ CachedExtentRef duplicate_for_write() final {
+ assert(delta_buffer.empty());
+ return CachedExtentRef(new ExtMapInnerNode(*this));
+ };
+
+ delta_buffer_t delta_buffer;
+ delta_buffer_t *maybe_get_delta_buffer() {
+ return is_mutation_pending() ? &delta_buffer : nullptr;
+ }
+
+ find_lextent_ret find_lextent(ext_context_t ec, objaddr_t lo, extent_len_t len) final;
+
+ insert_ret insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val) final;
+
+ rm_lextent_ret rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val) final;
+
+ split_children_ret make_split_children(ext_context_t ec) final;
+
+ full_merge_ret make_full_merge(ext_context_t ec, ExtMapNodeRef right) final;
+
+ make_balanced_ret make_balanced(ext_context_t ec, ExtMapNodeRef _right, bool prefer_left) final;
+
+ std::ostream &print_detail_l(std::ostream &out) const final;
+
+ extent_types_t get_type() const final {
+ return type;
+ }
+
+ ceph::bufferlist get_delta() final {
+ assert(!delta_buffer.empty());
+ ceph::buffer::ptr bptr(delta_buffer.get_bytes());
+ delta_buffer.copy_out(bptr.c_str(), bptr.length());
+ ceph::bufferlist bl;
+ bl.push_back(bptr);
+ return bl;
+ }
+
+ void apply_delta(const ceph::bufferlist &_bl) final {
+ assert(_bl.length());
+ ceph::bufferlist bl = _bl;
+ bl.rebuild();
+ delta_buffer_t buffer;
+ buffer.copy_in(bl.front().c_str(), bl.front().length());
+ buffer.replay(*this);
+ }
+
+ bool at_max_capacity() const final {
+ return get_size() == get_capacity();
+ }
+
+ bool at_min_capacity() const {
+ return get_size() == get_capacity() / 2;
+ }
+
+ unsigned get_node_size() const {
+ return get_size();
+ }
+
+ /* get the iterator containing [l, r]
+ */
+ std::pair<internal_iterator_t, internal_iterator_t> bound(
+ objaddr_t l, objaddr_t r) {
+ auto retl = begin();
+ for (; retl != end(); ++retl) {
+ if (retl->get_next_key_or_max() > l)
+ break;
+ }
+ auto retr = retl;
+ for (; retr != end(); ++retr) {
+ if (retr->get_key() >= r)
+ break;
+ }
+ return {retl, retr};
+ }
+
+ using split_entry_ertr = TransactionManager::read_extent_ertr;
+ using split_entry_ret = split_entry_ertr::future<ExtMapNodeRef>;
+ split_entry_ret split_entry(ext_context_t ec, objaddr_t lo,
+ internal_iterator_t, ExtMapNodeRef entry);
+ using merge_entry_ertr = TransactionManager::read_extent_ertr;
+ using merge_entry_ret = merge_entry_ertr::future<ExtMapNodeRef>;
+ merge_entry_ret merge_entry(ext_context_t ec, objaddr_t lo,
+ internal_iterator_t iter, ExtMapNodeRef entry);
+ internal_iterator_t get_containing_child(objaddr_t lo);
+
+};
+
+/**
+ * ExtMapLeafNode
+ *
+ * Abstracts operations on and layout of leaf nodes for the
+ * ExtentMap Tree.
+ *
+ * Layout (4k):
+ * num_entries: uint32_t 4b
+ * meta : depth 4b
+ * (padding) : 8b
+ * keys : objaddr_t[204] (204*4)b
+ * values : lext_map_val_t[204] (204*16)b
+ * = 4096
+ */
+constexpr size_t LEAF_NODE_CAPACITY =
+ (EXTMAP_BLOCK_SIZE - sizeof(uint32_t) - sizeof(extmap_node_meta_t))
+ / (sizeof(objaddr_t) + sizeof(lext_map_val_t));
+
+struct lext_map_val_le_t {
+ laddr_le_t laddr;
+ extent_len_le_t length = init_extent_len_le_t(0);
+
+ lext_map_val_le_t() = default;
+ lext_map_val_le_t(const lext_map_val_le_t &) = default;
+ explicit lext_map_val_le_t(const lext_map_val_t &val)
+ : laddr(laddr_le_t(val.laddr)),
+ length(init_extent_len_le_t(val.length)) {}
+
+ operator lext_map_val_t() const {
+ return lext_map_val_t{laddr, length};
+ }
+};
+
+struct ExtMapLeafNode
+ : ExtMapNode,
+ common::FixedKVNodeLayout<
+ LEAF_NODE_CAPACITY,
+ extmap_node_meta_t, extmap_node_meta_le_t,
+ objaddr_t, ceph_le32,
+ lext_map_val_t, lext_map_val_le_t> {
+ using internal_iterator_t = const_iterator;
+ template <typename... T>
+ ExtMapLeafNode(T&&... t) :
+ ExtMapNode(std::forward<T>(t)...),
+ FixedKVNodeLayout(get_bptr().c_str()) {}
+
+ static constexpr extent_types_t type = extent_types_t::EXTMAP_LEAF;
+
+ extmap_node_meta_t get_node_meta() const final { return get_meta(); }
+
+ CachedExtentRef duplicate_for_write() final {
+ assert(delta_buffer.empty());
+ return CachedExtentRef(new ExtMapLeafNode(*this));
+ };
+
+ delta_buffer_t delta_buffer;
+ delta_buffer_t *maybe_get_delta_buffer() {
+ return is_mutation_pending() ? &delta_buffer : nullptr;
+ }
+
+ find_lextent_ret find_lextent(ext_context_t ec, objaddr_t lo, extent_len_t len) final;
+
+ insert_ret insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val) final;
+
+ rm_lextent_ret rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val) final;
+
+ split_children_ret make_split_children(ext_context_t ec) final;
+
+ full_merge_ret make_full_merge(ext_context_t ec, ExtMapNodeRef right) final;
+
+ make_balanced_ret make_balanced(ext_context_t ec, ExtMapNodeRef _right, bool prefer_left) final;
+
+ extent_types_t get_type() const final {
+ return type;
+ }
+
+ ceph::bufferlist get_delta() final {
+ assert(!delta_buffer.empty());
+ ceph::buffer::ptr bptr(delta_buffer.get_bytes());
+ delta_buffer.copy_out(bptr.c_str(), bptr.length());
+ ceph::bufferlist bl;
+ bl.push_back(bptr);
+ return bl;
+ }
+
+ void apply_delta(const ceph::bufferlist &_bl) final {
+ assert(_bl.length());
+ ceph::bufferlist bl = _bl;
+ bl.rebuild();
+ delta_buffer_t buffer;
+ buffer.copy_in(bl.front().c_str(), bl.front().length());
+ buffer.replay(*this);
+ }
+
+ std::ostream &print_detail_l(std::ostream &out) const final;
+
+ bool at_max_capacity() const final {
+ return get_size() == get_capacity();
+ }
+
+ bool at_min_capacity() const final {
+ return get_size() == get_capacity() / 2;
+ }
+
+ unsigned get_node_size() const {
+ return get_size();
+ }
+
+ /* get the iterator containing [l, r]
+ */
+ std::pair<internal_iterator_t, internal_iterator_t> bound(
+ objaddr_t l, objaddr_t r) {
+ auto retl = begin();
+ for (; retl != end(); ++retl) {
+ if (retl->get_key() >= l || (retl->get_key() + retl->get_val().length) > l)
+ break;
+ }
+ auto retr = retl;
+ for (; retr != end(); ++retr) {
+ if (retr->get_key() >= r)
+ break;
+ }
+ return {retl, retr};
+ }
+
+ std::pair<internal_iterator_t, internal_iterator_t>
+ get_leaf_entries(objaddr_t lo, extent_len_t len);
+
+};
+using ExtentMapLeafNodeRef = TCachedExtentRef<ExtMapLeafNode>;
+
+}
diff --git a/src/crimson/os/seastore/journal.cc b/src/crimson/os/seastore/journal.cc
new file mode 100644
index 000000000..39875fb56
--- /dev/null
+++ b/src/crimson/os/seastore/journal.cc
@@ -0,0 +1,756 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+
+#include <boost/iterator/counting_iterator.hpp>
+
+#include "crimson/os/seastore/journal.h"
+
+#include "include/intarith.h"
+#include "crimson/os/seastore/segment_manager.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+namespace crimson::os::seastore {
+
+std::ostream &operator<<(std::ostream &out, const segment_header_t &header)
+{
+ return out << "segment_header_t("
+ << "segment_seq=" << header.journal_segment_seq
+ << ", physical_segment_id=" << header.physical_segment_id
+ << ", journal_tail=" << header.journal_tail
+ << ", segment_nonce=" << header.segment_nonce
+ << ")";
+}
+
+segment_nonce_t generate_nonce(
+ segment_seq_t seq,
+ const seastore_meta_t &meta)
+{
+ return ceph_crc32c(
+ seq,
+ reinterpret_cast<const unsigned char *>(meta.seastore_id.bytes()),
+ sizeof(meta.seastore_id.uuid));
+}
+
+Journal::Journal(SegmentManager &segment_manager)
+ : block_size(segment_manager.get_block_size()),
+ max_record_length(
+ segment_manager.get_segment_size() -
+ p2align(ceph::encoded_sizeof_bounded<segment_header_t>(),
+ size_t(block_size))),
+ segment_manager(segment_manager) {}
+
+
+Journal::initialize_segment_ertr::future<segment_seq_t>
+Journal::initialize_segment(Segment &segment)
+{
+ auto new_tail = segment_provider->get_journal_tail_target();
+ logger().debug(
+ "initialize_segment {} journal_tail_target {}",
+ segment.get_segment_id(),
+ new_tail);
+ // write out header
+ ceph_assert(segment.get_write_ptr() == 0);
+ bufferlist bl;
+
+ segment_seq_t seq = next_journal_segment_seq++;
+ current_segment_nonce = generate_nonce(
+ seq, segment_manager.get_meta());
+ auto header = segment_header_t{
+ seq,
+ segment.get_segment_id(),
+ segment_provider->get_journal_tail_target(),
+ current_segment_nonce};
+ encode(header, bl);
+
+ bufferptr bp(
+ ceph::buffer::create_page_aligned(
+ segment_manager.get_block_size()));
+ bp.zero();
+ auto iter = bl.cbegin();
+ iter.copy(bl.length(), bp.c_str());
+ bl.clear();
+ bl.append(bp);
+
+ written_to = segment_manager.get_block_size();
+ committed_to = 0;
+ return segment.write(0, bl).safe_then(
+ [=] {
+ segment_provider->update_journal_tail_committed(new_tail);
+ return seq;
+ },
+ initialize_segment_ertr::pass_further{},
+ crimson::ct_error::assert_all{ "TODO" });
+}
+
+ceph::bufferlist Journal::encode_record(
+ record_size_t rsize,
+ record_t &&record)
+{
+ bufferlist data_bl;
+ for (auto &i: record.extents) {
+ data_bl.append(i.bl);
+ }
+
+ bufferlist bl;
+ record_header_t header{
+ rsize.mdlength,
+ rsize.dlength,
+ (uint32_t)record.deltas.size(),
+ (uint32_t)record.extents.size(),
+ current_segment_nonce,
+ committed_to,
+ data_bl.crc32c(-1)
+ };
+ encode(header, bl);
+
+ auto metadata_crc_filler = bl.append_hole(sizeof(uint32_t));
+
+ for (const auto &i: record.extents) {
+ encode(extent_info_t(i), bl);
+ }
+ for (const auto &i: record.deltas) {
+ encode(i, bl);
+ }
+ if (bl.length() % block_size != 0) {
+ bl.append_zero(
+ block_size - (bl.length() % block_size));
+ }
+ ceph_assert(bl.length() == rsize.mdlength);
+
+
+ auto bliter = bl.cbegin();
+ auto metadata_crc = bliter.crc32c(
+ ceph::encoded_sizeof_bounded<record_header_t>(),
+ -1);
+ bliter += sizeof(checksum_t); /* crc hole again */
+ metadata_crc = bliter.crc32c(
+ bliter.get_remaining(),
+ metadata_crc);
+ ceph_le32 metadata_crc_le;
+ metadata_crc_le = metadata_crc;
+ metadata_crc_filler.copy_in(
+ sizeof(checksum_t),
+ reinterpret_cast<const char *>(&metadata_crc_le));
+
+ bl.claim_append(data_bl);
+ ceph_assert(bl.length() == (rsize.dlength + rsize.mdlength));
+
+ return bl;
+}
+
+bool Journal::validate_metadata(const bufferlist &bl)
+{
+ auto bliter = bl.cbegin();
+ auto test_crc = bliter.crc32c(
+ ceph::encoded_sizeof_bounded<record_header_t>(),
+ -1);
+ ceph_le32 recorded_crc_le;
+ ::decode(recorded_crc_le, bliter);
+ uint32_t recorded_crc = recorded_crc_le;
+ test_crc = bliter.crc32c(
+ bliter.get_remaining(),
+ test_crc);
+ return test_crc == recorded_crc;
+}
+
+Journal::read_validate_data_ret Journal::read_validate_data(
+ paddr_t record_base,
+ const record_header_t &header)
+{
+ return segment_manager.read(
+ record_base.add_offset(header.mdlength),
+ header.dlength
+ ).safe_then([=, &header](auto bptr) {
+ bufferlist bl;
+ bl.append(bptr);
+ return bl.crc32c(-1) == header.data_crc;
+ });
+}
+
+Journal::write_record_ret Journal::write_record(
+ record_size_t rsize,
+ record_t &&record)
+{
+ ceph::bufferlist to_write = encode_record(
+ rsize, std::move(record));
+ auto target = written_to;
+ assert((to_write.length() % block_size) == 0);
+ written_to += to_write.length();
+ logger().debug(
+ "write_record, mdlength {}, dlength {}, target {}",
+ rsize.mdlength,
+ rsize.dlength,
+ target);
+ return current_journal_segment->write(target, to_write).handle_error(
+ write_record_ertr::pass_further{},
+ crimson::ct_error::assert_all{ "TODO" }).safe_then([this, target] {
+ committed_to = target;
+ return write_record_ret(
+ write_record_ertr::ready_future_marker{},
+ paddr_t{
+ current_journal_segment->get_segment_id(),
+ target});
+ });
+}
+
+Journal::record_size_t Journal::get_encoded_record_length(
+ const record_t &record) const {
+ extent_len_t metadata =
+ (extent_len_t)ceph::encoded_sizeof_bounded<record_header_t>();
+ metadata += sizeof(checksum_t) /* crc */;
+ metadata += record.extents.size() *
+ ceph::encoded_sizeof_bounded<extent_info_t>();
+ extent_len_t data = 0;
+ for (const auto &i: record.deltas) {
+ metadata += ceph::encoded_sizeof(i);
+ }
+ for (const auto &i: record.extents) {
+ data += i.bl.length();
+ }
+ metadata = p2roundup(metadata, block_size);
+ return record_size_t{metadata, data};
+}
+
+bool Journal::needs_roll(segment_off_t length) const
+{
+ return length + written_to >
+ current_journal_segment->get_write_capacity();
+}
+
+Journal::roll_journal_segment_ertr::future<segment_seq_t>
+Journal::roll_journal_segment()
+{
+ auto old_segment_id = current_journal_segment ?
+ current_journal_segment->get_segment_id() :
+ NULL_SEG_ID;
+
+ return (current_journal_segment ?
+ current_journal_segment->close() :
+ Segment::close_ertr::now()).safe_then([this] {
+ return segment_provider->get_segment();
+ }).safe_then([this](auto segment) {
+ return segment_manager.open(segment);
+ }).safe_then([this](auto sref) {
+ current_journal_segment = sref;
+ written_to = 0;
+ return initialize_segment(*current_journal_segment);
+ }).safe_then([=](auto seq) {
+ if (old_segment_id != NULL_SEG_ID) {
+ segment_provider->close_segment(old_segment_id);
+ }
+ segment_provider->set_journal_segment(
+ current_journal_segment->get_segment_id(),
+ seq);
+ return seq;
+ }).handle_error(
+ roll_journal_segment_ertr::pass_further{},
+ crimson::ct_error::all_same_way([] { ceph_assert(0 == "TODO"); })
+ );
+}
+
+Journal::read_segment_header_ret
+Journal::read_segment_header(segment_id_t segment)
+{
+ return segment_manager.read(paddr_t{segment, 0}, block_size
+ ).handle_error(
+ read_segment_header_ertr::pass_further{},
+ crimson::ct_error::assert_all{}
+ ).safe_then([=](bufferptr bptr) -> read_segment_header_ret {
+ logger().debug("segment {} bptr size {}", segment, bptr.length());
+
+ segment_header_t header;
+ bufferlist bl;
+ bl.push_back(bptr);
+
+ logger().debug(
+ "Journal::read_segment_header: segment {} block crc {}",
+ segment,
+ bl.begin().crc32c(block_size, 0));
+
+ auto bp = bl.cbegin();
+ try {
+ decode(header, bp);
+ } catch (ceph::buffer::error &e) {
+ logger().debug(
+ "Journal::read_segment_header: segment {} unable to decode "
+ "header, skipping",
+ segment);
+ return crimson::ct_error::enodata::make();
+ }
+ logger().debug(
+ "Journal::read_segment_header: segment {} header {}",
+ segment,
+ header);
+ return read_segment_header_ret(
+ read_segment_header_ertr::ready_future_marker{},
+ header);
+ });
+}
+
+Journal::open_for_write_ret Journal::open_for_write()
+{
+ return roll_journal_segment().safe_then([this](auto seq) {
+ return open_for_write_ret(
+ open_for_write_ertr::ready_future_marker{},
+ journal_seq_t{
+ seq,
+ paddr_t{
+ current_journal_segment->get_segment_id(),
+ static_cast<segment_off_t>(block_size)}
+ });
+ });
+}
+
+Journal::find_replay_segments_fut Journal::find_replay_segments()
+{
+ return seastar::do_with(
+ std::vector<std::pair<segment_id_t, segment_header_t>>(),
+ [this](auto &&segments) mutable {
+ return crimson::do_for_each(
+ boost::make_counting_iterator(segment_id_t{0}),
+ boost::make_counting_iterator(segment_manager.get_num_segments()),
+ [this, &segments](auto i) {
+ return read_segment_header(i
+ ).safe_then([this, &segments, i](auto header) mutable {
+ if (generate_nonce(
+ header.journal_segment_seq,
+ segment_manager.get_meta()) != header.segment_nonce) {
+ logger().debug(
+ "find_replay_segments: nonce mismatch segment {} header {}",
+ i,
+ header);
+ assert(0 == "impossible");
+ return find_replay_segments_ertr::now();
+ }
+
+ segments.emplace_back(i, std::move(header));
+ return find_replay_segments_ertr::now();
+ }).handle_error(
+ crimson::ct_error::enoent::handle([i](auto) {
+ logger().debug(
+ "find_replay_segments: segment {} not available for read",
+ i);
+ return find_replay_segments_ertr::now();
+ }),
+ crimson::ct_error::enodata::handle([i](auto) {
+ logger().debug(
+ "find_replay_segments: segment {} header undecodable",
+ i);
+ return find_replay_segments_ertr::now();
+ }),
+ find_replay_segments_ertr::pass_further{},
+ crimson::ct_error::assert_all{}
+ );
+ }).safe_then([this, &segments]() mutable -> find_replay_segments_fut {
+ logger().debug(
+ "find_replay_segments: have {} segments",
+ segments.size());
+ if (segments.empty()) {
+ return crimson::ct_error::input_output_error::make();
+ }
+ std::sort(
+ segments.begin(),
+ segments.end(),
+ [](const auto &lt, const auto &rt) {
+ return lt.second.journal_segment_seq <
+ rt.second.journal_segment_seq;
+ });
+
+ next_journal_segment_seq =
+ segments.rbegin()->second.journal_segment_seq + 1;
+ std::for_each(
+ segments.begin(),
+ segments.end(),
+ [this](auto &seg) {
+ segment_provider->init_mark_segment_closed(
+ seg.first,
+ seg.second.journal_segment_seq);
+ });
+
+ auto journal_tail = segments.rbegin()->second.journal_tail;
+ segment_provider->update_journal_tail_committed(journal_tail);
+ auto replay_from = journal_tail.offset;
+ logger().debug(
+ "Journal::find_replay_segments: journal_tail={}",
+ journal_tail);
+ auto from = segments.begin();
+ if (replay_from != P_ADDR_NULL) {
+ from = std::find_if(
+ segments.begin(),
+ segments.end(),
+ [&replay_from](const auto &seg) -> bool {
+ return seg.first == replay_from.segment;
+ });
+ if (from->second.journal_segment_seq != journal_tail.segment_seq) {
+ logger().error(
+ "find_replay_segments: journal_tail {} does not match {}",
+ journal_tail,
+ from->second);
+ assert(0 == "invalid");
+ }
+ } else {
+ replay_from = paddr_t{from->first, (segment_off_t)block_size};
+ }
+ auto ret = replay_segments_t(segments.end() - from);
+ std::transform(
+ from, segments.end(), ret.begin(),
+ [this](const auto &p) {
+ auto ret = journal_seq_t{
+ p.second.journal_segment_seq,
+ paddr_t{p.first, (segment_off_t)block_size}};
+ logger().debug(
+ "Journal::find_replay_segments: replaying from {}",
+ ret);
+ return std::make_pair(ret, p.second);
+ });
+ ret[0].first.offset = replay_from;
+ return find_replay_segments_fut(
+ find_replay_segments_ertr::ready_future_marker{},
+ std::move(ret));
+ });
+ });
+}
+
+Journal::read_validate_record_metadata_ret Journal::read_validate_record_metadata(
+ paddr_t start,
+ segment_nonce_t nonce)
+{
+ if (start.offset + block_size > (int64_t)segment_manager.get_segment_size()) {
+ return read_validate_record_metadata_ret(
+ read_validate_record_metadata_ertr::ready_future_marker{},
+ std::nullopt);
+ }
+ return segment_manager.read(start, block_size
+ ).safe_then(
+ [=](bufferptr bptr) mutable
+ -> read_validate_record_metadata_ret {
+ logger().debug("read_validate_record_metadata: reading {}", start);
+ bufferlist bl;
+ bl.append(bptr);
+ auto bp = bl.cbegin();
+ record_header_t header;
+ try {
+ decode(header, bp);
+ } catch (ceph::buffer::error &e) {
+ return read_validate_record_metadata_ret(
+ read_validate_record_metadata_ertr::ready_future_marker{},
+ std::nullopt);
+ }
+ if (header.segment_nonce != nonce) {
+ return read_validate_record_metadata_ret(
+ read_validate_record_metadata_ertr::ready_future_marker{},
+ std::nullopt);
+ }
+ if (header.mdlength > block_size) {
+ if (start.offset + header.mdlength >
+ (int64_t)segment_manager.get_segment_size()) {
+ return crimson::ct_error::input_output_error::make();
+ }
+ return segment_manager.read(
+ {start.segment, start.offset + (segment_off_t)block_size},
+ header.mdlength - block_size).safe_then(
+ [header=std::move(header), bl=std::move(bl)](
+ auto &&bptail) mutable {
+ bl.push_back(bptail);
+ return read_validate_record_metadata_ret(
+ read_validate_record_metadata_ertr::ready_future_marker{},
+ std::make_pair(std::move(header), std::move(bl)));
+ });
+ } else {
+ return read_validate_record_metadata_ret(
+ read_validate_record_metadata_ertr::ready_future_marker{},
+ std::make_pair(std::move(header), std::move(bl))
+ );
+ }
+ }).safe_then([=](auto p) {
+ if (p && validate_metadata(p->second)) {
+ return read_validate_record_metadata_ret(
+ read_validate_record_metadata_ertr::ready_future_marker{},
+ std::move(*p)
+ );
+ } else {
+ return read_validate_record_metadata_ret(
+ read_validate_record_metadata_ertr::ready_future_marker{},
+ std::nullopt);
+ }
+ });
+}
+
+std::optional<std::vector<delta_info_t>> Journal::try_decode_deltas(
+ record_header_t header,
+ const bufferlist &bl)
+{
+ auto bliter = bl.cbegin();
+ bliter += ceph::encoded_sizeof_bounded<record_header_t>();
+ bliter += sizeof(checksum_t) /* crc */;
+ bliter += header.extents * ceph::encoded_sizeof_bounded<extent_info_t>();
+ logger().debug("{}: decoding {} deltas", __func__, header.deltas);
+ std::vector<delta_info_t> deltas(header.deltas);
+ for (auto &&i : deltas) {
+ try {
+ decode(i, bliter);
+ } catch (ceph::buffer::error &e) {
+ return std::nullopt;
+ }
+ }
+ return deltas;
+}
+
+std::optional<std::vector<extent_info_t>> Journal::try_decode_extent_infos(
+ record_header_t header,
+ const bufferlist &bl)
+{
+ auto bliter = bl.cbegin();
+ bliter += ceph::encoded_sizeof_bounded<record_header_t>();
+ bliter += sizeof(checksum_t) /* crc */;
+ logger().debug("{}: decoding {} extents", __func__, header.extents);
+ std::vector<extent_info_t> extent_infos(header.extents);
+ for (auto &&i : extent_infos) {
+ try {
+ decode(i, bliter);
+ } catch (ceph::buffer::error &e) {
+ return std::nullopt;
+ }
+ }
+ return extent_infos;
+}
+
+Journal::replay_ertr::future<>
+Journal::replay_segment(
+ journal_seq_t seq,
+ segment_header_t header,
+ delta_handler_t &handler)
+{
+ logger().debug("replay_segment: starting at {}", seq);
+ return seastar::do_with(
+ scan_valid_records_cursor(seq.offset),
+ found_record_handler_t(
+ [=, &handler](paddr_t base,
+ const record_header_t &header,
+ const bufferlist &mdbuf) {
+ auto deltas = try_decode_deltas(
+ header,
+ mdbuf);
+ if (!deltas) {
+ // This should be impossible, we did check the crc on the mdbuf
+ logger().error(
+ "Journal::replay_segment unable to decode deltas for record {}",
+ base);
+ assert(deltas);
+ }
+
+ return seastar::do_with(
+ std::move(*deltas),
+ [=](auto &deltas) {
+ return crimson::do_for_each(
+ deltas,
+ [=](auto &delta) {
+ /* The journal may validly contain deltas for extents in
+ * since released segments. We can detect those cases by
+ * checking whether the segment in question currently has a
+ * sequence number > the current journal segment seq. We can
+ * safetly skip these deltas because the extent must already
+ * have been rewritten.
+ *
+ * Note, this comparison exploits the fact that
+ * SEGMENT_SEQ_NULL is a large number.
+ */
+ if (delta.paddr != P_ADDR_NULL &&
+ (segment_provider->get_seq(delta.paddr.segment) >
+ seq.segment_seq)) {
+ return replay_ertr::now();
+ } else {
+ return handler(
+ journal_seq_t{seq.segment_seq, base},
+ base.add_offset(header.mdlength),
+ delta);
+ }
+ });
+ });
+ }),
+ [=](auto &cursor, auto &dhandler) {
+ return scan_valid_records(
+ cursor,
+ header.segment_nonce,
+ std::numeric_limits<size_t>::max(),
+ dhandler).safe_then([](auto){});
+ });
+}
+
+Journal::replay_ret Journal::replay(delta_handler_t &&delta_handler)
+{
+ return seastar::do_with(
+ std::move(delta_handler), replay_segments_t(),
+ [this](auto &handler, auto &segments) mutable -> replay_ret {
+ return find_replay_segments().safe_then(
+ [this, &handler, &segments](auto replay_segs) mutable {
+ logger().debug("replay: found {} segments", replay_segs.size());
+ segments = std::move(replay_segs);
+ return crimson::do_for_each(segments, [this, &handler](auto i) mutable {
+ return replay_segment(i.first, i.second, handler);
+ });
+ });
+ });
+}
+
+Journal::scan_extents_ret Journal::scan_extents(
+ scan_extents_cursor &cursor,
+ extent_len_t bytes_to_read)
+{
+ auto ret = std::make_unique<scan_extents_ret_bare>();
+ auto &retref = *ret;
+ return read_segment_header(cursor.get_offset().segment
+ ).handle_error(
+ scan_extents_ertr::pass_further{},
+ crimson::ct_error::assert_all{}
+ ).safe_then([&](auto segment_header) {
+ auto segment_nonce = segment_header.segment_nonce;
+ return seastar::do_with(
+ found_record_handler_t(
+ [&](
+ paddr_t base,
+ const record_header_t &header,
+ const bufferlist &mdbuf) mutable {
+
+ auto infos = try_decode_extent_infos(
+ header,
+ mdbuf);
+ if (!infos) {
+ // This should be impossible, we did check the crc on the mdbuf
+ logger().error(
+ "Journal::scan_extents unable to decode extents for record {}",
+ base);
+ assert(infos);
+ }
+
+ paddr_t extent_offset = base.add_offset(header.mdlength);
+ for (const auto &i : *infos) {
+ retref.emplace_back(extent_offset, i);
+ extent_offset.offset += i.len;
+ }
+ return scan_extents_ertr::now();
+ }),
+ [=, &cursor](auto &dhandler) {
+ return scan_valid_records(
+ cursor,
+ segment_nonce,
+ std::numeric_limits<size_t>::max(),
+ dhandler).safe_then([](auto){});
+ });
+ }).safe_then([ret=std::move(ret)] {
+ return std::move(*ret);
+ });
+}
+
+Journal::scan_valid_records_ret Journal::scan_valid_records(
+ scan_valid_records_cursor &cursor,
+ segment_nonce_t nonce,
+ size_t budget,
+ found_record_handler_t &handler)
+{
+ if (cursor.offset.offset == 0) {
+ cursor.offset.offset = block_size;
+ }
+ auto retref = std::make_unique<size_t>(0);
+ auto budget_used = *retref;
+ return crimson::do_until(
+ [=, &cursor, &budget_used, &handler]() mutable
+ -> scan_valid_records_ertr::future<bool> {
+ return [=, &handler, &cursor, &budget_used] {
+ if (!cursor.last_valid_header_found) {
+ return read_validate_record_metadata(cursor.offset, nonce
+ ).safe_then([=, &cursor](auto md) {
+ logger().debug(
+ "Journal::scan_valid_records: read complete {}",
+ cursor.offset);
+ if (!md) {
+ logger().debug(
+ "Journal::scan_valid_records: found invalid header at {}, presumably at end",
+ cursor.offset);
+ cursor.last_valid_header_found = true;
+ return scan_valid_records_ertr::now();
+ } else {
+ logger().debug(
+ "Journal::scan_valid_records: valid record read at {}",
+ cursor.offset);
+ cursor.last_committed = paddr_t{
+ cursor.offset.segment,
+ md->first.committed_to};
+ cursor.pending_records.emplace_back(
+ cursor.offset,
+ md->first,
+ md->second);
+ cursor.offset.offset +=
+ md->first.dlength + md->first.mdlength;
+ return scan_valid_records_ertr::now();
+ }
+ }).safe_then([=, &cursor, &budget_used, &handler] {
+ return crimson::do_until(
+ [=, &budget_used, &cursor, &handler] {
+ logger().debug(
+ "Journal::scan_valid_records: valid record read, processing queue");
+ if (cursor.pending_records.empty()) {
+ /* This is only possible if the segment is empty.
+ * A record's last_commited must be prior to its own
+ * location since it itself cannot yet have been committed
+ * at its own time of submission. Thus, the most recently
+ * read record must always fall after cursor.last_committed */
+ return scan_valid_records_ertr::make_ready_future<bool>(true);
+ }
+ auto &next = cursor.pending_records.front();
+ if (next.offset > cursor.last_committed) {
+ return scan_valid_records_ertr::make_ready_future<bool>(true);
+ }
+ budget_used +=
+ next.header.dlength + next.header.mdlength;
+ return handler(
+ next.offset,
+ next.header,
+ next.mdbuffer
+ ).safe_then([&cursor] {
+ cursor.pending_records.pop_front();
+ return scan_valid_records_ertr::make_ready_future<bool>(false);
+ });
+ });
+ });
+ } else {
+ assert(!cursor.pending_records.empty());
+ auto &next = cursor.pending_records.front();
+ return read_validate_data(next.offset, next.header
+ ).safe_then([=, &budget_used, &next, &cursor, &handler](auto valid) {
+ if (!valid) {
+ cursor.pending_records.clear();
+ return scan_valid_records_ertr::now();
+ }
+ budget_used +=
+ next.header.dlength + next.header.mdlength;
+ return handler(
+ next.offset,
+ next.header,
+ next.mdbuffer
+ ).safe_then([&cursor] {
+ cursor.pending_records.pop_front();
+ return scan_valid_records_ertr::now();
+ });
+ });
+ }
+ }().safe_then([=, &budget_used, &cursor] {
+ return scan_valid_records_ertr::make_ready_future<bool>(
+ cursor.is_complete() || budget_used >= budget);
+ });
+ }).safe_then([retref=std::move(retref)]() mutable -> scan_valid_records_ret {
+ return scan_valid_records_ret(
+ scan_valid_records_ertr::ready_future_marker{},
+ std::move(*retref));
+ });
+}
+
+
+}
diff --git a/src/crimson/os/seastore/journal.h b/src/crimson/os/seastore/journal.h
new file mode 100644
index 000000000..7424d78b3
--- /dev/null
+++ b/src/crimson/os/seastore/journal.h
@@ -0,0 +1,405 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/log.h"
+
+#include <boost/intrusive_ptr.hpp>
+
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "include/buffer.h"
+#include "include/denc.h"
+
+#include "crimson/os/seastore/segment_manager.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/osd/exceptions.h"
+
+namespace crimson::os::seastore {
+
+using segment_nonce_t = uint32_t;
+
+
+/**
+ * Segment header
+ *
+ * Every segment contains and encode segment_header_t in the first block.
+ * Our strategy for finding the journal replay point is:
+ * 1) Find the segment with the highest journal_segment_seq
+ * 2) Replay starting at record located at that segment's journal_tail
+ */
+struct segment_header_t {
+ segment_seq_t journal_segment_seq;
+ segment_id_t physical_segment_id; // debugging
+
+ journal_seq_t journal_tail;
+ segment_nonce_t segment_nonce;
+
+ DENC(segment_header_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.journal_segment_seq, p);
+ denc(v.physical_segment_id, p);
+ denc(v.journal_tail, p);
+ denc(v.segment_nonce, p);
+ DENC_FINISH(p);
+ }
+};
+std::ostream &operator<<(std::ostream &out, const segment_header_t &header);
+
+struct record_header_t {
+ // Fixed portion
+ extent_len_t mdlength; // block aligned, length of metadata
+ extent_len_t dlength; // block aligned, length of data
+ uint32_t deltas; // number of deltas
+ uint32_t extents; // number of extents
+ segment_nonce_t segment_nonce;// nonce of containing segment
+ segment_off_t committed_to; // records in this segment prior to committed_to
+ // have been fully written
+ checksum_t data_crc; // crc of data payload
+
+
+ DENC(record_header_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.mdlength, p);
+ denc(v.dlength, p);
+ denc(v.deltas, p);
+ denc(v.extents, p);
+ denc(v.segment_nonce, p);
+ denc(v.committed_to, p);
+ denc(v.data_crc, p);
+ DENC_FINISH(p);
+ }
+};
+
+struct extent_info_t {
+ extent_types_t type = extent_types_t::NONE;
+ laddr_t addr = L_ADDR_NULL;
+ extent_len_t len = 0;
+
+ extent_info_t() = default;
+ extent_info_t(const extent_t &et)
+ : type(et.type), addr(et.addr), len(et.bl.length()) {}
+
+ DENC(extent_info_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.type, p);
+ denc(v.addr, p);
+ denc(v.len, p);
+ DENC_FINISH(p);
+ }
+};
+
+/**
+ * Callback interface for managing available segments
+ */
+class JournalSegmentProvider {
+public:
+ using get_segment_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using get_segment_ret = get_segment_ertr::future<segment_id_t>;
+ virtual get_segment_ret get_segment() = 0;
+
+ virtual void close_segment(segment_id_t) {}
+
+ virtual void set_journal_segment(
+ segment_id_t segment,
+ segment_seq_t seq) {}
+
+ virtual journal_seq_t get_journal_tail_target() const = 0;
+ virtual void update_journal_tail_committed(journal_seq_t tail_committed) = 0;
+
+ virtual void init_mark_segment_closed(
+ segment_id_t segment, segment_seq_t seq) {}
+
+ virtual segment_seq_t get_seq(segment_id_t id) { return 0; }
+
+ virtual ~JournalSegmentProvider() {}
+};
+
+/**
+ * Manages stream of atomically written records to a SegmentManager.
+ */
+class Journal {
+public:
+ Journal(SegmentManager &segment_manager);
+
+ /**
+ * Sets the JournalSegmentProvider.
+ *
+ * Not provided in constructor to allow the provider to not own
+ * or construct the Journal (TransactionManager).
+ *
+ * Note, Journal does not own this ptr, user must ensure that
+ * *provider outlives Journal.
+ */
+ void set_segment_provider(JournalSegmentProvider *provider) {
+ segment_provider = provider;
+ }
+
+ /**
+ * initializes journal for new writes -- must run prior to calls
+ * to submit_record. Should be called after replay if not a new
+ * Journal.
+ */
+ using open_for_write_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error
+ >;
+ using open_for_write_ret = open_for_write_ertr::future<journal_seq_t>;
+ open_for_write_ret open_for_write();
+
+ /**
+ * close journal
+ *
+ * TODO: should probably flush and disallow further writes
+ */
+ using close_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ close_ertr::future<> close() { return close_ertr::now(); }
+
+ /**
+ * submit_record
+ *
+ * @param write record and returns offset of first block and seq
+ */
+ using submit_record_ertr = crimson::errorator<
+ crimson::ct_error::erange,
+ crimson::ct_error::input_output_error
+ >;
+ using submit_record_ret = submit_record_ertr::future<
+ std::pair<paddr_t, journal_seq_t>
+ >;
+ submit_record_ret submit_record(record_t &&record) {
+ auto rsize = get_encoded_record_length(record);
+ auto total = rsize.mdlength + rsize.dlength;
+ if (total > max_record_length) {
+ return crimson::ct_error::erange::make();
+ }
+ auto roll = needs_roll(total)
+ ? roll_journal_segment().safe_then([](auto){})
+ : roll_journal_segment_ertr::now();
+ return roll.safe_then(
+ [this, rsize, record=std::move(record)]() mutable {
+ return write_record(rsize, std::move(record)
+ ).safe_then([this, rsize](auto addr) {
+ return std::make_pair(
+ addr.add_offset(rsize.mdlength),
+ get_journal_seq(addr));
+ });
+ });
+ }
+
+ /**
+ * Read deltas and pass to delta_handler
+ *
+ * record_block_start (argument to delta_handler) is the start of the
+ * of the first block in the record
+ */
+ using replay_ertr = SegmentManager::read_ertr;
+ using replay_ret = replay_ertr::future<>;
+ using delta_handler_t = std::function<
+ replay_ret(journal_seq_t seq,
+ paddr_t record_block_base,
+ const delta_info_t&)>;
+ replay_ret replay(delta_handler_t &&delta_handler);
+
+ /**
+ * scan_extents
+ *
+ * Scans records beginning at addr until the first record boundary after
+ * addr + bytes_to_read.
+ *
+ * Returns list<extent, extent_info>
+ * cursor.is_complete() will be true when no further extents exist in segment.
+ */
+ class scan_valid_records_cursor;
+ using scan_extents_cursor = scan_valid_records_cursor;
+ using scan_extents_ertr = SegmentManager::read_ertr;
+ using scan_extents_ret_bare = std::list<std::pair<paddr_t, extent_info_t>>;
+ using scan_extents_ret = scan_extents_ertr::future<scan_extents_ret_bare>;
+ scan_extents_ret scan_extents(
+ scan_extents_cursor &cursor,
+ extent_len_t bytes_to_read
+ );
+
+
+private:
+ const extent_len_t block_size;
+ const extent_len_t max_record_length;
+
+ JournalSegmentProvider *segment_provider = nullptr;
+ SegmentManager &segment_manager;
+
+ segment_seq_t next_journal_segment_seq = 0;
+ segment_nonce_t current_segment_nonce = 0;
+
+ SegmentRef current_journal_segment;
+ segment_off_t written_to = 0;
+ segment_off_t committed_to = 0;
+
+ journal_seq_t get_journal_seq(paddr_t addr) {
+ return journal_seq_t{next_journal_segment_seq-1, addr};
+ }
+
+ /// prepare segment for writes, writes out segment header
+ using initialize_segment_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ initialize_segment_ertr::future<segment_seq_t> initialize_segment(
+ Segment &segment);
+
+ struct record_size_t {
+ extent_len_t mdlength = 0;
+ extent_len_t dlength = 0;
+
+ record_size_t(
+ extent_len_t mdlength,
+ extent_len_t dlength)
+ : mdlength(mdlength), dlength(dlength) {}
+ };
+
+ /**
+ * Return <mdlength, dlength> pair denoting length of
+ * metadata and blocks respectively.
+ */
+ record_size_t get_encoded_record_length(
+ const record_t &record) const;
+
+ /// create encoded record bl
+ ceph::bufferlist encode_record(
+ record_size_t rsize,
+ record_t &&record);
+
+ /// validate embedded metadata checksum
+ static bool validate_metadata(const bufferlist &bl);
+
+ /// read and validate data
+ using read_validate_data_ertr = SegmentManager::read_ertr;
+ using read_validate_data_ret = read_validate_data_ertr::future<bool>;
+ read_validate_data_ret read_validate_data(
+ paddr_t record_base,
+ const record_header_t &header ///< caller must ensure lifetime through
+ /// future resolution
+ );
+
+
+ /// do record write
+ using write_record_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using write_record_ret = write_record_ertr::future<paddr_t>;
+ write_record_ret write_record(
+ record_size_t rsize,
+ record_t &&record);
+
+ /// close current segment and initialize next one
+ using roll_journal_segment_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ roll_journal_segment_ertr::future<segment_seq_t> roll_journal_segment();
+
+ /// returns true iff current segment has insufficient space
+ bool needs_roll(segment_off_t length) const;
+
+ using read_segment_header_ertr = crimson::errorator<
+ crimson::ct_error::enoent,
+ crimson::ct_error::enodata,
+ crimson::ct_error::input_output_error
+ >;
+ using read_segment_header_ret = read_segment_header_ertr::future<
+ segment_header_t>;
+ read_segment_header_ret read_segment_header(segment_id_t segment);
+
+ /// return ordered vector of segments to replay
+ using replay_segments_t = std::vector<
+ std::pair<journal_seq_t, segment_header_t>>;
+ using find_replay_segments_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error
+ >;
+ using find_replay_segments_fut = find_replay_segments_ertr::future<
+ replay_segments_t>;
+ find_replay_segments_fut find_replay_segments();
+
+ /// attempts to decode deltas from bl, return nullopt if unsuccessful
+ std::optional<std::vector<delta_info_t>> try_decode_deltas(
+ record_header_t header,
+ const bufferlist &bl);
+
+ /// attempts to decode extent infos from bl, return nullopt if unsuccessful
+ std::optional<std::vector<extent_info_t>> try_decode_extent_infos(
+ record_header_t header,
+ const bufferlist &bl);
+
+ /// read record metadata for record starting at start
+ using read_validate_record_metadata_ertr = replay_ertr;
+ using read_validate_record_metadata_ret =
+ read_validate_record_metadata_ertr::future<
+ std::optional<std::pair<record_header_t, bufferlist>>
+ >;
+ read_validate_record_metadata_ret read_validate_record_metadata(
+ paddr_t start,
+ segment_nonce_t nonce);
+
+public:
+ /// scan segment for end incrementally
+ struct scan_valid_records_cursor {
+ bool last_valid_header_found = false;
+ paddr_t offset;
+ paddr_t last_committed;
+
+ struct found_record_t {
+ paddr_t offset;
+ record_header_t header;
+ bufferlist mdbuffer;
+
+ found_record_t(
+ paddr_t offset,
+ const record_header_t &header,
+ const bufferlist &mdbuffer)
+ : offset(offset), header(header), mdbuffer(mdbuffer) {}
+ };
+ std::deque<found_record_t> pending_records;
+
+ bool is_complete() const {
+ return last_valid_header_found && pending_records.empty();
+ }
+
+ paddr_t get_offset() const {
+ return offset;
+ }
+
+ scan_valid_records_cursor(
+ paddr_t offset)
+ : offset(offset) {}
+ };
+private:
+
+ using scan_valid_records_ertr = SegmentManager::read_ertr;
+ using scan_valid_records_ret = scan_valid_records_ertr::future<
+ size_t>;
+ using found_record_handler_t = std::function<
+ scan_valid_records_ertr::future<>(
+ paddr_t record_block_base,
+ // callee may assume header and bl will remain valid until
+ // returned future resolves
+ const record_header_t &header,
+ const bufferlist &bl)>;
+ scan_valid_records_ret scan_valid_records(
+ scan_valid_records_cursor &cursor, ///< [in, out] cursor, updated during call
+ segment_nonce_t nonce, ///< [in] nonce for segment
+ size_t budget, ///< [in] max budget to use
+ found_record_handler_t &handler ///< [in] handler for records
+ ); ///< @return used budget
+
+ /// replays records starting at start through end of segment
+ replay_ertr::future<>
+ replay_segment(
+ journal_seq_t start, ///< [in] starting addr, seq
+ segment_header_t header, ///< [in] segment header
+ delta_handler_t &delta_handler ///< [in] processes deltas in order
+ );
+
+};
+
+}
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_header_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_header_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::extent_info_t)
diff --git a/src/crimson/os/seastore/lba_manager.cc b/src/crimson/os/seastore/lba_manager.cc
new file mode 100644
index 000000000..73411dcf7
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager.cc
@@ -0,0 +1,17 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/segment_manager.h"
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/lba_manager.h"
+#include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h"
+
+namespace crimson::os::seastore::lba_manager {
+
+LBAManagerRef create_lba_manager(
+ SegmentManager &segment_manager,
+ Cache &cache) {
+ return LBAManagerRef(new btree::BtreeLBAManager(segment_manager, cache));
+}
+
+}
diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h
new file mode 100644
index 000000000..ad90f4c4f
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager.h
@@ -0,0 +1,207 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "include/buffer_fwd.h"
+#include "include/interval_set.h"
+#include "common/interval_map.h"
+
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/segment_manager.h"
+
+namespace crimson::os::seastore {
+
+/**
+ * Abstract interface for managing the logical to physical mapping
+ */
+class LBAManager {
+public:
+ using mkfs_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using mkfs_ret = mkfs_ertr::future<>;
+ virtual mkfs_ret mkfs(
+ Transaction &t
+ ) = 0;
+
+ /**
+ * Fetches mappings for laddr_t in range [offset, offset + len)
+ *
+ * Future will not resolve until all pins have resolved (set_paddr called)
+ */
+ using get_mapping_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using get_mapping_ret = get_mapping_ertr::future<lba_pin_list_t>;
+ virtual get_mapping_ret get_mapping(
+ Transaction &t,
+ laddr_t offset, extent_len_t length) = 0;
+
+ /**
+ * Fetches mappings for laddr_t in range [offset, offset + len)
+ *
+ * Future will not result until all pins have resolved (set_paddr called)
+ */
+ using get_mappings_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using get_mappings_ret = get_mapping_ertr::future<lba_pin_list_t>;
+ virtual get_mappings_ret get_mappings(
+ Transaction &t,
+ laddr_list_t &&extent_lisk) = 0;
+
+ /**
+ * Allocates a new mapping referenced by LBARef
+ *
+ * Offset will be relative to the block offset of the record
+ * This mapping will block from transaction submission until set_paddr
+ * is called on the LBAPin.
+ */
+ using alloc_extent_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using alloc_extent_ret = alloc_extent_ertr::future<LBAPinRef>;
+ virtual alloc_extent_ret alloc_extent(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len,
+ paddr_t addr) = 0;
+
+ /**
+ * Creates a new absolute mapping.
+ *
+ * off~len must be unreferenced
+ */
+ using set_extent_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg>;
+ using set_extent_ret = set_extent_ertr::future<LBAPinRef>;
+ virtual set_extent_ret set_extent(
+ Transaction &t,
+ laddr_t off, extent_len_t len, paddr_t addr) = 0;
+
+
+ struct ref_update_result_t {
+ unsigned refcount = 0;
+ paddr_t addr;
+ };
+ using ref_ertr = crimson::errorator<
+ crimson::ct_error::enoent,
+ crimson::ct_error::input_output_error>;
+ using ref_ret = ref_ertr::future<ref_update_result_t>;
+
+ /**
+ * Decrements ref count on extent
+ *
+ * @return returns resulting refcount
+ */
+ virtual ref_ret decref_extent(
+ Transaction &t,
+ laddr_t addr) = 0;
+
+ /**
+ * Increments ref count on extent
+ *
+ * @return returns resulting refcount
+ */
+ virtual ref_ret incref_extent(
+ Transaction &t,
+ laddr_t addr) = 0;
+
+ using complete_transaction_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using complete_transaction_ret = complete_transaction_ertr::future<>;
+ virtual complete_transaction_ret complete_transaction(
+ Transaction &t) = 0;
+
+ /**
+ * Should be called after replay on each cached extent.
+ * Implementation must initialize the LBAPin on any
+ * LogicalCachedExtent's and may also read in any dependent
+ * structures, etc.
+ */
+ using init_cached_extent_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using init_cached_extent_ret = init_cached_extent_ertr::future<>;
+ virtual init_cached_extent_ret init_cached_extent(
+ Transaction &t,
+ CachedExtentRef e) = 0;
+
+ /**
+ * Calls f for each mapping in [begin, end)
+ */
+ using scan_mappings_ertr = SegmentManager::read_ertr;
+ using scan_mappings_ret = scan_mappings_ertr::future<>;
+ using scan_mappings_func_t = std::function<
+ void(laddr_t, paddr_t, extent_len_t)>;
+ virtual scan_mappings_ret scan_mappings(
+ Transaction &t,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &&f) = 0;
+
+ /**
+ * Calls f for each mapped space usage in [begin, end)
+ */
+ using scan_mapped_space_ertr = SegmentManager::read_ertr;
+ using scan_mapped_space_ret = scan_mapped_space_ertr::future<>;
+ using scan_mapped_space_func_t = std::function<
+ void(paddr_t, extent_len_t)>;
+ virtual scan_mapped_space_ret scan_mapped_space(
+ Transaction &t,
+ scan_mapped_space_func_t &&f) = 0;
+
+ /**
+ * rewrite_extent
+ *
+ * rewrite extent into passed transaction
+ */
+ using rewrite_extent_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using rewrite_extent_ret = rewrite_extent_ertr::future<>;
+ virtual rewrite_extent_ret rewrite_extent(
+ Transaction &t,
+ CachedExtentRef extent) = 0;
+
+ /**
+ * get_physical_extent_if_live
+ *
+ * Returns extent at addr/laddr if still live (if laddr
+ * still points at addr). Extent must be an internal, physical
+ * extent.
+ *
+ * Returns a null CachedExtentRef if extent is not live.
+ */
+ using get_physical_extent_if_live_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using get_physical_extent_if_live_ret =
+ get_physical_extent_if_live_ertr::future<CachedExtentRef>;
+ virtual get_physical_extent_if_live_ret get_physical_extent_if_live(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t addr,
+ laddr_t laddr,
+ segment_off_t len) = 0;
+
+ virtual void add_pin(LBAPin &pin) = 0;
+
+ virtual ~LBAManager() {}
+};
+using LBAManagerRef = std::unique_ptr<LBAManager>;
+
+class Cache;
+namespace lba_manager {
+LBAManagerRef create_lba_manager(
+ SegmentManager &segment_manager,
+ Cache &cache);
+}
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
new file mode 100644
index 000000000..a837ae37e
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
@@ -0,0 +1,580 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include "crimson/common/log.h"
+
+#include "include/buffer.h"
+#include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h"
+
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+BtreeLBAManager::mkfs_ret BtreeLBAManager::mkfs(
+ Transaction &t)
+{
+ logger().debug("BtreeLBAManager::mkfs");
+ return cache.get_root(t).safe_then([this, &t](auto croot) {
+ auto root_leaf = cache.alloc_new_extent<LBALeafNode>(
+ t,
+ LBA_BLOCK_SIZE);
+ root_leaf->set_size(0);
+ lba_node_meta_t meta{0, L_ADDR_MAX, 1};
+ root_leaf->set_meta(meta);
+ root_leaf->pin.set_range(meta);
+ croot->get_root() =
+ root_t{
+ 1,
+ 0,
+ root_leaf->get_paddr(),
+ make_record_relative_paddr(0),
+ L_ADDR_NULL};
+ return mkfs_ertr::now();
+ });
+}
+
+BtreeLBAManager::get_root_ret
+BtreeLBAManager::get_root(Transaction &t)
+{
+ return cache.get_root(t).safe_then([this, &t](auto croot) {
+ logger().debug(
+ "BtreeLBAManager::get_root: reading root at {} depth {}",
+ paddr_t{croot->get_root().lba_root_addr},
+ unsigned(croot->get_root().lba_depth));
+ return get_lba_btree_extent(
+ get_context(t),
+ croot->get_root().lba_depth,
+ croot->get_root().lba_root_addr,
+ paddr_t());
+ });
+}
+
+BtreeLBAManager::get_mapping_ret
+BtreeLBAManager::get_mapping(
+ Transaction &t,
+ laddr_t offset, extent_len_t length)
+{
+ logger().debug("BtreeLBAManager::get_mapping: {}, {}", offset, length);
+ return get_root(
+ t).safe_then([this, &t, offset, length](auto extent) {
+ return extent->lookup_range(
+ get_context(t),
+ offset, length
+ ).safe_then([extent](auto ret) { return ret; });
+ }).safe_then([](auto &&e) {
+ logger().debug("BtreeLBAManager::get_mapping: got mapping {}", e);
+ return get_mapping_ret(
+ get_mapping_ertr::ready_future_marker{},
+ std::move(e));
+ });
+}
+
+
+BtreeLBAManager::get_mappings_ret
+BtreeLBAManager::get_mappings(
+ Transaction &t,
+ laddr_list_t &&list)
+{
+ logger().debug("BtreeLBAManager::get_mappings: {}", list);
+ auto l = std::make_unique<laddr_list_t>(std::move(list));
+ auto retptr = std::make_unique<lba_pin_list_t>();
+ auto &ret = *retptr;
+ return crimson::do_for_each(
+ l->begin(),
+ l->end(),
+ [this, &t, &ret](const auto &p) {
+ return get_mapping(t, p.first, p.second).safe_then(
+ [&ret](auto res) {
+ ret.splice(ret.end(), res, res.begin(), res.end());
+ });
+ }).safe_then([l=std::move(l), retptr=std::move(retptr)]() mutable {
+ return std::move(*retptr);
+ });
+}
+
+BtreeLBAManager::alloc_extent_ret
+BtreeLBAManager::alloc_extent(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len,
+ paddr_t addr)
+{
+ // TODO: we can certainly combine the lookup and the insert.
+ return get_root(
+ t).safe_then([this, &t, hint, len](auto extent) {
+ logger().debug(
+ "BtreeLBAManager::alloc_extent: beginning search at {}",
+ *extent);
+ return extent->find_hole(
+ get_context(t),
+ hint,
+ L_ADDR_MAX,
+ len).safe_then([extent](auto ret) {
+ return std::make_pair(ret, extent);
+ });
+ }).safe_then([this, &t, len, addr](auto allocation_pair) {
+ auto &[laddr, extent] = allocation_pair;
+ ceph_assert(laddr != L_ADDR_MAX);
+ return insert_mapping(
+ t,
+ extent,
+ laddr,
+ { len, addr, 1, 0 }
+ ).safe_then([laddr=laddr, addr, len](auto pin) {
+ logger().debug(
+ "BtreeLBAManager::alloc_extent: alloc {}~{} for {}",
+ laddr,
+ len,
+ addr);
+ return alloc_extent_ret(
+ alloc_extent_ertr::ready_future_marker{},
+ LBAPinRef(pin.release()));
+ });
+ });
+}
+
+BtreeLBAManager::set_extent_ret
+BtreeLBAManager::set_extent(
+ Transaction &t,
+ laddr_t off, extent_len_t len, paddr_t addr)
+{
+ return get_root(
+ t).safe_then([this, &t, off, len, addr](auto root) {
+ return insert_mapping(
+ t,
+ root,
+ off,
+ { len, addr, 1, 0 });
+ }).safe_then([](auto ret) {
+ return set_extent_ret(
+ set_extent_ertr::ready_future_marker{},
+ LBAPinRef(ret.release()));
+ });
+}
+
+static bool is_lba_node(extent_types_t type)
+{
+ return type == extent_types_t::LADDR_INTERNAL ||
+ type == extent_types_t::LADDR_LEAF;
+}
+
+static bool is_lba_node(const CachedExtent &e)
+{
+ return is_lba_node(e.get_type());
+}
+
+btree_range_pin_t &BtreeLBAManager::get_pin(CachedExtent &e)
+{
+ if (is_lba_node(e)) {
+ return e.cast<LBANode>()->pin;
+ } else if (e.is_logical()) {
+ return static_cast<BtreeLBAPin &>(
+ e.cast<LogicalCachedExtent>()->get_pin()).pin;
+ } else {
+ ceph_abort_msg("impossible");
+ }
+}
+
+static depth_t get_depth(const CachedExtent &e)
+{
+ if (is_lba_node(e)) {
+ return e.cast<LBANode>()->get_node_meta().depth;
+ } else if (e.is_logical()) {
+ return 0;
+ } else {
+ ceph_assert(0 == "currently impossible");
+ return 0;
+ }
+}
+
+BtreeLBAManager::complete_transaction_ret
+BtreeLBAManager::complete_transaction(
+ Transaction &t)
+{
+ std::vector<CachedExtentRef> to_clear;
+ to_clear.reserve(t.get_retired_set().size());
+ for (auto &e: t.get_retired_set()) {
+ if (e->is_logical() || is_lba_node(*e))
+ to_clear.push_back(e);
+ }
+ // need to call check_parent from leaf->parent
+ std::sort(
+ to_clear.begin(), to_clear.end(),
+ [](auto &l, auto &r) { return get_depth(*l) < get_depth(*r); });
+
+ for (auto &e: to_clear) {
+ auto &pin = get_pin(*e);
+ logger().debug("{}: retiring {}, {}", __func__, *e, pin);
+ pin_set.retire(pin);
+ }
+
+ // ...but add_pin from parent->leaf
+ std::vector<CachedExtentRef> to_link;
+ to_link.reserve(t.get_fresh_block_list().size());
+ for (auto &e: t.get_fresh_block_list()) {
+ if (e->is_valid() && (is_lba_node(*e) || e->is_logical()))
+ to_link.push_back(e);
+ }
+ std::sort(
+ to_link.begin(), to_link.end(),
+ [](auto &l, auto &r) -> bool { return get_depth(*l) > get_depth(*r); });
+
+ for (auto &e : to_link) {
+ logger().debug("{}: linking {}", __func__, *e);
+ pin_set.add_pin(get_pin(*e));
+ }
+
+ for (auto &e: to_clear) {
+ auto &pin = get_pin(*e);
+ logger().debug("{}: checking {}, {}", __func__, *e, pin);
+ pin_set.check_parent(pin);
+ }
+ return complete_transaction_ertr::now();
+}
+
+BtreeLBAManager::init_cached_extent_ret BtreeLBAManager::init_cached_extent(
+ Transaction &t,
+ CachedExtentRef e)
+{
+ logger().debug("{}: {}", __func__, *e);
+ return get_root(t).safe_then(
+ [this, &t, e=std::move(e)](LBANodeRef root) mutable {
+ if (is_lba_node(*e)) {
+ auto lban = e->cast<LBANode>();
+ logger().debug("init_cached_extent: lba node, getting root");
+ return root->lookup(
+ op_context_t{cache, pin_set, t},
+ lban->get_node_meta().begin,
+ lban->get_node_meta().depth
+ ).safe_then([this, e=std::move(e)](LBANodeRef c) {
+ if (c->get_paddr() == e->get_paddr()) {
+ assert(&*c == &*e);
+ logger().debug("init_cached_extent: {} initialized", *e);
+ } else {
+ // e is obsolete
+ logger().debug("init_cached_extent: {} obsolete", *e);
+ cache.drop_from_cache(e);
+ }
+ return init_cached_extent_ertr::now();
+ });
+ } else if (e->is_logical()) {
+ auto logn = e->cast<LogicalCachedExtent>();
+ return root->lookup_range(
+ op_context_t{cache, pin_set, t},
+ logn->get_laddr(),
+ logn->get_length()).safe_then(
+ [this, logn=std::move(logn)](auto pins) {
+ if (pins.size() == 1) {
+ auto pin = std::move(pins.front());
+ pins.pop_front();
+ if (pin->get_paddr() == logn->get_paddr()) {
+ logn->set_pin(std::move(pin));
+ pin_set.add_pin(
+ static_cast<BtreeLBAPin&>(logn->get_pin()).pin);
+ logger().debug("init_cached_extent: {} initialized", *logn);
+ } else {
+ // paddr doesn't match, remapped, obsolete
+ logger().debug("init_cached_extent: {} obsolete", *logn);
+ cache.drop_from_cache(logn);
+ }
+ } else {
+ // set of extents changed, obsolete
+ logger().debug("init_cached_extent: {} obsolete", *logn);
+ cache.drop_from_cache(logn);
+ }
+ return init_cached_extent_ertr::now();
+ });
+ } else {
+ logger().debug("init_cached_extent: {} skipped", *e);
+ return init_cached_extent_ertr::now();
+ }
+ });
+}
+
+BtreeLBAManager::scan_mappings_ret BtreeLBAManager::scan_mappings(
+ Transaction &t,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &&f)
+{
+ return seastar::do_with(
+ std::move(f),
+ LBANodeRef(),
+ [=, &t](auto &f, auto &lbarootref) {
+ return get_root(t).safe_then(
+ [=, &t, &f](LBANodeRef lbaroot) mutable {
+ lbarootref = lbaroot;
+ return lbaroot->scan_mappings(
+ get_context(t),
+ begin,
+ end,
+ f);
+ });
+ });
+}
+
+BtreeLBAManager::scan_mapped_space_ret BtreeLBAManager::scan_mapped_space(
+ Transaction &t,
+ scan_mapped_space_func_t &&f)
+{
+ return seastar::do_with(
+ std::move(f),
+ LBANodeRef(),
+ [=, &t](auto &f, auto &lbarootref) {
+ return get_root(t).safe_then(
+ [=, &t, &f](LBANodeRef lbaroot) mutable {
+ lbarootref = lbaroot;
+ return lbaroot->scan_mapped_space(
+ get_context(t),
+ f);
+ });
+ });
+}
+
+BtreeLBAManager::rewrite_extent_ret BtreeLBAManager::rewrite_extent(
+ Transaction &t,
+ CachedExtentRef extent)
+{
+ if (extent->is_logical()) {
+ auto lextent = extent->cast<LogicalCachedExtent>();
+ cache.retire_extent(t, extent);
+ auto nlextent = cache.alloc_new_extent_by_type(
+ t,
+ lextent->get_type(),
+ lextent->get_length())->cast<LogicalCachedExtent>();
+ lextent->get_bptr().copy_out(
+ 0,
+ lextent->get_length(),
+ nlextent->get_bptr().c_str());
+ nlextent->set_laddr(lextent->get_laddr());
+ nlextent->set_pin(lextent->get_pin().duplicate());
+
+ logger().debug(
+ "{}: rewriting {} into {}",
+ __func__,
+ *lextent,
+ *nlextent);
+
+ return update_mapping(
+ t,
+ lextent->get_laddr(),
+ [prev_addr = lextent->get_paddr(), addr = nlextent->get_paddr()](
+ const lba_map_val_t &in) {
+ lba_map_val_t ret = in;
+ ceph_assert(in.paddr == prev_addr);
+ ret.paddr = addr;
+ return ret;
+ }).safe_then([nlextent](auto e) {}).handle_error(
+ rewrite_extent_ertr::pass_further{},
+ /* ENOENT in particular should be impossible */
+ crimson::ct_error::assert_all{}
+ );
+ } else if (is_lba_node(*extent)) {
+ auto lba_extent = extent->cast<LBANode>();
+ cache.retire_extent(t, extent);
+ auto nlba_extent = cache.alloc_new_extent_by_type(
+ t,
+ lba_extent->get_type(),
+ lba_extent->get_length())->cast<LBANode>();
+ lba_extent->get_bptr().copy_out(
+ 0,
+ lba_extent->get_length(),
+ nlba_extent->get_bptr().c_str());
+ nlba_extent->pin.set_range(nlba_extent->get_node_meta());
+
+ /* This is a bit underhanded. Any relative addrs here must necessarily
+ * be record relative as we are rewriting a dirty extent. Thus, we
+ * are using resolve_relative_addrs with a (likely negative) block
+ * relative offset to correct them to block-relative offsets adjusted
+ * for our new transaction location.
+ *
+ * Upon commit, these now block relative addresses will be interpretted
+ * against the real final address.
+ */
+ nlba_extent->resolve_relative_addrs(
+ make_record_relative_paddr(0) - nlba_extent->get_paddr());
+
+ return update_internal_mapping(
+ t,
+ nlba_extent->get_node_meta().depth,
+ nlba_extent->get_node_meta().begin,
+ nlba_extent->get_paddr()).safe_then(
+ [](auto) {},
+ rewrite_extent_ertr::pass_further {},
+ crimson::ct_error::assert_all{});
+ } else {
+ return rewrite_extent_ertr::now();
+ }
+}
+
+BtreeLBAManager::get_physical_extent_if_live_ret
+BtreeLBAManager::get_physical_extent_if_live(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t addr,
+ laddr_t laddr,
+ segment_off_t len)
+{
+ ceph_assert(is_lba_node(type));
+ return cache.get_extent_by_type(
+ t,
+ type,
+ addr,
+ laddr,
+ len
+ ).safe_then([=, &t](CachedExtentRef extent) {
+ return get_root(t).safe_then([=, &t](LBANodeRef root) {
+ auto lba_node = extent->cast<LBANode>();
+ return root->lookup(
+ op_context_t{cache, pin_set, t},
+ lba_node->get_node_meta().begin,
+ lba_node->get_node_meta().depth).safe_then([=](LBANodeRef c) {
+ if (c->get_paddr() == lba_node->get_paddr()) {
+ return get_physical_extent_if_live_ret(
+ get_physical_extent_if_live_ertr::ready_future_marker{},
+ lba_node);
+ } else {
+ cache.drop_from_cache(lba_node);
+ return get_physical_extent_if_live_ret(
+ get_physical_extent_if_live_ertr::ready_future_marker{},
+ CachedExtentRef());
+ }
+ });
+ });
+ });
+}
+
+BtreeLBAManager::BtreeLBAManager(
+ SegmentManager &segment_manager,
+ Cache &cache)
+ : segment_manager(segment_manager),
+ cache(cache) {}
+
+BtreeLBAManager::insert_mapping_ret BtreeLBAManager::insert_mapping(
+ Transaction &t,
+ LBANodeRef root,
+ laddr_t laddr,
+ lba_map_val_t val)
+{
+ auto split = insert_mapping_ertr::future<LBANodeRef>(
+ insert_mapping_ertr::ready_future_marker{},
+ root);
+ if (root->at_max_capacity()) {
+ split = cache.get_root(t).safe_then(
+ [this, root, laddr, &t](RootBlockRef croot) {
+ logger().debug(
+ "BtreeLBAManager::insert_mapping: splitting root {}",
+ *croot);
+ {
+ auto mut_croot = cache.duplicate_for_write(t, croot);
+ croot = mut_croot->cast<RootBlock>();
+ }
+ auto nroot = cache.alloc_new_extent<LBAInternalNode>(t, LBA_BLOCK_SIZE);
+ lba_node_meta_t meta{0, L_ADDR_MAX, root->get_node_meta().depth + 1};
+ nroot->set_meta(meta);
+ nroot->pin.set_range(meta);
+ nroot->journal_insert(
+ nroot->begin(),
+ L_ADDR_MIN,
+ root->get_paddr(),
+ nullptr);
+ croot->get_root().lba_root_addr = nroot->get_paddr();
+ croot->get_root().lba_depth = root->get_node_meta().depth + 1;
+ return nroot->split_entry(
+ get_context(t),
+ laddr, nroot->begin(), root);
+ });
+ }
+ return split.safe_then([this, &t, laddr, val](LBANodeRef node) {
+ return node->insert(
+ get_context(t),
+ laddr, val);
+ });
+}
+
+BtreeLBAManager::update_refcount_ret BtreeLBAManager::update_refcount(
+ Transaction &t,
+ laddr_t addr,
+ int delta)
+{
+ return update_mapping(
+ t,
+ addr,
+ [delta](const lba_map_val_t &in) {
+ lba_map_val_t out = in;
+ ceph_assert((int)out.refcount + delta >= 0);
+ out.refcount += delta;
+ return out;
+ }).safe_then([](auto result) {
+ return ref_update_result_t{result.refcount, result.paddr};
+ });
+}
+
+BtreeLBAManager::update_mapping_ret BtreeLBAManager::update_mapping(
+ Transaction &t,
+ laddr_t addr,
+ update_func_t &&f)
+{
+ return get_root(t
+ ).safe_then([this, f=std::move(f), &t, addr](LBANodeRef root) mutable {
+ return root->mutate_mapping(
+ get_context(t),
+ addr,
+ std::move(f));
+ });
+}
+
+BtreeLBAManager::update_internal_mapping_ret
+BtreeLBAManager::update_internal_mapping(
+ Transaction &t,
+ depth_t depth,
+ laddr_t laddr,
+ paddr_t paddr)
+{
+ return cache.get_root(t).safe_then([=, &t](RootBlockRef croot) {
+ if (depth == croot->get_root().lba_depth) {
+ logger().debug(
+ "update_internal_mapping: updating lba root to: {}->{}",
+ laddr,
+ paddr);
+ {
+ auto mut_croot = cache.duplicate_for_write(t, croot);
+ croot = mut_croot->cast<RootBlock>();
+ }
+ ceph_assert(laddr == 0);
+ auto old_paddr = croot->get_root().lba_root_addr;
+ croot->get_root().lba_root_addr = paddr;
+ return update_internal_mapping_ret(
+ update_internal_mapping_ertr::ready_future_marker{},
+ old_paddr);
+ } else {
+ logger().debug(
+ "update_internal_mapping: updating lba node at depth {} to: {}->{}",
+ depth,
+ laddr,
+ paddr);
+ return get_lba_btree_extent(
+ get_context(t),
+ croot->get_root().lba_depth,
+ croot->get_root().lba_root_addr,
+ paddr_t()).safe_then([=, &t](LBANodeRef broot) {
+ return broot->mutate_internal_address(
+ get_context(t),
+ depth,
+ laddr,
+ paddr);
+ });
+ }
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
new file mode 100644
index 000000000..640d56734
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
@@ -0,0 +1,188 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "include/buffer_fwd.h"
+#include "include/interval_set.h"
+#include "common/interval_map.h"
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/lba_manager.h"
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/segment_manager.h"
+
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+/**
+ * BtreeLBAManager
+ *
+ * Uses a wandering btree to track two things:
+ * 1) lba state including laddr_t -> paddr_t mapping
+ * 2) reverse paddr_t -> laddr_t mapping for gc (TODO)
+ *
+ * Generally, any transaction will involve
+ * 1) deltas against lba tree nodes
+ * 2) new lba tree nodes
+ * - Note, there must necessarily be a delta linking
+ * these new nodes into the tree -- might be a
+ * bootstrap_state_t delta if new root
+ *
+ * get_mappings, alloc_extent_*, etc populate a Transaction
+ * which then gets submitted
+ */
+class BtreeLBAManager : public LBAManager {
+public:
+ BtreeLBAManager(
+ SegmentManager &segment_manager,
+ Cache &cache);
+
+ mkfs_ret mkfs(
+ Transaction &t) final;
+
+ get_mapping_ret get_mapping(
+ Transaction &t,
+ laddr_t offset, extent_len_t length) final;
+
+ get_mappings_ret get_mappings(
+ Transaction &t,
+ laddr_list_t &&list) final;
+
+ alloc_extent_ret alloc_extent(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len,
+ paddr_t addr) final;
+
+ set_extent_ret set_extent(
+ Transaction &t,
+ laddr_t off, extent_len_t len, paddr_t addr) final;
+
+ ref_ret decref_extent(
+ Transaction &t,
+ laddr_t addr) final {
+ return update_refcount(t, addr, -1);
+ }
+
+ ref_ret incref_extent(
+ Transaction &t,
+ laddr_t addr) final {
+ return update_refcount(t, addr, 1);
+ }
+
+ complete_transaction_ret complete_transaction(
+ Transaction &t) final;
+
+ init_cached_extent_ret init_cached_extent(
+ Transaction &t,
+ CachedExtentRef e) final;
+
+ scan_mappings_ret scan_mappings(
+ Transaction &t,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &&f) final;
+
+ scan_mapped_space_ret scan_mapped_space(
+ Transaction &t,
+ scan_mapped_space_func_t &&f) final;
+
+ rewrite_extent_ret rewrite_extent(
+ Transaction &t,
+ CachedExtentRef extent);
+
+ get_physical_extent_if_live_ret get_physical_extent_if_live(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t addr,
+ laddr_t laddr,
+ segment_off_t len) final;
+
+ void add_pin(LBAPin &pin) final {
+ auto *bpin = reinterpret_cast<BtreeLBAPin*>(&pin);
+ pin_set.add_pin(bpin->pin);
+ bpin->parent = nullptr;
+ }
+
+private:
+ SegmentManager &segment_manager;
+ Cache &cache;
+
+ btree_pin_set_t pin_set;
+
+ op_context_t get_context(Transaction &t) {
+ return op_context_t{cache, pin_set, t};
+ }
+
+ static btree_range_pin_t &get_pin(CachedExtent &e);
+
+
+ /**
+ * get_root
+ *
+ * Get a reference to the root LBANode.
+ */
+ using get_root_ertr = Cache::get_extent_ertr;
+ using get_root_ret = get_root_ertr::future<LBANodeRef>;
+ get_root_ret get_root(Transaction &);
+
+ /**
+ * insert_mapping
+ *
+ * Insert a lba mapping into the tree
+ */
+ using insert_mapping_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using insert_mapping_ret = insert_mapping_ertr::future<LBAPinRef>;
+ insert_mapping_ret insert_mapping(
+ Transaction &t, ///< [in,out] transaction
+ LBANodeRef root, ///< [in] root node
+ laddr_t laddr, ///< [in] logical addr to insert
+ lba_map_val_t val ///< [in] mapping to insert
+ );
+
+ /**
+ * update_refcount
+ *
+ * Updates refcount, returns resulting refcount
+ */
+ using update_refcount_ret = ref_ret;
+ update_refcount_ret update_refcount(
+ Transaction &t,
+ laddr_t addr,
+ int delta);
+
+ /**
+ * update_mapping
+ *
+ * Updates mapping, removes if f returns nullopt
+ */
+ using update_mapping_ertr = ref_ertr;
+ using update_mapping_ret = ref_ertr::future<lba_map_val_t>;
+ using update_func_t = LBANode::mutate_func_t;
+ update_mapping_ret update_mapping(
+ Transaction &t,
+ laddr_t addr,
+ update_func_t &&f);
+
+ using update_internal_mapping_ertr = LBANode::mutate_internal_address_ertr;
+ using update_internal_mapping_ret = LBANode::mutate_internal_address_ret;
+ update_internal_mapping_ret update_internal_mapping(
+ Transaction &t,
+ depth_t depth,
+ laddr_t laddr,
+ paddr_t paddr);
+};
+using BtreeLBAManagerRef = std::unique_ptr<BtreeLBAManager>;
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc
new file mode 100644
index 000000000..a86c3cc57
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc
@@ -0,0 +1,153 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/common/log.h"
+
+#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+void btree_range_pin_t::take_pin(btree_range_pin_t &other)
+{
+ assert(other.extent);
+ assert(other.pins);
+ other.pins->replace_pin(*this, other);
+ pins = other.pins;
+ other.pins = nullptr;
+
+ if (other.has_ref()) {
+ other.drop_ref();
+ acquire_ref();
+ }
+}
+
+btree_range_pin_t::~btree_range_pin_t()
+{
+ assert(!pins == !is_linked());
+ assert(!ref);
+ if (pins) {
+ logger().debug("{}: removing {}", __func__, *this);
+ pins->remove_pin(*this, true);
+ }
+ extent = nullptr;
+}
+
+void btree_pin_set_t::replace_pin(btree_range_pin_t &to, btree_range_pin_t &from)
+{
+ pins.replace_node(pins.iterator_to(from), to);
+}
+
+void btree_pin_set_t::remove_pin(btree_range_pin_t &pin, bool do_check_parent)
+{
+ logger().debug("{}: {}", __func__, pin);
+ assert(pin.is_linked());
+ assert(pin.pins);
+ assert(!pin.ref);
+
+ pins.erase(pin);
+ pin.pins = nullptr;
+
+ if (do_check_parent) {
+ check_parent(pin);
+ }
+}
+
+btree_range_pin_t *btree_pin_set_t::maybe_get_parent(
+ const lba_node_meta_t &meta)
+{
+ auto cmeta = meta;
+ cmeta.depth++;
+ auto iter = pins.upper_bound(cmeta, btree_range_pin_t::meta_cmp_t());
+ if (iter == pins.begin()) {
+ return nullptr;
+ } else {
+ --iter;
+ if (iter->range.is_parent_of(meta)) {
+ return &*iter;
+ } else {
+ return nullptr;
+ }
+ }
+}
+
+const btree_range_pin_t *btree_pin_set_t::maybe_get_first_child(
+ const lba_node_meta_t &meta) const
+{
+ if (meta.depth == 0) {
+ return nullptr;
+ }
+
+ auto cmeta = meta;
+ cmeta.depth--;
+
+ auto iter = pins.lower_bound(cmeta, btree_range_pin_t::meta_cmp_t());
+ if (iter == pins.end()) {
+ return nullptr;
+ } else if (meta.is_parent_of(iter->range)) {
+ return &*iter;
+ } else {
+ return nullptr;
+ }
+}
+
+void btree_pin_set_t::release_if_no_children(btree_range_pin_t &pin)
+{
+ assert(pin.is_linked());
+ if (maybe_get_first_child(pin.range) == nullptr) {
+ pin.drop_ref();
+ }
+}
+
+void btree_pin_set_t::add_pin(btree_range_pin_t &pin)
+{
+ assert(!pin.is_linked());
+ assert(!pin.pins);
+ assert(!pin.ref);
+
+ auto [prev, inserted] = pins.insert(pin);
+ if (!inserted) {
+ logger().error("{}: unable to add {}, found {}", __func__, pin, *prev);
+ assert(0 == "impossible");
+ return;
+ }
+ pin.pins = this;
+ if (!pin.is_root()) {
+ auto *parent = maybe_get_parent(pin.range);
+ assert(parent);
+ if (!parent->has_ref()) {
+ logger().debug("{}: acquiring parent {}", __func__,
+ static_cast<void*>(parent));
+ parent->acquire_ref();
+ } else {
+ logger().debug("{}: parent has ref {}", __func__,
+ static_cast<void*>(parent));
+ }
+ }
+ if (maybe_get_first_child(pin.range) != nullptr) {
+ logger().debug("{}: acquiring self {}", __func__, pin);
+ pin.acquire_ref();
+ }
+}
+
+void btree_pin_set_t::retire(btree_range_pin_t &pin)
+{
+ pin.drop_ref();
+ remove_pin(pin, false);
+}
+
+void btree_pin_set_t::check_parent(btree_range_pin_t &pin)
+{
+ auto parent = maybe_get_parent(pin.range);
+ if (parent) {
+ logger().debug("{}: releasing parent {}", __func__, *parent);
+ release_if_no_children(*parent);
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h
new file mode 100644
index 000000000..3fa218fc8
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h
@@ -0,0 +1,274 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive/set.hpp>
+
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/seastore_types.h"
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+class LBANode;
+using LBANodeRef = TCachedExtentRef<LBANode>;
+
+struct lba_node_meta_t {
+ laddr_t begin = 0;
+ laddr_t end = 0;
+ depth_t depth = 0;
+
+ bool is_parent_of(const lba_node_meta_t &other) const {
+ return (depth == other.depth + 1) &&
+ (begin <= other.begin) &&
+ (end >= other.end);
+ }
+
+ std::pair<lba_node_meta_t, lba_node_meta_t> split_into(laddr_t pivot) const {
+ return std::make_pair(
+ lba_node_meta_t{begin, pivot, depth},
+ lba_node_meta_t{pivot, end, depth});
+ }
+
+ static lba_node_meta_t merge_from(
+ const lba_node_meta_t &lhs, const lba_node_meta_t &rhs) {
+ assert(lhs.depth == rhs.depth);
+ return lba_node_meta_t{lhs.begin, rhs.end, lhs.depth};
+ }
+
+ static std::pair<lba_node_meta_t, lba_node_meta_t>
+ rebalance(const lba_node_meta_t &lhs, const lba_node_meta_t &rhs, laddr_t pivot) {
+ assert(lhs.depth == rhs.depth);
+ return std::make_pair(
+ lba_node_meta_t{lhs.begin, pivot, lhs.depth},
+ lba_node_meta_t{pivot, rhs.end, lhs.depth});
+ }
+
+ bool is_root() const {
+ return begin == 0 && end == L_ADDR_MAX;
+ }
+};
+
+inline std::ostream &operator<<(
+ std::ostream &lhs,
+ const lba_node_meta_t &rhs)
+{
+ return lhs << "btree_node_meta_t("
+ << "begin=" << rhs.begin
+ << ", end=" << rhs.end
+ << ", depth=" << rhs.depth
+ << ")";
+}
+
+/**
+ * btree_range_pin_t
+ *
+ * Element tracked by btree_pin_set_t below. Encapsulates the intrusive_set
+ * hook, the lba_node_meta_t representing the lba range covered by a node,
+ * and extent and ref members intended to hold a reference when the extent
+ * should be pinned.
+ */
+class btree_pin_set_t;
+class btree_range_pin_t : public boost::intrusive::set_base_hook<> {
+ friend class btree_pin_set_t;
+ lba_node_meta_t range;
+
+ btree_pin_set_t *pins = nullptr;
+
+ // We need to be able to remember extent without holding a reference,
+ // but we can do it more compactly -- TODO
+ CachedExtent *extent = nullptr;
+ CachedExtentRef ref;
+
+ using index_t = boost::intrusive::set<btree_range_pin_t>;
+
+ static auto get_tuple(const lba_node_meta_t &meta) {
+ return std::make_tuple(-meta.depth, meta.begin);
+ }
+
+ void acquire_ref() {
+ ref = CachedExtentRef(extent);
+ }
+
+ void drop_ref() {
+ ref.reset();
+ }
+
+public:
+ btree_range_pin_t() = default;
+ btree_range_pin_t(CachedExtent *extent)
+ : extent(extent) {}
+ btree_range_pin_t(const btree_range_pin_t &rhs, CachedExtent *extent)
+ : range(rhs.range), extent(extent) {}
+
+ bool has_ref() const {
+ return !!ref;
+ }
+
+ bool is_root() const {
+ return range.is_root();
+ }
+
+ void set_range(const lba_node_meta_t &nrange) {
+ range = nrange;
+ }
+ void set_extent(CachedExtent *nextent) {
+ assert(!extent);
+ extent = nextent;
+ }
+
+ void take_pin(btree_range_pin_t &other);
+
+ friend bool operator<(
+ const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) {
+ return get_tuple(lhs.range) < get_tuple(rhs.range);
+ }
+ friend bool operator>(
+ const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) {
+ return get_tuple(lhs.range) > get_tuple(rhs.range);
+ }
+ friend bool operator==(
+ const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) {
+ return get_tuple(lhs.range) == rhs.get_tuple(rhs.range);
+ }
+
+ struct meta_cmp_t {
+ bool operator()(
+ const btree_range_pin_t &lhs, const lba_node_meta_t &rhs) const {
+ return get_tuple(lhs.range) < get_tuple(rhs);
+ }
+ bool operator()(
+ const lba_node_meta_t &lhs, const btree_range_pin_t &rhs) const {
+ return get_tuple(lhs) < get_tuple(rhs.range);
+ }
+ };
+
+ friend std::ostream &operator<<(
+ std::ostream &lhs,
+ const btree_range_pin_t &rhs) {
+ return lhs << "btree_range_pin_t("
+ << "begin=" << rhs.range.begin
+ << ", end=" << rhs.range.end
+ << ", depth=" << rhs.range.depth
+ << ", extent=" << rhs.extent
+ << ")";
+ }
+
+ friend class BtreeLBAPin;
+ ~btree_range_pin_t();
+};
+
+/**
+ * btree_pin_set_t
+ *
+ * Ensures that for every cached node, all parent LBANodes required
+ * to map it are present in cache. Relocating these nodes can
+ * therefore be done without further reads or cache space.
+ *
+ * Contains a btree_range_pin_t for every clean or dirty LBANode
+ * or LogicalCachedExtent instance in cache at any point in time.
+ * For any LBANode, the contained btree_range_pin_t will hold
+ * a reference to that node pinning it in cache as long as that
+ * node has children in the set. This invariant can be violated
+ * only by calling retire_extent and is repaired by calling
+ * check_parent synchronously after adding any new extents.
+ */
+class btree_pin_set_t {
+ friend class btree_range_pin_t;
+ using pins_t = btree_range_pin_t::index_t;
+ pins_t pins;
+
+ pins_t::iterator get_iter(btree_range_pin_t &pin) {
+ return pins_t::s_iterator_to(pin);
+ }
+
+ /// Removes pin from set optionally checking whether parent has other children
+ void remove_pin(btree_range_pin_t &pin, bool check_parent);
+
+ void replace_pin(btree_range_pin_t &to, btree_range_pin_t &from);
+
+ /// Returns parent pin if exists
+ btree_range_pin_t *maybe_get_parent(const lba_node_meta_t &pin);
+
+ /// Returns earliest child pin if exist
+ const btree_range_pin_t *maybe_get_first_child(const lba_node_meta_t &pin) const;
+
+ /// Releases pin if it has no children
+ void release_if_no_children(btree_range_pin_t &pin);
+
+public:
+ /// Adds pin to set, assumes set is consistent
+ void add_pin(btree_range_pin_t &pin);
+
+ /**
+ * retire/check_parent
+ *
+ * See BtreeLBAManager::complete_transaction.
+ * retire removes the specified pin from the set, but does not
+ * check parents. After any new extents are added to the set,
+ * the caller is required to call check_parent to restore the
+ * invariant.
+ */
+ void retire(btree_range_pin_t &pin);
+ void check_parent(btree_range_pin_t &pin);
+
+ ~btree_pin_set_t() {
+ assert(pins.empty());
+ }
+};
+
+class BtreeLBAPin : public LBAPin {
+ friend class BtreeLBAManager;
+
+ /**
+ * parent
+ *
+ * populated until link_extent is called to ensure cache residence
+ * until add_pin is called.
+ */
+ CachedExtentRef parent;
+
+ paddr_t paddr;
+ btree_range_pin_t pin;
+
+public:
+ BtreeLBAPin() = default;
+
+ BtreeLBAPin(
+ CachedExtentRef parent,
+ paddr_t paddr,
+ lba_node_meta_t &&meta)
+ : parent(parent), paddr(paddr) {
+ pin.set_range(std::move(meta));
+ }
+
+ void link_extent(LogicalCachedExtent *ref) final {
+ pin.set_extent(ref);
+ }
+
+ extent_len_t get_length() const final {
+ assert(pin.range.end > pin.range.begin);
+ return pin.range.end - pin.range.begin;
+ }
+
+ paddr_t get_paddr() const final {
+ return paddr;
+ }
+
+ laddr_t get_laddr() const final {
+ return pin.range.begin;
+ }
+
+ LBAPinRef duplicate() const final {
+ auto ret = std::unique_ptr<BtreeLBAPin>(new BtreeLBAPin);
+ ret->pin.set_range(pin.range);
+ ret->paddr = paddr;
+ return ret;
+ }
+
+ void take_pin(LBAPin &opin) final {
+ pin.take_pin(static_cast<BtreeLBAPin&>(opin).pin);
+ }
+};
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h
new file mode 100644
index 000000000..b6f33a1ae
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h
@@ -0,0 +1,269 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <sys/mman.h>
+#include <memory>
+#include <string.h>
+
+#include "crimson/common/log.h"
+#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h"
+#include "crimson/os/seastore/lba_manager.h"
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+struct op_context_t {
+ Cache &cache;
+ btree_pin_set_t &pins;
+ Transaction &trans;
+};
+
+/**
+ * lba_map_val_t
+ *
+ * struct representing a single lba mapping
+ */
+struct lba_map_val_t {
+ extent_len_t len = 0; ///< length of mapping
+ paddr_t paddr; ///< physical addr of mapping
+ uint32_t refcount = 0; ///< refcount
+ uint32_t checksum = 0; ///< checksum of original block written at paddr (TODO)
+
+ lba_map_val_t(
+ extent_len_t len,
+ paddr_t paddr,
+ uint32_t refcount,
+ uint32_t checksum)
+ : len(len), paddr(paddr), refcount(refcount), checksum(checksum) {}
+};
+
+class BtreeLBAPin;
+using BtreeLBAPinRef = std::unique_ptr<BtreeLBAPin>;
+
+/**
+ * LBANode
+ *
+ * Base class enabling recursive lookup between internal and leaf nodes.
+ */
+struct LBANode : CachedExtent {
+ using LBANodeRef = TCachedExtentRef<LBANode>;
+ using lookup_range_ertr = LBAManager::get_mapping_ertr;
+ using lookup_range_ret = LBAManager::get_mapping_ret;
+
+ btree_range_pin_t pin;
+
+ LBANode(ceph::bufferptr &&ptr) : CachedExtent(std::move(ptr)), pin(this) {}
+ LBANode(const LBANode &rhs)
+ : CachedExtent(rhs), pin(rhs.pin, this) {}
+
+ virtual lba_node_meta_t get_node_meta() const = 0;
+
+ /**
+ * lookup
+ *
+ * Returns the node at the specified depth responsible
+ * for laddr
+ */
+ using lookup_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using lookup_ret = lookup_ertr::future<LBANodeRef>;
+ virtual lookup_ret lookup(
+ op_context_t c,
+ laddr_t addr,
+ depth_t depth) = 0;
+
+ /**
+ * lookup_range
+ *
+ * Returns mappings within range [addr, addr+len)
+ */
+ virtual lookup_range_ret lookup_range(
+ op_context_t c,
+ laddr_t addr,
+ extent_len_t len) = 0;
+
+ /**
+ * insert
+ *
+ * Recursively inserts into subtree rooted at *this. Caller
+ * must already have handled splitting if at_max_capacity().
+ *
+ * Precondition: !at_max_capacity()
+ */
+ using insert_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error
+ >;
+ using insert_ret = insert_ertr::future<LBAPinRef>;
+ virtual insert_ret insert(
+ op_context_t c,
+ laddr_t laddr,
+ lba_map_val_t val) = 0;
+
+ /**
+ * find_hole
+ *
+ * Finds minimum hole of size len in [min, max)
+ *
+ * @return addr of hole, L_ADDR_NULL if unfound
+ */
+ using find_hole_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using find_hole_ret = find_hole_ertr::future<laddr_t>;
+ virtual find_hole_ret find_hole(
+ op_context_t c,
+ laddr_t min,
+ laddr_t max,
+ extent_len_t len) = 0;
+
+ /**
+ * scan_mappings
+ *
+ * Call f for all mappings in [begin, end)
+ */
+ using scan_mappings_ertr = LBAManager::scan_mappings_ertr;
+ using scan_mappings_ret = LBAManager::scan_mappings_ret;
+ using scan_mappings_func_t = LBAManager::scan_mappings_func_t;
+ virtual scan_mappings_ret scan_mappings(
+ op_context_t c,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &f) = 0;
+
+ using scan_mapped_space_ertr = LBAManager::scan_mapped_space_ertr;
+ using scan_mapped_space_ret = LBAManager::scan_mapped_space_ret;
+ using scan_mapped_space_func_t = LBAManager::scan_mapped_space_func_t;
+ virtual scan_mapped_space_ret scan_mapped_space(
+ op_context_t c,
+ scan_mapped_space_func_t &f) = 0;
+
+ /**
+ * mutate_mapping
+ *
+ * Lookups up laddr, calls f on value. If f returns a value, inserts it.
+ * If it returns nullopt, removes the value.
+ * Caller must already have merged if at_min_capacity().
+ *
+ * Recursive calls use mutate_mapping_internal.
+ *
+ * Precondition: !at_min_capacity()
+ */
+ using mutate_mapping_ertr = crimson::errorator<
+ crimson::ct_error::enoent, ///< mapping does not exist
+ crimson::ct_error::input_output_error
+ >;
+ using mutate_mapping_ret = mutate_mapping_ertr::future<
+ lba_map_val_t>;
+ using mutate_func_t = std::function<
+ lba_map_val_t(const lba_map_val_t &v)
+ >;
+ virtual mutate_mapping_ret mutate_mapping(
+ op_context_t c,
+ laddr_t laddr,
+ mutate_func_t &&f) = 0;
+ virtual mutate_mapping_ret mutate_mapping_internal(
+ op_context_t c,
+ laddr_t laddr,
+ bool is_root,
+ mutate_func_t &&f) = 0;
+
+ /**
+ * mutate_internal_address
+ *
+ * Looks up internal node mapping at laddr, depth and
+ * updates the mapping to paddr. Returns previous paddr
+ * (for debugging purposes).
+ */
+ using mutate_internal_address_ertr = crimson::errorator<
+ crimson::ct_error::enoent, ///< mapping does not exist
+ crimson::ct_error::input_output_error
+ >;
+ using mutate_internal_address_ret = mutate_internal_address_ertr::future<
+ paddr_t>;
+ virtual mutate_internal_address_ret mutate_internal_address(
+ op_context_t c,
+ depth_t depth,
+ laddr_t laddr,
+ paddr_t paddr) = 0;
+
+ /**
+ * make_split_children
+ *
+ * Generates appropriately typed left and right nodes formed from the
+ * contents of *this.
+ *
+ * Returns <left, right, pivot> where pivot is the first value of right.
+ */
+ virtual std::tuple<
+ LBANodeRef,
+ LBANodeRef,
+ laddr_t>
+ make_split_children(
+ op_context_t c) = 0;
+
+ /**
+ * make_full_merge
+ *
+ * Returns a single node formed from merging *this and right.
+ * Precondition: at_min_capacity() && right.at_min_capacity()
+ */
+ virtual LBANodeRef make_full_merge(
+ op_context_t c,
+ LBANodeRef &right) = 0;
+
+ /**
+ * make_balanced
+ *
+ * Returns nodes formed by balancing the contents of *this and right.
+ *
+ * Returns <left, right, pivot> where pivot is the first value of right.
+ */
+ virtual std::tuple<
+ LBANodeRef,
+ LBANodeRef,
+ laddr_t>
+ make_balanced(
+ op_context_t c,
+ LBANodeRef &right,
+ bool prefer_left) = 0;
+
+ virtual bool at_max_capacity() const = 0;
+ virtual bool at_min_capacity() const = 0;
+
+ virtual ~LBANode() = default;
+
+ void on_delta_write(paddr_t record_block_offset) final {
+ // All in-memory relative addrs are necessarily record-relative
+ assert(get_prior_instance());
+ pin.take_pin(get_prior_instance()->cast<LBANode>()->pin);
+ resolve_relative_addrs(record_block_offset);
+ }
+
+ void on_initial_write() final {
+ // All in-memory relative addrs are necessarily block-relative
+ resolve_relative_addrs(get_paddr());
+ }
+
+ void on_clean_read() final {
+ // From initial write of block, relative addrs are necessarily block-relative
+ resolve_relative_addrs(get_paddr());
+ }
+
+ virtual void resolve_relative_addrs(paddr_t base) = 0;
+};
+using LBANodeRef = LBANode::LBANodeRef;
+
+/**
+ * get_lba_btree_extent
+ *
+ * Fetches node at depth of the appropriate type.
+ */
+Cache::get_extent_ertr::future<LBANodeRef> get_lba_btree_extent(
+ op_context_t c, ///< [in] context structure
+ depth_t depth, ///< [in] depth of node to fetch
+ paddr_t offset, ///< [in] physical addr of node
+ paddr_t base ///< [in] depending on user, block addr or record addr
+ /// in case offset is relative
+);
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc
new file mode 100644
index 000000000..5e400803b
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc
@@ -0,0 +1,701 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include <memory>
+#include <string.h>
+
+#include "include/buffer.h"
+#include "include/byteorder.h"
+
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+std::ostream &LBAInternalNode::print_detail(std::ostream &out) const
+{
+ return out << ", size=" << get_size()
+ << ", meta=" << get_meta();
+}
+
+LBAInternalNode::lookup_ret LBAInternalNode::lookup(
+ op_context_t c,
+ laddr_t addr,
+ depth_t depth)
+{
+ auto meta = get_meta();
+ if (depth == get_meta().depth) {
+ return lookup_ret(
+ lookup_ertr::ready_future_marker{},
+ this);
+ }
+ assert(meta.begin <= addr);
+ assert(meta.end > addr);
+ auto iter = lower_bound(addr);
+ return get_lba_btree_extent(
+ c,
+ meta.depth - 1,
+ iter->get_val(),
+ get_paddr()).safe_then([c, addr, depth](auto child) {
+ return child->lookup(c, addr, depth);
+ }).finally([ref=LBANodeRef(this)] {});
+}
+
+LBAInternalNode::lookup_range_ret LBAInternalNode::lookup_range(
+ op_context_t c,
+ laddr_t addr,
+ extent_len_t len)
+{
+ auto [begin, end] = bound(addr, addr + len);
+ auto result_up = std::make_unique<lba_pin_list_t>();
+ auto &result = *result_up;
+ return crimson::do_for_each(
+ std::move(begin),
+ std::move(end),
+ [this, c, &result, addr, len](const auto &val) mutable {
+ return get_lba_btree_extent(
+ c,
+ get_meta().depth - 1,
+ val.get_val(),
+ get_paddr()).safe_then(
+ [c, &result, addr, len](auto extent) mutable {
+ return extent->lookup_range(
+ c,
+ addr,
+ len).safe_then(
+ [&result](auto pin_list) mutable {
+ result.splice(result.end(), pin_list,
+ pin_list.begin(), pin_list.end());
+ });
+ });
+ }).safe_then([result=std::move(result_up), ref=LBANodeRef(this)] {
+ return lookup_range_ertr::make_ready_future<lba_pin_list_t>(
+ std::move(*result));
+ });
+}
+
+LBAInternalNode::insert_ret LBAInternalNode::insert(
+ op_context_t c,
+ laddr_t laddr,
+ lba_map_val_t val)
+{
+ auto insertion_pt = get_containing_child(laddr);
+ return get_lba_btree_extent(
+ c,
+ get_meta().depth - 1,
+ insertion_pt->get_val(),
+ get_paddr()).safe_then(
+ [this, insertion_pt, c, laddr, val=std::move(val)](
+ auto extent) mutable {
+ return extent->at_max_capacity() ?
+ split_entry(c, laddr, insertion_pt, extent) :
+ insert_ertr::make_ready_future<LBANodeRef>(std::move(extent));
+ }).safe_then([c, laddr, val=std::move(val)](
+ LBANodeRef extent) mutable {
+ return extent->insert(c, laddr, val);
+ });
+}
+
+LBAInternalNode::mutate_mapping_ret LBAInternalNode::mutate_mapping(
+ op_context_t c,
+ laddr_t laddr,
+ mutate_func_t &&f)
+{
+ return mutate_mapping_internal(c, laddr, true, std::move(f));
+}
+
+LBAInternalNode::mutate_mapping_ret LBAInternalNode::mutate_mapping_internal(
+ op_context_t c,
+ laddr_t laddr,
+ bool is_root,
+ mutate_func_t &&f)
+{
+ auto mutation_pt = get_containing_child(laddr);
+ if (mutation_pt == end()) {
+ assert(0 == "impossible");
+ return crimson::ct_error::enoent::make();
+ }
+ return get_lba_btree_extent(
+ c,
+ get_meta().depth - 1,
+ mutation_pt->get_val(),
+ get_paddr()
+ ).safe_then([=](LBANodeRef extent) {
+ if (extent->at_min_capacity() && get_size() > 1) {
+ return merge_entry(
+ c,
+ laddr,
+ mutation_pt,
+ extent,
+ is_root);
+ } else {
+ return merge_ertr::make_ready_future<LBANodeRef>(
+ std::move(extent));
+ }
+ }).safe_then([c, laddr, f=std::move(f)](LBANodeRef extent) mutable {
+ return extent->mutate_mapping_internal(c, laddr, false, std::move(f));
+ });
+}
+
+LBAInternalNode::mutate_internal_address_ret LBAInternalNode::mutate_internal_address(
+ op_context_t c,
+ depth_t depth,
+ laddr_t laddr,
+ paddr_t paddr)
+{
+ if (get_meta().depth == (depth + 1)) {
+ if (!is_pending()) {
+ return c.cache.duplicate_for_write(c.trans, this)->cast<LBAInternalNode>(
+ )->mutate_internal_address(
+ c,
+ depth,
+ laddr,
+ paddr);
+ }
+ auto iter = get_containing_child(laddr);
+ if (iter->get_key() != laddr) {
+ return crimson::ct_error::enoent::make();
+ }
+
+ auto old_paddr = iter->get_val();
+
+ journal_update(
+ iter,
+ maybe_generate_relative(paddr),
+ maybe_get_delta_buffer());
+
+ return mutate_internal_address_ret(
+ mutate_internal_address_ertr::ready_future_marker{},
+ old_paddr
+ );
+ } else {
+ auto iter = get_containing_child(laddr);
+ return get_lba_btree_extent(
+ c,
+ get_meta().depth - 1,
+ iter->get_val(),
+ get_paddr()
+ ).safe_then([=](auto node) {
+ return node->mutate_internal_address(
+ c,
+ depth,
+ laddr,
+ paddr);
+ });
+ }
+}
+
+LBAInternalNode::find_hole_ret LBAInternalNode::find_hole(
+ op_context_t c,
+ laddr_t min_addr,
+ laddr_t max_addr,
+ extent_len_t len)
+{
+ logger().debug(
+ "LBAInternalNode::find_hole min={}, max={}, len={}, *this={}",
+ min_addr, max_addr, len, *this);
+ auto [begin, end] = bound(min_addr, max_addr);
+ return seastar::repeat_until_value(
+ [i=begin, e=end, c, min_addr, len, this]() mutable {
+ if (i == e) {
+ return seastar::make_ready_future<std::optional<laddr_t>>(
+ std::make_optional<laddr_t>(L_ADDR_NULL));
+ }
+ return get_lba_btree_extent(c,
+ get_meta().depth - 1,
+ i->get_val(),
+ get_paddr()).safe_then(
+ [c, min_addr, len, i](auto extent) mutable {
+ auto lb = std::max(min_addr, i->get_key());
+ auto ub = i->get_next_key_or_max();
+ logger().debug("LBAInternalNode::find_hole extent {} lb {} ub {}",
+ *extent, lb, ub);
+ return extent->find_hole(c, lb, ub, len);
+ }).safe_then([&i](auto addr) mutable -> std::optional<laddr_t> {
+ if (addr == L_ADDR_NULL) {
+ ++i;
+ return {};
+ } else {
+ return addr;
+ }
+ },
+ // TODO: GCC enters a dead loop if crimson::do_until() is used
+ // or erroratorized future is returned
+ crimson::ct_error::assert_all{ "fix me - APIv6" });
+ });
+}
+
+LBAInternalNode::scan_mappings_ret LBAInternalNode::scan_mappings(
+ op_context_t c,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &f)
+{
+ auto [biter, eiter] = bound(begin, end);
+ return crimson::do_for_each(
+ std::move(biter),
+ std::move(eiter),
+ [=, &f](auto &viter) {
+ return get_lba_btree_extent(
+ c,
+ get_meta().depth - 1,
+ viter->get_val(),
+ get_paddr()).safe_then([=, &f](auto child) {
+ return child->scan_mappings(c, begin, end, f);
+ });
+ }).safe_then([ref=LBANodeRef(this)]{});
+}
+
+LBAInternalNode::scan_mapped_space_ret LBAInternalNode::scan_mapped_space(
+ op_context_t c,
+ scan_mapped_space_func_t &f)
+{
+ f(get_paddr(), get_length());
+ return crimson::do_for_each(
+ begin(), end(),
+ [=, &f](auto &viter) {
+ return get_lba_btree_extent(
+ c,
+ get_meta().depth - 1,
+ viter->get_val(),
+ get_paddr()).safe_then([=, &f](auto child) {
+ return child->scan_mapped_space(c, f);
+ });
+ }).safe_then([ref=LBANodeRef(this)]{});
+}
+
+
+void LBAInternalNode::resolve_relative_addrs(paddr_t base)
+{
+ for (auto i: *this) {
+ if (i->get_val().is_relative()) {
+ auto updated = base.add_relative(i->get_val());
+ logger().debug(
+ "LBAInternalNode::resolve_relative_addrs {} -> {}",
+ i->get_val(),
+ updated);
+ i->set_val(updated);
+ }
+ }
+}
+
+
+LBAInternalNode::split_ret
+LBAInternalNode::split_entry(
+ op_context_t c,
+ laddr_t addr,
+ internal_iterator_t iter, LBANodeRef entry)
+{
+ if (!is_pending()) {
+ auto mut = c.cache.duplicate_for_write(
+ c.trans, this)->cast<LBAInternalNode>();
+ auto mut_iter = mut->iter_idx(iter->get_offset());
+ return mut->split_entry(c, addr, mut_iter, entry);
+ }
+
+ ceph_assert(!at_max_capacity());
+ auto [left, right, pivot] = entry->make_split_children(c);
+
+ journal_update(
+ iter,
+ maybe_generate_relative(left->get_paddr()),
+ maybe_get_delta_buffer());
+ journal_insert(
+ iter + 1,
+ pivot,
+ maybe_generate_relative(right->get_paddr()),
+ maybe_get_delta_buffer());
+
+ c.cache.retire_extent(c.trans, entry);
+
+ logger().debug(
+ "LBAInternalNode::split_entry *this {} entry {} into left {} right {}",
+ *this,
+ *entry,
+ *left,
+ *right);
+
+ return split_ertr::make_ready_future<LBANodeRef>(
+ pivot > addr ? left : right
+ );
+}
+
+LBAInternalNode::merge_ret
+LBAInternalNode::merge_entry(
+ op_context_t c,
+ laddr_t addr,
+ internal_iterator_t iter,
+ LBANodeRef entry,
+ bool is_root)
+{
+ if (!is_pending()) {
+ auto mut = c.cache.duplicate_for_write(c.trans, this)->cast<LBAInternalNode>();
+ auto mut_iter = mut->iter_idx(iter->get_offset());
+ return mut->merge_entry(c, addr, mut_iter, entry, is_root);
+ }
+
+ logger().debug(
+ "LBAInternalNode: merge_entry: {}, {}",
+ *this,
+ *entry);
+ auto donor_is_left = (iter + 1) == end();
+ auto donor_iter = donor_is_left ? iter - 1 : iter + 1;
+ return get_lba_btree_extent(
+ c,
+ get_meta().depth - 1,
+ donor_iter->get_val(),
+ get_paddr()
+ ).safe_then([=](auto donor) mutable {
+ auto [l, r] = donor_is_left ?
+ std::make_pair(donor, entry) : std::make_pair(entry, donor);
+ auto [liter, riter] = donor_is_left ?
+ std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter);
+ if (donor->at_min_capacity()) {
+ auto replacement = l->make_full_merge(
+ c,
+ r);
+
+ journal_update(
+ liter,
+ maybe_generate_relative(replacement->get_paddr()),
+ maybe_get_delta_buffer());
+ journal_remove(riter, maybe_get_delta_buffer());
+
+ c.cache.retire_extent(c.trans, l);
+ c.cache.retire_extent(c.trans, r);
+
+ if (is_root && get_size() == 1) {
+ return c.cache.get_root(c.trans).safe_then([=](RootBlockRef croot) {
+ {
+ auto mut_croot = c.cache.duplicate_for_write(c.trans, croot);
+ croot = mut_croot->cast<RootBlock>();
+ }
+ croot->root.lba_root_addr = begin()->get_val();
+ logger().debug(
+ "LBAInternalNode::merge_entry: collapsing root {} to addr {}",
+ *this,
+ begin()->get_val());
+ croot->root.lba_depth = get_meta().depth - 1;
+ c.cache.retire_extent(c.trans, this);
+ return merge_ertr::make_ready_future<LBANodeRef>(replacement);
+ });
+ } else {
+ return merge_ertr::make_ready_future<LBANodeRef>(replacement);
+ }
+ } else {
+ logger().debug(
+ "LBAInternalEntry::merge_entry balanced l {} r {}",
+ *l,
+ *r);
+ auto [replacement_l, replacement_r, pivot] =
+ l->make_balanced(
+ c,
+ r,
+ !donor_is_left);
+
+ journal_update(
+ liter,
+ maybe_generate_relative(replacement_l->get_paddr()),
+ maybe_get_delta_buffer());
+ journal_replace(
+ riter,
+ pivot,
+ maybe_generate_relative(replacement_r->get_paddr()),
+ maybe_get_delta_buffer());
+
+ c.cache.retire_extent(c.trans, l);
+ c.cache.retire_extent(c.trans, r);
+ return merge_ertr::make_ready_future<LBANodeRef>(
+ addr >= pivot ? replacement_r : replacement_l
+ );
+ }
+ });
+}
+
+
+LBAInternalNode::internal_iterator_t
+LBAInternalNode::get_containing_child(laddr_t laddr)
+{
+ // TODO: binary search
+ for (auto i = begin(); i != end(); ++i) {
+ if (i.contains(laddr))
+ return i;
+ }
+ ceph_assert(0 == "invalid");
+ return end();
+}
+
+std::ostream &LBALeafNode::print_detail(std::ostream &out) const
+{
+ return out << ", size=" << get_size()
+ << ", meta=" << get_meta();
+}
+
+LBALeafNode::lookup_range_ret LBALeafNode::lookup_range(
+ op_context_t c,
+ laddr_t addr,
+ extent_len_t len)
+{
+ logger().debug(
+ "LBALeafNode::lookup_range {}~{}",
+ addr,
+ len);
+ auto ret = lba_pin_list_t();
+ auto [i, end] = get_leaf_entries(addr, len);
+ for (; i != end; ++i) {
+ auto val = i->get_val();
+ auto begin = i->get_key();
+ ret.emplace_back(
+ std::make_unique<BtreeLBAPin>(
+ this,
+ val.paddr.maybe_relative_to(get_paddr()),
+ lba_node_meta_t{ begin, begin + val.len, 0}));
+ }
+ return lookup_range_ertr::make_ready_future<lba_pin_list_t>(
+ std::move(ret));
+}
+
+LBALeafNode::insert_ret LBALeafNode::insert(
+ op_context_t c,
+ laddr_t laddr,
+ lba_map_val_t val)
+{
+ ceph_assert(!at_max_capacity());
+
+ if (!is_pending()) {
+ return c.cache.duplicate_for_write(c.trans, this
+ )->cast<LBALeafNode>()->insert(c, laddr, val);
+ }
+
+ val.paddr = maybe_generate_relative(val.paddr);
+ logger().debug(
+ "LBALeafNode::insert: inserting {}~{} -> {}",
+ laddr,
+ val.len,
+ val.paddr);
+
+ auto insert_pt = lower_bound(laddr);
+ journal_insert(insert_pt, laddr, val, maybe_get_delta_buffer());
+
+ logger().debug(
+ "LBALeafNode::insert: inserted {}~{} -> {}",
+ insert_pt.get_key(),
+ insert_pt.get_val().len,
+ insert_pt.get_val().paddr);
+ auto begin = insert_pt.get_key();
+ return insert_ret(
+ insert_ertr::ready_future_marker{},
+ std::make_unique<BtreeLBAPin>(
+ this,
+ val.paddr.maybe_relative_to(get_paddr()),
+ lba_node_meta_t{ begin, begin + val.len, 0}));
+}
+
+LBALeafNode::mutate_mapping_ret LBALeafNode::mutate_mapping(
+ op_context_t c,
+ laddr_t laddr,
+ mutate_func_t &&f)
+{
+ return mutate_mapping_internal(c, laddr, true, std::move(f));
+}
+
+LBALeafNode::mutate_mapping_ret LBALeafNode::mutate_mapping_internal(
+ op_context_t c,
+ laddr_t laddr,
+ bool is_root,
+ mutate_func_t &&f)
+{
+ auto mutation_pt = find(laddr);
+ if (mutation_pt == end()) {
+ return crimson::ct_error::enoent::make();
+ }
+
+ if (!is_pending()) {
+ return c.cache.duplicate_for_write(c.trans, this)->cast<LBALeafNode>(
+ )->mutate_mapping_internal(
+ c,
+ laddr,
+ is_root,
+ std::move(f));
+ }
+
+ auto cur = mutation_pt.get_val();
+ auto mutated = f(cur);
+
+ mutated.paddr = maybe_generate_relative(mutated.paddr);
+
+ logger().debug(
+ "{}: mutate addr {}: {} -> {}",
+ __func__,
+ laddr,
+ cur.paddr,
+ mutated.paddr);
+
+ if (mutated.refcount > 0) {
+ journal_update(mutation_pt, mutated, maybe_get_delta_buffer());
+ return mutate_mapping_ret(
+ mutate_mapping_ertr::ready_future_marker{},
+ mutated);
+ } else {
+ journal_remove(mutation_pt, maybe_get_delta_buffer());
+ return mutate_mapping_ret(
+ mutate_mapping_ertr::ready_future_marker{},
+ mutated);
+ }
+}
+
+LBALeafNode::mutate_internal_address_ret LBALeafNode::mutate_internal_address(
+ op_context_t c,
+ depth_t depth,
+ laddr_t laddr,
+ paddr_t paddr)
+{
+ ceph_assert(0 == "Impossible");
+ return mutate_internal_address_ret(
+ mutate_internal_address_ertr::ready_future_marker{},
+ paddr);
+}
+
+LBALeafNode::find_hole_ret LBALeafNode::find_hole(
+ op_context_t c,
+ laddr_t min,
+ laddr_t max,
+ extent_len_t len)
+{
+ logger().debug(
+ "LBALeafNode::find_hole min={} max={}, len={}, *this={}",
+ min, max, len, *this);
+ auto [liter, uiter] = bound(min, max);
+ for (auto i = liter; i != uiter; ++i) {
+ auto ub = i->get_key();
+ if (min + len <= ub) {
+ return find_hole_ret(
+ find_hole_ertr::ready_future_marker{},
+ min);
+ } else {
+ min = i->get_key() + i->get_val().len;
+ }
+ }
+ if (min + len <= max) {
+ return find_hole_ret(
+ find_hole_ertr::ready_future_marker{},
+ min);
+ } else {
+ return find_hole_ret(
+ find_hole_ertr::ready_future_marker{},
+ L_ADDR_MAX);
+ }
+}
+
+LBALeafNode::scan_mappings_ret LBALeafNode::scan_mappings(
+ op_context_t c,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &f)
+{
+ auto [biter, eiter] = bound(begin, end);
+ for (auto i = biter; i != eiter; ++i) {
+ auto val = i->get_val();
+ f(i->get_key(), val.paddr, val.len);
+ }
+ return scan_mappings_ertr::now();
+}
+
+LBALeafNode::scan_mapped_space_ret LBALeafNode::scan_mapped_space(
+ op_context_t c,
+ scan_mapped_space_func_t &f)
+{
+ f(get_paddr(), get_length());
+ for (auto i = begin(); i != end(); ++i) {
+ auto val = i->get_val();
+ f(val.paddr, val.len);
+ }
+ return scan_mappings_ertr::now();
+}
+
+
+void LBALeafNode::resolve_relative_addrs(paddr_t base)
+{
+ for (auto i: *this) {
+ if (i->get_val().paddr.is_relative()) {
+ auto val = i->get_val();
+ val.paddr = base.add_relative(val.paddr);
+ logger().debug(
+ "LBALeafNode::resolve_relative_addrs {} -> {}",
+ i->get_val().paddr,
+ val.paddr);
+ i->set_val(val);
+ }
+ }
+}
+
+std::pair<LBALeafNode::internal_iterator_t, LBALeafNode::internal_iterator_t>
+LBALeafNode::get_leaf_entries(laddr_t addr, extent_len_t len)
+{
+ return bound(addr, addr + len);
+}
+
+Cache::get_extent_ertr::future<LBANodeRef> get_lba_btree_extent(
+ op_context_t c,
+ depth_t depth,
+ paddr_t offset,
+ paddr_t base)
+{
+ offset = offset.maybe_relative_to(base);
+ ceph_assert(depth > 0);
+ if (depth > 1) {
+ logger().debug(
+ "get_lba_btree_extent: reading internal at offset {}, depth {}",
+ offset,
+ depth);
+ return c.cache.get_extent<LBAInternalNode>(
+ c.trans,
+ offset,
+ LBA_BLOCK_SIZE).safe_then([c](auto ret) {
+ auto meta = ret->get_meta();
+ if (ret->get_size()) {
+ ceph_assert(meta.begin <= ret->begin()->get_key());
+ ceph_assert(meta.end > (ret->end() - 1)->get_key());
+ }
+ if (!ret->is_pending() && !ret->pin.is_linked()) {
+ ret->pin.set_range(meta);
+ c.pins.add_pin(ret->pin);
+ }
+ return LBANodeRef(ret.detach(), /* add_ref = */ false);
+ });
+ } else {
+ logger().debug(
+ "get_lba_btree_extent: reading leaf at offset {}, depth {}",
+ offset,
+ depth);
+ return c.cache.get_extent<LBALeafNode>(
+ c.trans,
+ offset,
+ LBA_BLOCK_SIZE).safe_then([offset, c](auto ret) {
+ logger().debug(
+ "get_lba_btree_extent: read leaf at offset {} {}",
+ offset,
+ *ret);
+ auto meta = ret->get_meta();
+ if (ret->get_size()) {
+ ceph_assert(meta.begin <= ret->begin()->get_key());
+ ceph_assert(meta.end > (ret->end() - 1)->get_key());
+ }
+ if (!ret->is_pending() && !ret->pin.is_linked()) {
+ ret->pin.set_range(meta);
+ c.pins.add_pin(ret->pin);
+ }
+ return LBANodeRef(ret.detach(), /* add_ref = */ false);
+ });
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h
new file mode 100644
index 000000000..230eef682
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h
@@ -0,0 +1,555 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include <memory>
+#include <string.h>
+
+#include "include/buffer.h"
+
+#include "crimson/common/fixed_kv_node_layout.h"
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/lba_manager.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
+#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h"
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+constexpr size_t LBA_BLOCK_SIZE = 4096;
+
+/**
+ * lba_node_meta_le_t
+ *
+ * On disk layout for lba_node_meta_t
+ */
+struct lba_node_meta_le_t {
+ laddr_le_t begin = laddr_le_t(0);
+ laddr_le_t end = laddr_le_t(0);
+ depth_le_t depth = init_les32(0);
+
+ lba_node_meta_le_t() = default;
+ lba_node_meta_le_t(const lba_node_meta_le_t &) = default;
+ explicit lba_node_meta_le_t(const lba_node_meta_t &val)
+ : begin(init_le64(val.begin)),
+ end(init_le64(val.end)),
+ depth(init_les32(val.depth)) {}
+
+ operator lba_node_meta_t() const {
+ return lba_node_meta_t{ begin, end, depth };
+ }
+};
+
+
+/**
+ * LBAInternalNode
+ *
+ * Abstracts operations on and layout of internal nodes for the
+ * LBA Tree.
+ *
+ * Layout (4k):
+ * size : uint32_t[1] 4b
+ * (padding) : 4b
+ * meta : lba_node_meta_le_t[3] (1*24)b
+ * keys : laddr_t[255] (254*8)b
+ * values : paddr_t[255] (254*8)b
+ * = 4096
+
+ * TODO: make the above capacity calculation part of FixedKVNodeLayout
+ * TODO: the above alignment probably isn't portable without further work
+ */
+constexpr size_t INTERNAL_NODE_CAPACITY = 254;
+struct LBAInternalNode
+ : LBANode,
+ common::FixedKVNodeLayout<
+ INTERNAL_NODE_CAPACITY,
+ lba_node_meta_t, lba_node_meta_le_t,
+ laddr_t, laddr_le_t,
+ paddr_t, paddr_le_t> {
+ using internal_iterator_t = const_iterator;
+ template <typename... T>
+ LBAInternalNode(T&&... t) :
+ LBANode(std::forward<T>(t)...),
+ FixedKVNodeLayout(get_bptr().c_str()) {}
+
+ static constexpr extent_types_t type = extent_types_t::LADDR_INTERNAL;
+
+ lba_node_meta_t get_node_meta() const final { return get_meta(); }
+
+ CachedExtentRef duplicate_for_write() final {
+ assert(delta_buffer.empty());
+ return CachedExtentRef(new LBAInternalNode(*this));
+ };
+
+ delta_buffer_t delta_buffer;
+ delta_buffer_t *maybe_get_delta_buffer() {
+ return is_mutation_pending() ? &delta_buffer : nullptr;
+ }
+
+ lookup_ret lookup(op_context_t c, laddr_t addr, depth_t depth) final;
+
+ lookup_range_ret lookup_range(
+ op_context_t c,
+ laddr_t addr,
+ extent_len_t len) final;
+
+ insert_ret insert(
+ op_context_t c,
+ laddr_t laddr,
+ lba_map_val_t val) final;
+
+ mutate_mapping_ret mutate_mapping(
+ op_context_t c,
+ laddr_t laddr,
+ mutate_func_t &&f) final;
+ mutate_mapping_ret mutate_mapping_internal(
+ op_context_t c,
+ laddr_t laddr,
+ bool is_root,
+ mutate_func_t &&f) final;
+
+ mutate_internal_address_ret mutate_internal_address(
+ op_context_t c,
+ depth_t depth,
+ laddr_t laddr,
+ paddr_t paddr) final;
+
+ find_hole_ret find_hole(
+ op_context_t c,
+ laddr_t min,
+ laddr_t max,
+ extent_len_t len) final;
+
+ scan_mappings_ret scan_mappings(
+ op_context_t c,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &f) final;
+
+ scan_mapped_space_ret scan_mapped_space(
+ op_context_t c,
+ scan_mapped_space_func_t &f) final;
+
+ std::tuple<LBANodeRef, LBANodeRef, laddr_t>
+ make_split_children(op_context_t c) final {
+ auto left = c.cache.alloc_new_extent<LBAInternalNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto right = c.cache.alloc_new_extent<LBAInternalNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto pivot = split_into(*left, *right);
+ left->pin.set_range(left->get_meta());
+ right->pin.set_range(right->get_meta());
+ return std::make_tuple(
+ left,
+ right,
+ pivot);
+ }
+
+ LBANodeRef make_full_merge(
+ op_context_t c,
+ LBANodeRef &right) final {
+ auto replacement = c.cache.alloc_new_extent<LBAInternalNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ replacement->merge_from(*this, *right->cast<LBAInternalNode>());
+ replacement->pin.set_range(replacement->get_meta());
+ return replacement;
+ }
+
+ std::tuple<LBANodeRef, LBANodeRef, laddr_t>
+ make_balanced(
+ op_context_t c,
+ LBANodeRef &_right,
+ bool prefer_left) final {
+ ceph_assert(_right->get_type() == type);
+ auto &right = *_right->cast<LBAInternalNode>();
+ auto replacement_left = c.cache.alloc_new_extent<LBAInternalNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto replacement_right = c.cache.alloc_new_extent<LBAInternalNode>(
+ c.trans, LBA_BLOCK_SIZE);
+
+ auto pivot = balance_into_new_nodes(
+ *this,
+ right,
+ prefer_left,
+ *replacement_left,
+ *replacement_right);
+
+ replacement_left->pin.set_range(replacement_left->get_meta());
+ replacement_right->pin.set_range(replacement_right->get_meta());
+ return std::make_tuple(
+ replacement_left,
+ replacement_right,
+ pivot);
+ }
+
+ /**
+ * Internal relative addresses on read or in memory prior to commit
+ * are either record or block relative depending on whether this
+ * physical node is is_initial_pending() or just is_pending().
+ *
+ * User passes appropriate base depending on lifecycle and
+ * resolve_relative_addrs fixes up relative internal references
+ * based on base.
+ */
+ void resolve_relative_addrs(paddr_t base) final;
+ void node_resolve_vals(iterator from, iterator to) const final {
+ if (is_initial_pending()) {
+ for (auto i = from; i != to; ++i) {
+ if (i->get_val().is_relative()) {
+ assert(i->get_val().is_block_relative());
+ i->set_val(get_paddr().add_relative(i->get_val()));
+ }
+ }
+ }
+ }
+ void node_unresolve_vals(iterator from, iterator to) const final {
+ if (is_initial_pending()) {
+ for (auto i = from; i != to; ++i) {
+ if (i->get_val().is_relative()) {
+ assert(i->get_val().is_record_relative());
+ i->set_val(i->get_val() - get_paddr());
+ }
+ }
+ }
+ }
+
+ extent_types_t get_type() const final {
+ return type;
+ }
+
+ std::ostream &print_detail(std::ostream &out) const final;
+
+ ceph::bufferlist get_delta() final {
+ assert(!delta_buffer.empty());
+ ceph::buffer::ptr bptr(delta_buffer.get_bytes());
+ delta_buffer.copy_out(bptr.c_str(), bptr.length());
+ ceph::bufferlist bl;
+ bl.push_back(bptr);
+ return bl;
+ }
+
+ void apply_delta_and_adjust_crc(
+ paddr_t base, const ceph::bufferlist &_bl) final {
+ assert(_bl.length());
+ ceph::bufferlist bl = _bl;
+ bl.rebuild();
+ delta_buffer_t buffer;
+ buffer.copy_in(bl.front().c_str(), bl.front().length());
+ buffer.replay(*this);
+ set_last_committed_crc(get_crc32c());
+ resolve_relative_addrs(base);
+ }
+
+ bool at_max_capacity() const final {
+ return get_size() == get_capacity();
+ }
+
+ bool at_min_capacity() const {
+ return get_size() == (get_capacity() / 2);
+ }
+
+ /// returns iterators containing [l, r)
+ std::pair<internal_iterator_t, internal_iterator_t> bound(
+ laddr_t l, laddr_t r) {
+ // TODO: inefficient
+ auto retl = begin();
+ for (; retl != end(); ++retl) {
+ if (retl->get_next_key_or_max() > l)
+ break;
+ }
+ auto retr = retl;
+ for (; retr != end(); ++retr) {
+ if (retr->get_key() >= r)
+ break;
+ }
+ return std::make_pair(retl, retr);
+ }
+
+ using split_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error
+ >;
+ using split_ret = split_ertr::future<LBANodeRef>;
+ split_ret split_entry(
+ op_context_t c,
+ laddr_t addr,
+ internal_iterator_t,
+ LBANodeRef entry);
+
+ using merge_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error
+ >;
+ using merge_ret = merge_ertr::future<LBANodeRef>;
+ merge_ret merge_entry(
+ op_context_t c,
+ laddr_t addr,
+ internal_iterator_t,
+ LBANodeRef entry,
+ bool is_root);
+
+ /// returns iterator for subtree containing laddr
+ internal_iterator_t get_containing_child(laddr_t laddr);
+};
+
+/**
+ * LBALeafNode
+ *
+ * Abstracts operations on and layout of leaf nodes for the
+ * LBA Tree.
+ *
+ * Layout (4k):
+ * size : uint32_t[1] 4b
+ * (padding) : 4b
+ * meta : lba_node_meta_le_t[3] (1*24)b
+ * keys : laddr_t[170] (145*8)b
+ * values : lba_map_val_t[170] (145*20)b
+ * = 4092
+ *
+ * TODO: update FixedKVNodeLayout to handle the above calculation
+ * TODO: the above alignment probably isn't portable without further work
+ */
+constexpr size_t LEAF_NODE_CAPACITY = 145;
+
+/**
+ * lba_map_val_le_t
+ *
+ * On disk layout for lba_map_val_t.
+ */
+struct lba_map_val_le_t {
+ extent_len_le_t len = init_extent_len_le_t(0);
+ paddr_le_t paddr;
+ ceph_le32 refcount = init_le32(0);
+ ceph_le32 checksum = init_le32(0);
+
+ lba_map_val_le_t() = default;
+ lba_map_val_le_t(const lba_map_val_le_t &) = default;
+ explicit lba_map_val_le_t(const lba_map_val_t &val)
+ : len(init_extent_len_le_t(val.len)),
+ paddr(paddr_le_t(val.paddr)),
+ refcount(init_le32(val.refcount)),
+ checksum(init_le32(val.checksum)) {}
+
+ operator lba_map_val_t() const {
+ return lba_map_val_t{ len, paddr, refcount, checksum };
+ }
+};
+
+struct LBALeafNode
+ : LBANode,
+ common::FixedKVNodeLayout<
+ LEAF_NODE_CAPACITY,
+ lba_node_meta_t, lba_node_meta_le_t,
+ laddr_t, laddr_le_t,
+ lba_map_val_t, lba_map_val_le_t> {
+ using internal_iterator_t = const_iterator;
+ template <typename... T>
+ LBALeafNode(T&&... t) :
+ LBANode(std::forward<T>(t)...),
+ FixedKVNodeLayout(get_bptr().c_str()) {}
+
+ static constexpr extent_types_t type = extent_types_t::LADDR_LEAF;
+
+ lba_node_meta_t get_node_meta() const final { return get_meta(); }
+
+ CachedExtentRef duplicate_for_write() final {
+ assert(delta_buffer.empty());
+ return CachedExtentRef(new LBALeafNode(*this));
+ };
+
+ delta_buffer_t delta_buffer;
+ delta_buffer_t *maybe_get_delta_buffer() {
+ return is_mutation_pending() ? &delta_buffer : nullptr;
+ }
+
+ lookup_ret lookup(op_context_t c, laddr_t addr, depth_t depth) final
+ {
+ return lookup_ret(
+ lookup_ertr::ready_future_marker{},
+ this);
+ }
+
+ lookup_range_ret lookup_range(
+ op_context_t c,
+ laddr_t addr,
+ extent_len_t len) final;
+
+ insert_ret insert(
+ op_context_t c,
+ laddr_t laddr,
+ lba_map_val_t val) final;
+
+ mutate_mapping_ret mutate_mapping(
+ op_context_t c,
+ laddr_t laddr,
+ mutate_func_t &&f) final;
+ mutate_mapping_ret mutate_mapping_internal(
+ op_context_t c,
+ laddr_t laddr,
+ bool is_root,
+ mutate_func_t &&f) final;
+
+ mutate_internal_address_ret mutate_internal_address(
+ op_context_t c,
+ depth_t depth,
+ laddr_t laddr,
+ paddr_t paddr) final;
+
+ find_hole_ret find_hole(
+ op_context_t c,
+ laddr_t min,
+ laddr_t max,
+ extent_len_t len) final;
+
+ scan_mappings_ret scan_mappings(
+ op_context_t c,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &f) final;
+
+ scan_mapped_space_ret scan_mapped_space(
+ op_context_t c,
+ scan_mapped_space_func_t &f) final;
+
+ std::tuple<LBANodeRef, LBANodeRef, laddr_t>
+ make_split_children(op_context_t c) final {
+ auto left = c.cache.alloc_new_extent<LBALeafNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto right = c.cache.alloc_new_extent<LBALeafNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto pivot = split_into(*left, *right);
+ left->pin.set_range(left->get_meta());
+ right->pin.set_range(right->get_meta());
+ return std::make_tuple(
+ left,
+ right,
+ pivot);
+ }
+
+ LBANodeRef make_full_merge(
+ op_context_t c,
+ LBANodeRef &right) final {
+ auto replacement = c.cache.alloc_new_extent<LBALeafNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ replacement->merge_from(*this, *right->cast<LBALeafNode>());
+ replacement->pin.set_range(replacement->get_meta());
+ return replacement;
+ }
+
+ std::tuple<LBANodeRef, LBANodeRef, laddr_t>
+ make_balanced(
+ op_context_t c,
+ LBANodeRef &_right,
+ bool prefer_left) final {
+ ceph_assert(_right->get_type() == type);
+ auto &right = *_right->cast<LBALeafNode>();
+ auto replacement_left = c.cache.alloc_new_extent<LBALeafNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto replacement_right = c.cache.alloc_new_extent<LBALeafNode>(
+ c.trans, LBA_BLOCK_SIZE);
+
+ auto pivot = balance_into_new_nodes(
+ *this,
+ right,
+ prefer_left,
+ *replacement_left,
+ *replacement_right);
+
+ replacement_left->pin.set_range(replacement_left->get_meta());
+ replacement_right->pin.set_range(replacement_right->get_meta());
+ return std::make_tuple(
+ replacement_left,
+ replacement_right,
+ pivot);
+ }
+
+ // See LBAInternalNode, same concept
+ void resolve_relative_addrs(paddr_t base) final;
+ void node_resolve_vals(iterator from, iterator to) const final {
+ if (is_initial_pending()) {
+ for (auto i = from; i != to; ++i) {
+ auto val = i->get_val();
+ if (val.paddr.is_relative()) {
+ assert(val.paddr.is_block_relative());
+ val.paddr = get_paddr().add_relative(val.paddr);
+ i->set_val(val);
+ }
+ }
+ }
+ }
+ void node_unresolve_vals(iterator from, iterator to) const final {
+ if (is_initial_pending()) {
+ for (auto i = from; i != to; ++i) {
+ auto val = i->get_val();
+ if (val.paddr.is_relative()) {
+ auto val = i->get_val();
+ assert(val.paddr.is_record_relative());
+ val.paddr = val.paddr - get_paddr();
+ i->set_val(val);
+ }
+ }
+ }
+ }
+
+ ceph::bufferlist get_delta() final {
+ assert(!delta_buffer.empty());
+ ceph::buffer::ptr bptr(delta_buffer.get_bytes());
+ delta_buffer.copy_out(bptr.c_str(), bptr.length());
+ ceph::bufferlist bl;
+ bl.push_back(bptr);
+ return bl;
+ }
+
+ void apply_delta_and_adjust_crc(
+ paddr_t base, const ceph::bufferlist &_bl) final {
+ assert(_bl.length());
+ ceph::bufferlist bl = _bl;
+ bl.rebuild();
+ delta_buffer_t buffer;
+ buffer.copy_in(bl.front().c_str(), bl.front().length());
+ buffer.replay(*this);
+ set_last_committed_crc(get_crc32c());
+ resolve_relative_addrs(base);
+ }
+
+ extent_types_t get_type() const final {
+ return type;
+ }
+
+ std::ostream &print_detail(std::ostream &out) const final;
+
+ bool at_max_capacity() const final {
+ return get_size() == get_capacity();
+ }
+
+ bool at_min_capacity() const final {
+ return get_size() == (get_capacity() / 2);
+ }
+
+ /// returns iterators <lb, ub> containing addresses [l, r)
+ std::pair<internal_iterator_t, internal_iterator_t> bound(
+ laddr_t l, laddr_t r) {
+ // TODO: inefficient
+ auto retl = begin();
+ for (; retl != end(); ++retl) {
+ if (retl->get_key() >= l || (retl->get_key() + retl->get_val().len) > l)
+ break;
+ }
+ auto retr = retl;
+ for (; retr != end(); ++retr) {
+ if (retr->get_key() >= r)
+ break;
+ }
+ return std::make_pair(retl, retr);
+ }
+
+ std::pair<internal_iterator_t, internal_iterator_t>
+ get_leaf_entries(laddr_t addr, extent_len_t len);
+};
+using LBALeafNodeRef = TCachedExtentRef<LBALeafNode>;
+
+}
diff --git a/src/crimson/os/seastore/onode.cc b/src/crimson/os/seastore/onode.cc
new file mode 100644
index 000000000..a8b925b70
--- /dev/null
+++ b/src/crimson/os/seastore/onode.cc
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "onode.h"
+#include "include/encoding.h"
+
+namespace crimson::os::seastore {
+
+size_t Onode::size() const
+{
+ return ceph::encoded_sizeof(*this);
+}
+
+void Onode::encode(void* buffer, size_t len)
+{
+ struct [[gnu::packed]] encoded_t {
+ uint8_t struct_v;
+ uint8_t struct_compat;
+ uint32_t struct_len;
+ uint32_t len;
+ char data[];
+ };
+ auto p = reinterpret_cast<encoded_t*>(buffer);
+ assert(std::numeric_limits<uint16_t>::max() >= size());
+ assert(len >= size());
+ p->struct_v = 1;
+ p->struct_compat = 1;
+ p->struct_len = sizeof(encoded_t) + payload.size();
+ p->len = payload.size();
+ std::memcpy(p->data, payload.data(), payload.size());
+}
+
+bool operator==(const Onode& lhs, const Onode& rhs)
+{
+ return lhs.get() == rhs.get();
+}
+
+std::ostream& operator<<(std::ostream &out, const Onode &rhs)
+{
+ return out << rhs.get();
+}
+
+}
+
diff --git a/src/crimson/os/seastore/onode.h b/src/crimson/os/seastore/onode.h
new file mode 100644
index 000000000..4d7783028
--- /dev/null
+++ b/src/crimson/os/seastore/onode.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <limits>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include "include/buffer.h"
+#include "include/denc.h"
+
+namespace crimson::os::seastore {
+
+// in-memory onode, in addition to the stuff that should be persisted to disk,
+// it may contain intrusive hooks for LRU, rw locks etc
+class Onode : public boost::intrusive_ref_counter<
+ Onode,
+ boost::thread_unsafe_counter>
+{
+public:
+ Onode(std::string_view s)
+ : payload{s}
+ {}
+ size_t size() const;
+ const std::string& get() const {
+ return payload;
+ }
+ void encode(void* buffer, size_t len);
+ DENC(Onode, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.payload, p);
+ DENC_FINISH(p);
+ }
+
+private:
+ // dummy payload
+ std::string payload;
+};
+
+bool operator==(const Onode& lhs, const Onode& rhs);
+std::ostream& operator<<(std::ostream &out, const Onode &rhs);
+using OnodeRef = boost::intrusive_ptr<Onode>;
+}
+
+WRITE_CLASS_DENC(crimson::os::seastore::Onode)
diff --git a/src/crimson/os/seastore/onode_manager.h b/src/crimson/os/seastore/onode_manager.h
new file mode 100644
index 000000000..0a03b7fdf
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager.h
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/buffer_fwd.h"
+#include "include/ceph_assert.h"
+#include "common/hobject.h"
+
+#include "crimson/os/seastore/onode.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/osd/exceptions.h"
+
+namespace crimson::os::seastore {
+
+class OnodeManager {
+public:
+ using open_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ virtual open_ertr::future<OnodeRef> get_or_create_onode(
+ Transaction &trans,
+ const ghobject_t &hoid) {
+ return open_ertr::make_ready_future<OnodeRef>();
+ }
+ virtual open_ertr::future<std::vector<OnodeRef>> get_or_create_onodes(
+ Transaction &trans,
+ const std::vector<ghobject_t> &hoids) {
+ return open_ertr::make_ready_future<std::vector<OnodeRef>>();
+ }
+
+ using write_ertr= crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ virtual write_ertr::future<> write_dirty(
+ Transaction &trans,
+ const std::vector<OnodeRef> &onodes) {
+ return write_ertr::now();
+ }
+ virtual ~OnodeManager() {}
+};
+using OnodeManagerRef = std::unique_ptr<OnodeManager>;
+
+namespace onode_manager {
+
+OnodeManagerRef create_ephemeral() {
+ return OnodeManagerRef();
+}
+
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc
new file mode 100644
index 000000000..b05ea76a3
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "onode_block.h"
+
+namespace crimson::os::seastore {
+
+ceph::bufferlist OnodeBlock::get_delta()
+{
+ bufferlist bl;
+ assert(deltas.size() <= std::numeric_limits<uint8_t>::max());
+ uint8_t n_deltas = deltas.size();
+ ceph::encode(n_deltas, bl);
+ for (auto& delta : deltas) {
+ delta->encode(bl);
+ }
+ return bl;
+}
+
+void OnodeBlock::logical_on_delta_write()
+{
+ // journal submitted to disk, now update the memory
+ apply_pending_changes(true);
+}
+
+void OnodeBlock::apply_delta(const ceph::bufferlist &bl)
+{
+ assert(deltas.empty());
+
+ auto p = bl.cbegin();
+ uint8_t n_deltas = 0;
+ ceph::decode(n_deltas, p);
+ for (uint8_t i = 0; i < n_deltas; i++) {
+ delta_t delta;
+ delta.decode(p);
+ mutate(std::move(delta));
+ }
+ apply_pending_changes(true);
+}
+
+void OnodeBlock::mutate(delta_t&& d)
+{
+ if (is_initial_pending()) {
+ char* const p = get_bptr().c_str();
+ mutate_func(p, d);
+ }
+ deltas.push_back(std::make_unique<delta_t>(std::move(d)));
+}
+
+void OnodeBlock::apply_pending_changes(bool do_cleanup)
+{
+ if (!is_mutation_pending()) {
+ return;
+ }
+ if (share_buffer) {
+ // do a deep copy so i can change my own copy
+ get_bptr() = ceph::bufferptr{get_bptr().c_str(),
+ get_bptr().length()};
+ share_buffer = false;
+ }
+ assert(mutate_func);
+ char* const p = get_bptr().c_str();
+ for (auto& delta : deltas) {
+ mutate_func(p, *delta);
+ if (do_cleanup) {
+ delta.reset();
+ }
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h
new file mode 100644
index 000000000..0025d9847
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <cstdint>
+#include <boost/container/small_vector.hpp>
+
+#include "crimson/os/seastore/transaction_manager.h"
+#include "onode_delta.h"
+
+namespace crimson::os::seastore {
+
+// TODO s/CachedExtent/LogicalCachedExtent/
+struct OnodeBlock final : LogicalCachedExtent {
+ using Ref = TCachedExtentRef<OnodeBlock>;
+
+ template <typename... T>
+ OnodeBlock(T&&... t) : LogicalCachedExtent(std::forward<T>(t)...) {}
+ OnodeBlock(OnodeBlock&& block) = delete;
+ OnodeBlock(const OnodeBlock& block, CachedExtent::share_buffer_t tag) noexcept
+ : LogicalCachedExtent{block, tag},
+ share_buffer{true}
+ {}
+
+ CachedExtentRef duplicate_for_write() final {
+ return new OnodeBlock{*this, CachedExtent::share_buffer_t{}};
+ }
+
+ // could materialize the pending changes to the underlying buffer here,
+ // but since we write the change to the buffer immediately, let skip
+ // this for now.
+ void prepare_write() final {}
+
+ // queries
+ static constexpr extent_types_t TYPE = extent_types_t::ONODE_BLOCK;
+ extent_types_t get_type() const final {
+ return TYPE;
+ }
+
+ // have to stash all the changes before on_delta_write() is called,
+ // otherwise we could pollute the extent with pending mutations
+ // before the transaction carrying these mutations is committed to
+ // disk
+ ceph::bufferlist get_delta() final;
+ void logical_on_delta_write() final;
+ void apply_delta(const ceph::bufferlist &bl) final;
+
+ void sync() {
+ apply_pending_changes(false);
+ }
+ void mutate(delta_t&& d);
+ using mutate_func_t = std::function<void (char*, const delta_t&)>;
+ void set_delta_applier(mutate_func_t&& func) {
+ mutate_func = std::move(func);
+ }
+private:
+ // before looking at the extent, we need to make sure the content is up to date
+ void apply_pending_changes(bool do_cleanup);
+ // assuming we don't stash too many deltas to a single block
+ // otherwise a fullwrite op is necessary
+ boost::container::small_vector<std::unique_ptr<delta_t>, 2> deltas;
+ mutate_func_t mutate_func;
+ bool share_buffer = false;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc
new file mode 100644
index 000000000..869685d45
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc
@@ -0,0 +1,188 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "onode_delta.h"
+
+delta_t::delta_t(delta_t&& delta)
+{
+ assert(op == op_t::nop);
+ op = delta.op;
+ n = delta.n;
+ oid = std::move(delta.oid);
+ onode = std::move(delta.onode);
+ keys = std::move(delta.keys);
+ cells = std::move(delta.cells);
+ delta.op = op_t::nop;
+}
+
+delta_t& delta_t::operator=(delta_t&& delta)
+{
+ assert(op == op_t::nop);
+ op = delta.op;
+ n = delta.n;
+ oid = std::move(delta.oid);
+ onode = std::move(delta.onode);
+ keys = std::move(delta.keys);
+ cells = std::move(delta.cells);
+ delta.op = op_t::nop;
+ return *this;
+}
+
+delta_t delta_t::nop()
+{
+ return delta_t{op_t::nop};
+}
+
+delta_t delta_t::insert_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode)
+{
+ delta_t delta{op_t::insert_onode};
+ delta.n = slot;
+ delta.oid = oid;
+ delta.onode = onode;
+ return delta;
+}
+
+delta_t delta_t::update_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode)
+{
+ delta_t delta{op_t::update_onode};
+ delta.n = slot;
+ delta.oid = oid;
+ delta.onode = onode;
+ return delta;
+}
+
+delta_t delta_t::insert_child(unsigned slot,
+ const ghobject_t& oid,
+ crimson::os::seastore::laddr_t addr)
+{
+ delta_t delta{op_t::insert_child};
+ delta.n = slot;
+ delta.oid = oid;
+ delta.addr = addr;
+ return delta;
+}
+
+delta_t delta_t::update_key(unsigned slot, const ghobject_t& oid)
+{
+ delta_t delta{op_t::update_key};
+ delta.n = slot;
+ delta.oid = oid;
+ return delta;
+}
+
+delta_t delta_t::shift_left(unsigned n)
+{
+ delta_t delta{op_t::shift_left};
+ delta.n = n;
+ return delta;
+}
+
+delta_t delta_t::trim_right(unsigned n)
+{
+ delta_t delta{op_t::trim_right};
+ delta.n = n;
+ return delta;
+}
+
+delta_t delta_t::insert_front(ceph::buffer::ptr keys,
+ ceph::buffer::ptr cells)
+{
+ delta_t delta{op_t::insert_front};
+ delta.keys = std::move(keys);
+ delta.cells = std::move(cells);
+ return delta;
+}
+
+delta_t delta_t::insert_back(ceph::buffer::ptr keys,
+ ceph::buffer::ptr cells)
+{
+ delta_t delta{op_t::insert_back};
+ delta.keys = std::move(keys);
+ delta.cells = std::move(cells);
+ return delta;
+}
+
+delta_t delta_t::remove_from(unsigned slot)
+{
+ delta_t delta{op_t::remove_from};
+ delta.n = slot;
+ return delta;
+}
+
+void delta_t::encode(ceph::bufferlist& bl)
+{
+ using ceph::encode;
+ switch (op) {
+ case op_t::insert_onode:
+ [[fallthrough]];
+ case op_t::update_onode:
+ // the slot # is not encoded, because we can alway figure it out
+ // when we have to replay the delta by looking the oid up in the
+ // node block
+ encode(oid, bl);
+ encode(*onode, bl);
+ break;
+ case op_t::insert_child:
+ encode(oid, bl);
+ encode(addr, bl);
+ case op_t::update_key:
+ encode(n, bl);
+ encode(oid, bl);
+ break;
+ case op_t::shift_left:
+ encode(n, bl);
+ break;
+ case op_t::trim_right:
+ encode(n, bl);
+ break;
+ case op_t::insert_front:
+ [[fallthrough]];
+ case op_t::insert_back:
+ encode(n, bl);
+ encode(keys, bl);
+ encode(cells, bl);
+ break;
+ case op_t::remove_from:
+ encode(n, bl);
+ break;
+ default:
+ assert(0 == "unknown onode op");
+ }
+}
+
+void delta_t::decode(ceph::bufferlist::const_iterator& p) {
+ using ceph::decode;
+ decode(op, p);
+ switch (op) {
+ case op_t::insert_onode:
+ [[fallthrough]];
+ case op_t::update_onode:
+ decode(oid, p);
+ decode(*onode, p);
+ break;
+ case op_t::insert_child:
+ [[fallthrough]];
+ case op_t::update_key:
+ decode(n, p);
+ decode(oid, p);
+ break;
+ case op_t::shift_left:
+ decode(n, p);
+ break;
+ case op_t::trim_right:
+ decode(n, p);
+ break;
+ case op_t::insert_front:
+ [[fallthrough]];
+ case op_t::insert_back:
+ decode(n, p);
+ decode(keys, p);
+ decode(cells, p);
+ break;
+ case op_t::remove_from:
+ decode(n, p);
+ break;
+ default:
+ assert(0 == "unknown onode op");
+ }
+}
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h
new file mode 100644
index 000000000..3e7e7315e
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cstdint>
+
+#include "common/hobject.h"
+#include "include/buffer_fwd.h"
+
+#include "crimson/os/seastore/onode.h"
+#include "crimson/os/seastore/seastore_types.h"
+
+using crimson::os::seastore::OnodeRef;
+
+struct delta_t {
+ enum class op_t : uint8_t {
+ nop,
+ insert_onode,
+ update_onode,
+ insert_child,
+ update_key,
+ shift_left,
+ trim_right,
+ insert_front,
+ insert_back,
+ remove_from,
+ // finer grained op?
+ // - changing the embedded extent map of given oid
+ // - mutating the embedded xattrs of given oid
+ } op = op_t::nop;
+
+ unsigned n = 0;
+ ghobject_t oid;
+ crimson::os::seastore::laddr_t addr = 0;
+ OnodeRef onode;
+ ceph::bufferptr keys;
+ ceph::bufferptr cells;
+
+ delta_t() = default;
+ delta_t(op_t op)
+ : op{op}
+ {}
+ delta_t(delta_t&& delta);
+ delta_t& operator=(delta_t&& delta);
+
+ static delta_t nop();
+ static delta_t insert_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode);
+ static delta_t update_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode);
+ static delta_t insert_child(unsigned slot, const ghobject_t& oid, crimson::os::seastore::laddr_t addr);
+ static delta_t update_key(unsigned slot, const ghobject_t& oid);
+ static delta_t shift_left(unsigned n);
+ static delta_t trim_right(unsigned n);
+ static delta_t insert_front(ceph::buffer::ptr keys,
+ ceph::buffer::ptr cells);
+ static delta_t insert_back(ceph::buffer::ptr keys,
+ ceph::buffer::ptr cells);
+ static delta_t remove_from(unsigned slot);
+
+ // shortcuts
+ static delta_t insert_item(unsigned slot, const ghobject_t& oid, OnodeRef onode) {
+ return insert_onode(slot, oid, onode);
+ }
+ static delta_t insert_item(unsigned slot, const ghobject_t& oid, crimson::os::seastore::laddr_t addr) {
+ return insert_child(slot, oid, addr);
+ }
+
+ void encode(ceph::bufferlist& bl);
+ void decode(ceph::bufferlist::const_iterator& p);
+};
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc
new file mode 100644
index 000000000..fdcaa2fcb
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc
@@ -0,0 +1,567 @@
+#include "onode_node.h"
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+auto node_t<BlockSize, N, NodeType>::key_at(unsigned slot) const
+ -> std::pair<const key_prefix_t&, const key_suffix_t&>
+{
+ auto& key = keys[slot];
+ if constexpr (item_in_key) {
+ return {key, key_suffix_t{}};
+ } else {
+ auto p = from_end(key.offset);
+ return {key, *reinterpret_cast<const key_suffix_t*>(p)};
+ }
+}
+
+// update an existing oid with the specified item
+template<size_t BlockSize, int N, ntype_t NodeType>
+ghobject_t
+node_t<BlockSize, N, NodeType>::get_oid_at(unsigned slot,
+ const ghobject_t& oid) const
+{
+ auto [prefix, suffix] = key_at(slot);
+ ghobject_t updated = oid;
+ prefix.update_oid(updated);
+ suffix.update_oid(updated);
+ return updated;
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+auto node_t<BlockSize, N, NodeType>::item_at(const key_prefix_t& key) const
+ -> const_item_t
+{
+ if constexpr (item_in_key) {
+ return key.child_addr;
+ } else {
+ assert(key.offset < BlockSize);
+ auto p = from_end(key.offset);
+ auto partial_key = reinterpret_cast<const key_suffix_t*>(p);
+ p += size_of(*partial_key);
+ return *reinterpret_cast<const item_t*>(p);
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::dump(std::ostream& os) const
+{
+ for (uint16_t i = 0; i < count; i++) {
+ const auto& [prefix, suffix] = key_at(i);
+ os << " [" << i << '/' << count - 1 << "]\n"
+ << " key1 = (" << prefix << ")\n"
+ << " key2 = (" << suffix << ")\n";
+ const auto& item = item_at(prefix);
+ if (_is_leaf()) {
+ os << " item = " << item << "\n";
+ } else {
+ os << " child = " << std::hex << item << std::dec << "\n";
+ }
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+char* node_t<BlockSize, N, NodeType>::from_end(uint16_t offset)
+{
+ auto end = reinterpret_cast<char*>(this) + BlockSize;
+ return end - static_cast<int>(offset);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+const char* node_t<BlockSize, N, NodeType>::from_end(uint16_t offset) const
+{
+ auto end = reinterpret_cast<const char*>(this) + BlockSize;
+ return end - static_cast<int>(offset);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+uint16_t node_t<BlockSize, N, NodeType>::used_space() const
+{
+ if constexpr (item_in_key) {
+ return count * sizeof(key_prefix_t);
+ } else {
+ if (count) {
+ return keys[count - 1].offset + count * sizeof(key_prefix_t);
+ } else {
+ return 0;
+ }
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+uint16_t node_t<BlockSize, N, NodeType>::capacity()
+{
+ auto p = reinterpret_cast<node_t*>(0);
+ return BlockSize - (reinterpret_cast<char*>(p->keys) -
+ reinterpret_cast<char*>(p));
+}
+
+// TODO: if it's allowed to update 2 siblings at the same time, we can have
+// B* tree
+template<size_t BlockSize, int N, ntype_t NodeType>
+constexpr uint16_t node_t<BlockSize, N, NodeType>::min_size()
+{
+ return capacity() / 2;
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+constexpr std::pair<int16_t, int16_t>
+node_t<BlockSize, N, NodeType>::bytes_to_add(uint16_t size)
+{
+ assert(size < min_size());
+ return {min_size() - size, capacity() - size};
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+constexpr std::pair<int16_t, int16_t>
+node_t<BlockSize, N, NodeType>::bytes_to_remove(uint16_t size)
+{
+ assert(size > capacity());
+ return {size - capacity(), size - min_size()};
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+size_state_t node_t<BlockSize, N, NodeType>::size_state(uint16_t size) const
+{
+ if (size > capacity()) {
+ return size_state_t::overflow;
+ } else if (size < capacity() / 2) {
+ return size_state_t::underflow;
+ } else {
+ return size_state_t::okay;
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+bool node_t<BlockSize, N, NodeType>::is_underflow(uint16_t size) const
+{
+ switch (size_state(size)) {
+ case size_state_t::underflow:
+ return true;
+ case size_state_t::okay:
+ return false;
+ default:
+ assert(0);
+ return false;
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+int16_t node_t<BlockSize, N, NodeType>::size_with_key(unsigned slot,
+ const ghobject_t& oid) const
+{
+ if constexpr (item_in_key) {
+ return capacity();
+ } else {
+ // the size of fixed key does not change
+ [[maybe_unused]] const auto& [prefix, suffix] = key_at(slot);
+ return capacity() + key_suffix_t::size_from(oid) - suffix.size();
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+ordering_t node_t<BlockSize, N, NodeType>::compare_with_slot(unsigned slot,
+ const ghobject_t& oid) const
+{
+ const auto& [prefix, suffix] = key_at(slot);
+ if (auto result = prefix.compare(oid); result != ordering_t::equivalent) {
+ return result;
+ } else {
+ return suffix.compare(oid);
+ }
+}
+
+/// return the slot number of the first slot that is greater or equal to
+/// key
+template<size_t BlockSize, int N, ntype_t NodeType>
+std::pair<unsigned, bool> node_t<BlockSize, N, NodeType>::lower_bound(const ghobject_t& oid) const
+{
+ unsigned s = 0, e = count;
+ while (s != e) {
+ unsigned mid = (s + e) / 2;
+ switch (compare_with_slot(mid, oid)) {
+ case ordering_t::less:
+ s = ++mid;
+ break;
+ case ordering_t::greater:
+ e = mid;
+ break;
+ case ordering_t::equivalent:
+ assert(mid == 0 || mid < count);
+ return {mid, true};
+ }
+ }
+ return {s, false};
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+uint16_t node_t<BlockSize, N, NodeType>::size_of_item(const ghobject_t& oid,
+ const item_t& item)
+{
+ if constexpr (item_in_key) {
+ return sizeof(key_prefix_t);
+ } else {
+ return (sizeof(key_prefix_t) +
+ key_suffix_t::size_from(oid) + size_of(item));
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+bool node_t<BlockSize, N, NodeType>::is_overflow(const ghobject_t& oid,
+ const item_t& item) const
+{
+ return free_space() < size_of_item(oid, item);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+bool node_t<BlockSize, N, NodeType>::is_overflow(const ghobject_t& oid,
+ const OnodeRef& item) const
+{
+ return free_space() < (sizeof(key_prefix_t) + key_suffix_t::size_from(oid) + item->size());
+}
+
+// inserts an item into the given slot, pushing all subsequent keys forward
+// @note if the item is not embedded in key, shift the right half as well
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::insert_at(unsigned slot,
+ const ghobject_t& oid,
+ const item_t& item)
+{
+ assert(!is_overflow(oid, item));
+ assert(slot <= count);
+ if constexpr (item_in_key) {
+ // shift the keys right
+ key_prefix_t* key = keys + slot;
+ key_prefix_t* last_key = keys + count;
+ std::copy_backward(key, last_key, last_key + 1);
+ key->set(oid, item);
+ } else {
+ const uint16_t size = key_suffix_t::size_from(oid) + size_of(item);
+ uint16_t offset = size;
+ if (slot > 0) {
+ offset += keys[slot - 1].offset;
+ }
+ if (slot < count) {
+ // V
+ // | |... // ...|//////|| |
+ // | |... // ...|//////| | |
+ // shift the partial keys and items left
+ auto first = keys[slot - 1].offset;
+ auto last = keys[count - 1].offset;
+ std::memmove(from_end(last + size), from_end(last), last - first);
+ // shift the keys right and update the pointers
+ for (key_prefix_t* dst = keys + count; dst > keys + slot; dst--) {
+ key_prefix_t* src = dst - 1;
+ *dst = *src;
+ dst->offset += size;
+ }
+ }
+ keys[slot].set(oid, offset);
+ auto p = from_end(offset);
+ auto partial_key = reinterpret_cast<key_suffix_t*>(p);
+ partial_key->set(oid);
+ p += size_of(*partial_key);
+ auto item_ptr = reinterpret_cast<item_t*>(p);
+ *item_ptr = item;
+ }
+ count++;
+ assert(used_space() <= capacity());
+}
+
+// used by InnerNode for updating the keys indexing its children when their lower boundaries
+// is updated
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::update_key_at(unsigned slot, const ghobject_t& oid)
+{
+ if constexpr (is_leaf()) {
+ assert(0);
+ } else if constexpr (item_in_key) {
+ keys[slot].update(oid);
+ } else {
+ const auto& [prefix, suffix] = key_at(slot);
+ int16_t delta = key_suffix_t::size_from(oid) - suffix.size();
+ if (delta > 0) {
+ // shift the cells sitting at its left side
+ auto first = keys[slot].offset;
+ auto last = keys[count - 1].offset;
+ std::memmove(from_end(last + delta), from_end(last), last - first);
+ // update the pointers
+ for (key_prefix_t* key = keys + slot; key < keys + count; key++) {
+ key->offset += delta;
+ }
+ }
+ keys[slot].update(oid);
+ auto p = from_end(keys[slot].offset);
+ auto partial_key = reinterpret_cast<key_suffix_t*>(p);
+ partial_key->set(oid);
+ // we don't update item here
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+std::pair<unsigned, uint16_t>
+node_t<BlockSize, N, NodeType>::calc_grab_front(uint16_t min_grab,
+ uint16_t max_grab) const
+{
+ // TODO: split by likeness
+ uint16_t grabbed = 0;
+ uint16_t used = used_space();
+ int n = 0;
+ for (; n < count; n++) {
+ const auto& [prefix, suffix] = key_at(n);
+ uint16_t to_grab = sizeof(prefix) + size_of(suffix);
+ if constexpr (!item_in_key) {
+ const auto& item = item_at(prefix);
+ to_grab += size_of(item);
+ }
+ if (grabbed + to_grab > max_grab) {
+ break;
+ }
+ grabbed += to_grab;
+ }
+ if (grabbed >= min_grab) {
+ if (n == count) {
+ return {n, grabbed};
+ } else if (!is_underflow(used - grabbed)) {
+ return {n, grabbed};
+ }
+ }
+ return {0, 0};
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+std::pair<unsigned, uint16_t>
+node_t<BlockSize, N, NodeType>::calc_grab_back(uint16_t min_grab,
+ uint16_t max_grab) const
+{
+ // TODO: split by likeness
+ uint16_t grabbed = 0;
+ uint16_t used = used_space();
+ for (int i = count - 1; i >= 0; i--) {
+ const auto& [prefix, suffix] = key_at(i);
+ uint16_t to_grab = sizeof(prefix) + size_of(suffix);
+ if constexpr (!item_in_key) {
+ const auto& item = item_at(prefix);
+ to_grab += size_of(item);
+ }
+ grabbed += to_grab;
+ if (is_underflow(used - grabbed)) {
+ return {0, 0};
+ } else if (grabbed > max_grab) {
+ return {0, 0};
+ } else if (grabbed >= min_grab) {
+ return {i + 1, grabbed};
+ }
+ }
+ return {0, 0};
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+template<int LeftN, class Mover>
+void node_t<BlockSize, N, NodeType>::grab_from_left(node_t<BlockSize, LeftN, NodeType>& left,
+ unsigned n, uint16_t bytes,
+ Mover& mover)
+{
+ // TODO: rebuild keys if moving across different layouts
+ // group by likeness
+ shift_right(n, bytes);
+ mover.move_from(left.count - n, 0, n);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+template<int RightN, class Mover>
+delta_t node_t<BlockSize, N, NodeType>::acquire_right(node_t<BlockSize, RightN, NodeType>& right,
+ unsigned whoami, Mover& mover)
+{
+ mover.move_from(0, count, right.count);
+ return mover.to_delta();
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+template<int RightN, class Mover>
+void node_t<BlockSize, N, NodeType>::grab_from_right(node_t<BlockSize, RightN, NodeType>& right,
+ unsigned n, uint16_t bytes,
+ Mover& mover)
+{
+ mover.move_from(0, count, n);
+ right.shift_left(n, 0);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+template<int LeftN, class Mover>
+void node_t<BlockSize, N, NodeType>::push_to_left(node_t<BlockSize, LeftN, NodeType>& left,
+ unsigned n, uint16_t bytes,
+ Mover& mover)
+{
+ left.grab_from_right(*this, n, bytes, mover);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+template<int RightN, class Mover>
+void node_t<BlockSize, N, NodeType>::push_to_right(node_t<BlockSize, RightN, NodeType>& right,
+ unsigned n, uint16_t bytes,
+ Mover& mover)
+{
+ right.grab_from_left(*this, n, bytes, mover);
+}
+
+// [to, from) are removed, so we need to shift left
+// actually there are only two use cases:
+// - to = 0: for giving elements in bulk
+// - to = from - 1: for removing a single element
+// old: |////|.....| |.....|/|........|
+// new: |.....| |.....||........|
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::shift_left(unsigned from, unsigned to)
+{
+ assert(from < count);
+ assert(to < from);
+ if constexpr (item_in_key) {
+ std::copy(keys + from, keys + count, keys + to);
+ } else {
+ const uint16_t cell_hi = keys[count - 1].offset;
+ const uint16_t cell_lo = keys[from - 1].offset;
+ const uint16_t offset_delta = keys[from].offset - keys[to].offset;
+ for (auto src_key = keys + from, dst_key = keys + to;
+ src_key != keys + count;
+ ++src_key, ++dst_key) {
+ // shift the keys left
+ *dst_key = *src_key;
+ // update the pointers
+ dst_key->offset -= offset_delta;
+ }
+ // and cells
+ auto dst = from_end(cell_hi);
+ std::memmove(dst + offset_delta, dst, cell_hi - cell_lo);
+ }
+ count -= (from - to);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::insert_front(const ceph::bufferptr& keys_buf,
+ const ceph::bufferptr& cells_buf)
+{
+ unsigned n = keys_buf.length() / sizeof(key_prefix_t);
+ shift_right(n, cells_buf.length());
+ keys_buf.copy_out(0, keys_buf.length(), reinterpret_cast<char*>(keys));
+ if constexpr (item_in_key) {
+ assert(cells_buf.length() == 0);
+ } else {
+ cells_buf.copy_out(0, cells_buf.length(), from_end(keys[n - 1].offset));
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::insert_back(const ceph::bufferptr& keys_buf,
+ const ceph::bufferptr& cells_buf)
+{
+ keys_buf.copy_out(0, keys_buf.length(), reinterpret_cast<char*>(keys + count));
+ count += keys_buf.length() / sizeof(key_prefix_t);
+ if constexpr (item_in_key) {
+ assert(cells_buf.length() == 0);
+ } else {
+ cells_buf.copy_out(0, cells_buf.length(), from_end(keys[count - 1].offset));
+ }
+}
+
+// one or more elements are inserted, so we need to shift the elements right
+// actually there are only two use cases:
+// - bytes != 0: for inserting bytes before from
+// - bytes = 0: for inserting a single element before from
+// old: ||.....|
+// new: |/////|.....|
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::shift_right(unsigned n, unsigned bytes)
+{
+ assert(bytes + used_space() < capacity());
+ // shift the keys left
+ std::copy_backward(keys, keys + count, keys + count + n);
+ count += n;
+ if constexpr (!item_in_key) {
+ uint16_t cells = keys[count - 1].offset;
+ // copy the partial keys and items
+ std::memmove(from_end(cells + bytes), from_end(cells), cells);
+ // update the pointers
+ for (auto key = keys + n; key < keys + count; ++key) {
+ key->offset += bytes;
+ }
+ }
+}
+
+// shift all keys after slot is removed.
+// @note if the item is not embdedded in key, all items sitting at the left
+// side of it will be shifted right
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::remove_from(unsigned slot)
+{
+ assert(slot < count);
+ if (unsigned next = slot + 1; next < count) {
+ shift_left(next, slot);
+ } else {
+ // slot is the last one
+ count--;
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::trim_right(unsigned n)
+{
+ count = n;
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::play_delta(const delta_t& delta)
+{
+ switch (delta.op) {
+ case delta_t::op_t::insert_onode:
+ if constexpr (is_leaf()) {
+ auto [slot, found] = lower_bound(delta.oid);
+ assert(!found);
+ assert(delta.onode->size() <= std::numeric_limits<unsigned>::max());
+ ceph::bufferptr buf{static_cast<unsigned>(delta.onode->size())};
+ delta.onode->encode(buf.c_str(), buf.length());
+ auto onode = reinterpret_cast<const onode_t*>(buf.c_str());
+ return insert_at(slot, delta.oid, *onode);
+ } else {
+ throw std::invalid_argument("wrong node type");
+ }
+ case delta_t::op_t::update_onode:
+ // TODO
+ assert(0 == "not implemented");
+ break;
+ case delta_t::op_t::insert_child:
+ if constexpr (is_leaf()) {
+ throw std::invalid_argument("wrong node type");
+ } else {
+ auto [slot, found] = lower_bound(delta.oid);
+ assert(!found);
+ insert_at(slot, delta.oid, delta.addr);
+ }
+ case delta_t::op_t::update_key:
+ if constexpr (is_leaf()) {
+ throw std::invalid_argument("wrong node type");
+ } else {
+ return update_key_at(delta.n, delta.oid);
+ }
+ case delta_t::op_t::shift_left:
+ return shift_left(delta.n, 0);
+ case delta_t::op_t::trim_right:
+ return trim_right(delta.n);
+ case delta_t::op_t::insert_front:
+ return insert_front(delta.keys, delta.cells);
+ case delta_t::op_t::insert_back:
+ return insert_back(delta.keys, delta.cells);
+ case delta_t::op_t::remove_from:
+ return remove_from(delta.n);
+ default:
+ assert(0 == "unknown onode delta");
+ }
+}
+
+// explicit instantiate the node_t classes used by test_node.cc
+template class node_t<512, 0, ntype_t::inner>;
+template class node_t<512, 0, ntype_t::leaf>;
+template class node_t<512, 1, ntype_t::inner>;
+template class node_t<512, 1, ntype_t::leaf>;
+template class node_t<512, 2, ntype_t::inner>;
+template class node_t<512, 2, ntype_t::leaf>;
+template class node_t<512, 3, ntype_t::inner>;
+template class node_t<512, 3, ntype_t::leaf>;
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h
new file mode 100644
index 000000000..d833a6682
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h
@@ -0,0 +1,942 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <algorithm>
+#include <cstdint>
+#include <type_traits>
+#include <variant>
+
+#include "common/hobject.h"
+#include "crimson/common/layout.h"
+#include "crimson/os/seastore/onode.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "onode_delta.h"
+
+namespace asci = absl::container_internal;
+
+namespace boost::beast {
+ template<class T>
+ bool operator==(const span<T>& lhs, const span<T>& rhs) {
+ return std::equal(
+ lhs.begin(), lhs.end(),
+ rhs.begin(), rhs.end());
+ }
+}
+
+// on-disk onode
+// it only keeps the bits necessary to rebuild an in-memory onode
+struct [[gnu::packed]] onode_t {
+ onode_t& operator=(const onode_t& onode) {
+ len = onode.len;
+ std::memcpy(data, onode.data, len);
+ return *this;
+ }
+ size_t size() const {
+ return sizeof(*this) + len;
+ }
+ OnodeRef decode() const {
+ return new crimson::os::seastore::Onode(std::string_view{data, len});
+ }
+ uint8_t struct_v = 1;
+ uint8_t struct_compat = 1;
+ // TODO:
+ // - use uint16_t for length, as the size of an onode should be less
+ // than a block (16K for now)
+ // - drop struct_len
+ uint32_t struct_len = 0;
+ uint32_t len;
+ char data[];
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const onode_t& onode) {
+ return os << *onode.decode();
+}
+
+using crimson::os::seastore::laddr_t;
+
+struct [[gnu::packed]] child_addr_t {
+ laddr_t data;
+ child_addr_t(laddr_t data)
+ : data{data}
+ {}
+ child_addr_t& operator=(laddr_t addr) {
+ data = addr;
+ return *this;
+ }
+ laddr_t get() const {
+ return data;
+ }
+ operator laddr_t() const {
+ return data;
+ }
+ size_t size() const {
+ return sizeof(laddr_t);
+ }
+};
+
+// poor man's operator<=>
+enum class ordering_t {
+ less,
+ equivalent,
+ greater,
+};
+
+template<class L, class R>
+ordering_t compare_element(const L& x, const R& y)
+{
+ if constexpr (std::is_arithmetic_v<L>) {
+ static_assert(std::is_arithmetic_v<R>);
+ if (x < y) {
+ return ordering_t::less;
+ } else if (x > y) {
+ return ordering_t::greater;
+ } else {
+ return ordering_t::equivalent;
+ }
+ } else {
+ // string_view::compare(), string::compare(), ...
+ auto result = x.compare(y);
+ if (result < 0) {
+ return ordering_t::less;
+ } else if (result > 0) {
+ return ordering_t::greater;
+ } else {
+ return ordering_t::equivalent;
+ }
+ }
+}
+
+template<typename L, typename R>
+constexpr ordering_t tuple_cmp(const L&, const R&, std::index_sequence<>)
+{
+ return ordering_t::equivalent;
+}
+
+template<typename L, typename R,
+ size_t Head, size_t... Tail>
+constexpr ordering_t tuple_cmp(const L& x, const R& y,
+ std::index_sequence<Head, Tail...>)
+{
+ auto ordering = compare_element(std::get<Head>(x), std::get<Head>(y));
+ if (ordering != ordering_t::equivalent) {
+ return ordering;
+ } else {
+ return tuple_cmp(x, y, std::index_sequence<Tail...>());
+ }
+}
+
+template<typename... Ls, typename... Rs>
+constexpr ordering_t cmp(const std::tuple<Ls...>& x,
+ const std::tuple<Rs...>& y)
+{
+ static_assert(sizeof...(Ls) == sizeof...(Rs));
+ return tuple_cmp(x, y, std::index_sequence_for<Ls...>());
+}
+
+enum class likes_t {
+ yes,
+ no,
+ maybe,
+};
+
+struct [[gnu::packed]] variable_key_suffix {
+ uint64_t snap;
+ uint64_t gen;
+ uint8_t nspace_len;
+ uint8_t name_len;
+ char data[];
+ struct index_t {
+ enum {
+ nspace_data = 0,
+ name_data = 1,
+ };
+ };
+ using layout_type = asci::Layout<char, char>;
+ layout_type cell_layout() const {
+ return layout_type{nspace_len, name_len};
+ }
+ void set(const ghobject_t& oid) {
+ snap = oid.hobj.snap;
+ gen = oid.generation;
+ nspace_len = oid.hobj.nspace.size();
+ name_len = oid.hobj.oid.name.size();
+ auto layout = cell_layout();
+ std::memcpy(layout.Pointer<index_t::nspace_data>(data),
+ oid.hobj.nspace.data(), oid.hobj.nspace.size());
+ std::memcpy(layout.Pointer<index_t::name_data>(data),
+ oid.hobj.oid.name.data(), oid.hobj.oid.name.size());
+ }
+
+ void update_oid(ghobject_t& oid) const {
+ oid.hobj.snap = snap;
+ oid.generation = gen;
+ oid.hobj.nspace = nspace();
+ oid.hobj.oid.name = name();
+ }
+
+ variable_key_suffix& operator=(const variable_key_suffix& key) {
+ snap = key.snap;
+ gen = key.gen;
+ auto layout = cell_layout();
+ auto nspace = key.nspace();
+ std::copy_n(nspace.data(), nspace.size(),
+ layout.Pointer<index_t::nspace_data>(data));
+ auto name = key.name();
+ std::copy_n(name.data(), name.size(),
+ layout.Pointer<index_t::name_data>(data));
+ return *this;
+ }
+ const std::string_view nspace() const {
+ auto layout = cell_layout();
+ auto nspace = layout.Slice<index_t::nspace_data>(data);
+ return {nspace.data(), nspace.size()};
+ }
+ const std::string_view name() const {
+ auto layout = cell_layout();
+ auto name = layout.Slice<index_t::name_data>(data);
+ return {name.data(), name.size()};
+ }
+ size_t size() const {
+ return sizeof(*this) + nspace_len + name_len;
+ }
+ static size_t size_from(const ghobject_t& oid) {
+ return (sizeof(variable_key_suffix) +
+ oid.hobj.nspace.size() +
+ oid.hobj.oid.name.size());
+ }
+ ordering_t compare(const ghobject_t& oid) const {
+ return cmp(std::tie(nspace(), name(), snap, gen),
+ std::tie(oid.hobj.nspace, oid.hobj.oid.name, oid.hobj.snap.val,
+ oid.generation));
+ }
+ bool likes(const variable_key_suffix& key) const {
+ return nspace() == key.nspace() && name() == key.name();
+ }
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const variable_key_suffix& k) {
+ if (k.snap != CEPH_NOSNAP) {
+ os << "s" << k.snap << ",";
+ }
+ if (k.gen != ghobject_t::NO_GEN) {
+ os << "g" << k.gen << ",";
+ }
+ return os << k.nspace() << "/" << k.name();
+}
+
+// should use [[no_unique_address]] in C++20
+struct empty_key_suffix {
+ static constexpr ordering_t compare(const ghobject_t&) {
+ return ordering_t::equivalent;
+ }
+ static void set(const ghobject_t&) {}
+ static constexpr size_t size() {
+ return 0;
+ }
+ static size_t size_from(const ghobject_t&) {
+ return 0;
+ }
+ static void update_oid(ghobject_t&) {}
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const empty_key_suffix&)
+{
+ return os;
+}
+
+enum class ntype_t : uint8_t {
+ leaf = 0u,
+ inner,
+};
+
+constexpr ntype_t flip_ntype(ntype_t ntype) noexcept
+{
+ if (ntype == ntype_t::leaf) {
+ return ntype_t::inner;
+ } else {
+ return ntype_t::leaf;
+ }
+}
+
+template<int N, ntype_t NodeType>
+struct FixedKeyPrefix {};
+
+template<ntype_t NodeType>
+struct FixedKeyPrefix<0, NodeType>
+{
+ static constexpr bool item_in_key = false;
+ int8_t shard = -1;
+ int64_t pool = -1;
+ uint32_t hash = 0;
+ uint16_t offset = 0;
+
+ FixedKeyPrefix() = default;
+ FixedKeyPrefix(const ghobject_t& oid, uint16_t offset)
+ : shard{oid.shard_id},
+ pool{oid.hobj.pool},
+ hash{oid.hobj.get_hash()},
+ offset{offset}
+ {}
+
+ void set(const ghobject_t& oid, uint16_t new_offset) {
+ shard = oid.shard_id;
+ pool = oid.hobj.pool;
+ hash = oid.hobj.get_hash();
+ offset = new_offset;
+ }
+
+ void set(const FixedKeyPrefix& k, uint16_t new_offset) {
+ shard = k.shard;
+ pool = k.pool;
+ hash = k.hash;
+ offset = new_offset;
+ }
+
+ void update(const ghobject_t& oid) {
+ shard = oid.shard_id;
+ pool = oid.hobj.pool;
+ hash = oid.hobj.get_hash();
+ }
+
+ void update_oid(ghobject_t& oid) const {
+ oid.set_shard(shard_id_t{shard});
+ oid.hobj.pool = pool;
+ oid.hobj.set_hash(hash);
+ }
+
+ ordering_t compare(const ghobject_t& oid) const {
+ // so std::tie() can bind them by reference
+ int8_t rhs_shard = oid.shard_id;
+ uint32_t rhs_hash = oid.hobj.get_hash();
+ return cmp(std::tie(shard, pool, hash),
+ std::tie(rhs_shard, oid.hobj.pool, rhs_hash));
+ }
+ // @return true if i likes @c k, we will can be pushed down to next level
+ // in the same node
+ likes_t likes(const FixedKeyPrefix& k) const {
+ if (shard == k.shard && pool == k.pool) {
+ return likes_t::yes;
+ } else {
+ return likes_t::no;
+ }
+ }
+};
+
+template<ntype_t NodeType>
+std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<0, NodeType>& k) {
+ if (k.shard != shard_id_t::NO_SHARD) {
+ os << "s" << k.shard;
+ }
+ return os << "p=" << k.pool << ","
+ << "h=" << std::hex << k.hash << std::dec << ","
+ << ">" << k.offset;
+}
+
+// all elements in this node share the same <shard, pool>
+template<ntype_t NodeType>
+struct FixedKeyPrefix<1, NodeType> {
+ static constexpr bool item_in_key = false;
+ uint32_t hash = 0;
+ uint16_t offset = 0;
+
+ FixedKeyPrefix() = default;
+ FixedKeyPrefix(uint32_t hash, uint16_t offset)
+ : hash{hash},
+ offset{offset}
+ {}
+ FixedKeyPrefix(const ghobject_t& oid, uint16_t offset)
+ : FixedKeyPrefix(oid.hobj.get_hash(), offset)
+ {}
+ void set(const ghobject_t& oid, uint16_t new_offset) {
+ hash = oid.hobj.get_hash();
+ offset = new_offset;
+ }
+ template<int N>
+ void set(const FixedKeyPrefix<N, NodeType>& k, uint16_t new_offset) {
+ static_assert(N < 2, "only N0, N1 have hash");
+ hash = k.hash;
+ offset = new_offset;
+ }
+ void update_oid(ghobject_t& oid) const {
+ oid.hobj.set_hash(hash);
+ }
+ void update(const ghobject_t& oid) {
+ hash = oid.hobj.get_hash();
+ }
+ ordering_t compare(const ghobject_t& oid) const {
+ return compare_element(hash, oid.hobj.get_hash());
+ }
+ likes_t likes(const FixedKeyPrefix& k) const {
+ return hash == k.hash ? likes_t::yes : likes_t::no;
+ }
+};
+
+template<ntype_t NodeType>
+std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<1, NodeType>& k) {
+ return os << "0x" << std::hex << k.hash << std::dec << ","
+ << ">" << k.offset;
+}
+
+// all elements in this node must share the same <shard, pool, hash>
+template<ntype_t NodeType>
+struct FixedKeyPrefix<2, NodeType> {
+ static constexpr bool item_in_key = false;
+ uint16_t offset = 0;
+
+ FixedKeyPrefix() = default;
+
+ static constexpr ordering_t compare(const ghobject_t& oid) {
+ // need to compare the cell
+ return ordering_t::equivalent;
+ }
+ // always defer to my cell for likeness
+ constexpr likes_t likes(const FixedKeyPrefix&) const {
+ return likes_t::maybe;
+ }
+ void set(const ghobject_t&, uint16_t new_offset) {
+ offset = new_offset;
+ }
+ template<int N>
+ void set(const FixedKeyPrefix<N, NodeType>&, uint16_t new_offset) {
+ offset = new_offset;
+ }
+ void update(const ghobject_t&) {}
+ void update_oid(ghobject_t&) const {}
+};
+
+template<ntype_t NodeType>
+std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<2, NodeType>& k) {
+ return os << ">" << k.offset;
+}
+
+struct fixed_key_3 {
+ uint64_t snap = 0;
+ uint64_t gen = 0;
+
+ fixed_key_3() = default;
+ fixed_key_3(const ghobject_t& oid)
+ : snap{oid.hobj.snap}, gen{oid.generation}
+ {}
+ ordering_t compare(const ghobject_t& oid) const {
+ return cmp(std::tie(snap, gen),
+ std::tie(oid.hobj.snap.val, oid.generation));
+ }
+ // no object likes each other at this level
+ constexpr likes_t likes(const fixed_key_3&) const {
+ return likes_t::no;
+ }
+ void update_with_oid(const ghobject_t& oid) {
+ snap = oid.hobj.snap;
+ gen = oid.generation;
+ }
+ void update_oid(ghobject_t& oid) const {
+ oid.hobj.snap = snap;
+ oid.generation = gen;
+ }
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const fixed_key_3& k) {
+ if (k.snap != CEPH_NOSNAP) {
+ os << "s" << k.snap << ",";
+ }
+ if (k.gen != ghobject_t::NO_GEN) {
+ os << "g" << k.gen << ",";
+ }
+ return os;
+}
+
+// all elements in this node must share the same <shard, pool, hash, namespace, oid>
+// but the unlike other FixedKeyPrefix<>, a node with FixedKeyPrefix<3> does not have
+// variable_sized_key, so if it is an inner node, we can just embed the child
+// addr right in the key.
+template<>
+struct FixedKeyPrefix<3, ntype_t::inner> : public fixed_key_3 {
+ // the item is embedded in the key
+ static constexpr bool item_in_key = true;
+ laddr_t child_addr = 0;
+
+ FixedKeyPrefix() = default;
+ void set(const ghobject_t& oid, laddr_t new_child_addr) {
+ update_with_oid(oid);
+ child_addr = new_child_addr;
+ }
+ // unlikely get called, though..
+ void update(const ghobject_t& oid) {}
+ template<int N>
+ std::enable_if_t<N < 3> set(const FixedKeyPrefix<N, ntype_t::inner>&,
+ laddr_t new_child_addr) {
+ child_addr = new_child_addr;
+ }
+ void set(const FixedKeyPrefix& k, laddr_t new_child_addr) {
+ snap = k.snap;
+ gen = k.gen;
+ child_addr = new_child_addr;
+ }
+ void set(const variable_key_suffix& k, laddr_t new_child_addr) {
+ snap = k.snap;
+ gen = k.gen;
+ child_addr = new_child_addr;
+ }
+};
+
+template<>
+struct FixedKeyPrefix<3, ntype_t::leaf> : public fixed_key_3 {
+ static constexpr bool item_in_key = false;
+ uint16_t offset = 0;
+
+ FixedKeyPrefix() = default;
+ void set(const ghobject_t& oid, uint16_t new_offset) {
+ update_with_oid(oid);
+ offset = new_offset;
+ }
+ void set(const FixedKeyPrefix& k, uint16_t new_offset) {
+ snap = k.snap;
+ gen = k.gen;
+ offset = new_offset;
+ }
+ template<int N>
+ std::enable_if_t<N < 3> set(const FixedKeyPrefix<N, ntype_t::leaf>&,
+ uint16_t new_offset) {
+ offset = new_offset;
+ }
+};
+
+struct tag_t {
+ template<int N, ntype_t node_type>
+ static constexpr tag_t create() {
+ static_assert(std::clamp(N, 0, 3) == N);
+ return tag_t{N, static_cast<uint8_t>(node_type)};
+ }
+ bool is_leaf() const {
+ return type() == ntype_t::leaf;
+ }
+ int layout() const {
+ return layout_type;
+ }
+ ntype_t type() const {
+ return ntype_t{node_type};
+ }
+ int layout_type : 4;
+ uint8_t node_type : 4;
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const tag_t& tag) {
+ return os << "n=" << tag.layout() << ", leaf=" << tag.is_leaf();
+}
+
+// for calculating size of variable-sized item/key
+template<class T>
+size_t size_of(const T& t) {
+ using decayed_t = std::decay_t<T>;
+ if constexpr (std::is_scalar_v<decayed_t>) {
+ return sizeof(decayed_t);
+ } else {
+ return t.size();
+ }
+}
+
+enum class size_state_t {
+ okay,
+ underflow,
+ overflow,
+};
+
+// layout of a node of B+ tree
+//
+// it is different from a typical B+ tree in following ways
+// - the size of keys is not necessarily fixed, neither is the size of value.
+// - the max number of elements in a node is determined by the total size of
+// the keys and values in the node
+// - in internal nodes, each key maps to the logical address of the child
+// node whose minimum key is greater or equal to that key.
+template<size_t BlockSize,
+ int N,
+ ntype_t NodeType>
+struct node_t {
+ static_assert(std::clamp(N, 0, 3) == N);
+ constexpr static ntype_t node_type = NodeType;
+ constexpr static int node_n = N;
+
+ using key_prefix_t = FixedKeyPrefix<N, NodeType>;
+ using item_t = std::conditional_t<NodeType == ntype_t::leaf,
+ onode_t,
+ child_addr_t>;
+ using const_item_t = std::conditional_t<NodeType == ntype_t::leaf,
+ const onode_t&,
+ child_addr_t>;
+ static constexpr bool item_in_key = key_prefix_t::item_in_key;
+ using key_suffix_t = std::conditional_t<N < 3,
+ variable_key_suffix,
+ empty_key_suffix>;
+
+ std::pair<const key_prefix_t&, const key_suffix_t&>
+ key_at(unsigned slot) const;
+
+ // update an existing oid with the specified item
+ ghobject_t get_oid_at(unsigned slot, const ghobject_t& oid) const;
+ const_item_t item_at(const key_prefix_t& key) const;
+ void dump(std::ostream& os) const;
+
+ // for debugging only.
+ static constexpr bool is_leaf() {
+ return node_type == ntype_t::leaf;
+ }
+
+ bool _is_leaf() const {
+ return tag.is_leaf();
+ }
+
+ char* from_end(uint16_t offset);
+ const char* from_end(uint16_t offset) const;
+ uint16_t used_space() const;
+ uint16_t free_space() const {
+ return capacity() - used_space();
+ }
+ static uint16_t capacity();
+ // TODO: if it's allowed to update 2 siblings at the same time, we can have
+ // B* tree
+ static constexpr uint16_t min_size();
+
+
+ // calculate the allowable bounds on bytes to remove from an overflow node
+ // with specified size
+ // @param size the overflowed size
+ // @return <minimum bytes to grab, maximum bytes to grab>
+ static constexpr std::pair<int16_t, int16_t> bytes_to_remove(uint16_t size);
+
+ // calculate the allowable bounds on bytes to add to an underflow node
+ // with specified size
+ // @param size the underflowed size
+ // @return <minimum bytes to push, maximum bytes to push>
+ static constexpr std::pair<int16_t, int16_t> bytes_to_add(uint16_t size);
+
+ size_state_t size_state(uint16_t size) const;
+ bool is_underflow(uint16_t size) const;
+ int16_t size_with_key(unsigned slot, const ghobject_t& oid) const;
+ ordering_t compare_with_slot(unsigned slot, const ghobject_t& oid) const;
+ /// return the slot number of the first slot that is greater or equal to
+ /// key
+ std::pair<unsigned, bool> lower_bound(const ghobject_t& oid) const;
+ static uint16_t size_of_item(const ghobject_t& oid, const item_t& item);
+ bool is_overflow(const ghobject_t& oid, const item_t& item) const;
+ bool is_overflow(const ghobject_t& oid, const OnodeRef& item) const;
+
+ // inserts an item into the given slot, pushing all subsequent keys forward
+ // @note if the item is not embedded in key, shift the right half as well
+ void insert_at(unsigned slot, const ghobject_t& oid, const item_t& item);
+ // used by InnerNode for updating the keys indexing its children when their lower boundaries
+ // is updated
+ void update_key_at(unsigned slot, const ghobject_t& oid);
+ // try to figure out the number of elements and total size when trying to
+ // rebalance by moving the elements from the front of this node when its
+ // left sibling node is underflow
+ //
+ // @param min_grab lower bound of the number of bytes to move
+ // @param max_grab upper bound of the number of bytes to move
+ // @return the number of element to grab
+ // @note return {0, 0} if current node would be underflow if
+ // @c min_grab bytes of elements are taken from it
+ std::pair<unsigned, uint16_t> calc_grab_front(uint16_t min_grab, uint16_t max_grab) const;
+ // try to figure out the number of elements and their total size when trying to
+ // rebalance by moving the elements from the end of this node when its right
+ // sibling node is underflow
+ //
+ // @param min_grab lower bound of the number of bytes to move
+ // @param max_grab upper bound of the number of bytes to move
+ // @return the number of element to grab
+ // @note return {0, 0} if current node would be underflow if
+ // @c min_grab bytes of elements are taken from it
+ std::pair<unsigned, uint16_t> calc_grab_back(uint16_t min_grab, uint16_t max_grab) const;
+ template<int LeftN, class Mover> void grab_from_left(
+ node_t<BlockSize, LeftN, NodeType>& left,
+ unsigned n, uint16_t bytes,
+ Mover& mover);
+ template<int RightN, class Mover>
+ delta_t acquire_right(node_t<BlockSize, RightN, NodeType>& right,
+ unsigned whoami, Mover& mover);
+ // transfer n elements at the front of given node to me
+ template<int RightN, class Mover>
+ void grab_from_right(node_t<BlockSize, RightN, NodeType>& right,
+ unsigned n, uint16_t bytes,
+ Mover& mover);
+ template<int LeftN, class Mover>
+ void push_to_left(node_t<BlockSize, LeftN, NodeType>& left,
+ unsigned n, uint16_t bytes,
+ Mover& mover);
+ template<int RightN, class Mover>
+ void push_to_right(node_t<BlockSize, RightN, NodeType>& right,
+ unsigned n, uint16_t bytes,
+ Mover& mover);
+ // [to, from) are removed, so we need to shift left
+ // actually there are only two use cases:
+ // - to = 0: for giving elements in bulk
+ // - to = from - 1: for removing a single element
+ // old: |////|.....| |.....|/|........|
+ // new: |.....| |.....||........|
+ void shift_left(unsigned from, unsigned to);
+ void insert_front(const ceph::bufferptr& keys_buf, const ceph::bufferptr& cells_buf);
+ void insert_back(const ceph::bufferptr& keys_buf, const ceph::bufferptr& cells_buf);
+ // one or more elements are inserted, so we need to shift the elements right
+ // actually there are only two use cases:
+ // - bytes != 0: for inserting bytes before from
+ // - bytes = 0: for inserting a single element before from
+ // old: ||.....|
+ // new: |/////|.....|
+ void shift_right(unsigned n, unsigned bytes);
+ // shift all keys after slot is removed.
+ // @note if the item is not embdedded in key, all items sitting at the left
+ // side of it will be shifted right
+ void remove_from(unsigned slot);
+ void trim_right(unsigned n);
+ void play_delta(const delta_t& delta);
+ // /-------------------------------|
+ // | V
+ // |header|k0|k1|k2|... | / / |k2'v2|k1'v1|k0'.v0| v_m |
+ // |<-- count -->|
+ tag_t tag = tag_t::create<N, NodeType>();
+ // the count of values in the node
+ uint16_t count = 0;
+ key_prefix_t keys[];
+};
+
+template<class parent_t,
+ class from_t,
+ class to_t,
+ typename=void>
+class EntryMover {
+public:
+ // a "trap" mover
+ EntryMover(const parent_t&, from_t&, to_t& dst, unsigned) {
+ assert(0);
+ }
+ void move_from(unsigned, unsigned, unsigned) {
+ assert(0);
+ }
+ delta_t get_delta() {
+ return delta_t::nop();
+ }
+};
+
+// lower the layout, for instance, from L0 to L1, no reference oid is used
+template<class parent_t,
+ class from_t,
+ class to_t>
+class EntryMover<parent_t,
+ from_t,
+ to_t,
+ std::enable_if_t<from_t::node_n < to_t::node_n>>
+{
+public:
+ EntryMover(const parent_t&, from_t& src, to_t& dst, unsigned)
+ : src{src}, dst{dst}
+ {}
+ void move_from(unsigned src_first, unsigned dst_first, unsigned n)
+ {
+ ceph::bufferptr keys_buf{n * sizeof(to_t::key_prefix_t)};
+ ceph::bufferptr cells_buf;
+ auto dst_keys = reinterpret_cast<typename to_t::key_prefix_t*>(keys_buf.c_str());
+ if constexpr (to_t::item_in_key) {
+ for (unsigned i = 0; i < n; i++) {
+ const auto& [prefix, suffix] = src.key_at(src_first + i);
+ dst_keys[i].set(suffix, src.item_at(prefix));
+ }
+ } else {
+ // copy keys
+ uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0;
+ uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0;
+ for (unsigned i = 0; i < n; i++) {
+ auto& src_key = src.keys[src_first + i];
+ uint16_t offset = src_key.offset - src_offset + dst_offset;
+ dst_keys[i].set(src_key, offset);
+ }
+ // copy cells in bulk, yay!
+ auto src_end = src.keys[src_first + n - 1].offset;
+ uint16_t total_cell_size = src_end - src_offset;
+ cells_buf = ceph::bufferptr{total_cell_size};
+ cells_buf.copy_in(0, total_cell_size, src.from_end(src_end));
+ }
+ if (dst_first == dst.count) {
+ dst_delta = delta_t::insert_back(keys_buf, cells_buf);
+ } else {
+ dst_delta = delta_t::insert_front(keys_buf, cells_buf);
+ }
+ if (src_first > 0 && src_first + n == src.count) {
+ src_delta = delta_t::trim_right(src_first);
+ } else if (src_first == 0 && n < src.count) {
+ src_delta = delta_t::shift_left(n);
+ } else if (src_first == 0 && n == src.count) {
+ // the caller will retire the src extent
+ } else {
+ // grab in the middle?
+ assert(0);
+ }
+ }
+
+ delta_t from_delta() {
+ return std::move(src_delta);
+ }
+ delta_t to_delta() {
+ return std::move(dst_delta);
+ }
+private:
+ const from_t& src;
+ const to_t& dst;
+ delta_t dst_delta;
+ delta_t src_delta;
+};
+
+// lift the layout, for instance, from L2 to L0, need a reference oid
+template<class parent_t,
+ class from_t,
+ class to_t>
+class EntryMover<parent_t, from_t, to_t,
+ std::enable_if_t<(from_t::node_n > to_t::node_n)>>
+{
+public:
+ EntryMover(const parent_t& parent, from_t& src, to_t& dst, unsigned from_slot)
+ : src{src}, dst{dst}, ref_oid{parent->get_oid_at(from_slot, {})}
+ {}
+ void move_from(unsigned src_first, unsigned dst_first, unsigned n)
+ {
+ ceph::bufferptr keys_buf{n * sizeof(to_t::key_prefix_t)};
+ ceph::bufferptr cells_buf;
+ auto dst_keys = reinterpret_cast<typename to_t::key_prefix_t*>(keys_buf.c_str());
+ uint16_t in_node_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0;
+ static_assert(!std::is_same_v<typename to_t::key_suffix_t, empty_key_suffix>);
+ // copy keys
+ uint16_t buf_offset = 0;
+ for (unsigned i = 0; i < n; i++) {
+ auto& src_key = src.keys[src_first + i];
+ if constexpr (std::is_same_v<typename from_t::key_suffix_t, empty_key_suffix>) {
+ // heterogeneous partial key, have to rebuild dst partial key from oid
+ src_key.update_oid(ref_oid);
+ const auto& src_item = src.item_at(src_key);
+ size_t key2_size = to_t::key_suffix_t::size_from(ref_oid);
+ buf_offset += key2_size + size_of(src_item);
+ dst_keys[i].set(ref_oid, in_node_offset + buf_offset);
+ auto p = from_end(cells_buf, buf_offset);
+ auto partial_key = reinterpret_cast<typename to_t::key_suffix_t*>(p);
+ partial_key->set(ref_oid);
+ p += key2_size;
+ auto dst_item = reinterpret_cast<typename to_t::item_t*>(p);
+ *dst_item = src_item;
+ } else {
+ // homogeneous partial key, just update the pointers
+ uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0;
+ uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0;
+ uint16_t offset = src_key.offset - src_offset + dst_offset;
+ dst_keys[i].set(ref_oid, in_node_offset + offset);
+ }
+ }
+ if constexpr (std::is_same_v<typename to_t::key_suffix_t,
+ typename from_t::key_suffix_t>) {
+ // copy cells in bulk, yay!
+ uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0;
+ uint16_t src_end = src.keys[src_first + n - 1].offset;
+ uint16_t total_cell_size = src_end - src_offset;
+ cells_buf.copy_in(0, total_cell_size, src.from_end(src_end));
+ }
+ if (dst_first == dst.count) {
+ dst_delta = delta_t::insert_back(keys_buf, cells_buf);
+ } else {
+ dst_delta = delta_t::insert_front(keys_buf, cells_buf);
+ }
+ if (src_first + n == src.count && src_first > 0) {
+ src_delta = delta_t::trim_right(src_first);
+ } else {
+ // the caller will retire the src extent
+ assert(src_first == 0);
+ }
+ }
+
+ delta_t from_delta() {
+ return std::move(src_delta);
+ }
+ delta_t to_delta() {
+ return std::move(dst_delta);
+ }
+private:
+ char* from_end(ceph::bufferptr& ptr, uint16_t offset) {
+ return ptr.end_c_str() - static_cast<int>(offset);
+ }
+private:
+ const from_t& src;
+ const to_t& dst;
+ delta_t dst_delta;
+ delta_t src_delta;
+ ghobject_t ref_oid;
+};
+
+// identical layout, yay!
+template<class parent_t,
+ class child_t>
+class EntryMover<parent_t, child_t, child_t>
+{
+public:
+ EntryMover(const parent_t&, child_t& src, child_t& dst, unsigned)
+ : src{src}, dst{dst}
+ {}
+
+ void move_from(unsigned src_first, unsigned dst_first, unsigned n)
+ {
+ ceph::bufferptr keys_buf{static_cast<unsigned>(n * sizeof(typename child_t::key_prefix_t))};
+ ceph::bufferptr cells_buf;
+ auto dst_keys = reinterpret_cast<typename child_t::key_prefix_t*>(keys_buf.c_str());
+
+ // copy keys
+ std::copy(src.keys + src_first, src.keys + src_first + n,
+ dst_keys);
+ if constexpr (!child_t::item_in_key) {
+ uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0;
+ uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0;
+ const int offset_delta = dst_offset - src_offset;
+ // update the pointers
+ for (unsigned i = 0; i < n; i++) {
+ dst_keys[i].offset += offset_delta;
+ }
+ // copy cells in bulk, yay!
+ auto src_end = src.keys[src_first + n - 1].offset;
+ uint16_t total_cell_size = src_end - src_offset;
+ cells_buf = ceph::bufferptr{total_cell_size};
+ cells_buf.copy_in(0, total_cell_size, src.from_end(src_end));
+ }
+ if (dst_first == dst.count) {
+ dst_delta = delta_t::insert_back(std::move(keys_buf), std::move(cells_buf));
+ } else {
+ dst_delta = delta_t::insert_front(std::move(keys_buf), std::move(cells_buf));
+ }
+ if (src_first + n == src.count && src_first > 0) {
+ src_delta = delta_t::trim_right(n);
+ } else if (src_first == 0 && n < src.count) {
+ src_delta = delta_t::shift_left(n);
+ } else if (src_first == 0 && n == src.count) {
+ // the caller will retire the src extent
+ } else {
+ // grab in the middle?
+ assert(0);
+ }
+ }
+
+ delta_t from_delta() {
+ return std::move(src_delta);
+ }
+
+ delta_t to_delta() {
+ return std::move(dst_delta);
+ }
+private:
+ char* from_end(ceph::bufferptr& ptr, uint16_t offset) {
+ return ptr.end_c_str() - static_cast<int>(offset);
+ }
+private:
+ const child_t& src;
+ const child_t& dst;
+ delta_t src_delta;
+ delta_t dst_delta;
+};
+
+template<class parent_t, class from_t, class to_t>
+EntryMover<parent_t, from_t, to_t>
+make_mover(const parent_t& parent, from_t& src, to_t& dst, unsigned from_slot) {
+ return EntryMover<parent_t, from_t, to_t>(parent, src, dst, from_slot);
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h
new file mode 100644
index 000000000..4908c691f
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <algorithm>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <string>
+
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction.h"
+
+namespace crimson::os::seastore::onode {
+
+using crimson::os::seastore::Transaction;
+using crimson::os::seastore::TransactionRef;
+using crimson::os::seastore::make_transaction;
+using crimson::os::seastore::laddr_t;
+using crimson::os::seastore::L_ADDR_MIN;
+using crimson::os::seastore::L_ADDR_NULL;
+using crimson::os::seastore::extent_len_t;
+
+class DeltaRecorder;
+class NodeExtent;
+class NodeExtentManager;
+class RootNodeTracker;
+using DeltaRecorderURef = std::unique_ptr<DeltaRecorder>;
+using NodeExtentRef = crimson::os::seastore::TCachedExtentRef<NodeExtent>;
+using NodeExtentManagerURef = std::unique_ptr<NodeExtentManager>;
+using RootNodeTrackerURef = std::unique_ptr<RootNodeTracker>;
+struct context_t {
+ NodeExtentManager& nm;
+ Transaction& t;
+};
+
+class LeafNodeImpl;
+class InternalNodeImpl;
+class NodeImpl;
+using LeafNodeImplURef = std::unique_ptr<LeafNodeImpl>;
+using InternalNodeImplURef = std::unique_ptr<InternalNodeImpl>;
+using NodeImplURef = std::unique_ptr<NodeImpl>;
+
+using level_t = uint8_t;
+// a type only to index within a node, 32 bits should be enough
+using index_t = uint32_t;
+constexpr auto INDEX_END = std::numeric_limits<index_t>::max();
+constexpr auto INDEX_LAST = INDEX_END - 0x4;
+constexpr auto INDEX_UPPER_BOUND = INDEX_END - 0x8;
+inline bool is_valid_index(index_t index) { return index < INDEX_UPPER_BOUND; }
+
+// TODO: decide by NODE_BLOCK_SIZE
+using node_offset_t = uint16_t;
+constexpr node_offset_t DISK_BLOCK_SIZE = 1u << 12;
+constexpr node_offset_t NODE_BLOCK_SIZE = DISK_BLOCK_SIZE * 1u;
+
+enum class MatchKindBS : int8_t { NE = -1, EQ = 0 };
+
+enum class MatchKindCMP : int8_t { LT = -1, EQ = 0, GT };
+inline MatchKindCMP toMatchKindCMP(int value) {
+ if (value > 0) {
+ return MatchKindCMP::GT;
+ } else if (value < 0) {
+ return MatchKindCMP::LT;
+ } else {
+ return MatchKindCMP::EQ;
+ }
+}
+template <typename Type>
+MatchKindCMP toMatchKindCMP(const Type& l, const Type& r) {
+ int match = l - r;
+ return toMatchKindCMP(match);
+}
+
+inline MatchKindCMP toMatchKindCMP(
+ std::string_view l, std::string_view r) {
+ return toMatchKindCMP(l.compare(r));
+}
+
+inline MatchKindCMP reverse(MatchKindCMP cmp) {
+ if (cmp == MatchKindCMP::LT) {
+ return MatchKindCMP::GT;
+ } else if (cmp == MatchKindCMP::GT) {
+ return MatchKindCMP::LT;
+ } else {
+ return cmp;
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc
new file mode 100644
index 000000000..3df458f08
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc
@@ -0,0 +1,809 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node.h"
+
+#include <cassert>
+#include <exception>
+#include <sstream>
+
+#include "common/likely.h"
+#include "crimson/common/log.h"
+#include "node_extent_manager.h"
+#include "node_impl.h"
+#include "stages/node_stage_layout.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+namespace crimson::os::seastore::onode {
+
+using node_ertr = Node::node_ertr;
+template <class ValueT=void>
+using node_future = Node::node_future<ValueT>;
+
+/*
+ * tree_cursor_t
+ */
+
+tree_cursor_t::tree_cursor_t(Ref<LeafNode> node, const search_position_t& pos)
+ : leaf_node{node}, position{pos} {
+ assert(!is_end());
+ leaf_node->do_track_cursor<true>(*this);
+}
+
+tree_cursor_t::tree_cursor_t(
+ Ref<LeafNode> node, const search_position_t& pos,
+ const key_view_t& key, const onode_t* _p_value, layout_version_t v)
+ : leaf_node{node}, position{pos} {
+ assert(!is_end());
+ update_kv(key, _p_value, v);
+ leaf_node->do_track_cursor<true>(*this);
+}
+
+tree_cursor_t::tree_cursor_t(Ref<LeafNode> node)
+ : leaf_node{node}, position{search_position_t::end()} {
+ assert(is_end());
+ assert(leaf_node->is_level_tail());
+}
+
+tree_cursor_t::~tree_cursor_t() {
+ if (!is_end()) {
+ leaf_node->do_untrack_cursor(*this);
+ }
+}
+
+const key_view_t& tree_cursor_t::get_key_view() const {
+ ensure_kv();
+ return *key_view;
+}
+
+const onode_t* tree_cursor_t::get_p_value() const {
+ ensure_kv();
+ return p_value;
+}
+
+template <bool VALIDATE>
+void tree_cursor_t::update_track(
+ Ref<LeafNode> node, const search_position_t& pos) {
+ // the cursor must be already untracked
+ // track the new node and new pos
+ assert(!pos.is_end());
+ assert(!is_end());
+ leaf_node = node;
+ position = pos;
+ key_view.reset();
+ p_value = nullptr;
+ leaf_node->do_track_cursor<VALIDATE>(*this);
+}
+template void tree_cursor_t::update_track<true>(Ref<LeafNode>, const search_position_t&);
+template void tree_cursor_t::update_track<false>(Ref<LeafNode>, const search_position_t&);
+
+void tree_cursor_t::update_kv(
+ const key_view_t& key, const onode_t* _p_value, layout_version_t v) const {
+ assert(!is_end());
+ assert(_p_value);
+ assert(std::make_tuple(key, _p_value, v) == leaf_node->get_kv(position));
+ key_view = key;
+ p_value = _p_value;
+ node_version = v;
+}
+
+void tree_cursor_t::ensure_kv() const {
+ assert(!is_end());
+ if (!p_value || node_version != leaf_node->get_layout_version()) {
+ // NOTE: the leaf node is always present when we hold its reference.
+ std::tie(key_view, p_value, node_version) = leaf_node->get_kv(position);
+ }
+ assert(p_value);
+}
+
+/*
+ * Node
+ */
+
+Node::Node(NodeImplURef&& impl) : impl{std::move(impl)} {}
+
+Node::~Node() {
+ // XXX: tolerate failure between allocate() and as_child()
+ if (is_root()) {
+ super->do_untrack_root(*this);
+ } else {
+ _parent_info->ptr->do_untrack_child(*this);
+ }
+}
+
+level_t Node::level() const {
+ return impl->level();
+}
+
+node_future<Node::search_result_t> Node::lower_bound(
+ context_t c, const key_hobj_t& key) {
+ return seastar::do_with(
+ MatchHistory(), [this, c, &key](auto& history) {
+ return lower_bound_tracked(c, key, history);
+ }
+ );
+}
+
+node_future<std::pair<Ref<tree_cursor_t>, bool>> Node::insert(
+ context_t c, const key_hobj_t& key, const onode_t& value) {
+ return seastar::do_with(
+ MatchHistory(), [this, c, &key, &value](auto& history) {
+ return lower_bound_tracked(c, key, history
+ ).safe_then([c, &key, &value, &history](auto result) {
+ if (result.match() == MatchKindBS::EQ) {
+ return node_ertr::make_ready_future<std::pair<Ref<tree_cursor_t>, bool>>(
+ std::make_pair(result.p_cursor, false));
+ } else {
+ auto leaf_node = result.p_cursor->get_leaf_node();
+ return leaf_node->insert_value(
+ c, key, value, result.p_cursor->get_position(), history, result.mstat
+ ).safe_then([](auto p_cursor) {
+ return node_ertr::make_ready_future<std::pair<Ref<tree_cursor_t>, bool>>(
+ std::make_pair(p_cursor, true));
+ });
+ }
+ });
+ }
+ );
+}
+
+node_future<tree_stats_t> Node::get_tree_stats(context_t c) {
+ return seastar::do_with(
+ tree_stats_t(), [this, c](auto& stats) {
+ return do_get_tree_stats(c, stats).safe_then([&stats] {
+ return stats;
+ });
+ }
+ );
+}
+
+std::ostream& Node::dump(std::ostream& os) const {
+ return impl->dump(os);
+}
+
+std::ostream& Node::dump_brief(std::ostream& os) const {
+ return impl->dump_brief(os);
+}
+
+void Node::test_make_destructable(
+ context_t c, NodeExtentMutable& mut, Super::URef&& _super) {
+ impl->test_set_tail(mut);
+ make_root(c, std::move(_super));
+}
+
+node_future<> Node::mkfs(context_t c, RootNodeTracker& root_tracker) {
+ return LeafNode::allocate_root(c, root_tracker
+ ).safe_then([](auto ret) { /* FIXME: discard_result(); */ });
+}
+
+node_future<Ref<Node>> Node::load_root(context_t c, RootNodeTracker& root_tracker) {
+ return c.nm.get_super(c.t, root_tracker
+ ).safe_then([c, &root_tracker](auto&& _super) {
+ auto root_addr = _super->get_root_laddr();
+ assert(root_addr != L_ADDR_NULL);
+ return Node::load(c, root_addr, true
+ ).safe_then([c, _super = std::move(_super),
+ &root_tracker](auto root) mutable {
+ assert(root->impl->field_type() == field_type_t::N0);
+ root->as_root(std::move(_super));
+ std::ignore = c; // as only used in an assert
+ std::ignore = root_tracker;
+ assert(root == root_tracker.get_root(c.t));
+ return node_ertr::make_ready_future<Ref<Node>>(root);
+ });
+ });
+}
+
+void Node::make_root(context_t c, Super::URef&& _super) {
+ _super->write_root_laddr(c, impl->laddr());
+ as_root(std::move(_super));
+}
+
+void Node::as_root(Super::URef&& _super) {
+ assert(!super && !_parent_info);
+ assert(_super->get_root_laddr() == impl->laddr());
+ assert(impl->is_level_tail());
+ super = std::move(_super);
+ super->do_track_root(*this);
+}
+
+node_future<> Node::upgrade_root(context_t c) {
+ assert(is_root());
+ assert(impl->is_level_tail());
+ assert(impl->field_type() == field_type_t::N0);
+ super->do_untrack_root(*this);
+ return InternalNode::allocate_root(c, impl->level(), impl->laddr(), std::move(super)
+ ).safe_then([this](auto new_root) {
+ as_child(search_position_t::end(), new_root);
+ });
+}
+
+template <bool VALIDATE>
+void Node::as_child(const search_position_t& pos, Ref<InternalNode> parent_node) {
+ assert(!super);
+ _parent_info = parent_info_t{pos, parent_node};
+ parent_info().ptr->do_track_child<VALIDATE>(*this);
+}
+template void Node::as_child<true>(const search_position_t&, Ref<InternalNode>);
+template void Node::as_child<false>(const search_position_t&, Ref<InternalNode>);
+
+node_future<> Node::insert_parent(context_t c, Ref<Node> right_node) {
+ assert(!is_root());
+ // TODO(cross-node string dedup)
+ return parent_info().ptr->apply_child_split(
+ c, parent_info().position, this, right_node);
+}
+
+node_future<Ref<Node>> Node::load(
+ context_t c, laddr_t addr, bool expect_is_level_tail) {
+ // NOTE:
+ // *option1: all types of node have the same length;
+ // option2: length is defined by node/field types;
+ // option3: length is totally flexible;
+ return c.nm.read_extent(c.t, addr, NODE_BLOCK_SIZE
+ ).safe_then([expect_is_level_tail](auto extent) {
+ auto [node_type, field_type] = extent->get_types();
+ if (node_type == node_type_t::LEAF) {
+ auto impl = LeafNodeImpl::load(extent, field_type, expect_is_level_tail);
+ return Ref<Node>(new LeafNode(impl.get(), std::move(impl)));
+ } else if (node_type == node_type_t::INTERNAL) {
+ auto impl = InternalNodeImpl::load(extent, field_type, expect_is_level_tail);
+ return Ref<Node>(new InternalNode(impl.get(), std::move(impl)));
+ } else {
+ ceph_abort("impossible path");
+ }
+ });
+}
+
+/*
+ * InternalNode
+ */
+
+InternalNode::InternalNode(InternalNodeImpl* impl, NodeImplURef&& impl_ref)
+ : Node(std::move(impl_ref)), impl{impl} {}
+
+node_future<> InternalNode::apply_child_split(
+ context_t c, const search_position_t& pos,
+ Ref<Node> left_child, Ref<Node> right_child) {
+#ifndef NDEBUG
+ if (pos.is_end()) {
+ assert(impl->is_level_tail());
+ }
+#endif
+ impl->prepare_mutate(c);
+
+ auto left_key = left_child->impl->get_largest_key_view();
+ auto left_child_addr = left_child->impl->laddr();
+ auto left_child_addr_packed = laddr_packed_t{left_child_addr};
+ auto right_key = right_child->impl->get_largest_key_view();
+ auto right_child_addr = right_child->impl->laddr();
+ logger().debug("OTree::Internal::Insert: "
+ "pos({}), left_child({}, {:#x}), right_child({}, {:#x}) ...",
+ pos, left_key, left_child_addr, right_key, right_child_addr);
+ // update pos => left_child to pos => right_child
+ impl->replace_child_addr(pos, right_child_addr, left_child_addr);
+ replace_track(pos, right_child, left_child);
+
+ search_position_t insert_pos = pos;
+ auto [insert_stage, insert_size] = impl->evaluate_insert(
+ left_key, left_child_addr, insert_pos);
+ auto free_size = impl->free_size();
+ if (free_size >= insert_size) {
+ // insert
+ [[maybe_unused]] auto p_value = impl->insert(
+ left_key, left_child_addr_packed, insert_pos, insert_stage, insert_size);
+ assert(impl->free_size() == free_size - insert_size);
+ assert(insert_pos <= pos);
+ assert(p_value->value == left_child_addr);
+ track_insert(insert_pos, insert_stage, left_child, right_child);
+ validate_tracked_children();
+ return node_ertr::now();
+ }
+ // split and insert
+ Ref<InternalNode> this_ref = this;
+ return (is_root() ? upgrade_root(c) : node_ertr::now()
+ ).safe_then([this, c] {
+ return InternalNode::allocate(
+ c, impl->field_type(), impl->is_level_tail(), impl->level());
+ }).safe_then([this_ref, this, c, left_key, left_child, right_child,
+ insert_pos, insert_stage=insert_stage, insert_size=insert_size](auto fresh_right) mutable {
+ auto right_node = fresh_right.node;
+ auto left_child_addr = left_child->impl->laddr();
+ auto left_child_addr_packed = laddr_packed_t{left_child_addr};
+ auto [split_pos, is_insert_left, p_value] = impl->split_insert(
+ fresh_right.mut, *right_node->impl, left_key, left_child_addr_packed,
+ insert_pos, insert_stage, insert_size);
+ assert(p_value->value == left_child_addr);
+ track_split(split_pos, right_node);
+ if (is_insert_left) {
+ track_insert(insert_pos, insert_stage, left_child);
+ } else {
+ right_node->track_insert(insert_pos, insert_stage, left_child);
+ }
+ validate_tracked_children();
+ right_node->validate_tracked_children();
+
+ // propagate index to parent
+ return insert_parent(c, right_node);
+ // TODO (optimize)
+ // try to acquire space from siblings before split... see btrfs
+ });
+}
+
+node_future<Ref<InternalNode>> InternalNode::allocate_root(
+ context_t c, level_t old_root_level,
+ laddr_t old_root_addr, Super::URef&& super) {
+ return InternalNode::allocate(c, field_type_t::N0, true, old_root_level + 1
+ ).safe_then([c, old_root_addr,
+ super = std::move(super)](auto fresh_node) mutable {
+ auto root = fresh_node.node;
+ auto p_value = root->impl->get_p_value(search_position_t::end());
+ fresh_node.mut.copy_in_absolute(
+ const_cast<laddr_packed_t*>(p_value), old_root_addr);
+ root->make_root_from(c, std::move(super), old_root_addr);
+ return root;
+ });
+}
+
+node_future<Ref<tree_cursor_t>>
+InternalNode::lookup_smallest(context_t c) {
+ auto position = search_position_t::begin();
+ laddr_t child_addr = impl->get_p_value(position)->value;
+ return get_or_track_child(c, position, child_addr
+ ).safe_then([c](auto child) {
+ return child->lookup_smallest(c);
+ });
+}
+
+node_future<Ref<tree_cursor_t>>
+InternalNode::lookup_largest(context_t c) {
+ // NOTE: unlike LeafNode::lookup_largest(), this only works for the tail
+ // internal node to return the tail child address.
+ auto position = search_position_t::end();
+ laddr_t child_addr = impl->get_p_value(position)->value;
+ return get_or_track_child(c, position, child_addr).safe_then([c](auto child) {
+ return child->lookup_largest(c);
+ });
+}
+
+node_future<Node::search_result_t>
+InternalNode::lower_bound_tracked(
+ context_t c, const key_hobj_t& key, MatchHistory& history) {
+ auto result = impl->lower_bound(key, history);
+ return get_or_track_child(c, result.position, result.p_value->value
+ ).safe_then([c, &key, &history](auto child) {
+ // XXX(multi-type): pass result.mstat to child
+ return child->lower_bound_tracked(c, key, history);
+ });
+}
+
+node_future<> InternalNode::do_get_tree_stats(
+ context_t c, tree_stats_t& stats) {
+ auto nstats = impl->get_stats();
+ stats.size_persistent_internal += nstats.size_persistent;
+ stats.size_filled_internal += nstats.size_filled;
+ stats.size_logical_internal += nstats.size_logical;
+ stats.size_overhead_internal += nstats.size_overhead;
+ stats.size_value_internal += nstats.size_value;
+ stats.num_kvs_internal += nstats.num_kvs;
+ stats.num_nodes_internal += 1;
+
+ Ref<const InternalNode> this_ref = this;
+ return seastar::do_with(
+ search_position_t(), [this, this_ref, c, &stats](auto& pos) {
+ pos = search_position_t::begin();
+ return crimson::do_until(
+ [this, this_ref, c, &stats, &pos]() -> node_future<bool> {
+ auto child_addr = impl->get_p_value(pos)->value;
+ return get_or_track_child(c, pos, child_addr
+ ).safe_then([c, &stats](auto child) {
+ return child->do_get_tree_stats(c, stats);
+ }).safe_then([this, this_ref, &pos] {
+ if (pos.is_end()) {
+ return node_ertr::make_ready_future<bool>(true);
+ } else {
+ impl->next_position(pos);
+ if (pos.is_end()) {
+ if (impl->is_level_tail()) {
+ return node_ertr::make_ready_future<bool>(false);
+ } else {
+ return node_ertr::make_ready_future<bool>(true);
+ }
+ } else {
+ return node_ertr::make_ready_future<bool>(false);
+ }
+ }
+ });
+ });
+ }
+ );
+}
+
+node_future<> InternalNode::test_clone_root(
+ context_t c_other, RootNodeTracker& tracker_other) const {
+ assert(is_root());
+ assert(impl->is_level_tail());
+ assert(impl->field_type() == field_type_t::N0);
+ Ref<const InternalNode> this_ref = this;
+ return InternalNode::allocate(c_other, field_type_t::N0, true, impl->level()
+ ).safe_then([this, c_other, &tracker_other](auto fresh_other) {
+ impl->test_copy_to(fresh_other.mut);
+ auto cloned_root = fresh_other.node;
+ return c_other.nm.get_super(c_other.t, tracker_other
+ ).safe_then([c_other, cloned_root](auto&& super_other) {
+ cloned_root->make_root_new(c_other, std::move(super_other));
+ return cloned_root;
+ });
+ }).safe_then([this_ref, this, c_other](auto cloned_root) {
+ // clone tracked children
+ // In some unit tests, the children are stubbed out that they
+ // don't exist in NodeExtentManager, and are only tracked in memory.
+ return crimson::do_for_each(
+ tracked_child_nodes.begin(),
+ tracked_child_nodes.end(),
+ [this_ref, c_other, cloned_root](auto& kv) {
+ assert(kv.first == kv.second->parent_info().position);
+ return kv.second->test_clone_non_root(c_other, cloned_root);
+ }
+ );
+ });
+}
+
+node_future<Ref<Node>> InternalNode::get_or_track_child(
+ context_t c, const search_position_t& position, laddr_t child_addr) {
+ bool level_tail = position.is_end();
+ Ref<Node> child;
+ auto found = tracked_child_nodes.find(position);
+ Ref<InternalNode> this_ref = this;
+ return (found == tracked_child_nodes.end()
+ ? (logger().trace("OTree::Internal: load child untracked at {:#x}, pos({}), level={}",
+ child_addr, position, level() - 1),
+ Node::load(c, child_addr, level_tail
+ ).safe_then([this, position] (auto child) {
+ child->as_child(position, this);
+ return child;
+ }))
+ : (logger().trace("OTree::Internal: load child tracked at {:#x}, pos({}), level={}",
+ child_addr, position, level() - 1),
+ node_ertr::make_ready_future<Ref<Node>>(found->second))
+ ).safe_then([this_ref, this, position, child_addr] (auto child) {
+ assert(child_addr == child->impl->laddr());
+ assert(position == child->parent_info().position);
+ std::ignore = position;
+ std::ignore = child_addr;
+ validate_child(*child);
+ return child;
+ });
+}
+
+void InternalNode::track_insert(
+ const search_position_t& insert_pos, match_stage_t insert_stage,
+ Ref<Node> insert_child, Ref<Node> nxt_child) {
+ // update tracks
+ auto pos_upper_bound = insert_pos;
+ pos_upper_bound.index_by_stage(insert_stage) = INDEX_UPPER_BOUND;
+ auto first = tracked_child_nodes.lower_bound(insert_pos);
+ auto last = tracked_child_nodes.lower_bound(pos_upper_bound);
+ std::vector<Node*> nodes;
+ std::for_each(first, last, [&nodes](auto& kv) {
+ nodes.push_back(kv.second);
+ });
+ tracked_child_nodes.erase(first, last);
+ for (auto& node : nodes) {
+ auto _pos = node->parent_info().position;
+ assert(!_pos.is_end());
+ ++_pos.index_by_stage(insert_stage);
+ node->as_child(_pos, this);
+ }
+ // track insert
+ insert_child->as_child(insert_pos, this);
+
+#ifndef NDEBUG
+ // validate left_child is before right_child
+ if (nxt_child) {
+ auto iter = tracked_child_nodes.find(insert_pos);
+ ++iter;
+ assert(iter->second == nxt_child);
+ }
+#endif
+}
+
+void InternalNode::replace_track(
+ const search_position_t& position, Ref<Node> new_child, Ref<Node> old_child) {
+ assert(tracked_child_nodes[position] == old_child);
+ tracked_child_nodes.erase(position);
+ new_child->as_child(position, this);
+ assert(tracked_child_nodes[position] == new_child);
+}
+
+void InternalNode::track_split(
+ const search_position_t& split_pos, Ref<InternalNode> right_node) {
+ auto first = tracked_child_nodes.lower_bound(split_pos);
+ auto iter = first;
+ while (iter != tracked_child_nodes.end()) {
+ search_position_t new_pos = iter->first;
+ new_pos -= split_pos;
+ iter->second->as_child<false>(new_pos, right_node);
+ ++iter;
+ }
+ tracked_child_nodes.erase(first, tracked_child_nodes.end());
+}
+
+void InternalNode::validate_child(const Node& child) const {
+#ifndef NDEBUG
+ assert(impl->level() - 1 == child.impl->level());
+ assert(this == child.parent_info().ptr);
+ auto& child_pos = child.parent_info().position;
+ assert(impl->get_p_value(child_pos)->value == child.impl->laddr());
+ if (child_pos.is_end()) {
+ assert(impl->is_level_tail());
+ assert(child.impl->is_level_tail());
+ } else {
+ assert(!child.impl->is_level_tail());
+ assert(impl->get_key_view(child_pos) == child.impl->get_largest_key_view());
+ }
+ // XXX(multi-type)
+ assert(impl->field_type() <= child.impl->field_type());
+#endif
+}
+
+node_future<InternalNode::fresh_node_t> InternalNode::allocate(
+ context_t c, field_type_t field_type, bool is_level_tail, level_t level) {
+ return InternalNodeImpl::allocate(c, field_type, is_level_tail, level
+ ).safe_then([](auto&& fresh_impl) {
+ auto node = Ref<InternalNode>(new InternalNode(
+ fresh_impl.impl.get(), std::move(fresh_impl.impl)));
+ return fresh_node_t{node, fresh_impl.mut};
+ });
+}
+
+/*
+ * LeafNode
+ */
+
+LeafNode::LeafNode(LeafNodeImpl* impl, NodeImplURef&& impl_ref)
+ : Node(std::move(impl_ref)), impl{impl} {}
+
+bool LeafNode::is_level_tail() const {
+ return impl->is_level_tail();
+}
+
+std::tuple<key_view_t, const onode_t*, layout_version_t> LeafNode::get_kv(
+ const search_position_t& pos) const {
+ key_view_t key_view;
+ auto p_value = impl->get_p_value(pos, &key_view);
+ return {key_view, p_value, layout_version};
+}
+
+node_future<Ref<tree_cursor_t>>
+LeafNode::lookup_smallest(context_t) {
+ if (unlikely(impl->is_empty())) {
+ assert(is_root());
+ return node_ertr::make_ready_future<Ref<tree_cursor_t>>(
+ new tree_cursor_t(this));
+ }
+ auto pos = search_position_t::begin();
+ key_view_t index_key;
+ auto p_value = impl->get_p_value(pos, &index_key);
+ return node_ertr::make_ready_future<Ref<tree_cursor_t>>(
+ get_or_track_cursor(pos, index_key, p_value));
+}
+
+node_future<Ref<tree_cursor_t>>
+LeafNode::lookup_largest(context_t) {
+ if (unlikely(impl->is_empty())) {
+ assert(is_root());
+ return node_ertr::make_ready_future<Ref<tree_cursor_t>>(
+ new tree_cursor_t(this));
+ }
+ search_position_t pos;
+ const onode_t* p_value = nullptr;
+ key_view_t index_key;
+ impl->get_largest_slot(pos, index_key, &p_value);
+ return node_ertr::make_ready_future<Ref<tree_cursor_t>>(
+ get_or_track_cursor(pos, index_key, p_value));
+}
+
+node_future<Node::search_result_t>
+LeafNode::lower_bound_tracked(
+ context_t c, const key_hobj_t& key, MatchHistory& history) {
+ key_view_t index_key;
+ auto result = impl->lower_bound(key, history, &index_key);
+ Ref<tree_cursor_t> cursor;
+ if (result.position.is_end()) {
+ assert(!result.p_value);
+ cursor = new tree_cursor_t(this);
+ } else {
+ cursor = get_or_track_cursor(result.position, index_key, result.p_value);
+ }
+ return node_ertr::make_ready_future<search_result_t>(
+ search_result_t{cursor, result.mstat});
+}
+
+node_future<> LeafNode::do_get_tree_stats(context_t, tree_stats_t& stats) {
+ auto nstats = impl->get_stats();
+ stats.size_persistent_leaf += nstats.size_persistent;
+ stats.size_filled_leaf += nstats.size_filled;
+ stats.size_logical_leaf += nstats.size_logical;
+ stats.size_overhead_leaf += nstats.size_overhead;
+ stats.size_value_leaf += nstats.size_value;
+ stats.num_kvs_leaf += nstats.num_kvs;
+ stats.num_nodes_leaf += 1;
+ return node_ertr::now();
+}
+
+node_future<> LeafNode::test_clone_root(
+ context_t c_other, RootNodeTracker& tracker_other) const {
+ assert(is_root());
+ assert(impl->is_level_tail());
+ assert(impl->field_type() == field_type_t::N0);
+ Ref<const LeafNode> this_ref = this;
+ return LeafNode::allocate(c_other, field_type_t::N0, true
+ ).safe_then([this, c_other, &tracker_other](auto fresh_other) {
+ impl->test_copy_to(fresh_other.mut);
+ auto cloned_root = fresh_other.node;
+ return c_other.nm.get_super(c_other.t, tracker_other
+ ).safe_then([c_other, cloned_root](auto&& super_other) {
+ cloned_root->make_root_new(c_other, std::move(super_other));
+ });
+ }).safe_then([this_ref]{});
+}
+
+node_future<Ref<tree_cursor_t>> LeafNode::insert_value(
+ context_t c, const key_hobj_t& key, const onode_t& value,
+ const search_position_t& pos, const MatchHistory& history,
+ match_stat_t mstat) {
+#ifndef NDEBUG
+ if (pos.is_end()) {
+ assert(impl->is_level_tail());
+ }
+#endif
+ logger().debug("OTree::Leaf::Insert: "
+ "pos({}), {}, {}, {}, mstat({}) ...",
+ pos, key, value, history, mstat);
+ search_position_t insert_pos = pos;
+ auto [insert_stage, insert_size] = impl->evaluate_insert(
+ key, value, history, mstat, insert_pos);
+ auto free_size = impl->free_size();
+ if (free_size >= insert_size) {
+ // insert
+ on_layout_change();
+ impl->prepare_mutate(c);
+ auto p_value = impl->insert(key, value, insert_pos, insert_stage, insert_size);
+ assert(impl->free_size() == free_size - insert_size);
+ assert(insert_pos <= pos);
+ assert(p_value->size == value.size);
+ auto ret = track_insert(insert_pos, insert_stage, p_value);
+ validate_tracked_cursors();
+ return node_ertr::make_ready_future<Ref<tree_cursor_t>>(ret);
+ }
+ // split and insert
+ Ref<LeafNode> this_ref = this;
+ return (is_root() ? upgrade_root(c) : node_ertr::now()
+ ).safe_then([this, c] {
+ return LeafNode::allocate(c, impl->field_type(), impl->is_level_tail());
+ }).safe_then([this_ref, this, c, &key, &value,
+ insert_pos, insert_stage=insert_stage, insert_size=insert_size](auto fresh_right) mutable {
+ auto right_node = fresh_right.node;
+ // no need to bump version for right node, as it is fresh
+ on_layout_change();
+ impl->prepare_mutate(c);
+ auto [split_pos, is_insert_left, p_value] = impl->split_insert(
+ fresh_right.mut, *right_node->impl, key, value,
+ insert_pos, insert_stage, insert_size);
+ assert(p_value->size == value.size);
+ track_split(split_pos, right_node);
+ Ref<tree_cursor_t> ret;
+ if (is_insert_left) {
+ ret = track_insert(insert_pos, insert_stage, p_value);
+ } else {
+ ret = right_node->track_insert(insert_pos, insert_stage, p_value);
+ }
+ validate_tracked_cursors();
+ right_node->validate_tracked_cursors();
+
+ // propagate insert to parent
+ return insert_parent(c, right_node).safe_then([ret] {
+ return ret;
+ });
+ // TODO (optimize)
+ // try to acquire space from siblings before split... see btrfs
+ });
+}
+
+node_future<Ref<LeafNode>> LeafNode::allocate_root(
+ context_t c, RootNodeTracker& root_tracker) {
+ return LeafNode::allocate(c, field_type_t::N0, true
+ ).safe_then([c, &root_tracker](auto fresh_node) {
+ auto root = fresh_node.node;
+ return c.nm.get_super(c.t, root_tracker
+ ).safe_then([c, root](auto&& super) {
+ root->make_root_new(c, std::move(super));
+ return root;
+ });
+ });
+}
+
+Ref<tree_cursor_t> LeafNode::get_or_track_cursor(
+ const search_position_t& position,
+ const key_view_t& key, const onode_t* p_value) {
+ assert(!position.is_end());
+ assert(p_value);
+ Ref<tree_cursor_t> p_cursor;
+ auto found = tracked_cursors.find(position);
+ if (found == tracked_cursors.end()) {
+ p_cursor = new tree_cursor_t(this, position, key, p_value, layout_version);
+ } else {
+ p_cursor = found->second;
+ assert(p_cursor->get_leaf_node() == this);
+ assert(p_cursor->get_position() == position);
+ p_cursor->update_kv(key, p_value, layout_version);
+ }
+ return p_cursor;
+}
+
+void LeafNode::validate_cursor(tree_cursor_t& cursor) const {
+#ifndef NDEBUG
+ assert(this == cursor.get_leaf_node().get());
+ assert(!cursor.is_end());
+ auto [key, val, ver] = get_kv(cursor.get_position());
+ assert(key == cursor.get_key_view());
+ assert(val == cursor.get_p_value());
+#endif
+}
+
+Ref<tree_cursor_t> LeafNode::track_insert(
+ const search_position_t& insert_pos, match_stage_t insert_stage,
+ const onode_t* p_onode) {
+ // update cursor position
+ auto pos_upper_bound = insert_pos;
+ pos_upper_bound.index_by_stage(insert_stage) = INDEX_UPPER_BOUND;
+ auto first = tracked_cursors.lower_bound(insert_pos);
+ auto last = tracked_cursors.lower_bound(pos_upper_bound);
+ std::vector<tree_cursor_t*> p_cursors;
+ std::for_each(first, last, [&p_cursors](auto& kv) {
+ p_cursors.push_back(kv.second);
+ });
+ tracked_cursors.erase(first, last);
+ for (auto& p_cursor : p_cursors) {
+ search_position_t new_pos = p_cursor->get_position();
+ ++new_pos.index_by_stage(insert_stage);
+ p_cursor->update_track<true>(this, new_pos);
+ }
+
+ // track insert
+ // TODO: getting key_view_t from stage::proceed_insert() and
+ // stage::append_insert() has not supported yet
+ return new tree_cursor_t(this, insert_pos);
+}
+
+void LeafNode::track_split(
+ const search_position_t& split_pos, Ref<LeafNode> right_node) {
+ // update cursor ownership and position
+ auto first = tracked_cursors.lower_bound(split_pos);
+ auto iter = first;
+ while (iter != tracked_cursors.end()) {
+ search_position_t new_pos = iter->first;
+ new_pos -= split_pos;
+ iter->second->update_track<false>(right_node, new_pos);
+ ++iter;
+ }
+ tracked_cursors.erase(first, tracked_cursors.end());
+}
+
+node_future<LeafNode::fresh_node_t> LeafNode::allocate(
+ context_t c, field_type_t field_type, bool is_level_tail) {
+ return LeafNodeImpl::allocate(c, field_type, is_level_tail
+ ).safe_then([](auto&& fresh_impl) {
+ auto node = Ref<LeafNode>(new LeafNode(
+ fresh_impl.impl.get(), std::move(fresh_impl.impl)));
+ return fresh_node_t{node, fresh_impl.mut};
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h
new file mode 100644
index 000000000..d6af489e7
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h
@@ -0,0 +1,476 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <ostream>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include "crimson/common/type_helpers.h"
+
+#include "node_extent_mutable.h"
+#include "stages/key_layout.h"
+#include "stages/stage_types.h"
+#include "super.h"
+#include "tree_types.h"
+
+/**
+ * Tree example (2 levels):
+ *
+ * Root node keys: [ 3 7 ]
+ * values: [p1 p2 p3]
+ * / | \
+ * ------- | -------
+ * | | |
+ * V V V
+ * Leaf node keys: [ 1 2 3] [ 4 5 7] [ 9 11 12]
+ * values: [v1 v2 v3] [v4 v5 v6] [v7 v8 v9]
+ *
+ * Tree structure properties:
+ * - As illustrated above, the parent key is strictly equal to its left child's
+ * largest key;
+ * - If a tree is indexing multiple seastore transactions, each transaction
+ * will be mapped to a Super which points to a distinct root node. So the
+ * transactions are isolated at tree level. However, tree nodes from
+ * different transactions can reference the same seastore CachedExtent before
+ * modification;
+ * - The resources of the transactional tree are tracked by tree_cursor_ts held
+ * by users. As long as any cursor is alive, the according tree hierarchy is
+ * alive and keeps tracked. See the reversed resource management sections
+ * below;
+ */
+
+namespace crimson::os::seastore::onode {
+
+class LeafNode;
+class InternalNode;
+
+/**
+ * tree_cursor_t
+ *
+ * A cursor points to a position (LeafNode and search_position_t) of the tree
+ * where it can find the according key and value pair. The position is updated
+ * by LeafNode insert/split/delete/merge internally and is kept valid. It also
+ * caches the key-value information for a specific node layout version.
+ *
+ * Exposes public interfaces for Btree::Cursor.
+ */
+using layout_version_t = uint32_t;
+class tree_cursor_t final
+ : public boost::intrusive_ref_counter<
+ tree_cursor_t, boost::thread_unsafe_counter> {
+ public:
+ // public to Btree
+ ~tree_cursor_t();
+ tree_cursor_t(const tree_cursor_t&) = delete;
+ tree_cursor_t(tree_cursor_t&&) = delete;
+ tree_cursor_t& operator=(const tree_cursor_t&) = delete;
+ tree_cursor_t& operator=(tree_cursor_t&&) = delete;
+
+ /**
+ * is_end
+ *
+ * Represents one-past-the-last of all the sorted key-value
+ * pairs in the tree. An end cursor won't contain valid key-value
+ * information.
+ */
+ bool is_end() const { return position.is_end(); }
+
+ /// Returns the key view in tree if it is not an end cursor.
+ const key_view_t& get_key_view() const;
+
+ /// Returns the value pointer in tree if it is not an end cursor.
+ const onode_t* get_p_value() const;
+
+ private:
+ tree_cursor_t(Ref<LeafNode>, const search_position_t&);
+ tree_cursor_t(Ref<LeafNode>, const search_position_t&,
+ const key_view_t& key, const onode_t*, layout_version_t);
+ // lookup reaches the end, contain leaf node for further insert
+ tree_cursor_t(Ref<LeafNode>);
+ const search_position_t& get_position() const { return position; }
+ Ref<LeafNode> get_leaf_node() { return leaf_node; }
+ template <bool VALIDATE>
+ void update_track(Ref<LeafNode>, const search_position_t&);
+ void update_kv(const key_view_t&, const onode_t*, layout_version_t) const;
+ void ensure_kv() const;
+
+ private:
+ /**
+ * Reversed resource management (tree_cursor_t)
+ *
+ * tree_cursor_t holds a reference to the LeafNode, so the LeafNode will be
+ * alive as long as any of it's cursors is still referenced by user.
+ */
+ Ref<LeafNode> leaf_node;
+ search_position_t position;
+
+ // cached information
+ mutable std::optional<key_view_t> key_view;
+ mutable const onode_t* p_value;
+ mutable layout_version_t node_version;
+
+ friend class LeafNode;
+ friend class Node; // get_position(), get_leaf_node()
+};
+
+/**
+ * Node
+ *
+ * An abstracted class for both InternalNode and LeafNode.
+ *
+ * Exposes public interfaces for Btree.
+ */
+class Node
+ : public boost::intrusive_ref_counter<
+ Node, boost::thread_unsafe_counter> {
+ public:
+ // public to Btree
+ using node_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent,
+ crimson::ct_error::erange>;
+ template <class ValueT=void>
+ using node_future = node_ertr::future<ValueT>;
+
+ struct search_result_t {
+ bool is_end() const { return p_cursor->is_end(); }
+ Ref<tree_cursor_t> p_cursor;
+ match_stat_t mstat;
+
+ MatchKindBS match() const {
+ assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX);
+ return (mstat == MSTAT_EQ ? MatchKindBS::EQ : MatchKindBS::NE);
+ }
+ };
+
+ virtual ~Node();
+ Node(const Node&) = delete;
+ Node(Node&&) = delete;
+ Node& operator=(const Node&) = delete;
+ Node& operator=(Node&&) = delete;
+
+ /**
+ * level
+ *
+ * A positive value denotes the level (or height) of this node in tree.
+ * 0 means LeafNode, positive means InternalNode.
+ */
+ level_t level() const;
+
+ /**
+ * lookup_smallest
+ *
+ * Returns a cursor pointing to the smallest key in the sub-tree formed by
+ * this node.
+ *
+ * Returns an end cursor if it is an empty root node.
+ */
+ virtual node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) = 0;
+
+ /**
+ * lookup_largest
+ *
+ * Returns a cursor pointing to the largest key in the sub-tree formed by
+ * this node.
+ *
+ * Returns an end cursor if it is an empty root node.
+ */
+ virtual node_future<Ref<tree_cursor_t>> lookup_largest(context_t) = 0;
+
+ /**
+ * lower_bound
+ *
+ * Returns a cursor pointing to the first element in the range [first, last)
+ * of the sub-tree which does not compare less than the input key. The
+ * result also denotes whether the pointed key is equal to the input key.
+ *
+ * Returns an end cursor with MatchKindBS::NE if:
+ * - It is an empty root node;
+ * - Or the input key is larger than all the keys in the sub-tree;
+ */
+ node_future<search_result_t> lower_bound(context_t c, const key_hobj_t& key);
+
+ /**
+ * insert
+ *
+ * Try to insert a key-value pair into the sub-tree formed by this node.
+ *
+ * Returns a boolean denoting whether the insertion is successful:
+ * - If true, the returned cursor points to the inserted element in tree;
+ * - If false, the returned cursor points to the conflicting element in tree;
+ */
+ node_future<std::pair<Ref<tree_cursor_t>, bool>> insert(
+ context_t, const key_hobj_t&, const onode_t&);
+
+ /// Recursively collects the statistics of the sub-tree formed by this node
+ node_future<tree_stats_t> get_tree_stats(context_t);
+
+ /// Returns an ostream containing a dump of all the elements in the node.
+ std::ostream& dump(std::ostream&) const;
+
+ /// Returns an ostream containing an one-line summary of this node.
+ std::ostream& dump_brief(std::ostream&) const;
+
+ /// Initializes the tree by allocating an empty root node.
+ static node_future<> mkfs(context_t, RootNodeTracker&);
+
+ /// Loads the tree root. The tree must be initialized.
+ static node_future<Ref<Node>> load_root(context_t, RootNodeTracker&);
+
+ // Only for unit test purposes.
+ void test_make_destructable(context_t, NodeExtentMutable&, Super::URef&&);
+ virtual node_future<> test_clone_root(context_t, RootNodeTracker&) const = 0;
+
+ protected:
+ virtual node_future<> test_clone_non_root(context_t, Ref<InternalNode>) const {
+ ceph_abort("impossible path");
+ }
+ virtual node_future<search_result_t> lower_bound_tracked(
+ context_t, const key_hobj_t&, MatchHistory&) = 0;
+ virtual node_future<> do_get_tree_stats(context_t, tree_stats_t&) = 0;
+
+ protected:
+ Node(NodeImplURef&&);
+ bool is_root() const {
+ assert((super && !_parent_info.has_value()) ||
+ (!super && _parent_info.has_value()));
+ return !_parent_info.has_value();
+ }
+
+ // as root
+ void make_root(context_t c, Super::URef&& _super);
+ void make_root_new(context_t c, Super::URef&& _super) {
+ assert(_super->get_root_laddr() == L_ADDR_NULL);
+ make_root(c, std::move(_super));
+ }
+ void make_root_from(context_t c, Super::URef&& _super, laddr_t from_addr) {
+ assert(_super->get_root_laddr() == from_addr);
+ make_root(c, std::move(_super));
+ }
+ void as_root(Super::URef&& _super);
+ node_future<> upgrade_root(context_t);
+
+ // as child/non-root
+ template <bool VALIDATE = true>
+ void as_child(const search_position_t&, Ref<InternalNode>);
+ struct parent_info_t {
+ search_position_t position;
+ Ref<InternalNode> ptr;
+ };
+ const parent_info_t& parent_info() const { return *_parent_info; }
+ node_future<> insert_parent(context_t, Ref<Node> right_node);
+
+ private:
+ /**
+ * Reversed resource management (Node)
+ *
+ * Root Node holds a reference to its parent Super class, so its parent
+ * will be alive as long as this root node is alive.
+ *
+ * None-root Node holds a reference to its parent Node, so its parent will
+ * be alive as long as any of it's children is alive.
+ */
+ // as root
+ Super::URef super;
+ // as child/non-root
+ std::optional<parent_info_t> _parent_info;
+
+ private:
+ static node_future<Ref<Node>> load(context_t, laddr_t, bool expect_is_level_tail);
+
+ NodeImplURef impl;
+ friend class InternalNode;
+};
+inline std::ostream& operator<<(std::ostream& os, const Node& node) {
+ return node.dump_brief(os);
+}
+
+/**
+ * InternalNode
+ *
+ * A concrete implementation of Node class that represents an internal tree
+ * node. Its level is always positive and its values are logical block
+ * addresses to its child nodes. An internal node cannot be empty.
+ */
+class InternalNode final : public Node {
+ public:
+ // public to Node
+ InternalNode(InternalNodeImpl*, NodeImplURef&&);
+ ~InternalNode() override { assert(tracked_child_nodes.empty()); }
+ InternalNode(const InternalNode&) = delete;
+ InternalNode(InternalNode&&) = delete;
+ InternalNode& operator=(const InternalNode&) = delete;
+ InternalNode& operator=(InternalNode&&) = delete;
+
+ node_future<> apply_child_split(
+ context_t, const search_position_t&, Ref<Node> left, Ref<Node> right);
+ template <bool VALIDATE>
+ void do_track_child(Node& child) {
+ if constexpr (VALIDATE) {
+ validate_child(child);
+ }
+ auto& child_pos = child.parent_info().position;
+ assert(tracked_child_nodes.find(child_pos) == tracked_child_nodes.end());
+ tracked_child_nodes[child_pos] = &child;
+ }
+ void do_untrack_child(const Node& child) {
+ auto& child_pos = child.parent_info().position;
+ assert(tracked_child_nodes.find(child_pos)->second == &child);
+ [[maybe_unused]] auto removed = tracked_child_nodes.erase(child_pos);
+ assert(removed);
+ }
+
+ static node_future<Ref<InternalNode>> allocate_root(
+ context_t, level_t, laddr_t, Super::URef&&);
+
+ protected:
+ node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) override;
+ node_future<Ref<tree_cursor_t>> lookup_largest(context_t) override;
+ node_future<search_result_t> lower_bound_tracked(
+ context_t, const key_hobj_t&, MatchHistory&) override;
+ node_future<> do_get_tree_stats(context_t, tree_stats_t&) override;
+
+ node_future<> test_clone_root(context_t, RootNodeTracker&) const override;
+
+ private:
+ // XXX: extract a common tracker for InternalNode to track Node,
+ // and LeafNode to track tree_cursor_t.
+ node_future<Ref<Node>> get_or_track_child(context_t, const search_position_t&, laddr_t);
+ void track_insert(
+ const search_position_t&, match_stage_t, Ref<Node>, Ref<Node> nxt_child = nullptr);
+ void replace_track(const search_position_t&, Ref<Node> new_child, Ref<Node> old_child);
+ void track_split(const search_position_t&, Ref<InternalNode>);
+ void validate_tracked_children() const {
+#ifndef NDEBUG
+ for (auto& kv : tracked_child_nodes) {
+ assert(kv.first == kv.second->parent_info().position);
+ validate_child(*kv.second);
+ }
+#endif
+ }
+ void validate_child(const Node& child) const;
+
+ struct fresh_node_t {
+ Ref<InternalNode> node;
+ NodeExtentMutable mut;
+ std::pair<Ref<Node>, NodeExtentMutable> make_pair() {
+ return std::make_pair(Ref<Node>(node), mut);
+ }
+ };
+ static node_future<fresh_node_t> allocate(context_t, field_type_t, bool, level_t);
+
+ private:
+ /**
+ * Reversed resource management (InternalNode)
+ *
+ * InteralNode keeps track of its child nodes which are still alive in
+ * memory, and their positions will be updated throughout
+ * insert/split/delete/merge operations of this node.
+ */
+ // XXX: leverage intrusive data structure to control memory overhead
+ std::map<search_position_t, Node*> tracked_child_nodes;
+ InternalNodeImpl* impl;
+};
+
+/**
+ * LeafNode
+ *
+ * A concrete implementation of Node class that represents a leaf tree node.
+ * Its level is always 0. A leaf node can only be empty if it is root.
+ */
+class LeafNode final : public Node {
+ public:
+ // public to tree_cursor_t
+ ~LeafNode() override { assert(tracked_cursors.empty()); }
+ LeafNode(const LeafNode&) = delete;
+ LeafNode(LeafNode&&) = delete;
+ LeafNode& operator=(const LeafNode&) = delete;
+ LeafNode& operator=(LeafNode&&) = delete;
+
+ bool is_level_tail() const;
+ layout_version_t get_layout_version() const { return layout_version; }
+ std::tuple<key_view_t, const onode_t*, layout_version_t> get_kv(
+ const search_position_t&) const;
+ template <bool VALIDATE>
+ void do_track_cursor(tree_cursor_t& cursor) {
+ if constexpr (VALIDATE) {
+ validate_cursor(cursor);
+ }
+ auto& cursor_pos = cursor.get_position();
+ assert(tracked_cursors.find(cursor_pos) == tracked_cursors.end());
+ tracked_cursors[cursor_pos] = &cursor;
+ }
+ void do_untrack_cursor(tree_cursor_t& cursor) {
+ validate_cursor(cursor);
+ auto& cursor_pos = cursor.get_position();
+ assert(tracked_cursors.find(cursor_pos)->second == &cursor);
+ [[maybe_unused]] auto removed = tracked_cursors.erase(cursor_pos);
+ assert(removed);
+ }
+
+ protected:
+ node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) override;
+ node_future<Ref<tree_cursor_t>> lookup_largest(context_t) override;
+ node_future<search_result_t> lower_bound_tracked(
+ context_t, const key_hobj_t&, MatchHistory&) override;
+ node_future<> do_get_tree_stats(context_t, tree_stats_t&) override;
+
+ node_future<> test_clone_root(context_t, RootNodeTracker&) const override;
+
+ private:
+ LeafNode(LeafNodeImpl*, NodeImplURef&&);
+ node_future<Ref<tree_cursor_t>> insert_value(
+ context_t, const key_hobj_t&, const onode_t&,
+ const search_position_t&, const MatchHistory&,
+ match_stat_t mstat);
+ static node_future<Ref<LeafNode>> allocate_root(context_t, RootNodeTracker&);
+ friend class Node;
+
+ private:
+ // XXX: extract a common tracker for InternalNode to track Node,
+ // and LeafNode to track tree_cursor_t.
+ Ref<tree_cursor_t> get_or_track_cursor(
+ const search_position_t&, const key_view_t&, const onode_t*);
+ Ref<tree_cursor_t> track_insert(
+ const search_position_t&, match_stage_t, const onode_t*);
+ void track_split(const search_position_t&, Ref<LeafNode>);
+ void validate_tracked_cursors() const {
+#ifndef NDEBUG
+ for (auto& kv : tracked_cursors) {
+ assert(kv.first == kv.second->get_position());
+ validate_cursor(*kv.second);
+ }
+#endif
+ }
+ void validate_cursor(tree_cursor_t& cursor) const;
+ // invalidate p_value pointers in tree_cursor_t
+ void on_layout_change() { ++layout_version; }
+
+ struct fresh_node_t {
+ Ref<LeafNode> node;
+ NodeExtentMutable mut;
+ std::pair<Ref<Node>, NodeExtentMutable> make_pair() {
+ return std::make_pair(Ref<Node>(node), mut);
+ }
+ };
+ static node_future<fresh_node_t> allocate(context_t, field_type_t, bool);
+
+ private:
+ /**
+ * Reversed resource management (LeafNode)
+ *
+ * LeafNode keeps track of the referencing cursors which are still alive in
+ * memory, and their positions will be updated throughout
+ * insert/split/delete/merge operations of this node.
+ */
+ // XXX: leverage intrusive data structure to control memory overhead
+ std::map<search_position_t, tree_cursor_t*> tracked_cursors;
+ LeafNodeImpl* impl;
+ layout_version_t layout_version = 0;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h
new file mode 100644
index 000000000..d08a99015
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "include/buffer.h"
+#include "node_types.h"
+
+namespace crimson::os::seastore::onode {
+
+/**
+ * DeltaRecorder
+ *
+ * An abstracted class to encapsulate different implementations to apply delta
+ * to a specific node layout.
+ */
+class DeltaRecorder {
+ public:
+ virtual ~DeltaRecorder() {
+ assert(is_empty());
+ }
+
+ bool is_empty() const {
+ return encoded.length() == 0;
+ }
+
+ ceph::bufferlist get_delta() {
+ assert(!is_empty());
+ return std::move(encoded);
+ }
+
+ virtual node_type_t node_type() const = 0;
+ virtual field_type_t field_type() const = 0;
+ virtual void apply_delta(ceph::bufferlist::const_iterator&,
+ NodeExtentMutable&) = 0;
+
+ protected:
+ DeltaRecorder() = default;
+ ceph::bufferlist encoded;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h
new file mode 100644
index 000000000..94782f50d
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h
@@ -0,0 +1,413 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/log.h"
+#include "node_extent_manager.h"
+#include "node_delta_recorder.h"
+#include "node_layout_replayable.h"
+
+#ifndef NDEBUG
+#include "node_extent_manager/test_replay.h"
+#endif
+
+namespace crimson::os::seastore::onode {
+
+/**
+ * DeltaRecorderT
+ *
+ * Responsible to encode and decode delta, and apply delta for a specific node
+ * layout.
+ */
+template <typename FieldType, node_type_t NODE_TYPE>
+class DeltaRecorderT final: public DeltaRecorder {
+ enum class op_t : uint8_t {
+ INSERT,
+ SPLIT,
+ SPLIT_INSERT,
+ UPDATE_CHILD_ADDR,
+ };
+
+ public:
+ using layout_t = NodeLayoutReplayableT<FieldType, NODE_TYPE>;
+ using node_stage_t = typename layout_t::node_stage_t;
+ using position_t = typename layout_t::position_t;
+ using StagedIterator = typename layout_t::StagedIterator;
+ using value_t = typename layout_t::value_t;
+ static constexpr auto FIELD_TYPE = layout_t::FIELD_TYPE;
+
+ ~DeltaRecorderT() override = default;
+
+ template <KeyT KT>
+ void encode_insert(
+ const full_key_t<KT>& key,
+ const value_t& value,
+ const position_t& insert_pos,
+ const match_stage_t& insert_stage,
+ const node_offset_t& insert_size) {
+ ceph::encode(op_t::INSERT, encoded);
+ encode_key<KT>(key, encoded);
+ encode_value(value, encoded);
+ insert_pos.encode(encoded);
+ ceph::encode(insert_stage, encoded);
+ ceph::encode(insert_size, encoded);
+ }
+
+ void encode_split(
+ const StagedIterator& split_at,
+ const char* p_node_start) {
+ ceph::encode(op_t::SPLIT, encoded);
+ split_at.encode(p_node_start, encoded);
+ }
+
+ template <KeyT KT>
+ void encode_split_insert(
+ const StagedIterator& split_at,
+ const full_key_t<KT>& key,
+ const value_t& value,
+ const position_t& insert_pos,
+ const match_stage_t& insert_stage,
+ const node_offset_t& insert_size,
+ const char* p_node_start) {
+ ceph::encode(op_t::SPLIT_INSERT, encoded);
+ split_at.encode(p_node_start, encoded);
+ encode_key<KT>(key, encoded);
+ encode_value(value, encoded);
+ insert_pos.encode(encoded);
+ ceph::encode(insert_stage, encoded);
+ ceph::encode(insert_size, encoded);
+ }
+
+ void encode_update_child_addr(
+ const laddr_t new_addr,
+ const laddr_packed_t* p_addr,
+ const char* p_node_start) {
+ ceph::encode(op_t::UPDATE_CHILD_ADDR, encoded);
+ ceph::encode(new_addr, encoded);
+ int node_offset = reinterpret_cast<const char*>(p_addr) - p_node_start;
+ assert(node_offset > 0 && node_offset <= NODE_BLOCK_SIZE);
+ ceph::encode(static_cast<node_offset_t>(node_offset), encoded);
+ }
+
+ static DeltaRecorderURef create() {
+ return std::unique_ptr<DeltaRecorder>(new DeltaRecorderT());
+ }
+
+ protected:
+ DeltaRecorderT() = default;
+ node_type_t node_type() const override { return NODE_TYPE; }
+ field_type_t field_type() const override { return FIELD_TYPE; }
+ void apply_delta(ceph::bufferlist::const_iterator& delta,
+ NodeExtentMutable& node) override {
+ assert(is_empty());
+ node_stage_t stage(reinterpret_cast<const FieldType*>(node.get_read()));
+ op_t op;
+ try {
+ ceph::decode(op, delta);
+ switch (op) {
+ case op_t::INSERT: {
+ logger().debug("OTree::Extent::Replay: decoding INSERT ...");
+ auto key = key_hobj_t::decode(delta);
+
+ std::unique_ptr<char[]> value_storage_heap;
+ value_t value_storage_stack;
+ auto p_value = decode_value(delta, value_storage_heap, value_storage_stack);
+
+ auto insert_pos = position_t::decode(delta);
+ match_stage_t insert_stage;
+ ceph::decode(insert_stage, delta);
+ node_offset_t insert_size;
+ ceph::decode(insert_size, delta);
+ logger().debug("OTree::Extent::Replay: apply {}, {}, "
+ "insert_pos({}), insert_stage={}, insert_size={}B ...",
+ key, *p_value, insert_pos, insert_stage, insert_size);
+ layout_t::template insert<KeyT::HOBJ>(
+ node, stage, key, *p_value, insert_pos, insert_stage, insert_size);
+ break;
+ }
+ case op_t::SPLIT: {
+ logger().debug("OTree::Extent::Replay: decoding SPLIT ...");
+ auto split_at = StagedIterator::decode(stage.p_start(), delta);
+ logger().debug("OTree::Extent::Replay: apply split_at={} ...", split_at);
+ layout_t::split(node, stage, split_at);
+ break;
+ }
+ case op_t::SPLIT_INSERT: {
+ logger().debug("OTree::Extent::Replay: decoding SPLIT_INSERT ...");
+ auto split_at = StagedIterator::decode(stage.p_start(), delta);
+ auto key = key_hobj_t::decode(delta);
+
+ std::unique_ptr<char[]> value_storage_heap;
+ value_t value_storage_stack;
+ auto p_value = decode_value(delta, value_storage_heap, value_storage_stack);
+
+ auto insert_pos = position_t::decode(delta);
+ match_stage_t insert_stage;
+ ceph::decode(insert_stage, delta);
+ node_offset_t insert_size;
+ ceph::decode(insert_size, delta);
+ logger().debug("OTree::Extent::Replay: apply split_at={}, {}, {}, "
+ "insert_pos({}), insert_stage={}, insert_size={}B ...",
+ split_at, key, *p_value, insert_pos, insert_stage, insert_size);
+ layout_t::template split_insert<KeyT::HOBJ>(
+ node, stage, split_at, key, *p_value, insert_pos, insert_stage, insert_size);
+ break;
+ }
+ case op_t::UPDATE_CHILD_ADDR: {
+ logger().debug("OTree::Extent::Replay: decoding UPDATE_CHILD_ADDR ...");
+ laddr_t new_addr;
+ ceph::decode(new_addr, delta);
+ node_offset_t update_offset;
+ ceph::decode(update_offset, delta);
+ auto p_addr = reinterpret_cast<laddr_packed_t*>(
+ node.get_write() + update_offset);
+ logger().debug("OTree::Extent::Replay: apply {:#x} to offset {:#x} ...",
+ new_addr, update_offset);
+ layout_t::update_child_addr(node, new_addr, p_addr);
+ break;
+ }
+ default:
+ logger().error("OTree::Extent::Replay: got unknown op {} when replay {:#x}",
+ op, node.get_laddr());
+ ceph_abort();
+ }
+ } catch (buffer::error& e) {
+ logger().error("OTree::Extent::Replay: got decode error {} when replay {:#x}",
+ e, node.get_laddr());
+ ceph_abort();
+ }
+ }
+
+ private:
+ static void encode_value(const value_t& value, ceph::bufferlist& encoded) {
+ if constexpr (std::is_same_v<value_t, laddr_packed_t>) {
+ // NODE_TYPE == node_type_t::INTERNAL
+ ceph::encode(value.value, encoded);
+ } else if constexpr (std::is_same_v<value_t, onode_t>) {
+ // NODE_TYPE == node_type_t::LEAF
+ value.encode(encoded);
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ static value_t* decode_value(ceph::bufferlist::const_iterator& delta,
+ std::unique_ptr<char[]>& value_storage_heap,
+ value_t& value_storage_stack) {
+ if constexpr (std::is_same_v<value_t, laddr_packed_t>) {
+ // NODE_TYPE == node_type_t::INTERNAL
+ laddr_t value;
+ ceph::decode(value, delta);
+ value_storage_stack.value = value;
+ return &value_storage_stack;
+ } else if constexpr (std::is_same_v<value_t, onode_t>) {
+ // NODE_TYPE == node_type_t::LEAF
+ auto value_config = onode_t::decode(delta);
+ value_storage_heap = onode_t::allocate(value_config);
+ return reinterpret_cast<onode_t*>(value_storage_heap.get());
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ static seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+};
+
+/**
+ * NodeExtentAccessorT
+ *
+ * This component is responsible to reference and mutate the underlying
+ * NodeExtent, record mutation parameters when needed, and apply the recorded
+ * modifications for a specific node layout.
+ */
+template <typename FieldType, node_type_t NODE_TYPE>
+class NodeExtentAccessorT {
+ public:
+ using layout_t = NodeLayoutReplayableT<FieldType, NODE_TYPE>;
+ using node_stage_t = typename layout_t::node_stage_t;
+ using position_t = typename layout_t::position_t;
+ using recorder_t = DeltaRecorderT<FieldType, NODE_TYPE>;
+ using StagedIterator = typename layout_t::StagedIterator;
+ using value_t = typename layout_t::value_t;
+ static constexpr auto FIELD_TYPE = layout_t::FIELD_TYPE;
+
+ NodeExtentAccessorT(NodeExtentRef extent)
+ : extent{extent},
+ node_stage{reinterpret_cast<const FieldType*>(extent->get_read())} {
+ if (no_recording()) {
+ mut.emplace(extent->get_mutable());
+ assert(extent->get_recorder() == nullptr);
+ recorder = nullptr;
+ } else if (needs_recording()) {
+ mut.emplace(extent->get_mutable());
+ auto p_recorder = extent->get_recorder();
+ assert(p_recorder != nullptr);
+ assert(p_recorder->node_type() == NODE_TYPE);
+ assert(p_recorder->field_type() == FIELD_TYPE);
+ recorder = static_cast<recorder_t*>(p_recorder);
+ } else if (needs_mutate()) {
+ // mut is empty
+ assert(extent->get_recorder() == nullptr ||
+ extent->get_recorder()->is_empty());
+ recorder = nullptr;
+ } else {
+ ceph_abort("impossible path");
+ }
+#ifndef NDEBUG
+ auto ref_recorder = recorder_t::create();
+ test_recorder = static_cast<recorder_t*>(ref_recorder.get());
+ test_extent = TestReplayExtent::create(
+ extent->get_length(), std::move(ref_recorder));
+#endif
+ }
+ ~NodeExtentAccessorT() = default;
+ NodeExtentAccessorT(const NodeExtentAccessorT&) = delete;
+ NodeExtentAccessorT(NodeExtentAccessorT&&) = delete;
+ NodeExtentAccessorT& operator=(const NodeExtentAccessorT&) = delete;
+ NodeExtentAccessorT& operator=(NodeExtentAccessorT&&) = delete;
+
+ const node_stage_t& read() const { return node_stage; }
+ laddr_t get_laddr() const { return extent->get_laddr(); }
+
+ // must be called before any mutate attempes.
+ // for the safety of mixed read and mutate, call before read.
+ void prepare_mutate(context_t c) {
+ if (needs_mutate()) {
+ auto ref_recorder = recorder_t::create();
+ recorder = static_cast<recorder_t*>(ref_recorder.get());
+ extent = extent->mutate(c, std::move(ref_recorder));
+ assert(needs_recording());
+ node_stage = node_stage_t(
+ reinterpret_cast<const FieldType*>(extent->get_read()));
+ assert(recorder == static_cast<recorder_t*>(extent->get_recorder()));
+ mut.emplace(extent->get_mutable());
+ }
+ }
+
+ template <KeyT KT>
+ const value_t* insert_replayable(
+ const full_key_t<KT>& key,
+ const value_t& value,
+ position_t& insert_pos,
+ match_stage_t& insert_stage,
+ node_offset_t& insert_size) {
+ assert(!needs_mutate());
+ if (needs_recording()) {
+ recorder->template encode_insert<KT>(
+ key, value, insert_pos, insert_stage, insert_size);
+ }
+#ifndef NDEBUG
+ test_extent->prepare_replay(extent);
+ test_recorder->template encode_insert<KT>(
+ key, value, insert_pos, insert_stage, insert_size);
+#endif
+ auto ret = layout_t::template insert<KT>(
+ *mut, read(), key, value,
+ insert_pos, insert_stage, insert_size);
+#ifndef NDEBUG
+ test_extent->replay_and_verify(extent);
+#endif
+ return ret;
+ }
+
+ void split_replayable(StagedIterator& split_at) {
+ assert(!needs_mutate());
+ if (needs_recording()) {
+ recorder->encode_split(split_at, read().p_start());
+ }
+#ifndef NDEBUG
+ test_extent->prepare_replay(extent);
+ test_recorder->template encode_split(split_at, read().p_start());
+#endif
+ layout_t::split(*mut, read(), split_at);
+#ifndef NDEBUG
+ test_extent->replay_and_verify(extent);
+#endif
+ }
+
+ template <KeyT KT>
+ const value_t* split_insert_replayable(
+ StagedIterator& split_at,
+ const full_key_t<KT>& key,
+ const value_t& value,
+ position_t& insert_pos,
+ match_stage_t& insert_stage,
+ node_offset_t& insert_size) {
+ assert(!needs_mutate());
+ if (needs_recording()) {
+ recorder->template encode_split_insert<KT>(
+ split_at, key, value, insert_pos, insert_stage, insert_size,
+ read().p_start());
+ }
+#ifndef NDEBUG
+ test_extent->prepare_replay(extent);
+ test_recorder->template encode_split_insert<KT>(
+ split_at, key, value, insert_pos, insert_stage, insert_size,
+ read().p_start());
+#endif
+ auto ret = layout_t::template split_insert<KT>(
+ *mut, read(), split_at, key, value,
+ insert_pos, insert_stage, insert_size);
+#ifndef NDEBUG
+ test_extent->replay_and_verify(extent);
+#endif
+ return ret;
+ }
+
+ void update_child_addr_replayable(
+ const laddr_t new_addr, laddr_packed_t* p_addr) {
+ assert(!needs_mutate());
+ if (needs_recording()) {
+ recorder->encode_update_child_addr(new_addr, p_addr, read().p_start());
+ }
+#ifndef NDEBUG
+ test_extent->prepare_replay(extent);
+ test_recorder->encode_update_child_addr(new_addr, p_addr, read().p_start());
+#endif
+ layout_t::update_child_addr(*mut, new_addr, p_addr);
+#ifndef NDEBUG
+ test_extent->replay_and_verify(extent);
+#endif
+ }
+
+ void test_copy_to(NodeExtentMutable& to) const {
+ assert(extent->get_length() == to.get_length());
+ std::memcpy(to.get_write(), extent->get_read(), extent->get_length());
+ }
+
+ private:
+ /**
+ * Possible states with CachedExtent::extent_state_t:
+ * INITIAL_WRITE_PENDING -- can mutate, no recording
+ * MUTATION_PENDING -- can mutate, needs recording
+ * CLEAN/DIRTY -- pending mutate
+ * INVALID -- impossible
+ */
+ bool no_recording() const {
+ return extent->is_initial_pending();
+ }
+ bool needs_recording() const {
+ return extent->is_mutation_pending();
+ }
+ bool needs_mutate() const {
+ assert(extent->is_valid());
+ return !extent->is_pending();
+ }
+
+ NodeExtentRef extent;
+ node_stage_t node_stage;
+ std::optional<NodeExtentMutable> mut;
+ // owned by extent
+ recorder_t* recorder;
+
+#ifndef NDEBUG
+ // verify record replay using a different memory block
+ TestReplayExtent::Ref test_extent;
+ recorder_t* test_recorder;
+#endif
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc
new file mode 100644
index 000000000..bd22d4b67
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_extent_manager.h"
+
+#include "node_extent_manager/dummy.h"
+#include "node_extent_manager/seastore.h"
+#include "stages/node_stage_layout.h"
+
+namespace crimson::os::seastore::onode {
+
+std::pair<node_type_t, field_type_t> NodeExtent::get_types() const {
+ const auto header = reinterpret_cast<const node_header_t*>(get_read());
+ auto node_type = header->get_node_type();
+ auto field_type = header->get_field_type();
+ if (!field_type.has_value()) {
+ throw std::runtime_error("load failed: bad field type");
+ }
+ return {node_type, *field_type};
+}
+
+NodeExtentManagerURef NodeExtentManager::create_dummy(bool is_sync) {
+ if (is_sync) {
+ return NodeExtentManagerURef(new DummyNodeExtentManager<true>());
+ } else {
+ return NodeExtentManagerURef(new DummyNodeExtentManager<false>());
+ }
+}
+
+NodeExtentManagerURef NodeExtentManager::create_seastore(
+ TransactionManager& tm, laddr_t min_laddr) {
+ return NodeExtentManagerURef(new SeastoreNodeExtentManager(tm, min_laddr));
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h
new file mode 100644
index 000000000..77b230e03
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/type_helpers.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/transaction_manager.h"
+
+#include "fwd.h"
+#include "super.h"
+#include "node_extent_mutable.h"
+#include "node_types.h"
+
+/**
+ * node_extent_manager.h
+ *
+ * Contains general interfaces for different backends (Dummy and Seastore).
+ */
+
+namespace crimson::os::seastore::onode {
+
+using crimson::os::seastore::LogicalCachedExtent;
+class NodeExtent : public LogicalCachedExtent {
+ public:
+ virtual ~NodeExtent() = default;
+ std::pair<node_type_t, field_type_t> get_types() const;
+ const char* get_read() const {
+ return get_bptr().c_str();
+ }
+ NodeExtentMutable get_mutable() {
+ assert(is_pending());
+ return do_get_mutable();
+ }
+
+ virtual DeltaRecorder* get_recorder() const = 0;
+ virtual NodeExtentRef mutate(context_t, DeltaRecorderURef&&) = 0;
+
+ protected:
+ template <typename... T>
+ NodeExtent(T&&... t) : LogicalCachedExtent(std::forward<T>(t)...) {}
+
+ NodeExtentMutable do_get_mutable() {
+ return NodeExtentMutable(*this);
+ }
+
+ /**
+ * Abstracted interfaces to implement:
+ * - CacheExtent::duplicate_for_write() -> CachedExtentRef
+ * - CacheExtent::get_type() -> extent_types_t
+ * - CacheExtent::get_delta() -> ceph::bufferlist
+ * - LogicalCachedExtent::apply_delta(const ceph::bufferlist) -> void
+ */
+
+ private:
+ friend class NodeExtentMutable;
+};
+
+using crimson::os::seastore::TransactionManager;
+class NodeExtentManager {
+ public:
+ virtual ~NodeExtentManager() = default;
+ using tm_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent,
+ crimson::ct_error::erange>;
+ template <class ValueT=void>
+ using tm_future = tm_ertr::future<ValueT>;
+
+ virtual bool is_read_isolated() const = 0;
+ virtual tm_future<NodeExtentRef> read_extent(
+ Transaction&, laddr_t, extent_len_t) = 0;
+ virtual tm_future<NodeExtentRef> alloc_extent(Transaction&, extent_len_t) = 0;
+ virtual tm_future<Super::URef> get_super(Transaction&, RootNodeTracker&) = 0;
+ virtual std::ostream& print(std::ostream& os) const = 0;
+
+ static NodeExtentManagerURef create_dummy(bool is_sync);
+ static NodeExtentManagerURef create_seastore(
+ TransactionManager& tm, laddr_t min_laddr = L_ADDR_MIN);
+};
+inline std::ostream& operator<<(std::ostream& os, const NodeExtentManager& nm) {
+ return nm.print(os);
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h
new file mode 100644
index 000000000..830ea4a7d
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h
@@ -0,0 +1,156 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <chrono>
+#include <seastar/core/sleep.hh>
+
+#include "include/buffer_raw.h"
+
+#include "crimson/common/log.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h"
+
+/**
+ * dummy.h
+ *
+ * Dummy backend implementations for test purposes.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class DummySuper final: public Super {
+ public:
+ DummySuper(Transaction& t, RootNodeTracker& tracker, laddr_t* p_root_laddr)
+ : Super(t, tracker), p_root_laddr{p_root_laddr} {}
+ ~DummySuper() override = default;
+ protected:
+ laddr_t get_root_laddr() const override { return *p_root_laddr; }
+ void write_root_laddr(context_t, laddr_t addr) override {
+ logger().info("OTree::Dummy: update root {:#x} ...", addr);
+ *p_root_laddr = addr;
+ }
+ private:
+ static seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+ laddr_t* p_root_laddr;
+};
+
+class DummyNodeExtent final: public NodeExtent {
+ public:
+ DummyNodeExtent(ceph::bufferptr &&ptr) : NodeExtent(std::move(ptr)) {
+ state = extent_state_t::INITIAL_WRITE_PENDING;
+ }
+ ~DummyNodeExtent() override = default;
+ protected:
+ NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override {
+ ceph_abort("impossible path"); }
+ DeltaRecorder* get_recorder() const override {
+ return nullptr; }
+ CachedExtentRef duplicate_for_write() override {
+ ceph_abort("impossible path"); }
+ extent_types_t get_type() const override {
+ return extent_types_t::TEST_BLOCK; }
+ ceph::bufferlist get_delta() override {
+ ceph_abort("impossible path"); }
+ void apply_delta(const ceph::bufferlist&) override {
+ ceph_abort("impossible path"); }
+};
+
+template <bool SYNC>
+class DummyNodeExtentManager final: public NodeExtentManager {
+ static constexpr size_t ALIGNMENT = 4096;
+ public:
+ ~DummyNodeExtentManager() override = default;
+ protected:
+ bool is_read_isolated() const override { return false; }
+
+ tm_future<NodeExtentRef> read_extent(
+ Transaction& t, laddr_t addr, extent_len_t len) override {
+ logger().trace("OTree::Dummy: reading {}B at {:#x} ...", len, addr);
+ if constexpr (SYNC) {
+ return read_extent_sync(t, addr, len);
+ } else {
+ using namespace std::chrono_literals;
+ return seastar::sleep(1us).then([this, &t, addr, len] {
+ return read_extent_sync(t, addr, len);
+ });
+ }
+ }
+
+ tm_future<NodeExtentRef> alloc_extent(
+ Transaction& t, extent_len_t len) override {
+ logger().trace("OTree::Dummy: allocating {}B ...", len);
+ if constexpr (SYNC) {
+ return alloc_extent_sync(t, len);
+ } else {
+ using namespace std::chrono_literals;
+ return seastar::sleep(1us).then([this, &t, len] {
+ return alloc_extent_sync(t, len);
+ });
+ }
+ }
+
+ tm_future<Super::URef> get_super(
+ Transaction& t, RootNodeTracker& tracker) override {
+ logger().trace("OTree::Dummy: get root ...");
+ if constexpr (SYNC) {
+ return get_super_sync(t, tracker);
+ } else {
+ using namespace std::chrono_literals;
+ return seastar::sleep(1us).then([this, &t, &tracker] {
+ return get_super_sync(t, tracker);
+ });
+ }
+ }
+
+ std::ostream& print(std::ostream& os) const override {
+ return os << "DummyNodeExtentManager(sync=" << SYNC << ")";
+ }
+
+ private:
+ tm_future<NodeExtentRef> read_extent_sync(
+ Transaction& t, laddr_t addr, extent_len_t len) {
+ auto iter = allocate_map.find(addr);
+ assert(iter != allocate_map.end());
+ auto extent = iter->second;
+ logger().trace("OTree::Dummy: read {}B at {:#x}",
+ extent->get_length(), extent->get_laddr());
+ assert(extent->get_laddr() == addr);
+ assert(extent->get_length() == len);
+ return tm_ertr::make_ready_future<NodeExtentRef>(extent);
+ }
+
+ tm_future<NodeExtentRef> alloc_extent_sync(
+ Transaction& t, extent_len_t len) {
+ assert(len % ALIGNMENT == 0);
+ auto r = ceph::buffer::create_aligned(len, ALIGNMENT);
+ auto addr = reinterpret_cast<laddr_t>(r->get_data());
+ auto bp = ceph::bufferptr(std::move(r));
+ auto extent = Ref<DummyNodeExtent>(new DummyNodeExtent(std::move(bp)));
+ extent->set_laddr(addr);
+ assert(allocate_map.find(extent->get_laddr()) == allocate_map.end());
+ allocate_map.insert({extent->get_laddr(), extent});
+ logger().debug("OTree::Dummy: allocated {}B at {:#x}",
+ extent->get_length(), extent->get_laddr());
+ assert(extent->get_length() == len);
+ return tm_ertr::make_ready_future<NodeExtentRef>(extent);
+ }
+
+ tm_future<Super::URef> get_super_sync(
+ Transaction& t, RootNodeTracker& tracker) {
+ logger().debug("OTree::Dummy: got root {:#x}", root_laddr);
+ return tm_ertr::make_ready_future<Super::URef>(
+ Super::URef(new DummySuper(t, tracker, &root_laddr)));
+ }
+
+ static seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+
+ std::map<laddr_t, Ref<DummyNodeExtent>> allocate_map;
+ laddr_t root_laddr = L_ADDR_NULL;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc
new file mode 100644
index 000000000..8d88485bf
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc
@@ -0,0 +1,88 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "seastore.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h"
+
+namespace {
+
+seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+}
+
+}
+
+namespace crimson::os::seastore::onode {
+
+static DeltaRecorderURef create_recorder(
+ node_type_t node_type, field_type_t field_type) {
+ if (node_type == node_type_t::LEAF) {
+ if (field_type == field_type_t::N0) {
+ return DeltaRecorderT<node_fields_0_t, node_type_t::LEAF>::create();
+ } else if (field_type == field_type_t::N1) {
+ return DeltaRecorderT<node_fields_1_t, node_type_t::LEAF>::create();
+ } else if (field_type == field_type_t::N2) {
+ return DeltaRecorderT<node_fields_2_t, node_type_t::LEAF>::create();
+ } else if (field_type == field_type_t::N3) {
+ return DeltaRecorderT<leaf_fields_3_t, node_type_t::LEAF>::create();
+ } else {
+ ceph_abort("impossible path");
+ }
+ } else if (node_type == node_type_t::INTERNAL) {
+ if (field_type == field_type_t::N0) {
+ return DeltaRecorderT<node_fields_0_t, node_type_t::INTERNAL>::create();
+ } else if (field_type == field_type_t::N1) {
+ return DeltaRecorderT<node_fields_1_t, node_type_t::INTERNAL>::create();
+ } else if (field_type == field_type_t::N2) {
+ return DeltaRecorderT<node_fields_2_t, node_type_t::INTERNAL>::create();
+ } else if (field_type == field_type_t::N3) {
+ return DeltaRecorderT<internal_fields_3_t, node_type_t::INTERNAL>::create();
+ } else {
+ ceph_abort("impossible path");
+ }
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+void SeastoreSuper::write_root_laddr(context_t c, laddr_t addr) {
+ logger().info("OTree::Seastore: update root {:#x} ...", addr);
+ root_addr = addr;
+ auto nm = static_cast<SeastoreNodeExtentManager*>(&c.nm);
+ nm->get_tm().write_onode_root(c.t, addr);
+}
+
+NodeExtentRef SeastoreNodeExtent::mutate(
+ context_t c, DeltaRecorderURef&& _recorder) {
+ logger().debug("OTree::Seastore: mutate {:#x} ...", get_laddr());
+ auto nm = static_cast<SeastoreNodeExtentManager*>(&c.nm);
+ auto extent = nm->get_tm().get_mutable_extent(c.t, this);
+ auto ret = extent->cast<SeastoreNodeExtent>();
+ assert(!ret->recorder || ret->recorder->is_empty());
+ ret->recorder = std::move(_recorder);
+ return ret;
+}
+
+void SeastoreNodeExtent::apply_delta(const ceph::bufferlist& bl) {
+ logger().debug("OTree::Seastore: replay {:#x} ...", get_laddr());
+ if (!recorder) {
+ auto [node_type, field_type] = get_types();
+ recorder = create_recorder(node_type, field_type);
+ } else {
+#ifndef NDEBUG
+ auto [node_type, field_type] = get_types();
+ assert(recorder->node_type() == node_type);
+ assert(recorder->field_type() == field_type);
+#endif
+ }
+ assert(is_clean());
+ auto node = do_get_mutable();
+ auto p = bl.cbegin();
+ while (p != bl.end()) {
+ recorder->apply_delta(p, node);
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h
new file mode 100644
index 000000000..f80b99fab
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h
@@ -0,0 +1,126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/log.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h"
+
+/**
+ * seastore.h
+ *
+ * Seastore backend implementations.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class SeastoreSuper final: public Super {
+ public:
+ SeastoreSuper(Transaction& t, RootNodeTracker& tracker,
+ laddr_t root_addr, TransactionManager& tm)
+ : Super(t, tracker), root_addr{root_addr}, tm{tm} {}
+ ~SeastoreSuper() override = default;
+ protected:
+ laddr_t get_root_laddr() const override {
+ return root_addr;
+ }
+ void write_root_laddr(context_t c, laddr_t addr) override;
+ private:
+ static seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+ laddr_t root_addr;
+ TransactionManager& tm;
+};
+
+class SeastoreNodeExtent final: public NodeExtent {
+ public:
+ SeastoreNodeExtent(ceph::bufferptr &&ptr)
+ : NodeExtent(std::move(ptr)) {}
+ SeastoreNodeExtent(const SeastoreNodeExtent& other)
+ : NodeExtent(other) {}
+ ~SeastoreNodeExtent() override = default;
+ protected:
+ NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override;
+
+ DeltaRecorder* get_recorder() const override {
+ return recorder.get();
+ }
+
+ CachedExtentRef duplicate_for_write() override {
+ return CachedExtentRef(new SeastoreNodeExtent(*this));
+ }
+ extent_types_t get_type() const override {
+ return extent_types_t::ONODE_BLOCK_STAGED;
+ }
+ ceph::bufferlist get_delta() override {
+ assert(recorder);
+ return recorder->get_delta();
+ }
+ void apply_delta(const ceph::bufferlist&) override;
+ private:
+ DeltaRecorderURef recorder;
+};
+
+class SeastoreNodeExtentManager final: public NodeExtentManager {
+ public:
+ SeastoreNodeExtentManager(TransactionManager& tm, laddr_t min)
+ : tm{tm}, addr_min{min} {};
+ ~SeastoreNodeExtentManager() override = default;
+ TransactionManager& get_tm() { return tm; }
+ protected:
+ bool is_read_isolated() const override { return true; }
+
+ tm_future<NodeExtentRef> read_extent(
+ Transaction& t, laddr_t addr, extent_len_t len) override {
+ logger().debug("OTree::Seastore: reading {}B at {:#x} ...", len, addr);
+ return tm.read_extents<SeastoreNodeExtent>(t, addr, len
+ ).safe_then([addr, len](auto&& extents) {
+ assert(extents.size() == 1);
+ [[maybe_unused]] auto [laddr, e] = extents.front();
+ logger().trace("OTree::Seastore: read {}B at {:#x}",
+ e->get_length(), e->get_laddr());
+ assert(e->get_laddr() == addr);
+ assert(e->get_length() == len);
+ std::ignore = addr;
+ std::ignore = len;
+ return NodeExtentRef(e);
+ });
+ }
+
+ tm_future<NodeExtentRef> alloc_extent(
+ Transaction& t, extent_len_t len) override {
+ logger().debug("OTree::Seastore: allocating {}B ...", len);
+ return tm.alloc_extent<SeastoreNodeExtent>(t, addr_min, len
+ ).safe_then([len](auto extent) {
+ logger().debug("OTree::Seastore: allocated {}B at {:#x}",
+ extent->get_length(), extent->get_laddr());
+ assert(extent->get_length() == len);
+ std::ignore = len;
+ return NodeExtentRef(extent);
+ });
+ }
+
+ tm_future<Super::URef> get_super(
+ Transaction& t, RootNodeTracker& tracker) override {
+ logger().trace("OTree::Seastore: get root ...");
+ return tm.read_onode_root(t).safe_then([this, &t, &tracker](auto root_addr) {
+ logger().debug("OTree::Seastore: got root {:#x}", root_addr);
+ return Super::URef(new SeastoreSuper(t, tracker, root_addr, tm));
+ });
+ }
+
+ std::ostream& print(std::ostream& os) const override {
+ return os << "SeastoreNodeExtentManager";
+ }
+
+ private:
+ static seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+ TransactionManager& tm;
+ const laddr_t addr_min;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h
new file mode 100644
index 000000000..240c88932
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h"
+
+/** test_replay.h
+ *
+ * A special version of NodeExtent to help verify delta encode, decode and
+ * replay in recorder_t under debug build.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class TestReplayExtent final: public NodeExtent {
+ public:
+ using Ref = crimson::os::seastore::TCachedExtentRef<TestReplayExtent>;
+
+ void prepare_replay(NodeExtentRef from_extent) {
+ assert(get_length() == from_extent->get_length());
+ auto mut = do_get_mutable();
+ std::memcpy(mut.get_write(), from_extent->get_read(), get_length());
+ }
+
+ void replay_and_verify(NodeExtentRef replayed_extent) {
+ assert(get_length() == replayed_extent->get_length());
+ auto mut = do_get_mutable();
+ auto bl = recorder->get_delta();
+ assert(bl.length());
+ auto p = bl.cbegin();
+ recorder->apply_delta(p, mut);
+ assert(p == bl.end());
+ auto cmp = std::memcmp(get_read(), replayed_extent->get_read(), get_length());
+ ceph_assert(cmp == 0 && "replay mismatch!");
+ }
+
+ static Ref create(extent_len_t length, DeltaRecorderURef&& recorder) {
+ auto r = ceph::buffer::create_aligned(length, 4096);
+ auto bp = ceph::bufferptr(std::move(r));
+ return new TestReplayExtent(std::move(bp), std::move(recorder));
+ }
+
+ protected:
+ NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override {
+ ceph_abort("impossible path"); }
+ DeltaRecorder* get_recorder() const override {
+ ceph_abort("impossible path"); }
+ CachedExtentRef duplicate_for_write() override {
+ ceph_abort("impossible path"); }
+ extent_types_t get_type() const override {
+ return extent_types_t::TEST_BLOCK; }
+ ceph::bufferlist get_delta() override {
+ ceph_abort("impossible path"); }
+ void apply_delta(const ceph::bufferlist&) override {
+ ceph_abort("impossible path"); }
+
+ private:
+ TestReplayExtent(ceph::bufferptr&& ptr, DeltaRecorderURef&& recorder)
+ : NodeExtent(std::move(ptr)), recorder(std::move(recorder)) {
+ state = extent_state_t::MUTATION_PENDING;
+ }
+ DeltaRecorderURef recorder;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc
new file mode 100644
index 000000000..048c4000d
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_extent_mutable.h"
+#include "node_extent_manager.h"
+
+namespace crimson::os::seastore::onode {
+
+NodeExtentMutable::NodeExtentMutable(NodeExtent& extent)
+ : extent{extent} {
+ assert(extent.is_pending() || // during mutation
+ extent.is_clean()); // during replay
+}
+
+const char* NodeExtentMutable::get_read() const {
+ assert(extent.is_pending() || // during mutation
+ extent.is_clean()); // during replay
+ return extent.get_bptr().c_str();
+}
+
+char* NodeExtentMutable::get_write() {
+ assert(extent.is_pending() || // during mutation
+ extent.is_clean()); // during replay
+ return extent.get_bptr().c_str();
+}
+
+extent_len_t NodeExtentMutable::get_length() const {
+ return extent.get_length();
+}
+
+laddr_t NodeExtentMutable::get_laddr() const {
+ return extent.get_laddr();
+}
+
+const char* NodeExtentMutable::buf_upper_bound() const {
+ return get_read() + get_length();
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h
new file mode 100644
index 000000000..52f10a013
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <cstring>
+
+#include "fwd.h"
+
+#pragma once
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtent;
+
+/**
+ * NodeExtentMutable
+ *
+ * A thin wrapper of NodeExtent to make sure that only the newly allocated
+ * or the duplicated NodeExtent is mutable, and the memory modifications are
+ * safe within the extent range.
+ */
+class NodeExtentMutable {
+ public:
+ void copy_in_absolute(void* dst, const void* src, extent_len_t len) {
+ assert((char*)dst >= get_write());
+ assert((char*)dst + len <= buf_upper_bound());
+ std::memcpy(dst, src, len);
+ }
+ template <typename T>
+ void copy_in_absolute(void* dst, const T& src) {
+ copy_in_absolute(dst, &src, sizeof(T));
+ }
+
+ const void* copy_in_relative(
+ extent_len_t dst_offset, const void* src, extent_len_t len) {
+ auto dst = get_write() + dst_offset;
+ copy_in_absolute(dst, src, len);
+ return dst;
+ }
+ template <typename T>
+ const T* copy_in_relative(
+ extent_len_t dst_offset, const T& src) {
+ auto dst = copy_in_relative(dst_offset, &src, sizeof(T));
+ return static_cast<const T*>(dst);
+ }
+
+ void shift_absolute(const void* src, extent_len_t len, int offset) {
+ assert((const char*)src >= get_write());
+ assert((const char*)src + len <= buf_upper_bound());
+ char* to = (char*)src + offset;
+ assert(to >= get_write());
+ assert(to + len <= buf_upper_bound());
+ if (len != 0) {
+ std::memmove(to, src, len);
+ }
+ }
+ void shift_relative(extent_len_t src_offset, extent_len_t len, int offset) {
+ shift_absolute(get_write() + src_offset, len, offset);
+ }
+
+ template <typename T>
+ void validate_inplace_update(const T& updated) {
+ assert((const char*)&updated >= get_write());
+ assert((const char*)&updated + sizeof(T) <= buf_upper_bound());
+ }
+
+ const char* get_read() const;
+ char* get_write();
+ extent_len_t get_length() const;
+ laddr_t get_laddr() const;
+
+ private:
+ explicit NodeExtentMutable(NodeExtent&);
+ const char* buf_upper_bound() const;
+
+ NodeExtent& extent;
+
+ friend class NodeExtent;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc
new file mode 100644
index 000000000..59d792b1a
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_impl.h"
+#include "node_layout.h"
+
+namespace crimson::os::seastore::onode {
+
+#ifdef UNIT_TESTS_BUILT
+last_split_info_t last_split = {};
+#endif
+
+// XXX: branchless allocation
+InternalNodeImpl::alloc_ertr::future<InternalNodeImpl::fresh_impl_t>
+InternalNodeImpl::allocate(
+ context_t c, field_type_t type, bool is_level_tail, level_t level) {
+ if (type == field_type_t::N0) {
+ return InternalNode0::allocate(c, is_level_tail, level);
+ } else if (type == field_type_t::N1) {
+ return InternalNode1::allocate(c, is_level_tail, level);
+ } else if (type == field_type_t::N2) {
+ return InternalNode2::allocate(c, is_level_tail, level);
+ } else if (type == field_type_t::N3) {
+ return InternalNode3::allocate(c, is_level_tail, level);
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+LeafNodeImpl::alloc_ertr::future<LeafNodeImpl::fresh_impl_t>
+LeafNodeImpl::allocate(
+ context_t c, field_type_t type, bool is_level_tail) {
+ if (type == field_type_t::N0) {
+ return LeafNode0::allocate(c, is_level_tail, 0);
+ } else if (type == field_type_t::N1) {
+ return LeafNode1::allocate(c, is_level_tail, 0);
+ } else if (type == field_type_t::N2) {
+ return LeafNode2::allocate(c, is_level_tail, 0);
+ } else if (type == field_type_t::N3) {
+ return LeafNode3::allocate(c, is_level_tail, 0);
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+InternalNodeImplURef InternalNodeImpl::load(
+ NodeExtentRef extent, field_type_t type, bool expect_is_level_tail) {
+ if (type == field_type_t::N0) {
+ return InternalNode0::load(extent, expect_is_level_tail);
+ } else if (type == field_type_t::N1) {
+ return InternalNode1::load(extent, expect_is_level_tail);
+ } else if (type == field_type_t::N2) {
+ return InternalNode2::load(extent, expect_is_level_tail);
+ } else if (type == field_type_t::N3) {
+ return InternalNode3::load(extent, expect_is_level_tail);
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+LeafNodeImplURef LeafNodeImpl::load(
+ NodeExtentRef extent, field_type_t type, bool expect_is_level_tail) {
+ if (type == field_type_t::N0) {
+ return LeafNode0::load(extent, expect_is_level_tail);
+ } else if (type == field_type_t::N1) {
+ return LeafNode1::load(extent, expect_is_level_tail);
+ } else if (type == field_type_t::N2) {
+ return LeafNode2::load(extent, expect_is_level_tail);
+ } else if (type == field_type_t::N3) {
+ return LeafNode3::load(extent, expect_is_level_tail);
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h
new file mode 100644
index 000000000..3267cda2b
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h
@@ -0,0 +1,197 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+
+#include "node_extent_mutable.h"
+#include "node_types.h"
+#include "stages/stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+#ifdef UNIT_TESTS_BUILT
+enum class InsertType { BEGIN, LAST, MID };
+struct split_expectation_t {
+ match_stage_t split_stage;
+ match_stage_t insert_stage;
+ bool is_insert_left;
+ InsertType insert_type;
+};
+struct last_split_info_t {
+ search_position_t split_pos;
+ match_stage_t insert_stage;
+ bool is_insert_left;
+ InsertType insert_type;
+ bool match(const split_expectation_t& e) const {
+ match_stage_t split_stage;
+ if (split_pos.nxt.nxt.index == 0) {
+ if (split_pos.nxt.index == 0) {
+ split_stage = 2;
+ } else {
+ split_stage = 1;
+ }
+ } else {
+ split_stage = 0;
+ }
+ return split_stage == e.split_stage &&
+ insert_stage == e.insert_stage &&
+ is_insert_left == e.is_insert_left &&
+ insert_type == e.insert_type;
+ }
+ bool match_split_pos(const search_position_t& pos) const {
+ return split_pos == pos;
+ }
+};
+extern last_split_info_t last_split;
+#endif
+
+struct key_hobj_t;
+struct key_view_t;
+class NodeExtentMutable;
+
+/**
+ * NodeImpl
+ *
+ * Hides type specific node layout implementations for Node.
+ */
+class NodeImpl {
+ public:
+ using alloc_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent,
+ crimson::ct_error::erange>;
+ virtual ~NodeImpl() = default;
+
+ virtual field_type_t field_type() const = 0;
+ virtual laddr_t laddr() const = 0;
+ virtual void prepare_mutate(context_t) = 0;
+ virtual bool is_level_tail() const = 0;
+ virtual bool is_empty() const = 0;
+ virtual level_t level() const = 0;
+ virtual node_offset_t free_size() const = 0;
+ virtual key_view_t get_key_view(const search_position_t&) const = 0;
+ virtual key_view_t get_largest_key_view() const = 0;
+ virtual void next_position(search_position_t&) const = 0;
+
+ virtual node_stats_t get_stats() const = 0;
+ virtual std::ostream& dump(std::ostream&) const = 0;
+ virtual std::ostream& dump_brief(std::ostream&) const = 0;
+ virtual void validate_layout() const = 0;
+
+ virtual void test_copy_to(NodeExtentMutable&) const = 0;
+ virtual void test_set_tail(NodeExtentMutable&) = 0;
+
+ protected:
+ NodeImpl() = default;
+};
+
+/**
+ * InternalNodeImpl
+ *
+ * Hides type specific node layout implementations for InternalNode.
+ */
+class InternalNodeImpl : public NodeImpl {
+ public:
+ struct internal_marker_t {};
+ virtual ~InternalNodeImpl() = default;
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual const laddr_packed_t* get_p_value(
+ const search_position_t&,
+ key_view_t* = nullptr, internal_marker_t = {}) const {
+ ceph_abort("impossible path");
+ }
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual lookup_result_t<node_type_t::INTERNAL> lower_bound(
+ const key_hobj_t&, MatchHistory&,
+ key_view_t* = nullptr, internal_marker_t = {}) const {
+ ceph_abort("impossible path");
+ }
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual const laddr_packed_t* insert(
+ const key_view_t&, const laddr_packed_t&, search_position_t&, match_stage_t&, node_offset_t&) {
+ ceph_abort("impossible path");
+ }
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual std::tuple<search_position_t, bool, const laddr_packed_t*> split_insert(
+ NodeExtentMutable&, NodeImpl&, const key_view_t&, const laddr_packed_t&,
+ search_position_t&, match_stage_t&, node_offset_t&) {
+ ceph_abort("impossible path");
+ }
+
+ virtual void replace_child_addr(const search_position_t&, laddr_t dst, laddr_t src) = 0;
+ virtual std::tuple<match_stage_t, node_offset_t> evaluate_insert(
+ const key_view_t&, const laddr_t&, search_position_t&) const = 0;
+
+ struct fresh_impl_t {
+ InternalNodeImplURef impl;
+ NodeExtentMutable mut;
+ std::pair<NodeImplURef, NodeExtentMutable> make_pair() {
+ return {std::move(impl), mut};
+ }
+ };
+ static alloc_ertr::future<fresh_impl_t> allocate(context_t, field_type_t, bool, level_t);
+ static InternalNodeImplURef load(NodeExtentRef, field_type_t, bool);
+
+ protected:
+ InternalNodeImpl() = default;
+};
+
+/**
+ * LeafNodeImpl
+ *
+ * Hides type specific node layout implementations for LeafNode.
+ */
+class LeafNodeImpl : public NodeImpl {
+ public:
+ struct leaf_marker_t {};
+ virtual ~LeafNodeImpl() = default;
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual const onode_t* get_p_value(
+ const search_position_t&,
+ key_view_t* = nullptr, leaf_marker_t={}) const {
+ ceph_abort("impossible path");
+ }
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual lookup_result_t<node_type_t::LEAF> lower_bound(
+ const key_hobj_t&, MatchHistory&,
+ key_view_t* = nullptr, leaf_marker_t = {}) const {
+ ceph_abort("impossible path");
+ }
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual const onode_t* insert(
+ const key_hobj_t&, const onode_t&, search_position_t&, match_stage_t&, node_offset_t&) {
+ ceph_abort("impossible path");
+ }
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual std::tuple<search_position_t, bool, const onode_t*> split_insert(
+ NodeExtentMutable&, NodeImpl&, const key_hobj_t&, const onode_t&,
+ search_position_t&, match_stage_t&, node_offset_t&) {
+ ceph_abort("impossible path");
+ }
+
+ virtual void get_largest_slot(
+ search_position_t&, key_view_t&, const onode_t**) const = 0;
+ virtual std::tuple<match_stage_t, node_offset_t> evaluate_insert(
+ const key_hobj_t&, const onode_t&,
+ const MatchHistory&, match_stat_t, search_position_t&) const = 0;
+
+ struct fresh_impl_t {
+ LeafNodeImplURef impl;
+ NodeExtentMutable mut;
+ std::pair<NodeImplURef, NodeExtentMutable> make_pair() {
+ return {std::move(impl), mut};
+ }
+ };
+ static alloc_ertr::future<fresh_impl_t> allocate(context_t, field_type_t, bool);
+ static LeafNodeImplURef load(NodeExtentRef, field_type_t, bool);
+
+ protected:
+ LeafNodeImpl() = default;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
new file mode 100644
index 000000000..916d17424
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
@@ -0,0 +1,613 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+#include <sstream>
+
+#include "common/likely.h"
+#include "crimson/common/log.h"
+#include "node_extent_accessor.h"
+#include "node_impl.h"
+#include "stages/node_stage_layout.h"
+
+namespace crimson::os::seastore::onode {
+
+template <node_type_t NODE_TYPE> struct insert_key_type;
+template <> struct insert_key_type<node_type_t::INTERNAL> {
+ static constexpr auto type = KeyT::VIEW; };
+template <> struct insert_key_type<node_type_t::LEAF> {
+ static constexpr auto type = KeyT::HOBJ; };
+
+template <node_type_t NODE_TYPE> struct node_impl_type;
+template <> struct node_impl_type<node_type_t::INTERNAL> {
+ using type = InternalNodeImpl; };
+template <> struct node_impl_type<node_type_t::LEAF> {
+ using type = LeafNodeImpl; };
+
+template <node_type_t NODE_TYPE> struct node_marker_type;
+template <> struct node_marker_type<node_type_t::INTERNAL> {
+ using type = InternalNodeImpl::internal_marker_t; };
+template <> struct node_marker_type<node_type_t::LEAF> {
+ using type = LeafNodeImpl::leaf_marker_t; };
+
+/**
+ * NodeLayoutT
+ *
+ * Contains templated and concrete implementations for both InternalNodeImpl
+ * and LeafNodeImpl under a specific node layout.
+ */
+template <typename FieldType, node_type_t NODE_TYPE>
+class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl {
+ public:
+ using URef = std::unique_ptr<NodeLayoutT>;
+ using extent_t = NodeExtentAccessorT<FieldType, NODE_TYPE>;
+ using parent_t = typename node_impl_type<NODE_TYPE>::type;
+ using marker_t = typename node_marker_type<NODE_TYPE>::type;
+ using node_stage_t = typename extent_t::node_stage_t;
+ using position_t = typename extent_t::position_t;
+ using value_t = typename extent_t::value_t;
+ static constexpr auto FIELD_TYPE = extent_t::FIELD_TYPE;
+ static constexpr auto KEY_TYPE = insert_key_type<NODE_TYPE>::type;
+ static constexpr auto STAGE = STAGE_T::STAGE;
+
+ NodeLayoutT(const NodeLayoutT&) = delete;
+ NodeLayoutT(NodeLayoutT&&) = delete;
+ NodeLayoutT& operator=(const NodeLayoutT&) = delete;
+ NodeLayoutT& operator=(NodeLayoutT&&) = delete;
+ ~NodeLayoutT() override = default;
+
+ static URef load(NodeExtentRef extent, bool expect_is_level_tail) {
+ std::unique_ptr<NodeLayoutT> ret(new NodeLayoutT(extent));
+ assert(ret->is_level_tail() == expect_is_level_tail);
+ return ret;
+ }
+
+ using alloc_ertr = NodeExtentManager::tm_ertr;
+ static alloc_ertr::future<typename parent_t::fresh_impl_t> allocate(
+ context_t c, bool is_level_tail, level_t level) {
+ // NOTE: Currently, all the node types have the same size for simplicity.
+ // But depending on the requirement, we may need to make node size
+ // configurable by field_type_t and node_type_t, or totally flexible.
+ return c.nm.alloc_extent(c.t, node_stage_t::EXTENT_SIZE
+ ).safe_then([is_level_tail, level](auto extent) {
+ assert(extent->is_initial_pending());
+ auto mut = extent->get_mutable();
+ node_stage_t::bootstrap_extent(
+ mut, FIELD_TYPE, NODE_TYPE, is_level_tail, level);
+ return typename parent_t::fresh_impl_t{
+ std::unique_ptr<parent_t>(new NodeLayoutT(extent)), mut};
+ });
+ }
+
+ protected:
+ /*
+ * NodeImpl
+ */
+ field_type_t field_type() const override { return FIELD_TYPE; }
+ laddr_t laddr() const override { return extent.get_laddr(); }
+ void prepare_mutate(context_t c) override { return extent.prepare_mutate(c); }
+ bool is_level_tail() const override { return extent.read().is_level_tail(); }
+ bool is_empty() const override { return extent.read().keys() == 0; }
+ level_t level() const override { return extent.read().level(); }
+ node_offset_t free_size() const override { return extent.read().free_size(); }
+
+ key_view_t get_key_view(const search_position_t& position) const override {
+ key_view_t ret;
+ STAGE_T::get_key_view(extent.read(), cast_down<STAGE>(position), ret);
+ return ret;
+ }
+
+ key_view_t get_largest_key_view() const override {
+ key_view_t index_key;
+ STAGE_T::template lookup_largest_slot<false, true, false>(
+ extent.read(), nullptr, &index_key, nullptr);
+ return index_key;
+ }
+
+ void next_position(search_position_t& pos) const override {
+ assert(!pos.is_end());
+ bool find_next = STAGE_T::next_position(extent.read(), cast_down<STAGE>(pos));
+ if (find_next) {
+ pos = search_position_t::end();
+ }
+ }
+
+ node_stats_t get_stats() const override {
+ node_stats_t stats;
+ auto& node_stage = extent.read();
+ key_view_t index_key;
+ if (node_stage.keys()) {
+ STAGE_T::get_stats(node_stage, stats, index_key);
+ }
+ stats.size_persistent = node_stage_t::EXTENT_SIZE;
+ stats.size_filled = filled_size();
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (is_level_tail()) {
+ stats.size_logical += sizeof(value_t);
+ stats.size_value += sizeof(value_t);
+ stats.num_kvs += 1;
+ }
+ }
+ return stats;
+ }
+
+ std::ostream& dump(std::ostream& os) const override {
+ auto& node_stage = extent.read();
+ auto p_start = node_stage.p_start();
+ dump_brief(os);
+ auto stats = get_stats();
+ os << " num_kvs=" << stats.num_kvs
+ << ", logical=" << stats.size_logical
+ << "B, overhead=" << stats.size_overhead
+ << "B, value=" << stats.size_value << "B";
+ os << ":\n header: " << node_stage_t::header_size() << "B";
+ size_t size = 0u;
+ if (node_stage.keys()) {
+ STAGE_T::dump(node_stage, os, " ", size, p_start);
+ } else {
+ size += node_stage_t::header_size();
+ if (NODE_TYPE == node_type_t::LEAF || !node_stage.is_level_tail()) {
+ os << " empty!";
+ }
+ }
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (node_stage.is_level_tail()) {
+ size += sizeof(laddr_t);
+ auto value_ptr = node_stage.get_end_p_laddr();
+ int offset = reinterpret_cast<const char*>(value_ptr) - p_start;
+ os << "\n tail value: 0x"
+ << std::hex << value_ptr->value << std::dec
+ << " " << size << "B"
+ << " @" << offset << "B";
+ }
+ }
+ assert(size == filled_size());
+ return os;
+ }
+
+ std::ostream& dump_brief(std::ostream& os) const override {
+ auto& node_stage = extent.read();
+ os << "Node" << NODE_TYPE << FIELD_TYPE
+ << "@0x" << std::hex << extent.get_laddr()
+ << "+" << node_stage_t::EXTENT_SIZE << std::dec
+ << (node_stage.is_level_tail() ? "$" : "")
+ << "(level=" << (unsigned)node_stage.level()
+ << ", filled=" << filled_size() << "B"
+ << ", free=" << node_stage.free_size() << "B"
+ << ")";
+ return os;
+ }
+
+ void validate_layout() const override {
+#ifndef NDEBUG
+ STAGE_T::validate(extent.read());
+#endif
+ }
+
+ void test_copy_to(NodeExtentMutable& to) const override {
+ extent.test_copy_to(to);
+ }
+
+ void test_set_tail(NodeExtentMutable& mut) override {
+ node_stage_t::update_is_level_tail(mut, extent.read(), true);
+ }
+
+ /*
+ * Common
+ */
+ const value_t* get_p_value(const search_position_t& position,
+ key_view_t* index_key=nullptr, marker_t={}) const override {
+ auto& node_stage = extent.read();
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ assert(!index_key);
+ if (position.is_end()) {
+ assert(is_level_tail());
+ return node_stage.get_end_p_laddr();
+ }
+ } else {
+ assert(!position.is_end());
+ }
+ if (index_key) {
+ return STAGE_T::template get_p_value<true>(
+ node_stage, cast_down<STAGE>(position), index_key);
+ } else {
+ return STAGE_T::get_p_value(node_stage, cast_down<STAGE>(position));
+ }
+ }
+
+ lookup_result_t<NODE_TYPE> lower_bound(
+ const key_hobj_t& key, MatchHistory& history,
+ key_view_t* index_key=nullptr, marker_t={}) const override {
+ auto& node_stage = extent.read();
+ if constexpr (NODE_TYPE == node_type_t::LEAF) {
+ if (unlikely(node_stage.keys() == 0)) {
+ history.set<STAGE_LEFT>(MatchKindCMP::LT);
+ return lookup_result_t<NODE_TYPE>::end();
+ }
+ }
+
+ typename STAGE_T::result_t result_raw;
+ if (index_key) {
+ result_raw = STAGE_T::template lower_bound<true>(
+ node_stage, key, history, index_key);
+#ifndef NDEBUG
+ if (!result_raw.is_end()) {
+ full_key_t<KeyT::VIEW> index;
+ STAGE_T::get_key_view(node_stage, result_raw.position, index);
+ assert(index == *index_key);
+ }
+#endif
+ } else {
+ result_raw = STAGE_T::lower_bound(node_stage, key, history);
+ }
+#ifndef NDEBUG
+ if (result_raw.is_end()) {
+ assert(result_raw.mstat == MSTAT_END);
+ } else {
+ full_key_t<KeyT::VIEW> index;
+ STAGE_T::get_key_view(node_stage, result_raw.position, index);
+ assert_mstat(key, index, result_raw.mstat);
+ }
+#endif
+
+ // calculate MSTAT_LT3
+ if constexpr (FIELD_TYPE == field_type_t::N0) {
+ // currently only internal node checks mstat
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (result_raw.mstat == MSTAT_LT2) {
+ auto cmp = compare_to<KeyT::HOBJ>(
+ key, node_stage[result_raw.position.index].shard_pool);
+ assert(cmp != MatchKindCMP::GT);
+ if (cmp != MatchKindCMP::EQ) {
+ result_raw.mstat = MSTAT_LT3;
+ }
+ }
+ }
+ }
+
+ auto result = normalize(std::move(result_raw));
+ if (result.is_end()) {
+ assert(node_stage.is_level_tail());
+ assert(result.p_value == nullptr);
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ result.p_value = node_stage.get_end_p_laddr();
+ }
+ } else {
+ assert(result.p_value != nullptr);
+ }
+ return result;
+ }
+
+ const value_t* insert(
+ const full_key_t<KEY_TYPE>& key, const value_t& value,
+ search_position_t& insert_pos, match_stage_t& insert_stage,
+ node_offset_t& insert_size) override {
+ logger().debug("OTree::Layout::Insert: begin at "
+ "insert_pos({}), insert_stage={}, insert_size={}B ...",
+ insert_pos, insert_stage, insert_size);
+ if (unlikely(logger().is_enabled(seastar::log_level::trace))) {
+ std::ostringstream sos;
+ dump(sos);
+ logger().trace("OTree::Layout::Insert: -- dump\n{}", sos.str());
+ }
+ auto ret = extent.template insert_replayable<KEY_TYPE>(
+ key, value, cast_down<STAGE>(insert_pos), insert_stage, insert_size);
+ logger().debug("OTree::Layout::Insert: done at "
+ "insert_pos({}), insert_stage={}, insert_size={}B",
+ insert_pos, insert_stage, insert_size);
+ if (unlikely(logger().is_enabled(seastar::log_level::trace))) {
+ std::ostringstream sos;
+ dump(sos);
+ logger().trace("OTree::Layout::Insert: -- dump\n{}", sos.str());
+ }
+ validate_layout();
+ assert(get_key_view(insert_pos) == key);
+ return ret;
+ }
+
+ std::tuple<search_position_t, bool, const value_t*> split_insert(
+ NodeExtentMutable& right_mut, NodeImpl& right_impl,
+ const full_key_t<KEY_TYPE>& key, const value_t& value,
+ search_position_t& _insert_pos, match_stage_t& insert_stage,
+ node_offset_t& insert_size) override {
+ logger().info("OTree::Layout::Split: begin at "
+ "insert_pos({}), insert_stage={}, insert_size={}B, "
+ "{:#x}=>{:#x} ...",
+ _insert_pos, insert_stage, insert_size,
+ laddr(), right_impl.laddr());
+ if (unlikely(logger().is_enabled(seastar::log_level::debug))) {
+ std::ostringstream sos;
+ dump(sos);
+ logger().debug("OTree::Layout::Split: -- dump\n{}", sos.str());
+ }
+#ifdef UNIT_TESTS_BUILT
+ auto insert_stage_pre = insert_stage;
+#endif
+
+ auto& insert_pos = cast_down<STAGE>(_insert_pos);
+ auto& node_stage = extent.read();
+ typename STAGE_T::StagedIterator split_at;
+ bool is_insert_left;
+ size_t split_size;
+ size_t target_split_size;
+ {
+ size_t empty_size = node_stage.size_before(0);
+ size_t filled_kv_size = filled_size() - empty_size;
+ /** NODE_BLOCK_SIZE considerations
+ *
+ * Generally,
+ * target_split_size = (filled_size + insert_size) / 2
+ * We can have two locate_split() strategies:
+ * A. the simpler one is to locate the largest split position where
+ * the estimated left_node_size <= target_split_size;
+ * B. the fair one takes a further step to calculate the next slot of
+ * P KiB, and if left_node_size + P/2 < target_split_size, compensate
+ * the split position to include the next slot; (TODO)
+ *
+ * Say that the node_block_size = N KiB, the largest allowed
+ * insert_size = 1/I * N KiB (I > 1). We want to identify the minimal 'I'
+ * that won't lead to "double split" effect, meaning after a split,
+ * the right node size is still larger than N KiB and need to split
+ * again. I think "double split" makes split much more complicated and
+ * we can no longer identify whether the node is safe under concurrent
+ * operations.
+ *
+ * We need to evaluate the worst case in order to identify 'I'. This means:
+ * - filled_size ~= N KiB
+ * - insert_size == N/I KiB
+ * - target_split_size ~= (I+1)/2I * N KiB
+ * To simplify the below calculations, node_block_size is normalized to 1.
+ *
+ * With strategy A, the worst case is when left_node_size cannot include
+ * the next slot that will just overflow the target_split_size:
+ * - left_node_size + 1/I ~= (I+1)/2I
+ * - left_node_size ~= (I-1)/2I
+ * - right_node_size ~= 1 + 1/I - left_node_size ~= (I+3)/2I
+ * The right_node_size cannot larger than the node_block_size in the
+ * worst case, which means (I+3)/2I < 1, so I > 3, meaning the largest
+ * possible insert_size must be smaller than 1/3 of the node_block_size.
+ *
+ * With strategy B, the worst case is when left_node_size cannot include
+ * the next slot that will just overflow the threshold
+ * target_split_size - 1/2I, thus:
+ * - left_node_size ~= (I+1)/2I - 1/2I ~= 1/2
+ * - right_node_size ~= 1 + 1/I - 1/2 ~= (I+2)/2I < node_block_size(1)
+ * - I > 2
+ * This means the largest possible insert_size must be smaller than 1/2 of
+ * the node_block_size, which is better than strategy A.
+
+ * In order to avoid "double split", there is another side-effect we need
+ * to take into consideration: if split happens with snap-gen indexes, the
+ * according ns-oid string needs to be copied to the right node. That is
+ * to say: right_node_size + string_size < node_block_size.
+ *
+ * Say that the largest allowed string size is 1/S of the largest allowed
+ * insert_size N/I KiB. If we go with stragety B, the equation should be
+ * changed to:
+ * - right_node_size ~= (I+2)/2I + 1/(I*S) < 1
+ * - I > 2 + 2/S (S > 1)
+ *
+ * Now back to NODE_BLOCK_SIZE calculation, if we have limits of at most
+ * X KiB ns-oid string and Y KiB of onode_t to store in this BTree, then:
+ * - largest_insert_size ~= X+Y KiB
+ * - 1/S == X/(X+Y)
+ * - I > (4X+2Y)/(X+Y)
+ * - node_block_size(N) == I * insert_size > 4X+2Y KiB
+ *
+ * In conclusion,
+ * (TODO) the current node block size (4 KiB) is too small to
+ * store entire 2 KiB ns-oid string. We need to consider a larger
+ * node_block_size.
+ *
+ * We are setting X = Y = 640 B in order not to break the current
+ * implementations with 4KiB node.
+ *
+ * (TODO) Implement smarter logics to check when "double split" happens.
+ */
+ target_split_size = empty_size + (filled_kv_size + insert_size) / 2;
+ assert(insert_size < (node_stage.total_size() - empty_size) / 2);
+
+ std::optional<bool> _is_insert_left;
+ split_at.set(node_stage);
+ split_size = 0;
+ bool locate_nxt = STAGE_T::recursively_locate_split_inserted(
+ split_size, 0, target_split_size, insert_pos,
+ insert_stage, insert_size, _is_insert_left, split_at);
+ is_insert_left = *_is_insert_left;
+ logger().debug("OTree::Layout::Split: -- located "
+ "split_at({}), insert_pos({}), is_insert_left={}, "
+ "split_size={}B(target={}B, current={}B)",
+ split_at, insert_pos, is_insert_left,
+ split_size, target_split_size, filled_size());
+ // split_size can be larger than target_split_size in strategy B
+ // assert(split_size <= target_split_size);
+ if (locate_nxt) {
+ assert(insert_stage == STAGE);
+ assert(split_at.get().is_last());
+ split_at.set_end();
+ assert(insert_pos.index == split_at.index());
+ }
+ }
+
+ auto append_at = split_at;
+ // TODO(cross-node string dedup)
+ typename STAGE_T::template StagedAppender<KEY_TYPE> right_appender;
+ right_appender.init(&right_mut, right_mut.get_write());
+ const value_t* p_value = nullptr;
+ if (!is_insert_left) {
+ // right node: append [start(append_at), insert_pos)
+ STAGE_T::template append_until<KEY_TYPE>(
+ append_at, right_appender, insert_pos, insert_stage);
+ logger().debug("OTree::Layout::Split: -- right appended until "
+ "insert_pos({}), insert_stage={}, insert/append the rest ...",
+ insert_pos, insert_stage);
+ // right node: append [insert_pos(key, value)]
+ bool is_front_insert = (insert_pos == position_t::begin());
+ [[maybe_unused]] bool is_end = STAGE_T::template append_insert<KEY_TYPE>(
+ key, value, append_at, right_appender,
+ is_front_insert, insert_stage, p_value);
+ assert(append_at.is_end() == is_end);
+ } else {
+ logger().debug("OTree::Layout::Split: -- right appending ...");
+ }
+
+ // right node: append (insert_pos, end)
+ auto pos_end = position_t::end();
+ STAGE_T::template append_until<KEY_TYPE>(
+ append_at, right_appender, pos_end, STAGE);
+ assert(append_at.is_end());
+ right_appender.wrap();
+ if (unlikely(logger().is_enabled(seastar::log_level::debug))) {
+ std::ostringstream sos;
+ right_impl.dump(sos);
+ logger().debug("OTree::Layout::Split: -- right node dump\n{}", sos.str());
+ }
+ right_impl.validate_layout();
+
+ // mutate left node
+ if (is_insert_left) {
+ logger().debug("OTree::Layout::Split: -- left trim/insert at "
+ "insert_pos({}), insert_stage={} ...",
+ insert_pos, insert_stage);
+ p_value = extent.template split_insert_replayable<KEY_TYPE>(
+ split_at, key, value, insert_pos, insert_stage, insert_size);
+ assert(get_key_view(_insert_pos) == key);
+ } else {
+ logger().debug("OTree::Layout::Split: -- left trim ...");
+ assert(right_impl.get_key_view(_insert_pos) == key);
+ extent.split_replayable(split_at);
+ }
+ if (unlikely(logger().is_enabled(seastar::log_level::debug))) {
+ std::ostringstream sos;
+ dump(sos);
+ logger().debug("OTree::Layout::Split: -- left node dump\n{}", sos.str());
+ }
+ validate_layout();
+ assert(p_value);
+
+ auto split_pos = normalize(split_at.get_pos());
+ logger().info("OTree::Layout::Split: done at "
+ "insert_pos({}), insert_stage={}, insert_size={}B, split_at({}), "
+ "is_insert_left={}, split_size={}B(target={}B)",
+ _insert_pos, insert_stage, insert_size, split_pos,
+ is_insert_left, split_size, target_split_size);
+ assert(split_size == filled_size());
+
+#ifdef UNIT_TESTS_BUILT
+ InsertType insert_type;
+ search_position_t last_pos;
+ if (is_insert_left) {
+ STAGE_T::template lookup_largest_slot<true, false, false>(
+ extent.read(), &cast_down_fill_0<STAGE>(last_pos), nullptr, nullptr);
+ } else {
+ node_stage_t right_stage{reinterpret_cast<FieldType*>(right_mut.get_write())};
+ STAGE_T::template lookup_largest_slot<true, false, false>(
+ right_stage, &cast_down_fill_0<STAGE>(last_pos), nullptr, nullptr);
+ }
+ if (_insert_pos == search_position_t::begin()) {
+ insert_type = InsertType::BEGIN;
+ } else if (_insert_pos == last_pos) {
+ insert_type = InsertType::LAST;
+ } else {
+ insert_type = InsertType::MID;
+ }
+ last_split = {split_pos, insert_stage_pre, is_insert_left, insert_type};
+#endif
+ return {split_pos, is_insert_left, p_value};
+ }
+
+ /*
+ * InternalNodeImpl
+ */
+ void replace_child_addr(
+ const search_position_t& pos, laddr_t dst, laddr_t src) override {
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ const laddr_packed_t* p_value = get_p_value(pos);
+ assert(p_value->value == src);
+ extent.update_child_addr_replayable(dst, const_cast<laddr_packed_t*>(p_value));
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ std::tuple<match_stage_t, node_offset_t> evaluate_insert(
+ const key_view_t& key, const laddr_t& value,
+ search_position_t& insert_pos) const override {
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ auto packed_value = laddr_packed_t{value};
+ auto& node_stage = extent.read();
+ match_stage_t insert_stage;
+ node_offset_t insert_size;
+ if (unlikely(!node_stage.keys())) {
+ assert(insert_pos.is_end());
+ insert_stage = STAGE;
+ insert_size = STAGE_T::template insert_size<KeyT::VIEW>(key, packed_value);
+ } else {
+ std::tie(insert_stage, insert_size) = STAGE_T::evaluate_insert(
+ node_stage, key, packed_value, cast_down<STAGE>(insert_pos), false);
+ }
+ return {insert_stage, insert_size};
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ /*
+ * LeafNodeImpl
+ */
+ void get_largest_slot(search_position_t& pos,
+ key_view_t& index_key, const onode_t** pp_value) const override {
+ if constexpr (NODE_TYPE == node_type_t::LEAF) {
+ STAGE_T::template lookup_largest_slot<true, true, true>(
+ extent.read(), &cast_down_fill_0<STAGE>(pos), &index_key, pp_value);
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ std::tuple<match_stage_t, node_offset_t> evaluate_insert(
+ const key_hobj_t& key, const onode_t& value,
+ const MatchHistory& history, match_stat_t mstat,
+ search_position_t& insert_pos) const override {
+ if constexpr (NODE_TYPE == node_type_t::LEAF) {
+ if (unlikely(is_empty())) {
+ assert(insert_pos.is_end());
+ return {STAGE, STAGE_T::template insert_size<KeyT::HOBJ>(key, value)};
+ } else {
+ return STAGE_T::evaluate_insert(
+ key, value, history, mstat, cast_down<STAGE>(insert_pos));
+ }
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ private:
+ NodeLayoutT(NodeExtentRef extent) : extent{extent} {}
+
+ node_offset_t filled_size() const {
+ auto& node_stage = extent.read();
+ auto ret = node_stage.size_before(node_stage.keys());
+ assert(ret == node_stage.total_size() - node_stage.free_size());
+ return ret;
+ }
+
+ static seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+
+ extent_t extent;
+};
+
+using InternalNode0 = NodeLayoutT<node_fields_0_t, node_type_t::INTERNAL>;
+using InternalNode1 = NodeLayoutT<node_fields_1_t, node_type_t::INTERNAL>;
+using InternalNode2 = NodeLayoutT<node_fields_2_t, node_type_t::INTERNAL>;
+using InternalNode3 = NodeLayoutT<internal_fields_3_t, node_type_t::INTERNAL>;
+using LeafNode0 = NodeLayoutT<node_fields_0_t, node_type_t::LEAF>;
+using LeafNode1 = NodeLayoutT<node_fields_1_t, node_type_t::LEAF>;
+using LeafNode2 = NodeLayoutT<node_fields_2_t, node_type_t::LEAF>;
+using LeafNode3 = NodeLayoutT<leaf_fields_3_t, node_type_t::LEAF>;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h
new file mode 100644
index 000000000..c1499d609
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "node_extent_mutable.h"
+#include "stages/node_stage.h"
+#include "stages/stage.h"
+
+#define STAGE_T node_to_stage_t<node_stage_t>
+
+namespace crimson::os::seastore::onode {
+
+/**
+ * NodeLayoutReplayableT
+ *
+ * Contains templated logics to modify the layout of a NodeExtend which are
+ * also replayable. Used by NodeExtentAccessorT at runtime and by
+ * DeltaRecorderT during replay.
+ */
+template <typename FieldType, node_type_t NODE_TYPE>
+struct NodeLayoutReplayableT {
+ using node_stage_t = node_extent_t<FieldType, NODE_TYPE>;
+ using position_t = typename STAGE_T::position_t;
+ using StagedIterator = typename STAGE_T::StagedIterator;
+ using value_t = value_type_t<NODE_TYPE>;
+ static constexpr auto FIELD_TYPE = FieldType::FIELD_TYPE;
+
+ template <KeyT KT>
+ static const value_t* insert(
+ NodeExtentMutable& mut,
+ const node_stage_t& node_stage,
+ const full_key_t<KT>& key,
+ const value_t& value,
+ position_t& insert_pos,
+ match_stage_t& insert_stage,
+ node_offset_t& insert_size) {
+ auto p_value = STAGE_T::template proceed_insert<KT, false>(
+ mut, node_stage, key, value, insert_pos, insert_stage, insert_size);
+ return p_value;
+ }
+
+ static void split(
+ NodeExtentMutable& mut,
+ const node_stage_t& node_stage,
+ StagedIterator& split_at) {
+ node_stage_t::update_is_level_tail(mut, node_stage, false);
+ STAGE_T::trim(mut, split_at);
+ }
+
+ template <KeyT KT>
+ static const value_t* split_insert(
+ NodeExtentMutable& mut,
+ const node_stage_t& node_stage,
+ StagedIterator& split_at,
+ const full_key_t<KT>& key,
+ const value_t& value,
+ position_t& insert_pos,
+ match_stage_t& insert_stage,
+ node_offset_t& insert_size) {
+ node_stage_t::update_is_level_tail(mut, node_stage, false);
+ STAGE_T::trim(mut, split_at);
+ auto p_value = STAGE_T::template proceed_insert<KT, true>(
+ mut, node_stage, key, value, insert_pos, insert_stage, insert_size);
+ return p_value;
+ }
+
+ static void update_child_addr(
+ NodeExtentMutable& mut, const laddr_t new_addr, laddr_packed_t* p_addr) {
+ assert(NODE_TYPE == node_type_t::INTERNAL);
+ mut.copy_in_absolute(p_addr, new_addr);
+ }
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h
new file mode 100644
index 000000000..6774544c7
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <ostream>
+
+#include "fwd.h"
+
+namespace crimson::os::seastore::onode {
+
+constexpr uint8_t FIELD_TYPE_MAGIC = 0x25;
+enum class field_type_t : uint8_t {
+ N0 = FIELD_TYPE_MAGIC,
+ N1,
+ N2,
+ N3,
+ _MAX
+};
+inline uint8_t to_unsigned(field_type_t type) {
+ auto value = static_cast<uint8_t>(type);
+ assert(value >= FIELD_TYPE_MAGIC);
+ assert(value < static_cast<uint8_t>(field_type_t::_MAX));
+ return value - FIELD_TYPE_MAGIC;
+}
+inline std::ostream& operator<<(std::ostream &os, field_type_t type) {
+ const char* const names[] = {"0", "1", "2", "3"};
+ auto index = to_unsigned(type);
+ os << names[index];
+ return os;
+}
+
+enum class node_type_t : uint8_t {
+ LEAF = 0,
+ INTERNAL
+};
+inline std::ostream& operator<<(std::ostream &os, const node_type_t& type) {
+ const char* const names[] = {"L", "I"};
+ auto index = static_cast<uint8_t>(type);
+ assert(index <= 1u);
+ os << names[index];
+ return os;
+}
+
+struct laddr_packed_t {
+ laddr_t value;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const laddr_packed_t& laddr) {
+ return os << "laddr_packed(0x" << std::hex << laddr.value << std::dec << ")";
+}
+
+using match_stat_t = int8_t;
+constexpr match_stat_t MSTAT_END = -2; // index is search_position_t::end()
+constexpr match_stat_t MSTAT_EQ = -1; // key == index
+constexpr match_stat_t MSTAT_LT0 = 0; // key == index [pool/shard crush ns/oid]; key < index [snap/gen]
+constexpr match_stat_t MSTAT_LT1 = 1; // key == index [pool/shard crush]; key < index [ns/oid]
+constexpr match_stat_t MSTAT_LT2 = 2; // key < index [pool/shard crush ns/oid] ||
+ // key == index [pool/shard]; key < index [crush]
+constexpr match_stat_t MSTAT_LT3 = 3; // key < index [pool/shard]
+constexpr match_stat_t MSTAT_MIN = MSTAT_END;
+constexpr match_stat_t MSTAT_MAX = MSTAT_LT3;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc
new file mode 100644
index 000000000..443c6cabd
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc
@@ -0,0 +1,165 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "item_iterator_stage.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+#define ITER_T item_iterator_t<NODE_TYPE>
+#define ITER_INST(NT) item_iterator_t<NT>
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+memory_range_t ITER_T::insert_prefix(
+ NodeExtentMutable& mut, const ITER_T& iter, const full_key_t<KT>& key,
+ bool is_end, node_offset_t size, const char* p_left_bound) {
+ // 1. insert range
+ char* p_insert;
+ if (is_end) {
+ assert(!iter.has_next());
+ p_insert = const_cast<char*>(iter.p_start());
+ } else {
+ p_insert = const_cast<char*>(iter.p_end());
+ }
+ char* p_insert_front = p_insert - size;
+
+ // 2. shift memory
+ const char* p_shift_start = p_left_bound;
+ const char* p_shift_end = p_insert;
+ mut.shift_absolute(p_shift_start,
+ p_shift_end - p_shift_start,
+ -(int)size);
+
+ // 3. append header
+ p_insert -= sizeof(node_offset_t);
+ node_offset_t back_offset = (p_insert - p_insert_front);
+ mut.copy_in_absolute(p_insert, back_offset);
+ ns_oid_view_t::append<KT>(mut, key, p_insert);
+
+ return {p_insert_front, p_insert};
+}
+#define IP_TEMPLATE(NT, KT) \
+ template memory_range_t ITER_INST(NT)::insert_prefix<KT>( \
+ NodeExtentMutable&, const ITER_INST(NT)&, const full_key_t<KT>&, \
+ bool, node_offset_t, const char*)
+IP_TEMPLATE(node_type_t::LEAF, KeyT::VIEW);
+IP_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW);
+IP_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ);
+IP_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ);
+
+template <node_type_t NODE_TYPE>
+void ITER_T::update_size(
+ NodeExtentMutable& mut, const ITER_T& iter, int change) {
+ node_offset_t offset = iter.get_back_offset();
+ int new_size = change + offset;
+ assert(new_size > 0 && new_size < NODE_BLOCK_SIZE);
+ mut.copy_in_absolute(
+ (void*)iter.get_item_range().p_end, node_offset_t(new_size));
+}
+
+template <node_type_t NODE_TYPE>
+node_offset_t ITER_T::trim_until(NodeExtentMutable&, const ITER_T& iter) {
+ assert(iter.index() != 0);
+ size_t ret = iter.p_end() - iter.p_items_start;
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+}
+
+template <node_type_t NODE_TYPE>
+node_offset_t ITER_T::trim_at(
+ NodeExtentMutable& mut, const ITER_T& iter, node_offset_t trimmed) {
+ size_t trim_size = iter.p_start() - iter.p_items_start + trimmed;
+ assert(trim_size < NODE_BLOCK_SIZE);
+ assert(iter.get_back_offset() > trimmed);
+ node_offset_t new_offset = iter.get_back_offset() - trimmed;
+ mut.copy_in_absolute((void*)iter.item_range.p_end, new_offset);
+ return trim_size;
+}
+
+#define ITER_TEMPLATE(NT) template class ITER_INST(NT)
+ITER_TEMPLATE(node_type_t::LEAF);
+ITER_TEMPLATE(node_type_t::INTERNAL);
+
+#define APPEND_T ITER_T::Appender<KT>
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+bool APPEND_T::append(const ITER_T& src, index_t& items) {
+ auto p_end = src.p_end();
+ bool append_till_end = false;
+ if (is_valid_index(items)) {
+ for (auto i = 1u; i <= items; ++i) {
+ if (!src.has_next()) {
+ assert(i == items);
+ append_till_end = true;
+ break;
+ }
+ ++src;
+ }
+ } else {
+ if (items == INDEX_END) {
+ append_till_end = true;
+ } else {
+ assert(items == INDEX_LAST);
+ }
+ items = 0;
+ while (src.has_next()) {
+ ++src;
+ ++items;
+ }
+ if (append_till_end) {
+ ++items;
+ }
+ }
+
+ const char* p_start;
+ if (append_till_end) {
+ p_start = src.p_start();
+ } else {
+ p_start = src.p_end();
+ }
+ assert(p_end >= p_start);
+ size_t append_size = p_end - p_start;
+ p_append -= append_size;
+ p_mut->copy_in_absolute(p_append, p_start, append_size);
+ return append_till_end;
+}
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const key_get_type& partial_key) {
+ p_append -= sizeof(node_offset_t);
+ p_offset_while_open = p_append;
+ ns_oid_view_t::append(*p_mut, partial_key, p_append);
+ return {p_mut, p_append};
+}
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const full_key_t<KT>& key) {
+ p_append -= sizeof(node_offset_t);
+ p_offset_while_open = p_append;
+ ns_oid_view_t::append<KT>(*p_mut, key, p_append);
+ return {p_mut, p_append};
+}
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+void APPEND_T::wrap_nxt(char* _p_append) {
+ assert(_p_append < p_append);
+ p_mut->copy_in_absolute(
+ p_offset_while_open, node_offset_t(p_offset_while_open - _p_append));
+ p_append = _p_append;
+}
+
+#define APPEND_TEMPLATE(NT, KT) template class ITER_INST(NT)::Appender<KT>
+APPEND_TEMPLATE(node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ);
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h
new file mode 100644
index 000000000..bb68eec8f
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h
@@ -0,0 +1,180 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "key_layout.h"
+#include "stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+/**
+ * item_iterator_t
+ *
+ * The STAGE_STRING implementation for node N0/N1, implements staged contract
+ * as an iterative container to resolve crush hash conflicts.
+ *
+ * The layout of the contaner to index ns, oid strings storing n items:
+ *
+ * # <--------- container range ---------> #
+ * #<~># items [i+1, n) #
+ * # # items [0, i) #<~>#
+ * # # <------ item i -------------> # #
+ * # # <--- item_range ---> | # #
+ * # # | # #
+ * # # next-stage | ns-oid | back_ # #
+ * # # contaner | strings | offset # #
+ * #...# range | | #...#
+ * ^ ^ | ^
+ * | | | |
+ * | +---------------------------+ |
+ * + p_items_start p_items_end +
+ */
+template <node_type_t NODE_TYPE>
+class item_iterator_t {
+ using value_t = value_type_t<NODE_TYPE>;
+ public:
+ item_iterator_t(const memory_range_t& range)
+ : p_items_start(range.p_start), p_items_end(range.p_end) {
+ assert(p_items_start < p_items_end);
+ next_item_range(p_items_end);
+ }
+
+ const char* p_start() const { return item_range.p_start; }
+ const char* p_end() const { return item_range.p_end + sizeof(node_offset_t); }
+ const memory_range_t& get_item_range() const { return item_range; }
+ node_offset_t get_back_offset() const { return back_offset; }
+
+ // container type system
+ using key_get_type = const ns_oid_view_t&;
+ static constexpr auto CONTAINER_TYPE = ContainerType::ITERATIVE;
+ index_t index() const { return _index; }
+ key_get_type get_key() const {
+ if (!key.has_value()) {
+ key = ns_oid_view_t(item_range.p_end);
+ assert(item_range.p_start < (*key).p_start());
+ }
+ return *key;
+ }
+ node_offset_t size() const {
+ size_t ret = item_range.p_end - item_range.p_start + sizeof(node_offset_t);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ };
+ node_offset_t size_to_nxt() const {
+ size_t ret = get_key().size() + sizeof(node_offset_t);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ }
+ node_offset_t size_overhead() const {
+ return sizeof(node_offset_t) + get_key().size_overhead();
+ }
+ memory_range_t get_nxt_container() const {
+ return {item_range.p_start, get_key().p_start()};
+ }
+ bool has_next() const {
+ assert(p_items_start <= item_range.p_start);
+ return p_items_start < item_range.p_start;
+ }
+ const item_iterator_t<NODE_TYPE>& operator++() const {
+ assert(has_next());
+ next_item_range(item_range.p_start);
+ key.reset();
+ ++_index;
+ return *this;
+ }
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ int start_offset = p_items_start - p_node_start;
+ int end_offset = p_items_end - p_node_start;
+ assert(start_offset > 0 && start_offset < NODE_BLOCK_SIZE);
+ assert(end_offset > 0 && end_offset <= NODE_BLOCK_SIZE);
+ ceph::encode(static_cast<node_offset_t>(start_offset), encoded);
+ ceph::encode(static_cast<node_offset_t>(end_offset), encoded);
+ ceph::encode(_index, encoded);
+ }
+
+ static item_iterator_t decode(const char* p_node_start,
+ ceph::bufferlist::const_iterator& delta) {
+ node_offset_t start_offset;
+ ceph::decode(start_offset, delta);
+ node_offset_t end_offset;
+ ceph::decode(end_offset, delta);
+ assert(start_offset < end_offset);
+ assert(end_offset <= NODE_BLOCK_SIZE);
+ index_t index;
+ ceph::decode(index, delta);
+
+ item_iterator_t ret({p_node_start + start_offset,
+ p_node_start + end_offset});
+ while (index > 0) {
+ ++ret;
+ --index;
+ }
+ return ret;
+ }
+
+ static node_offset_t header_size() { return 0u; }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(
+ const full_key_t<KT>& key, const value_t&) {
+ return ns_oid_view_t::estimate_size<KT>(key) + sizeof(node_offset_t);
+ }
+
+ template <KeyT KT>
+ static memory_range_t insert_prefix(
+ NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter,
+ const full_key_t<KT>& key, bool is_end,
+ node_offset_t size, const char* p_left_bound);
+
+ static void update_size(
+ NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter, int change);
+
+ static node_offset_t trim_until(NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&);
+ static node_offset_t trim_at(
+ NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&, node_offset_t trimmed);
+
+ template <KeyT KT>
+ class Appender;
+
+ private:
+ void next_item_range(const char* p_end) const {
+ auto p_item_end = p_end - sizeof(node_offset_t);
+ assert(p_items_start < p_item_end);
+ back_offset = reinterpret_cast<const node_offset_packed_t*>(p_item_end)->value;
+ assert(back_offset);
+ const char* p_item_start = p_item_end - back_offset;
+ assert(p_items_start <= p_item_start);
+ item_range = {p_item_start, p_item_end};
+ }
+
+ const char* p_items_start;
+ const char* p_items_end;
+ mutable memory_range_t item_range;
+ mutable node_offset_t back_offset;
+ mutable std::optional<ns_oid_view_t> key;
+ mutable index_t _index = 0u;
+};
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+class item_iterator_t<NODE_TYPE>::Appender {
+ public:
+ Appender(NodeExtentMutable* p_mut, char* p_append)
+ : p_mut{p_mut}, p_append{p_append} {}
+ bool append(const item_iterator_t<NODE_TYPE>& src, index_t& items);
+ char* wrap() { return p_append; }
+ std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&);
+ std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&);
+ void wrap_nxt(char* _p_append);
+
+ private:
+ NodeExtentMutable* p_mut;
+ char* p_append;
+ char* p_offset_while_open;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc
new file mode 100644
index 000000000..d60bb8d09
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "key_layout.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+void string_key_view_t::append_str(
+ NodeExtentMutable& mut, std::string_view str, char*& p_append) {
+ assert(is_valid_size(str.length()));
+ p_append -= sizeof(string_size_t);
+ string_size_t len = str.length();
+ mut.copy_in_absolute(p_append, len);
+ p_append -= len;
+ mut.copy_in_absolute(p_append, str.data(), len);
+}
+
+void string_key_view_t::append_dedup(
+ NodeExtentMutable& mut, const Type& dedup_type, char*& p_append) {
+ p_append -= sizeof(string_size_t);
+ if (dedup_type == Type::MIN) {
+ mut.copy_in_absolute(p_append, MIN);
+ } else if (dedup_type == Type::MAX) {
+ mut.copy_in_absolute(p_append, MAX);
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h
new file mode 100644
index 000000000..cc1f546c1
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h
@@ -0,0 +1,846 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <limits>
+#include <optional>
+#include <ostream>
+
+#include "common/hobject.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h"
+
+namespace crimson::os::seastore::onode {
+
+using shard_t = int8_t;
+using pool_t = int64_t;
+using crush_hash_t = uint32_t;
+using snap_t = uint64_t;
+using gen_t = uint64_t;
+static_assert(sizeof(shard_t) == sizeof(ghobject_t().shard_id.id));
+static_assert(sizeof(pool_t) == sizeof(ghobject_t().hobj.pool));
+static_assert(sizeof(crush_hash_t) == sizeof(ghobject_t().hobj.get_hash()));
+static_assert(sizeof(snap_t) == sizeof(ghobject_t().hobj.snap.val));
+static_assert(sizeof(gen_t) == sizeof(ghobject_t().generation));
+
+class NodeExtentMutable;
+class key_view_t;
+class key_hobj_t;
+enum class KeyT { VIEW, HOBJ };
+template <KeyT> struct _full_key_type;
+template<> struct _full_key_type<KeyT::VIEW> { using type = key_view_t; };
+template<> struct _full_key_type<KeyT::HOBJ> { using type = key_hobj_t; };
+template <KeyT type>
+using full_key_t = typename _full_key_type<type>::type;
+
+struct node_offset_packed_t {
+ node_offset_t value;
+} __attribute__((packed));
+
+// TODO: consider alignments
+struct shard_pool_t {
+ bool operator==(const shard_pool_t& x) const {
+ return (shard == x.shard && pool == x.pool);
+ }
+ bool operator!=(const shard_pool_t& x) const { return !(*this == x); }
+
+ template <KeyT KT>
+ static shard_pool_t from_key(const full_key_t<KT>& key);
+
+ shard_t shard;
+ pool_t pool;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const shard_pool_t& sp) {
+ return os << (unsigned)sp.shard << "," << sp.pool;
+}
+inline MatchKindCMP compare_to(const shard_pool_t& l, const shard_pool_t& r) {
+ auto ret = toMatchKindCMP(l.shard, r.shard);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return toMatchKindCMP(l.pool, r.pool);
+}
+
+struct crush_t {
+ bool operator==(const crush_t& x) const { return crush == x.crush; }
+ bool operator!=(const crush_t& x) const { return !(*this == x); }
+
+ template <KeyT KT>
+ static crush_t from_key(const full_key_t<KT>& key);
+
+ crush_hash_t crush;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const crush_t& c) {
+ return os << c.crush;
+}
+inline MatchKindCMP compare_to(const crush_t& l, const crush_t& r) {
+ return toMatchKindCMP(l.crush, r.crush);
+}
+
+struct shard_pool_crush_t {
+ bool operator==(const shard_pool_crush_t& x) const {
+ return (shard_pool == x.shard_pool && crush == x.crush);
+ }
+ bool operator!=(const shard_pool_crush_t& x) const { return !(*this == x); }
+
+ template <KeyT KT>
+ static shard_pool_crush_t from_key(const full_key_t<KT>& key);
+
+ shard_pool_t shard_pool;
+ crush_t crush;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const shard_pool_crush_t& spc) {
+ return os << spc.shard_pool << "," << spc.crush;
+}
+inline MatchKindCMP compare_to(
+ const shard_pool_crush_t& l, const shard_pool_crush_t& r) {
+ auto ret = compare_to(l.shard_pool, r.shard_pool);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return compare_to(l.crush, r.crush);
+}
+
+struct snap_gen_t {
+ bool operator==(const snap_gen_t& x) const {
+ return (snap == x.snap && gen == x.gen);
+ }
+ bool operator!=(const snap_gen_t& x) const { return !(*this == x); }
+
+ template <KeyT KT>
+ static snap_gen_t from_key(const full_key_t<KT>& key);
+
+ snap_t snap;
+ gen_t gen;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const snap_gen_t& sg) {
+ return os << sg.snap << "," << sg.gen;
+}
+inline MatchKindCMP compare_to(const snap_gen_t& l, const snap_gen_t& r) {
+ auto ret = toMatchKindCMP(l.snap, r.snap);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return toMatchKindCMP(l.gen, r.gen);
+}
+
+/**
+ * string_key_view_t
+ *
+ * The layout to store char array as an oid or an ns string which may be
+ * compressed.
+ *
+ * If compressed, the physical block only stores an unsigned int of
+ * string_size_t, with value 0 denoting Type::MIN, and value max() denoting
+ * Type::MAX.
+ *
+ * If not compressed (Type::STR), the physical block stores the char array and
+ * a valid string_size_t value.
+ */
+struct string_key_view_t {
+ enum class Type {MIN, STR, MAX};
+ // presumably the maximum string length is 2KiB
+ using string_size_t = uint16_t;
+ static constexpr auto MAX = std::numeric_limits<string_size_t>::max();
+ static constexpr auto MIN = string_size_t(0u);
+ static auto is_valid_size(size_t size) {
+ return (size > MIN && size < MAX);
+ }
+
+ string_key_view_t(const char* p_end) {
+ p_length = p_end - sizeof(string_size_t);
+ std::memcpy(&length, p_length, sizeof(string_size_t));
+ if (is_valid_size(length)) {
+ auto _p_key = p_length - length;
+ p_key = static_cast<const char*>(_p_key);
+ } else {
+ assert(length == MAX || length == MIN);
+ p_key = nullptr;
+ }
+ }
+ Type type() const {
+ if (length == MIN) {
+ return Type::MIN;
+ } else if (length == MAX) {
+ return Type::MAX;
+ } else {
+ assert(is_valid_size(length));
+ return Type::STR;
+ }
+ }
+ const char* p_start() const {
+ if (p_key) {
+ return p_key;
+ } else {
+ return p_length;
+ }
+ }
+ const char* p_next_end() const {
+ if (p_key) {
+ return p_start();
+ } else {
+ return p_length + sizeof(string_size_t);
+ }
+ }
+ node_offset_t size() const {
+ size_t ret = length + sizeof(string_size_t);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ }
+ node_offset_t size_logical() const {
+ assert(type() == Type::STR);
+ assert(is_valid_size(length));
+ return length;
+ }
+ node_offset_t size_overhead() const {
+ assert(type() == Type::STR);
+ return sizeof(string_size_t);
+ }
+
+ std::string_view to_string_view() const {
+ assert(type() == Type::STR);
+ assert(is_valid_size(length));
+ return {p_key, length};
+ }
+ bool operator==(const string_key_view_t& x) const {
+ if (type() == x.type() && type() != Type::STR)
+ return true;
+ if (type() != x.type())
+ return false;
+ if (length != x.length)
+ return false;
+ return (memcmp(p_key, x.p_key, length) == 0);
+ }
+ bool operator!=(const string_key_view_t& x) const { return !(*this == x); }
+
+ static void append_str(
+ NodeExtentMutable&, std::string_view, char*& p_append);
+
+ static void test_append_str(std::string_view str, char*& p_append) {
+ assert(is_valid_size(str.length()));
+ p_append -= sizeof(string_size_t);
+ string_size_t len = str.length();
+ std::memcpy(p_append, &len, sizeof(string_size_t));
+ p_append -= len;
+ std::memcpy(p_append, str.data(), len);
+ }
+
+ static void append_dedup(
+ NodeExtentMutable&, const Type& dedup_type, char*& p_append);
+
+ static void test_append_dedup(const Type& dedup_type, char*& p_append) {
+ p_append -= sizeof(string_size_t);
+ string_size_t len;
+ if (dedup_type == Type::MIN) {
+ len = MIN;
+ } else if (dedup_type == Type::MAX) {
+ len = MAX;
+ } else {
+ ceph_abort("impossible path");
+ }
+ std::memcpy(p_append, &len, sizeof(string_size_t));
+ }
+
+ const char* p_key;
+ const char* p_length;
+ // TODO: remove if p_length is aligned
+ string_size_t length;
+};
+
+/**
+ * string_view_masked_t
+ *
+ * A common class to hide the underlying string implementation regardless of a
+ * string_key_view_t (maybe compressed), a string/string_view, or a compressed
+ * string. And leverage this consistant class to do compare, print, convert and
+ * append operations.
+ */
+class string_view_masked_t {
+ public:
+ using string_size_t = string_key_view_t::string_size_t;
+ using Type = string_key_view_t::Type;
+ explicit string_view_masked_t(const string_key_view_t& index)
+ : type{index.type()} {
+ if (type == Type::STR) {
+ view = index.to_string_view();
+ }
+ }
+ explicit string_view_masked_t(std::string_view str)
+ : type{Type::STR}, view{str} {
+ assert(string_key_view_t::is_valid_size(view.size()));
+ }
+
+ Type get_type() const { return type; }
+ std::string_view to_string_view() const {
+ assert(get_type() == Type::STR);
+ return view;
+ }
+ string_size_t size() const {
+ assert(get_type() == Type::STR);
+ assert(string_key_view_t::is_valid_size(view.size()));
+ return view.size();
+ }
+ bool operator==(const string_view_masked_t& x) const {
+ if (get_type() == x.get_type() && get_type() != Type::STR)
+ return true;
+ if (get_type() != x.get_type())
+ return false;
+ if (size() != x.size())
+ return false;
+ return (memcmp(view.data(), x.view.data(), size()) == 0);
+ }
+ bool operator!=(const string_view_masked_t& x) const { return !(*this == x); }
+ void encode(ceph::bufferlist& bl) const {
+ if (get_type() == Type::MIN) {
+ ceph::encode(string_key_view_t::MIN, bl);
+ } else if (get_type() == Type::MAX) {
+ ceph::encode(string_key_view_t::MAX, bl);
+ } else {
+ ceph::encode(size(), bl);
+ ceph::encode_nohead(view, bl);
+ }
+ }
+ static auto min() { return string_view_masked_t{Type::MIN}; }
+ static auto max() { return string_view_masked_t{Type::MAX}; }
+ static string_view_masked_t decode(
+ std::string& str_storage, ceph::bufferlist::const_iterator& delta) {
+ string_size_t size;
+ ceph::decode(size, delta);
+ if (size == string_key_view_t::MIN) {
+ return min();
+ } else if (size == string_key_view_t::MAX) {
+ return max();
+ } else {
+ ceph::decode_nohead(size, str_storage, delta);
+ return string_view_masked_t(str_storage);
+ }
+ }
+
+ private:
+ explicit string_view_masked_t(Type type)
+ : type{type} {}
+
+ Type type;
+ std::string_view view;
+};
+inline MatchKindCMP compare_to(const string_view_masked_t& l, const string_view_masked_t& r) {
+ using Type = string_view_masked_t::Type;
+ auto l_type = l.get_type();
+ auto r_type = r.get_type();
+ if (l_type == Type::STR && r_type == Type::STR) {
+ assert(l.size() && r.size());
+ return toMatchKindCMP(l.to_string_view(), r.to_string_view());
+ } else if (l_type == r_type) {
+ return MatchKindCMP::EQ;
+ } else if (l_type == Type::MIN || r_type == Type::MAX) {
+ return MatchKindCMP::LT;
+ } else { // l_type == Type::MAX || r_type == Type::MIN
+ return MatchKindCMP::GT;
+ }
+}
+inline MatchKindCMP compare_to(std::string_view l, const string_view_masked_t& r) {
+ using Type = string_view_masked_t::Type;
+ assert(l.length());
+ auto r_type = r.get_type();
+ if (r_type == Type::MIN) {
+ return MatchKindCMP::GT;
+ } else if (r_type == Type::MAX) {
+ return MatchKindCMP::LT;
+ } else { // r_type == Type::STR
+ assert(r.size());
+ return toMatchKindCMP(l, r.to_string_view());
+ }
+}
+inline MatchKindCMP compare_to(const string_view_masked_t& l, std::string_view r) {
+ return reverse(compare_to(r, l));
+}
+inline std::ostream& operator<<(std::ostream& os, const string_view_masked_t& masked) {
+ using Type = string_view_masked_t::Type;
+ auto type = masked.get_type();
+ if (type == Type::MIN) {
+ return os << "MIN";
+ } else if (type == Type::MAX) {
+ return os << "MAX";
+ } else { // type == Type::STR
+ auto view = masked.to_string_view();
+ if (view.length() <= 12) {
+ os << "\"" << view << "\"";
+ } else {
+ os << "\"" << std::string_view(view.data(), 4) << ".."
+ << std::string_view(view.data() + view.length() - 2, 2)
+ << "/" << view.length() << "B\"";
+ }
+ return os;
+ }
+}
+
+struct ns_oid_view_t {
+ using string_size_t = string_key_view_t::string_size_t;
+ using Type = string_key_view_t::Type;
+
+ ns_oid_view_t(const char* p_end) : nspace(p_end), oid(nspace.p_next_end()) {}
+ Type type() const { return oid.type(); }
+ const char* p_start() const { return oid.p_start(); }
+ node_offset_t size() const {
+ if (type() == Type::STR) {
+ size_t ret = nspace.size() + oid.size();
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ } else {
+ return sizeof(string_size_t);
+ }
+ }
+ node_offset_t size_logical() const {
+ assert(type() == Type::STR);
+ return nspace.size_logical() + oid.size_logical();
+ }
+ node_offset_t size_overhead() const {
+ assert(type() == Type::STR);
+ return nspace.size_overhead() + oid.size_overhead();
+ }
+ bool operator==(const ns_oid_view_t& x) const {
+ return (string_view_masked_t{nspace} == string_view_masked_t{x.nspace} &&
+ string_view_masked_t{oid} == string_view_masked_t{x.oid});
+ }
+ bool operator!=(const ns_oid_view_t& x) const { return !(*this == x); }
+
+ template <KeyT KT>
+ static node_offset_t estimate_size(const full_key_t<KT>& key);
+
+ template <KeyT KT>
+ static void append(NodeExtentMutable&,
+ const full_key_t<KT>& key,
+ char*& p_append);
+
+ static void append(NodeExtentMutable& mut,
+ const ns_oid_view_t& view,
+ char*& p_append) {
+ if (view.type() == Type::STR) {
+ string_key_view_t::append_str(mut, view.nspace.to_string_view(), p_append);
+ string_key_view_t::append_str(mut, view.oid.to_string_view(), p_append);
+ } else {
+ string_key_view_t::append_dedup(mut, view.type(), p_append);
+ }
+ }
+
+ template <KeyT KT>
+ static void test_append(const full_key_t<KT>& key, char*& p_append);
+
+ string_key_view_t nspace;
+ string_key_view_t oid;
+};
+inline std::ostream& operator<<(std::ostream& os, const ns_oid_view_t& ns_oid) {
+ return os << string_view_masked_t{ns_oid.nspace} << ","
+ << string_view_masked_t{ns_oid.oid};
+}
+inline MatchKindCMP compare_to(const ns_oid_view_t& l, const ns_oid_view_t& r) {
+ auto ret = compare_to(string_view_masked_t{l.nspace},
+ string_view_masked_t{r.nspace});
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return compare_to(string_view_masked_t{l.oid},
+ string_view_masked_t{r.oid});
+}
+
+/**
+ * key_hobj_t
+ *
+ * A specialized implementation of a full_key_t storing a ghobject_t passed
+ * from user.
+ */
+class key_hobj_t {
+ public:
+ explicit key_hobj_t(const ghobject_t& ghobj) : ghobj{ghobj} {}
+ /*
+ * common interfaces as a full_key_t
+ */
+ shard_t shard() const {
+ return ghobj.shard_id;
+ }
+ pool_t pool() const {
+ return ghobj.hobj.pool;
+ }
+ crush_hash_t crush() const {
+ return ghobj.hobj.get_hash();
+ }
+ std::string_view nspace() const {
+ // TODO(cross-node string dedup)
+ return ghobj.hobj.nspace;
+ }
+ string_view_masked_t nspace_masked() const {
+ // TODO(cross-node string dedup)
+ return string_view_masked_t{nspace()};
+ }
+ std::string_view oid() const {
+ // TODO(cross-node string dedup)
+ return ghobj.hobj.oid.name;
+ }
+ string_view_masked_t oid_masked() const {
+ // TODO(cross-node string dedup)
+ return string_view_masked_t{oid()};
+ }
+ ns_oid_view_t::Type dedup_type() const {
+ return _dedup_type;
+ }
+ snap_t snap() const {
+ return ghobj.hobj.snap;
+ }
+ gen_t gen() const {
+ return ghobj.generation;
+ }
+
+ bool operator==(const full_key_t<KeyT::VIEW>& o) const;
+ bool operator==(const full_key_t<KeyT::HOBJ>& o) const;
+ bool operator!=(const full_key_t<KeyT::VIEW>& o) const {
+ return !operator==(o);
+ }
+ bool operator!=(const full_key_t<KeyT::HOBJ>& o) const {
+ return !operator==(o);
+ }
+
+ std::ostream& dump(std::ostream& os) const {
+ os << "key_hobj(" << (unsigned)shard() << ","
+ << pool() << "," << crush() << "; "
+ << string_view_masked_t{nspace()} << ","
+ << string_view_masked_t{oid()} << "; "
+ << snap() << "," << gen() << ")";
+ return os;
+ }
+
+ static key_hobj_t decode(ceph::bufferlist::const_iterator& delta) {
+ shard_t shard;
+ ceph::decode(shard, delta);
+ pool_t pool;
+ ceph::decode(pool, delta);
+ crush_hash_t crush;
+ ceph::decode(crush, delta);
+ std::string nspace;
+ auto nspace_masked = string_view_masked_t::decode(nspace, delta);
+ // TODO(cross-node string dedup)
+ assert(nspace_masked.get_type() == string_view_masked_t::Type::STR);
+ std::string oid;
+ auto oid_masked = string_view_masked_t::decode(oid, delta);
+ // TODO(cross-node string dedup)
+ assert(oid_masked.get_type() == string_view_masked_t::Type::STR);
+ snap_t snap;
+ ceph::decode(snap, delta);
+ gen_t gen;
+ ceph::decode(gen, delta);
+ return key_hobj_t(ghobject_t(
+ shard_id_t(shard), pool, crush, nspace, oid, snap, gen));
+ }
+
+ private:
+ ns_oid_view_t::Type _dedup_type = ns_oid_view_t::Type::STR;
+ ghobject_t ghobj;
+};
+inline std::ostream& operator<<(std::ostream& os, const key_hobj_t& key) {
+ return key.dump(os);
+}
+
+/**
+ * key_view_t
+ *
+ * A specialized implementation of a full_key_t pointing to the locations
+ * storing the full key in a tree node.
+ */
+class key_view_t {
+ public:
+ /**
+ * common interfaces as a full_key_t
+ */
+ shard_t shard() const {
+ return shard_pool_packed().shard;
+ }
+ pool_t pool() const {
+ return shard_pool_packed().pool;
+ }
+ crush_hash_t crush() const {
+ return crush_packed().crush;
+ }
+ std::string_view nspace() const {
+ // TODO(cross-node string dedup)
+ return ns_oid_view().nspace.to_string_view();
+ }
+ string_view_masked_t nspace_masked() const {
+ // TODO(cross-node string dedup)
+ return string_view_masked_t{ns_oid_view().nspace};
+ }
+ std::string_view oid() const {
+ // TODO(cross-node string dedup)
+ return ns_oid_view().oid.to_string_view();
+ }
+ string_view_masked_t oid_masked() const {
+ // TODO(cross-node string dedup)
+ return string_view_masked_t{ns_oid_view().oid};
+ }
+ ns_oid_view_t::Type dedup_type() const {
+ return ns_oid_view().type();
+ }
+ snap_t snap() const {
+ return snap_gen_packed().snap;
+ }
+ gen_t gen() const {
+ return snap_gen_packed().gen;
+ }
+
+ bool operator==(const full_key_t<KeyT::VIEW>& o) const;
+ bool operator==(const full_key_t<KeyT::HOBJ>& o) const;
+ bool operator!=(const full_key_t<KeyT::VIEW>& o) const {
+ return !operator==(o);
+ }
+ bool operator!=(const full_key_t<KeyT::HOBJ>& o) const {
+ return !operator==(o);
+ }
+
+ /**
+ * key_view_t specific interfaces
+ */
+ bool has_shard_pool() const {
+ return p_shard_pool != nullptr;
+ }
+ bool has_crush() const {
+ return p_crush != nullptr;
+ }
+ bool has_ns_oid() const {
+ return p_ns_oid.has_value();
+ }
+ bool has_snap_gen() const {
+ return p_snap_gen != nullptr;
+ }
+
+ const shard_pool_t& shard_pool_packed() const {
+ assert(has_shard_pool());
+ return *p_shard_pool;
+ }
+ const crush_t& crush_packed() const {
+ assert(has_crush());
+ return *p_crush;
+ }
+ const ns_oid_view_t& ns_oid_view() const {
+ assert(has_ns_oid());
+ return *p_ns_oid;
+ }
+ const snap_gen_t& snap_gen_packed() const {
+ assert(has_snap_gen());
+ return *p_snap_gen;
+ }
+
+ size_t size_logical() const {
+ return sizeof(shard_t) + sizeof(pool_t) + sizeof(crush_hash_t) +
+ sizeof(snap_t) + sizeof(gen_t) + ns_oid_view().size_logical();
+ }
+
+ ghobject_t to_ghobj() const {
+ return ghobject_t(
+ shard_id_t(shard()), pool(), crush(),
+ std::string(nspace()), std::string(oid()), snap(), gen());
+ }
+
+ void replace(const crush_t& key) { p_crush = &key; }
+ void set(const crush_t& key) {
+ assert(!has_crush());
+ replace(key);
+ }
+ void replace(const shard_pool_crush_t& key) { p_shard_pool = &key.shard_pool; }
+ void set(const shard_pool_crush_t& key) {
+ set(key.crush);
+ assert(!has_shard_pool());
+ replace(key);
+ }
+ void replace(const ns_oid_view_t& key) { p_ns_oid = key; }
+ void set(const ns_oid_view_t& key) {
+ assert(!has_ns_oid());
+ replace(key);
+ }
+ void replace(const snap_gen_t& key) { p_snap_gen = &key; }
+ void set(const snap_gen_t& key) {
+ assert(!has_snap_gen());
+ replace(key);
+ }
+
+ std::ostream& dump(std::ostream& os) const {
+ os << "key_view(";
+ if (has_shard_pool()) {
+ os << (unsigned)shard() << "," << pool() << ",";
+ } else {
+ os << "X,X,";
+ }
+ if (has_crush()) {
+ os << crush() << "; ";
+ } else {
+ os << "X; ";
+ }
+ if (has_ns_oid()) {
+ os << ns_oid_view() << "; ";
+ } else {
+ os << "X,X; ";
+ }
+ if (has_snap_gen()) {
+ os << snap() << "," << gen() << ")";
+ } else {
+ os << "X,X)";
+ }
+ return os;
+ }
+
+ private:
+ const shard_pool_t* p_shard_pool = nullptr;
+ const crush_t* p_crush = nullptr;
+ std::optional<ns_oid_view_t> p_ns_oid;
+ const snap_gen_t* p_snap_gen = nullptr;
+};
+
+template <KeyT KT>
+void encode_key(const full_key_t<KT>& key, ceph::bufferlist& bl) {
+ ceph::encode(key.shard(), bl);
+ ceph::encode(key.pool(), bl);
+ ceph::encode(key.crush(), bl);
+ key.nspace_masked().encode(bl);
+ key.oid_masked().encode(bl);
+ ceph::encode(key.snap(), bl);
+ ceph::encode(key.gen(), bl);
+}
+
+inline MatchKindCMP compare_to(std::string_view l, std::string_view r) {
+ return toMatchKindCMP(l, r);
+}
+template <KeyT TypeL, KeyT TypeR>
+bool compare_full_key(const full_key_t<TypeL>& l, const full_key_t<TypeR>& r) {
+ if (l.shard() != r.shard())
+ return false;
+ if (l.pool() != r.pool())
+ return false;
+ if (l.crush() != r.crush())
+ return false;
+ if (compare_to(l.nspace(), r.nspace()) != MatchKindCMP::EQ)
+ return false;
+ if (compare_to(l.oid(), r.oid()) != MatchKindCMP::EQ)
+ return false;
+ if (l.snap() != r.snap())
+ return false;
+ if (l.gen() != r.gen())
+ return false;
+ return true;
+}
+
+inline bool key_hobj_t::operator==(const full_key_t<KeyT::VIEW>& o) const {
+ return compare_full_key<KeyT::HOBJ, KeyT::VIEW>(*this, o);
+}
+inline bool key_hobj_t::operator==(const full_key_t<KeyT::HOBJ>& o) const {
+ return compare_full_key<KeyT::HOBJ, KeyT::HOBJ>(*this, o);
+}
+inline bool key_view_t::operator==(const full_key_t<KeyT::VIEW>& o) const {
+ return compare_full_key<KeyT::VIEW, KeyT::VIEW>(*this, o);
+}
+inline bool key_view_t::operator==(const full_key_t<KeyT::HOBJ>& o) const {
+ return compare_full_key<KeyT::VIEW, KeyT::HOBJ>(*this, o);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const key_view_t& key) {
+ return key.dump(os);
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const shard_pool_t& target) {
+ auto ret = toMatchKindCMP(key.shard(), target.shard);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return toMatchKindCMP(key.pool(), target.pool);
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const crush_t& target) {
+ return toMatchKindCMP(key.crush(), target.crush);
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const shard_pool_crush_t& target) {
+ auto ret = compare_to<Type>(key, target.shard_pool);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return compare_to<Type>(key, target.crush);
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const ns_oid_view_t& target) {
+ auto ret = compare_to(key.nspace(), string_view_masked_t{target.nspace});
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return compare_to(key.oid(), string_view_masked_t{target.oid});
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const snap_gen_t& target) {
+ auto ret = toMatchKindCMP(key.snap(), target.snap);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return toMatchKindCMP(key.gen(), target.gen);
+}
+
+template <KeyT KT>
+shard_pool_t shard_pool_t::from_key(const full_key_t<KT>& key) {
+ if constexpr (KT == KeyT::VIEW) {
+ return key.shard_pool_packed();
+ } else {
+ return {key.shard(), key.pool()};
+ }
+}
+
+template <KeyT KT>
+crush_t crush_t::from_key(const full_key_t<KT>& key) {
+ if constexpr (KT == KeyT::VIEW) {
+ return key.crush_packed();
+ } else {
+ return {key.crush()};
+ }
+}
+
+template <KeyT KT>
+shard_pool_crush_t shard_pool_crush_t::from_key(const full_key_t<KT>& key) {
+ return {shard_pool_t::from_key<KT>(key), crush_t::from_key<KT>(key)};
+}
+
+template <KeyT KT>
+snap_gen_t snap_gen_t::from_key(const full_key_t<KT>& key) {
+ if constexpr (KT == KeyT::VIEW) {
+ return key.snap_gen_packed();
+ } else {
+ return {key.snap(), key.gen()};
+ }
+}
+
+template <KeyT KT>
+node_offset_t ns_oid_view_t::estimate_size(const full_key_t<KT>& key) {
+ if constexpr (KT == KeyT::VIEW) {
+ return key.ns_oid_view().size();
+ } else {
+ if (key.dedup_type() != Type::STR) {
+ // size after deduplication
+ return sizeof(string_size_t);
+ } else {
+ return 2 * sizeof(string_size_t) + key.nspace().size() + key.oid().size();
+ }
+ }
+}
+
+template <KeyT KT>
+void ns_oid_view_t::append(
+ NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) {
+ if (key.dedup_type() == Type::STR) {
+ string_key_view_t::append_str(mut, key.nspace(), p_append);
+ string_key_view_t::append_str(mut, key.oid(), p_append);
+ } else {
+ string_key_view_t::append_dedup(mut, key.dedup_type(), p_append);
+ }
+}
+
+template <KeyT KT>
+void ns_oid_view_t::test_append(const full_key_t<KT>& key, char*& p_append) {
+ if (key.dedup_type() == Type::STR) {
+ string_key_view_t::test_append_str(key.nspace(), p_append);
+ string_key_view_t::test_append_str(key.oid(), p_append);
+ } else {
+ string_key_view_t::test_append_dedup(key.dedup_type(), p_append);
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc
new file mode 100644
index 000000000..4a5988185
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc
@@ -0,0 +1,318 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_stage.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+#include "node_stage_layout.h"
+
+namespace crimson::os::seastore::onode {
+
+#define NODE_T node_extent_t<FieldType, NODE_TYPE>
+#define NODE_INST(FT, NT) node_extent_t<FT, NT>
+
+template <typename FieldType, node_type_t NODE_TYPE>
+const char* NODE_T::p_left_bound() const {
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ // N3 internal node doesn't have the right part
+ return nullptr;
+ } else {
+ auto ret = p_start() + fields().get_item_end_offset(keys());
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (is_level_tail()) {
+ ret -= sizeof(laddr_t);
+ }
+ }
+ return ret;
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+node_offset_t NODE_T::size_to_nxt_at(index_t index) const {
+ assert(index < keys());
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ return FieldType::estimate_insert_one();
+ } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+ auto p_end = p_start() + p_fields->get_item_end_offset(index);
+ return FieldType::estimate_insert_one() + ns_oid_view_t(p_end).size();
+ } else {
+ ceph_abort("N3 node is not nested");
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+memory_range_t NODE_T::get_nxt_container(index_t index) const {
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ ceph_abort("N3 internal node doesn't have the right part");
+ } else {
+ node_offset_t item_start_offset = p_fields->get_item_start_offset(index);
+ node_offset_t item_end_offset = p_fields->get_item_end_offset(index);
+ assert(item_start_offset < item_end_offset);
+ auto item_p_start = p_start() + item_start_offset;
+ auto item_p_end = p_start() + item_end_offset;
+ if constexpr (FIELD_TYPE == field_type_t::N2) {
+ // range for sub_items_t<NODE_TYPE>
+ item_p_end = ns_oid_view_t(item_p_end).p_start();
+ assert(item_p_start < item_p_end);
+ } else {
+ // range for item_iterator_t<NODE_TYPE>
+ }
+ return {item_p_start, item_p_end};
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+void NODE_T::bootstrap_extent(
+ NodeExtentMutable& mut,
+ field_type_t field_type, node_type_t node_type,
+ bool is_level_tail, level_t level) {
+ node_header_t::bootstrap_extent(
+ mut, field_type, node_type, is_level_tail, level);
+ mut.copy_in_relative(
+ sizeof(node_header_t), typename FieldType::num_keys_t(0u));
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+void NODE_T::update_is_level_tail(
+ NodeExtentMutable& mut, const node_extent_t& extent, bool value) {
+ node_header_t::update_is_level_tail(mut, extent.p_fields->header, value);
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+memory_range_t NODE_T::insert_prefix_at(
+ NodeExtentMutable& mut, const node_extent_t& node, const full_key_t<KT>& key,
+ index_t index, node_offset_t size, const char* p_left_bound) {
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ assert(index <= node.keys());
+ assert(p_left_bound == node.p_left_bound());
+ assert(size > FieldType::estimate_insert_one());
+ auto size_right = size - FieldType::estimate_insert_one();
+ const char* p_insert = node.p_start() + node.fields().get_item_end_offset(index);
+ const char* p_insert_front = p_insert - size_right;
+ FieldType::template insert_at<KT>(mut, key, node.fields(), index, size_right);
+ mut.shift_absolute(p_left_bound,
+ p_insert - p_left_bound,
+ -(int)size_right);
+ return {p_insert_front, p_insert};
+ } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+ ceph_abort("not implemented");
+ } else {
+ ceph_abort("impossible");
+ }
+}
+#define IPA_TEMPLATE(FT, NT, KT) \
+ template memory_range_t NODE_INST(FT, NT)::insert_prefix_at<KT>( \
+ NodeExtentMutable&, const node_extent_t&, const full_key_t<KT>&, \
+ index_t, node_offset_t, const char*)
+IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ);
+
+template <typename FieldType, node_type_t NODE_TYPE>
+void NODE_T::update_size_at(
+ NodeExtentMutable& mut, const node_extent_t& node, index_t index, int change) {
+ assert(index < node.keys());
+ FieldType::update_size_at(mut, node.fields(), index, change);
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+node_offset_t NODE_T::trim_until(
+ NodeExtentMutable& mut, const node_extent_t& node, index_t index) {
+ assert(!node.is_level_tail());
+ auto keys = node.keys();
+ assert(index <= keys);
+ if (index == keys) {
+ return 0;
+ }
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ ceph_abort("not implemented");
+ } else {
+ mut.copy_in_absolute(
+ (void*)&node.p_fields->num_keys, num_keys_t(index));
+ }
+ // no need to calculate trim size for node
+ return 0;
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+node_offset_t NODE_T::trim_at(
+ NodeExtentMutable& mut, const node_extent_t& node,
+ index_t index, node_offset_t trimmed) {
+ assert(!node.is_level_tail());
+ assert(index < node.keys());
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ ceph_abort("not implemented");
+ } else {
+ node_offset_t offset = node.p_fields->get_item_start_offset(index);
+ size_t new_offset = offset + trimmed;
+ assert(new_offset < node.p_fields->get_item_end_offset(index));
+ mut.copy_in_absolute(const_cast<void*>(node.p_fields->p_offset(index)),
+ node_offset_t(new_offset));
+ mut.copy_in_absolute(
+ (void*)&node.p_fields->num_keys, num_keys_t(index + 1));
+ }
+ // no need to calculate trim size for node
+ return 0;
+}
+
+#define NODE_TEMPLATE(FT, NT) template class NODE_INST(FT, NT)
+NODE_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(node_fields_0_t, node_type_t::LEAF);
+NODE_TEMPLATE(node_fields_1_t, node_type_t::LEAF);
+NODE_TEMPLATE(node_fields_2_t, node_type_t::LEAF);
+NODE_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF);
+
+#define APPEND_T node_extent_t<FieldType, NODE_TYPE>::Appender<KT>
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+void APPEND_T::append(const node_extent_t& src, index_t from, index_t items) {
+ assert(from <= src.keys());
+ if (p_src == nullptr) {
+ p_src = &src;
+ } else {
+ assert(p_src == &src);
+ }
+ if (items == 0) {
+ return;
+ }
+ assert(from < src.keys());
+ assert(from + items <= src.keys());
+ num_keys += items;
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ ceph_abort("impossible path");
+ } else {
+ // append left part forwards
+ node_offset_t offset_left_start = src.fields().get_key_start_offset(from);
+ node_offset_t offset_left_end = src.fields().get_key_start_offset(from + items);
+ node_offset_t left_size = offset_left_end - offset_left_start;
+ if (num_keys == 0) {
+ // no need to adjust offset
+ assert(from == 0);
+ assert(p_start + offset_left_start == p_append_left);
+ p_mut->copy_in_absolute(p_append_left,
+ src.p_start() + offset_left_start, left_size);
+ } else {
+ node_offset_t step_size = FieldType::estimate_insert_one();
+ node_offset_t offset_base = src.fields().get_item_end_offset(from);
+ int offset_change = p_append_right - p_start - offset_base;
+ auto p_offset_dst = p_append_left;
+ if constexpr (FIELD_TYPE != field_type_t::N2) {
+ // copy keys
+ p_mut->copy_in_absolute(p_append_left,
+ src.p_start() + offset_left_start, left_size);
+ // point to offset for update
+ p_offset_dst += sizeof(typename FieldType::key_t);
+ }
+ for (auto i = from; i < from + items; ++i) {
+ p_mut->copy_in_absolute(p_offset_dst,
+ node_offset_t(src.fields().get_item_start_offset(i) + offset_change));
+ p_offset_dst += step_size;
+ }
+ assert(p_append_left + left_size + sizeof(typename FieldType::key_t) ==
+ p_offset_dst);
+ }
+ p_append_left += left_size;
+
+ // append right part backwards
+ node_offset_t offset_right_start = src.fields().get_item_end_offset(from + items);
+ node_offset_t offset_right_end = src.fields().get_item_end_offset(from);
+ node_offset_t right_size = offset_right_end - offset_right_start;
+ p_append_right -= right_size;
+ p_mut->copy_in_absolute(p_append_right,
+ src.p_start() + offset_right_start, right_size);
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+void APPEND_T::append(
+ const full_key_t<KT>& key, const value_t& value, const value_t*& p_value) {
+ if constexpr (FIELD_TYPE == field_type_t::N3) {
+ ceph_abort("not implemented");
+ } else {
+ ceph_abort("should not happen");
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const key_get_type& partial_key) {
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ FieldType::append_key(*p_mut, partial_key, p_append_left);
+ } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+ FieldType::append_key(*p_mut, partial_key, p_append_right);
+ } else {
+ ceph_abort("impossible path");
+ }
+ return {p_mut, p_append_right};
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const full_key_t<KT>& key) {
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ FieldType::template append_key<KT>(*p_mut, key, p_append_left);
+ } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+ FieldType::template append_key<KT>(*p_mut, key, p_append_right);
+ } else {
+ ceph_abort("impossible path");
+ }
+ return {p_mut, p_append_right};
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+char* APPEND_T::wrap() {
+ assert(p_append_left <= p_append_right);
+ assert(p_src);
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (p_src->is_level_tail()) {
+ laddr_t tail_value = p_src->get_end_p_laddr()->value;
+ p_append_right -= sizeof(laddr_t);
+ assert(p_append_left <= p_append_right);
+ p_mut->copy_in_absolute(p_append_right, tail_value);
+ }
+ }
+ p_mut->copy_in_absolute(p_start + offsetof(FieldType, num_keys), num_keys);
+ return p_append_left;
+}
+
+#define APPEND_TEMPLATE(FT, NT, KT) template class node_extent_t<FT, NT>::Appender<KT>
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::HOBJ);
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h
new file mode 100644
index 000000000..cf0ca463c
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h
@@ -0,0 +1,226 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "key_layout.h"
+#include "stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+/**
+ * node_extent_t
+ *
+ * The top indexing stage implementation for node N0/N1/N2/N3, implements
+ * staged contract as an indexable container, and provides access to node
+ * header.
+ *
+ * The specific field layout are defined by FieldType which are
+ * node_fields_0_t, node_fields_1_t, node_fields_2_t, internal_fields_3_t and
+ * leaf_fields_3_t. Diagrams see node_stage_layout.h.
+ */
+template <typename FieldType, node_type_t _NODE_TYPE>
+class node_extent_t {
+ public:
+ using value_t = value_type_t<_NODE_TYPE>;
+ using num_keys_t = typename FieldType::num_keys_t;
+ static constexpr node_type_t NODE_TYPE = _NODE_TYPE;
+ static constexpr field_type_t FIELD_TYPE = FieldType::FIELD_TYPE;
+ static constexpr node_offset_t EXTENT_SIZE =
+ (FieldType::SIZE + DISK_BLOCK_SIZE - 1u) / DISK_BLOCK_SIZE * DISK_BLOCK_SIZE;
+
+ // TODO: remove
+ node_extent_t() = default;
+
+ node_extent_t(const FieldType* p_fields) : p_fields{p_fields} {
+ validate(*p_fields);
+ }
+
+ const char* p_start() const { return fields_start(*p_fields); }
+
+ const char* off_to_ptr(node_offset_t off) const {
+ assert(off <= FieldType::SIZE);
+ return p_start() + off;
+ }
+
+ node_offset_t ptr_to_off(const void* ptr) const {
+ auto _ptr = static_cast<const char*>(ptr);
+ assert(_ptr >= p_start());
+ auto off = _ptr - p_start();
+ assert(off <= FieldType::SIZE);
+ return off;
+ }
+
+ bool is_level_tail() const { return p_fields->is_level_tail(); }
+ level_t level() const { return p_fields->header.level; }
+ node_offset_t free_size() const {
+ return p_fields->template free_size_before<NODE_TYPE>(keys());
+ }
+ node_offset_t total_size() const { return p_fields->total_size(); }
+ const char* p_left_bound() const;
+ template <node_type_t T = NODE_TYPE>
+ std::enable_if_t<T == node_type_t::INTERNAL, const laddr_packed_t*>
+ get_end_p_laddr() const {
+ assert(is_level_tail());
+ if constexpr (FIELD_TYPE == field_type_t::N3) {
+ return &p_fields->child_addrs[keys()];
+ } else {
+ auto offset_start = p_fields->get_item_end_offset(keys());
+ assert(offset_start <= FieldType::SIZE);
+ offset_start -= sizeof(laddr_packed_t);
+ auto p_addr = p_start() + offset_start;
+ return reinterpret_cast<const laddr_packed_t*>(p_addr);
+ }
+ }
+
+ // container type system
+ using key_get_type = typename FieldType::key_get_type;
+ static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE;
+ index_t keys() const { return p_fields->num_keys; }
+ key_get_type operator[] (index_t index) const { return p_fields->get_key(index); }
+ node_offset_t size_before(index_t index) const {
+ auto free_size = p_fields->template free_size_before<NODE_TYPE>(index);
+ assert(total_size() >= free_size);
+ return total_size() - free_size;
+ }
+ node_offset_t size_to_nxt_at(index_t index) const;
+ node_offset_t size_overhead_at(index_t index) const {
+ return FieldType::ITEM_OVERHEAD; }
+ memory_range_t get_nxt_container(index_t index) const;
+
+ template <typename T = FieldType>
+ std::enable_if_t<T::FIELD_TYPE == field_type_t::N3, const value_t*>
+ get_p_value(index_t index) const {
+ assert(index < keys());
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ return &p_fields->child_addrs[index];
+ } else {
+ auto range = get_nxt_container(index);
+ auto ret = reinterpret_cast<const onode_t*>(range.p_start);
+ assert(range.p_start + ret->size == range.p_end);
+ return ret;
+ }
+ }
+
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ assert(p_node_start == p_start());
+ // nothing to encode as the container range is the entire extent
+ }
+
+ static node_extent_t decode(const char* p_node_start,
+ ceph::bufferlist::const_iterator& delta) {
+ // nothing to decode
+ return node_extent_t(reinterpret_cast<const FieldType*>(p_node_start));
+ }
+
+ static void validate(const FieldType& fields) {
+#ifndef NDEBUG
+ assert(fields.header.get_node_type() == NODE_TYPE);
+ assert(fields.header.get_field_type() == FieldType::FIELD_TYPE);
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ assert(fields.header.level > 0u);
+ } else {
+ assert(fields.header.level == 0u);
+ }
+#endif
+ }
+
+ static void bootstrap_extent(
+ NodeExtentMutable&, field_type_t, node_type_t, bool, level_t);
+
+ static void update_is_level_tail(NodeExtentMutable&, const node_extent_t&, bool);
+
+ static node_offset_t header_size() { return FieldType::HEADER_SIZE; }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(
+ const full_key_t<KT>& key, const value_t& value) {
+ auto size = FieldType::estimate_insert_one();
+ if constexpr (FIELD_TYPE == field_type_t::N2) {
+ size += ns_oid_view_t::estimate_size<KT>(key);
+ } else if constexpr (FIELD_TYPE == field_type_t::N3 &&
+ NODE_TYPE == node_type_t::LEAF) {
+ size += value.size;
+ }
+ return size;
+ }
+
+ template <KeyT KT>
+ static const value_t* insert_at(
+ NodeExtentMutable& mut, const node_extent_t&,
+ const full_key_t<KT>& key, const value_t& value,
+ index_t index, node_offset_t size, const char* p_left_bound) {
+ if constexpr (FIELD_TYPE == field_type_t::N3) {
+ ceph_abort("not implemented");
+ } else {
+ ceph_abort("impossible");
+ }
+ }
+
+ template <KeyT KT>
+ static memory_range_t insert_prefix_at(
+ NodeExtentMutable&, const node_extent_t&,
+ const full_key_t<KT>& key,
+ index_t index, node_offset_t size, const char* p_left_bound);
+
+ static void update_size_at(
+ NodeExtentMutable&, const node_extent_t&, index_t index, int change);
+
+ static node_offset_t trim_until(
+ NodeExtentMutable&, const node_extent_t&, index_t index);
+ static node_offset_t trim_at(NodeExtentMutable&, const node_extent_t&,
+ index_t index, node_offset_t trimmed);
+
+ template <KeyT KT>
+ class Appender;
+
+ private:
+ const FieldType& fields() const { return *p_fields; }
+ const FieldType* p_fields;
+};
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+class node_extent_t<FieldType, NODE_TYPE>::Appender {
+ public:
+ Appender(NodeExtentMutable* p_mut, char* p_append)
+ : p_mut{p_mut}, p_start{p_append} {
+#ifndef NDEBUG
+ auto p_fields = reinterpret_cast<const FieldType*>(p_append);
+ assert(*(p_fields->header.get_field_type()) == FIELD_TYPE);
+ assert(p_fields->header.get_node_type() == NODE_TYPE);
+ assert(p_fields->num_keys == 0);
+#endif
+ p_append_left = p_start + FieldType::HEADER_SIZE;
+ p_append_right = p_start + FieldType::SIZE;
+ }
+ void append(const node_extent_t& src, index_t from, index_t items);
+ void append(const full_key_t<KT>&, const value_t&, const value_t*&);
+ char* wrap();
+ std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&);
+ std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&);
+ void wrap_nxt(char* p_append) {
+ if constexpr (FIELD_TYPE != field_type_t::N3) {
+ assert(p_append < p_append_right);
+ assert(p_append_left < p_append);
+ p_append_right = p_append;
+ FieldType::append_offset(*p_mut, p_append - p_start, p_append_left);
+ ++num_keys;
+ } else {
+ ceph_abort("not implemented");
+ }
+ }
+
+ private:
+ const node_extent_t* p_src = nullptr;
+ NodeExtentMutable* p_mut;
+ char* p_start;
+ char* p_append_left;
+ char* p_append_right;
+ num_keys_t num_keys = 0;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc
new file mode 100644
index 000000000..81bfac72a
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_stage_layout.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+void node_header_t::bootstrap_extent(
+ NodeExtentMutable& mut,
+ field_type_t field_type, node_type_t node_type,
+ bool is_level_tail, level_t level) {
+ node_header_t header;
+ header.set_field_type(field_type);
+ header.set_node_type(node_type);
+ header.set_is_level_tail(is_level_tail);
+ header.level = level;
+ mut.copy_in_relative(0, header);
+}
+
+void node_header_t::update_is_level_tail(
+ NodeExtentMutable& mut, const node_header_t& header, bool value) {
+ auto& _header = const_cast<node_header_t&>(header);
+ _header.set_is_level_tail(value);
+ mut.validate_inplace_update(_header);
+}
+
+#define F013_T _node_fields_013_t<SlotType>
+#define F013_INST(ST) _node_fields_013_t<ST>
+
+template <typename SlotType>
+void F013_T::update_size_at(
+ NodeExtentMutable& mut, const me_t& node, index_t index, int change) {
+ assert(index <= node.num_keys);
+ for (const auto* p_slot = &node.slots[index];
+ p_slot < &node.slots[node.num_keys];
+ ++p_slot) {
+ node_offset_t offset = p_slot->right_offset;
+ mut.copy_in_absolute(
+ (void*)&(p_slot->right_offset),
+ node_offset_t(offset - change));
+ }
+}
+
+template <typename SlotType>
+void F013_T::append_key(
+ NodeExtentMutable& mut, const key_t& key, char*& p_append) {
+ mut.copy_in_absolute(p_append, key);
+ p_append += sizeof(key_t);
+}
+
+template <typename SlotType>
+void F013_T::append_offset(
+ NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) {
+ mut.copy_in_absolute(p_append, offset_to_right);
+ p_append += sizeof(node_offset_t);
+}
+
+template <typename SlotType>
+template <KeyT KT>
+void F013_T::insert_at(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ const me_t& node, index_t index, node_offset_t size_right) {
+ assert(index <= node.num_keys);
+ update_size_at(mut, node, index, size_right);
+ auto p_insert = const_cast<char*>(fields_start(node)) +
+ node.get_key_start_offset(index);
+ auto p_shift_end = fields_start(node) + node.get_key_start_offset(node.num_keys);
+ mut.shift_absolute(p_insert, p_shift_end - p_insert, estimate_insert_one());
+ mut.copy_in_absolute((void*)&node.num_keys, num_keys_t(node.num_keys + 1));
+ append_key(mut, key_t::template from_key<KT>(key), p_insert);
+ append_offset(mut, node.get_item_end_offset(index) - size_right, p_insert);
+}
+#define IA_TEMPLATE(ST, KT) template void F013_INST(ST):: \
+ insert_at<KT>(NodeExtentMutable&, const full_key_t<KT>&, \
+ const F013_INST(ST)&, index_t, node_offset_t)
+IA_TEMPLATE(slot_0_t, KeyT::VIEW);
+IA_TEMPLATE(slot_1_t, KeyT::VIEW);
+IA_TEMPLATE(slot_3_t, KeyT::VIEW);
+IA_TEMPLATE(slot_0_t, KeyT::HOBJ);
+IA_TEMPLATE(slot_1_t, KeyT::HOBJ);
+IA_TEMPLATE(slot_3_t, KeyT::HOBJ);
+
+#define F013_TEMPLATE(ST) template struct F013_INST(ST)
+F013_TEMPLATE(slot_0_t);
+F013_TEMPLATE(slot_1_t);
+F013_TEMPLATE(slot_3_t);
+
+void node_fields_2_t::append_offset(
+ NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) {
+ mut.copy_in_absolute(p_append, offset_to_right);
+ p_append += sizeof(node_offset_t);
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h
new file mode 100644
index 000000000..14ba95bf4
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h
@@ -0,0 +1,366 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "key_layout.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+struct node_header_t {
+ static constexpr unsigned FIELD_TYPE_BITS = 6u;
+ static_assert(static_cast<uint8_t>(field_type_t::_MAX) <= 1u << FIELD_TYPE_BITS);
+ static constexpr unsigned NODE_TYPE_BITS = 1u;
+ static constexpr unsigned B_LEVEL_TAIL_BITS = 1u;
+ using bits_t = uint8_t;
+
+ node_header_t() {}
+ std::optional<field_type_t> get_field_type() const {
+ if (field_type >= FIELD_TYPE_MAGIC &&
+ field_type < static_cast<uint8_t>(field_type_t::_MAX)) {
+ return static_cast<field_type_t>(field_type);
+ } else {
+ return std::nullopt;
+ }
+ }
+ node_type_t get_node_type() const {
+ return static_cast<node_type_t>(node_type);
+ }
+ bool get_is_level_tail() const {
+ return is_level_tail;
+ }
+
+ static void bootstrap_extent(
+ NodeExtentMutable&, field_type_t, node_type_t, bool, level_t);
+
+ static void update_is_level_tail(NodeExtentMutable&, const node_header_t&, bool);
+
+ bits_t field_type : FIELD_TYPE_BITS;
+ bits_t node_type : NODE_TYPE_BITS;
+ bits_t is_level_tail : B_LEVEL_TAIL_BITS;
+ static_assert(sizeof(bits_t) * 8 ==
+ FIELD_TYPE_BITS + NODE_TYPE_BITS + B_LEVEL_TAIL_BITS);
+ level_t level;
+
+ private:
+ void set_field_type(field_type_t type) {
+ field_type = static_cast<uint8_t>(type);
+ }
+ void set_node_type(node_type_t type) {
+ node_type = static_cast<uint8_t>(type);
+ }
+ void set_is_level_tail(bool value) {
+ is_level_tail = static_cast<uint8_t>(value);
+ }
+} __attribute__((packed));
+
+template <typename FixedKeyType, field_type_t _FIELD_TYPE>
+struct _slot_t {
+ using key_t = FixedKeyType;
+ static constexpr field_type_t FIELD_TYPE = _FIELD_TYPE;
+ static constexpr node_offset_t OVERHEAD = sizeof(_slot_t) - sizeof(key_t);
+
+ key_t key;
+ node_offset_t right_offset;
+} __attribute__((packed));
+using slot_0_t = _slot_t<shard_pool_crush_t, field_type_t::N0>;
+using slot_1_t = _slot_t<crush_t, field_type_t::N1>;
+using slot_3_t = _slot_t<snap_gen_t, field_type_t::N3>;
+
+struct node_range_t {
+ node_offset_t start;
+ node_offset_t end;
+};
+
+template <typename FieldType>
+const char* fields_start(const FieldType& node) {
+ return reinterpret_cast<const char*>(&node);
+}
+
+template <node_type_t NODE_TYPE, typename FieldType>
+node_range_t fields_free_range_before(
+ const FieldType& node, index_t index) {
+ assert(index <= node.num_keys);
+ node_offset_t offset_start = node.get_key_start_offset(index);
+ node_offset_t offset_end =
+ (index == 0 ? FieldType::SIZE
+ : node.get_item_start_offset(index - 1));
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (node.is_level_tail() && index == node.num_keys) {
+ offset_end -= sizeof(laddr_t);
+ }
+ }
+ assert(offset_start <= offset_end);
+ assert(offset_end - offset_start < FieldType::SIZE);
+ return {offset_start, offset_end};
+}
+
+/**
+ * _node_fields_013_t (node_fields_0_t, node_fields_1_t, leaf_fields_3_t
+ *
+ * The STAGE_LEFT layout implementation for node N0/N1, or the STAGE_RIGHT
+ * layout implementation for leaf node N3.
+ *
+ * The node layout storing n slots:
+ *
+ * # <----------------------------- node range --------------------------------------> #
+ * # #<~># free space #
+ * # <----- left part -----------------------------> # <~# <----- right slots -------> #
+ * # # <---- left slots -------------> #~> # #
+ * # # slots [2, n) |<~># #<~>| right slots [2, n) #
+ * # # <- slot 0 -> | <- slot 1 -> | # # | <-- s1 --> | <-- s0 --> #
+ * # # | | # # | | #
+ * # | num_ # | right | | right | # # | next-stage | next-stage #
+ * # header | keys # key | offset | key | offset | # # | container | container #
+ * # | # 0 | 0 | 1 | 1 |...#...#...| or onode 1 | or onode 0 #
+ * | | ^ ^
+ * | | | |
+ * | +----------------+ |
+ * +--------------------------------------------+
+ */
+template <typename SlotType>
+struct _node_fields_013_t {
+ // TODO: decide by NODE_BLOCK_SIZE, sizeof(SlotType), sizeof(laddr_t)
+ // and the minimal size of variable_key.
+ using num_keys_t = uint8_t;
+ using key_t = typename SlotType::key_t;
+ using key_get_type = const key_t&;
+ using me_t = _node_fields_013_t<SlotType>;
+ static constexpr field_type_t FIELD_TYPE = SlotType::FIELD_TYPE;
+ static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE;
+ static constexpr node_offset_t HEADER_SIZE =
+ sizeof(node_header_t) + sizeof(num_keys_t);
+ static constexpr node_offset_t ITEM_OVERHEAD = SlotType::OVERHEAD;
+
+ bool is_level_tail() const { return header.get_is_level_tail(); }
+ node_offset_t total_size() const { return SIZE; }
+ key_get_type get_key(index_t index) const {
+ assert(index < num_keys);
+ return slots[index].key;
+ }
+ node_offset_t get_key_start_offset(index_t index) const {
+ assert(index <= num_keys);
+ auto offset = HEADER_SIZE + sizeof(SlotType) * index;
+ assert(offset < SIZE);
+ return offset;
+ }
+ node_offset_t get_item_start_offset(index_t index) const {
+ assert(index < num_keys);
+ auto offset = slots[index].right_offset;
+ assert(offset <= SIZE);
+ return offset;
+ }
+ const void* p_offset(index_t index) const {
+ assert(index < num_keys);
+ return &slots[index].right_offset;
+ }
+ node_offset_t get_item_end_offset(index_t index) const {
+ return index == 0 ? SIZE : get_item_start_offset(index - 1);
+ }
+ template <node_type_t NODE_TYPE>
+ node_offset_t free_size_before(index_t index) const {
+ auto range = fields_free_range_before<NODE_TYPE>(*this, index);
+ return range.end - range.start;
+ }
+
+ static node_offset_t estimate_insert_one() { return sizeof(SlotType); }
+ template <KeyT KT>
+ static void insert_at(
+ NodeExtentMutable&, const full_key_t<KT>& key,
+ const me_t& node, index_t index, node_offset_t size_right);
+ static void update_size_at(
+ NodeExtentMutable&, const me_t& node, index_t index, int change);
+ static void append_key(
+ NodeExtentMutable&, const key_t& key, char*& p_append);
+ template <KeyT KT>
+ static void append_key(
+ NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) {
+ append_key(mut, key_t::template from_key<KT>(key), p_append);
+ }
+ static void append_offset(
+ NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append);
+
+ node_header_t header;
+ num_keys_t num_keys = 0u;
+ SlotType slots[];
+} __attribute__((packed));
+using node_fields_0_t = _node_fields_013_t<slot_0_t>;
+using node_fields_1_t = _node_fields_013_t<slot_1_t>;
+
+/**
+ * node_fields_2_t
+ *
+ * The STAGE_STRING layout implementation for node N2.
+ *
+ * The node layout storing n slots:
+ *
+ * # <--------------------------------- node range ----------------------------------------> #
+ * # #<~># free space #
+ * # <------- left part ---------------> # <~# <--------- right slots ---------------------> #
+ * # # <---- offsets ----> #~> #<~>| slots [2, n) #
+ * # # offsets [2, n) |<~># # | <----- slot 1 ----> | <----- slot 0 ----> #
+ * # # | # # | | #
+ * # | num_ # offset | offset | # # | next-stage | ns-oid | next-stage | ns-oid #
+ * # header | keys # 0 | 1 |...#...#...| container1 | 1 | container0 | 0 #
+ * | | ^ ^
+ * | | | |
+ * | +----------------+ |
+ * +-----------------------------------------------+
+ */
+struct node_fields_2_t {
+ // TODO: decide by NODE_BLOCK_SIZE, sizeof(node_off_t), sizeof(laddr_t)
+ // and the minimal size of variable_key.
+ using num_keys_t = uint8_t;
+ using key_t = ns_oid_view_t;
+ using key_get_type = key_t;
+ static constexpr field_type_t FIELD_TYPE = field_type_t::N2;
+ static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE;
+ static constexpr node_offset_t HEADER_SIZE =
+ sizeof(node_header_t) + sizeof(num_keys_t);
+ static constexpr node_offset_t ITEM_OVERHEAD = sizeof(node_offset_t);
+
+ bool is_level_tail() const { return header.get_is_level_tail(); }
+ node_offset_t total_size() const { return SIZE; }
+ key_get_type get_key(index_t index) const {
+ assert(index < num_keys);
+ node_offset_t item_end_offset =
+ (index == 0 ? SIZE : offsets[index - 1]);
+ assert(item_end_offset <= SIZE);
+ const char* p_start = fields_start(*this);
+ return key_t(p_start + item_end_offset);
+ }
+ node_offset_t get_key_start_offset(index_t index) const {
+ assert(index <= num_keys);
+ auto offset = HEADER_SIZE + sizeof(node_offset_t) * num_keys;
+ assert(offset <= SIZE);
+ return offset;
+ }
+ node_offset_t get_item_start_offset(index_t index) const {
+ assert(index < num_keys);
+ auto offset = offsets[index];
+ assert(offset <= SIZE);
+ return offset;
+ }
+ const void* p_offset(index_t index) const {
+ assert(index < num_keys);
+ return &offsets[index];
+ }
+ node_offset_t get_item_end_offset(index_t index) const {
+ return index == 0 ? SIZE : get_item_start_offset(index - 1);
+ }
+ template <node_type_t NODE_TYPE>
+ node_offset_t free_size_before(index_t index) const {
+ auto range = fields_free_range_before<NODE_TYPE>(*this, index);
+ return range.end - range.start;
+ }
+
+ static node_offset_t estimate_insert_one() { return sizeof(node_offset_t); }
+ template <KeyT KT>
+ static void insert_at(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ const node_fields_2_t& node, index_t index, node_offset_t size_right) {
+ ceph_abort("not implemented");
+ }
+ static void update_size_at(
+ NodeExtentMutable& mut, const node_fields_2_t& node, index_t index, int change) {
+ ceph_abort("not implemented");
+ }
+ static void append_key(
+ NodeExtentMutable& mut, const key_t& key, char*& p_append) {
+ ns_oid_view_t::append(mut, key, p_append);
+ }
+ template <KeyT KT>
+ static void append_key(
+ NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) {
+ ns_oid_view_t::append<KT>(mut, key, p_append);
+ }
+ static void append_offset(
+ NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append);
+
+ node_header_t header;
+ num_keys_t num_keys = 0u;
+ node_offset_t offsets[];
+} __attribute__((packed));
+
+/**
+ * internal_fields_3_t
+ *
+ * The STAGE_RIGHT layout implementation for N2.
+ *
+ * The node layout storing 3 children:
+ *
+ * # <---------------- node range ---------------------------> #
+ * # # <-- keys ---> # <---- laddrs -----------> #
+ * # free space: # |<~># |<~>#
+ * # # | # | #
+ * # | num_ # key | key | # laddr | laddr | laddr | #
+ * # header | keys # 0 | 1 |...# 0 | 1 | 2 |...#
+ */
+// TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t)
+static constexpr unsigned MAX_NUM_KEYS_I3 = 170u;
+template <unsigned MAX_NUM_KEYS>
+struct _internal_fields_3_t {
+ using key_get_type = const snap_gen_t&;
+ using me_t = _internal_fields_3_t<MAX_NUM_KEYS>;
+ // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t)
+ using num_keys_t = uint8_t;
+ static constexpr field_type_t FIELD_TYPE = field_type_t::N3;
+ static constexpr node_offset_t SIZE = sizeof(me_t);
+ static constexpr node_offset_t HEADER_SIZE =
+ sizeof(node_header_t) + sizeof(num_keys_t);
+ static constexpr node_offset_t ITEM_OVERHEAD = 0u;
+
+ bool is_level_tail() const { return header.get_is_level_tail(); }
+ node_offset_t total_size() const {
+ if (is_level_tail()) {
+ return SIZE - sizeof(snap_gen_t);
+ } else {
+ return SIZE;
+ }
+ }
+ key_get_type get_key(index_t index) const {
+ assert(index < num_keys);
+ return keys[index];
+ }
+ template <node_type_t NODE_TYPE>
+ std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, node_offset_t>
+ free_size_before(index_t index) const {
+ assert(index <= num_keys);
+ assert(num_keys <= (is_level_tail() ? MAX_NUM_KEYS - 1 : MAX_NUM_KEYS));
+ auto free = (MAX_NUM_KEYS - index) * (sizeof(snap_gen_t) + sizeof(laddr_t));
+ if (is_level_tail() && index == num_keys) {
+ free -= (sizeof(snap_gen_t) + sizeof(laddr_t));
+ }
+ assert(free < SIZE);
+ return free;
+ }
+
+ static node_offset_t estimate_insert_one() {
+ return sizeof(snap_gen_t) + sizeof(laddr_t);
+ }
+ template <KeyT KT>
+ static void insert_at(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ const me_t& node, index_t index, node_offset_t size_right) {
+ ceph_abort("not implemented");
+ }
+ static void update_size_at(
+ NodeExtentMutable& mut, const me_t& node, index_t index, int change) {
+ ceph_abort("not implemented");
+ }
+
+ node_header_t header;
+ num_keys_t num_keys = 0u;
+ snap_gen_t keys[MAX_NUM_KEYS];
+ laddr_packed_t child_addrs[MAX_NUM_KEYS];
+} __attribute__((packed));
+static_assert(_internal_fields_3_t<MAX_NUM_KEYS_I3>::SIZE <= NODE_BLOCK_SIZE &&
+ _internal_fields_3_t<MAX_NUM_KEYS_I3 + 1>::SIZE > NODE_BLOCK_SIZE);
+using internal_fields_3_t = _internal_fields_3_t<MAX_NUM_KEYS_I3>;
+
+using leaf_fields_3_t = _node_fields_013_t<slot_3_t>;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h
new file mode 100644
index 000000000..cac167a98
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h
@@ -0,0 +1,2186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <type_traits>
+
+#include "common/likely.h"
+
+#include "sub_items_stage.h"
+#include "item_iterator_stage.h"
+
+namespace crimson::os::seastore::onode {
+
+struct search_result_bs_t {
+ index_t index;
+ MatchKindBS match;
+};
+template <typename FGetKey>
+search_result_bs_t binary_search(
+ const full_key_t<KeyT::HOBJ>& key,
+ index_t begin, index_t end, FGetKey&& f_get_key) {
+ assert(begin <= end);
+ while (begin < end) {
+ auto total = begin + end;
+ auto mid = total >> 1;
+ // do not copy if return value is reference
+ decltype(f_get_key(mid)) target = f_get_key(mid);
+ auto match = compare_to<KeyT::HOBJ>(key, target);
+ if (match == MatchKindCMP::LT) {
+ end = mid;
+ } else if (match == MatchKindCMP::GT) {
+ begin = mid + 1;
+ } else {
+ return {mid, MatchKindBS::EQ};
+ }
+ }
+ return {begin , MatchKindBS::NE};
+}
+
+template <typename PivotType, typename FGet>
+search_result_bs_t binary_search_r(
+ index_t rend, index_t rbegin, FGet&& f_get, const PivotType& key) {
+ assert(rend <= rbegin);
+ while (rend < rbegin) {
+ auto total = rend + rbegin + 1;
+ auto mid = total >> 1;
+ // do not copy if return value is reference
+ decltype(f_get(mid)) target = f_get(mid);
+ int match = target - key;
+ if (match < 0) {
+ rend = mid;
+ } else if (match > 0) {
+ rbegin = mid - 1;
+ } else {
+ return {mid, MatchKindBS::EQ};
+ }
+ }
+ return {rbegin, MatchKindBS::NE};
+}
+
+inline bool matchable(field_type_t type, match_stat_t mstat) {
+ assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX);
+ /*
+ * compressed prefix by field type:
+ * N0: NONE
+ * N1: pool/shard
+ * N2: pool/shard crush
+ * N3: pool/shard crush ns/oid
+ *
+ * if key matches the node's compressed prefix, return true
+ * else, return false
+ */
+#ifndef NDEBUG
+ if (mstat == MSTAT_END) {
+ assert(type == field_type_t::N0);
+ }
+#endif
+ return mstat + to_unsigned(type) < 4;
+}
+
+inline void assert_mstat(
+ const full_key_t<KeyT::HOBJ>& key,
+ const full_key_t<KeyT::VIEW>& index,
+ match_stat_t mstat) {
+ assert(mstat >= MSTAT_MIN && mstat <= MSTAT_LT2);
+ // key < index ...
+ switch (mstat) {
+ case MSTAT_EQ:
+ break;
+ case MSTAT_LT0:
+ assert(compare_to<KeyT::HOBJ>(key, index.snap_gen_packed()) == MatchKindCMP::LT);
+ break;
+ case MSTAT_LT1:
+ assert(compare_to<KeyT::HOBJ>(key, index.ns_oid_view()) == MatchKindCMP::LT);
+ break;
+ case MSTAT_LT2:
+ if (index.has_shard_pool()) {
+ assert(compare_to<KeyT::HOBJ>(key, shard_pool_crush_t{
+ index.shard_pool_packed(), index.crush_packed()}) == MatchKindCMP::LT);
+ } else {
+ assert(compare_to<KeyT::HOBJ>(key, index.crush_packed()) == MatchKindCMP::LT);
+ }
+ break;
+ default:
+ ceph_abort("impossible path");
+ }
+ // key == index ...
+ switch (mstat) {
+ case MSTAT_EQ:
+ assert(compare_to<KeyT::HOBJ>(key, index.snap_gen_packed()) == MatchKindCMP::EQ);
+ case MSTAT_LT0:
+ if (!index.has_ns_oid())
+ break;
+ assert(index.ns_oid_view().type() == ns_oid_view_t::Type::MAX ||
+ compare_to<KeyT::HOBJ>(key, index.ns_oid_view()) == MatchKindCMP::EQ);
+ case MSTAT_LT1:
+ if (!index.has_crush())
+ break;
+ assert(compare_to<KeyT::HOBJ>(key, index.crush_packed()) == MatchKindCMP::EQ);
+ if (!index.has_shard_pool())
+ break;
+ assert(compare_to<KeyT::HOBJ>(key, index.shard_pool_packed()) == MatchKindCMP::EQ);
+ default:
+ break;
+ }
+}
+
+#define NXT_STAGE_T staged<next_param_t>
+
+enum class TrimType { BEFORE, AFTER, AT };
+
+/**
+ * staged
+ *
+ * Implements recursive logic that modifies or reads the node layout
+ * (N0/N1/N2/N3 * LEAF/INTERNAL) with the multi-stage design. The specific
+ * stage implementation is flexible. So the implementations for different
+ * stages can be assembled independently, as long as they follow the
+ * definitions of container interfaces.
+ *
+ * Multi-stage is designed to index different portions of onode keys
+ * stage-by-stage. There are at most 3 stages for a node:
+ * - STAGE_LEFT: index shard-pool-crush for N0, or index crush for N1 node;
+ * - STAGE_STRING: index ns-oid for N0/N1/N2 nodes;
+ * - STAGE_RIGHT: index snap-gen for N0/N1/N2/N3 nodes;
+ *
+ * The intention is to consolidate the high-level indexing implementations at
+ * the level of stage, so we don't need to write them repeatedly for every
+ * stage and for every node type.
+ */
+template <typename Params>
+struct staged {
+ static_assert(Params::STAGE >= STAGE_BOTTOM);
+ static_assert(Params::STAGE <= STAGE_TOP);
+ using container_t = typename Params::container_t;
+ using key_get_type = typename container_t::key_get_type;
+ using next_param_t = typename Params::next_param_t;
+ using position_t = staged_position_t<Params::STAGE>;
+ using result_t = staged_result_t<Params::NODE_TYPE, Params::STAGE>;
+ using value_t = value_type_t<Params::NODE_TYPE>;
+ static constexpr auto CONTAINER_TYPE = container_t::CONTAINER_TYPE;
+ static constexpr bool IS_BOTTOM = (Params::STAGE == STAGE_BOTTOM);
+ static constexpr auto NODE_TYPE = Params::NODE_TYPE;
+ static constexpr auto STAGE = Params::STAGE;
+
+ template <bool is_exclusive>
+ static void _left_or_right(index_t& split_index, index_t insert_index,
+ std::optional<bool>& is_insert_left) {
+ assert(!is_insert_left.has_value());
+ assert(is_valid_index(split_index));
+ if constexpr (is_exclusive) {
+ if (split_index <= insert_index) {
+ // ...[s_index-1] |!| (i_index) [s_index]...
+ // offset i_position to right
+ is_insert_left = false;
+ } else {
+ // ...[s_index-1] (i_index)) |?[s_index]| ...
+ // ...(i_index)...[s_index-1] |?[s_index]| ...
+ is_insert_left = true;
+ --split_index;
+ }
+ } else {
+ if (split_index < insert_index) {
+ // ...[s_index-1] |?[s_index]| ...[(i_index)[s_index_k]...
+ is_insert_left = false;
+ } else if (split_index > insert_index) {
+ // ...[(i_index)s_index-1] |?[s_index]| ...
+ // ...[(i_index)s_index_k]...[s_index-1] |?[s_index]| ...
+ is_insert_left = true;
+ } else {
+ // ...[s_index-1] |?[(i_index)s_index]| ...
+ // i_to_left = std::nullopt;
+ }
+ }
+ }
+
+ template <ContainerType CTYPE, typename Enable = void> class _iterator_t;
+ template <ContainerType CTYPE>
+ class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::INDEXABLE>> {
+ /*
+ * indexable container type system:
+ * CONTAINER_TYPE = ContainerType::INDEXABLE
+ * keys() const -> index_t
+ * operator[](index_t) const -> key_get_type
+ * size_before(index_t) const -> node_offset_t
+ * size_overhead_at(index_t) const -> node_offset_t
+ * (IS_BOTTOM) get_p_value(index_t) const -> const value_t*
+ * (!IS_BOTTOM) size_to_nxt_at(index_t) const -> node_offset_t
+ * (!IS_BOTTOM) get_nxt_container(index_t) const
+ * encode(p_node_start, encoded)
+ * decode(p_node_start, delta) -> container_t
+ * static:
+ * header_size() -> node_offset_t
+ * estimate_insert(key, value) -> node_offset_t
+ * (IS_BOTTOM) insert_at(mut, src, key, value,
+ * index, size, p_left_bound) -> const value_t*
+ * (!IS_BOTTOM) insert_prefix_at(mut, src, key,
+ * index, size, p_left_bound) -> memory_range_t
+ * (!IS_BOTTOM) update_size_at(mut, src, index, size)
+ * trim_until(mut, container, index) -> trim_size
+ * (!IS_BOTTOM) trim_at(mut, container, index, trimmed) -> trim_size
+ *
+ * Appender::append(const container_t& src, from, items)
+ */
+ public:
+ using me_t = _iterator_t<CTYPE>;
+
+ _iterator_t(const container_t& container) : container{container} {
+ assert(container.keys());
+ }
+
+ index_t index() const {
+ return _index;
+ }
+ key_get_type get_key() const {
+ assert(!is_end());
+ return container[_index];
+ }
+ node_offset_t size_to_nxt() const {
+ assert(!is_end());
+ return container.size_to_nxt_at(_index);
+ }
+ template <typename T = typename NXT_STAGE_T::container_t>
+ std::enable_if_t<!IS_BOTTOM, T> get_nxt_container() const {
+ assert(!is_end());
+ return container.get_nxt_container(_index);
+ }
+ template <typename T = value_t>
+ std::enable_if_t<IS_BOTTOM, const T*> get_p_value() const {
+ assert(!is_end());
+ return container.get_p_value(_index);
+ }
+ bool is_last() const {
+ return _index + 1 == container.keys();
+ }
+ bool is_end() const { return _index == container.keys(); }
+ node_offset_t size() const {
+ assert(!is_end());
+ assert(header_size() == container.size_before(0));
+ assert(container.size_before(_index + 1) > container.size_before(_index));
+ return container.size_before(_index + 1) -
+ container.size_before(_index);
+ }
+ node_offset_t size_overhead() const {
+ assert(!is_end());
+ return container.size_overhead_at(_index);
+ }
+
+ me_t& operator++() {
+ assert(!is_end());
+ assert(!is_last());
+ ++_index;
+ return *this;
+ }
+ void seek_at(index_t index) {
+ assert(index < container.keys());
+ seek_till_end(index);
+ }
+ void seek_till_end(index_t index) {
+ assert(!is_end());
+ assert(this->index() == 0);
+ assert(index <= container.keys());
+ _index = index;
+ }
+ void seek_last() {
+ assert(!is_end());
+ assert(index() == 0);
+ _index = container.keys() - 1;
+ }
+ void set_end() {
+ assert(!is_end());
+ assert(is_last());
+ ++_index;
+ }
+ // Note: possible to return an end iterator
+ MatchKindBS seek(const full_key_t<KeyT::HOBJ>& key, bool exclude_last) {
+ assert(!is_end());
+ assert(index() == 0);
+ index_t end_index = container.keys();
+ if (exclude_last) {
+ assert(end_index);
+ --end_index;
+ assert(compare_to<KeyT::HOBJ>(key, container[end_index]) == MatchKindCMP::LT);
+ }
+ auto ret = binary_search(key, _index, end_index,
+ [this] (index_t index) { return container[index]; });
+ _index = ret.index;
+ return ret.match;
+ }
+
+ template <KeyT KT, typename T = value_t>
+ std::enable_if_t<IS_BOTTOM, const T*> insert(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ const value_t& value, node_offset_t insert_size, const char* p_left_bound) {
+ return container_t::template insert_at<KT>(
+ mut, container, key, value, _index, insert_size, p_left_bound);
+ }
+
+ template <KeyT KT, typename T = memory_range_t>
+ std::enable_if_t<!IS_BOTTOM, T> insert_prefix(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ node_offset_t size, const char* p_left_bound) {
+ return container_t::template insert_prefix_at<KT>(
+ mut, container, key, _index, size, p_left_bound);
+ }
+
+ template <typename T = void>
+ std::enable_if_t<!IS_BOTTOM, T>
+ update_size(NodeExtentMutable& mut, node_offset_t insert_size) {
+ assert(!is_end());
+ container_t::update_size_at(mut, container, _index, insert_size);
+ }
+
+ // Note: possible to return an end iterator when is_exclusive is true
+ template <bool is_exclusive>
+ size_t seek_split_inserted(
+ size_t start_size, size_t extra_size, size_t target_size,
+ index_t& insert_index, size_t insert_size,
+ std::optional<bool>& is_insert_left) {
+ assert(!is_end());
+ assert(index() == 0);
+ // replace insert_index placeholder
+ if constexpr (!is_exclusive) {
+ if (insert_index == INDEX_LAST) {
+ insert_index = container.keys() - 1;
+ }
+ } else {
+ if (insert_index == INDEX_END) {
+ insert_index = container.keys();
+ }
+ }
+ assert(insert_index <= container.keys());
+
+ auto start_size_1 = start_size + extra_size;
+ auto f_get_used_size = [this, start_size, start_size_1,
+ insert_index, insert_size] (index_t index) {
+ size_t current_size;
+ if (unlikely(index == 0)) {
+ current_size = start_size;
+ } else {
+ current_size = start_size_1;
+ if (index > insert_index) {
+ current_size += insert_size;
+ if constexpr (is_exclusive) {
+ --index;
+ }
+ }
+ // already includes header size
+ current_size += container.size_before(index);
+ }
+ return current_size;
+ };
+ index_t s_end;
+ if constexpr (is_exclusive) {
+ s_end = container.keys();
+ } else {
+ s_end = container.keys() - 1;
+ }
+ _index = binary_search_r(0, s_end, f_get_used_size, target_size).index;
+ size_t current_size = f_get_used_size(_index);
+ assert(current_size <= target_size);
+
+ _left_or_right<is_exclusive>(_index, insert_index, is_insert_left);
+ return current_size;
+ }
+
+ size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) {
+ assert(!is_end());
+ assert(index() == 0);
+ auto start_size_1 = start_size + extra_size;
+ auto f_get_used_size = [this, start_size, start_size_1] (index_t index) {
+ size_t current_size;
+ if (unlikely(index == 0)) {
+ current_size = start_size;
+ } else {
+ // already includes header size
+ current_size = start_size_1 + container.size_before(index);
+ }
+ return current_size;
+ };
+ _index = binary_search_r(
+ 0, container.keys() - 1, f_get_used_size, target_size).index;
+ size_t current_size = f_get_used_size(_index);
+ assert(current_size <= target_size);
+ return current_size;
+ }
+
+ // Note: possible to return an end iterater if to_index == INDEX_END
+ template <KeyT KT>
+ void copy_out_until(
+ typename container_t::template Appender<KT>& appender, index_t& to_index) {
+ auto num_keys = container.keys();
+ index_t items;
+ if (to_index == INDEX_END) {
+ items = num_keys - _index;
+ appender.append(container, _index, items);
+ _index = num_keys;
+ to_index = _index;
+ } else if (to_index == INDEX_LAST) {
+ assert(!is_end());
+ items = num_keys - 1 - _index;
+ appender.append(container, _index, items);
+ _index = num_keys - 1;
+ to_index = _index;
+ } else {
+ assert(_index <= to_index);
+ assert(to_index <= num_keys);
+ items = to_index - _index;
+ appender.append(container, _index, items);
+ _index = to_index;
+ }
+ }
+
+ node_offset_t trim_until(NodeExtentMutable& mut) {
+ return container_t::trim_until(mut, container, _index);
+ }
+
+ template <typename T = node_offset_t>
+ std::enable_if_t<!IS_BOTTOM, T>
+ trim_at(NodeExtentMutable& mut, node_offset_t trimmed) {
+ return container_t::trim_at(mut, container, _index, trimmed);
+ }
+
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ container.encode(p_node_start, encoded);
+ ceph::encode(_index, encoded);
+ }
+
+ static me_t decode(const char* p_node_start,
+ ceph::bufferlist::const_iterator& delta) {
+ auto container = container_t::decode(p_node_start, delta);
+ auto ret = me_t(container);
+ index_t index;
+ ceph::decode(index, delta);
+ ret.seek_till_end(index);
+ return ret;
+ }
+
+ static node_offset_t header_size() {
+ return container_t::header_size();
+ }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(
+ const full_key_t<KT>& key, const value_t& value) {
+ return container_t::template estimate_insert<KT>(key, value);
+ }
+
+ private:
+ container_t container;
+ index_t _index = 0;
+ };
+
+ template <ContainerType CTYPE>
+ class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::ITERATIVE>> {
+ /*
+ * iterative container type system (!IS_BOTTOM):
+ * CONTAINER_TYPE = ContainerType::ITERATIVE
+ * index() const -> index_t
+ * get_key() const -> key_get_type
+ * size() const -> node_offset_t
+ * size_to_nxt() const -> node_offset_t
+ * size_overhead() const -> node_offset_t
+ * get_nxt_container() const
+ * has_next() const -> bool
+ * encode(p_node_start, encoded)
+ * decode(p_node_start, delta) -> container_t
+ * operator++()
+ * static:
+ * header_size() -> node_offset_t
+ * estimate_insert(key, value) -> node_offset_t
+ * insert_prefix(mut, src, key, is_end, size, p_left_bound) -> memory_range_t
+ * update_size(mut, src, size)
+ * trim_until(mut, container) -> trim_size
+ * trim_at(mut, container, trimmed) -> trim_size
+ */
+ // currently the iterative iterator is only implemented with STAGE_STRING
+ // for in-node space efficiency
+ static_assert(STAGE == STAGE_STRING);
+ public:
+ using me_t = _iterator_t<CTYPE>;
+
+ _iterator_t(const container_t& container) : container{container} {}
+
+ index_t index() const {
+ if (is_end()) {
+ return container.index() + 1;
+ } else {
+ return container.index();
+ }
+ }
+ key_get_type get_key() const {
+ assert(!is_end());
+ return container.get_key();
+ }
+ node_offset_t size_to_nxt() const {
+ assert(!is_end());
+ return container.size_to_nxt();
+ }
+ const typename NXT_STAGE_T::container_t get_nxt_container() const {
+ assert(!is_end());
+ return container.get_nxt_container();
+ }
+ bool is_last() const {
+ assert(!is_end());
+ return !container.has_next();
+ }
+ bool is_end() const {
+#ifndef NDEBUG
+ if (_is_end) {
+ assert(!container.has_next());
+ }
+#endif
+ return _is_end;
+ }
+ node_offset_t size() const {
+ assert(!is_end());
+ return container.size();
+ }
+ node_offset_t size_overhead() const {
+ assert(!is_end());
+ return container.size_overhead();
+ }
+
+ me_t& operator++() {
+ assert(!is_end());
+ assert(!is_last());
+ ++container;
+ return *this;
+ }
+ void seek_at(index_t index) {
+ assert(!is_end());
+ assert(this->index() == 0);
+ while (index > 0) {
+ assert(container.has_next());
+ ++container;
+ --index;
+ }
+ }
+ void seek_till_end(index_t index) {
+ assert(!is_end());
+ assert(this->index() == 0);
+ while (index > 0) {
+ if (!container.has_next()) {
+ assert(index == 1);
+ set_end();
+ break;
+ }
+ ++container;
+ --index;
+ }
+ }
+ void seek_last() {
+ assert(!is_end());
+ assert(index() == 0);
+ while (container.has_next()) {
+ ++container;
+ }
+ }
+ void set_end() {
+ assert(!is_end());
+ assert(is_last());
+ _is_end = true;
+ }
+ // Note: possible to return an end iterator
+ MatchKindBS seek(const full_key_t<KeyT::HOBJ>& key, bool exclude_last) {
+ assert(!is_end());
+ assert(index() == 0);
+ do {
+ if (exclude_last && is_last()) {
+ assert(compare_to<KeyT::HOBJ>(key, get_key()) == MatchKindCMP::LT);
+ return MatchKindBS::NE;
+ }
+ auto match = compare_to<KeyT::HOBJ>(key, get_key());
+ if (match == MatchKindCMP::LT) {
+ return MatchKindBS::NE;
+ } else if (match == MatchKindCMP::EQ) {
+ return MatchKindBS::EQ;
+ } else {
+ if (container.has_next()) {
+ ++container;
+ } else {
+ // end
+ break;
+ }
+ }
+ } while (true);
+ assert(!exclude_last);
+ set_end();
+ return MatchKindBS::NE;
+ }
+
+ template <KeyT KT>
+ memory_range_t insert_prefix(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ node_offset_t size, const char* p_left_bound) {
+ return container_t::template insert_prefix<KT>(
+ mut, container, key, is_end(), size, p_left_bound);
+ }
+
+ void update_size(NodeExtentMutable& mut, node_offset_t insert_size) {
+ assert(!is_end());
+ container_t::update_size(mut, container, insert_size);
+ }
+
+ // Note: possible to return an end iterator when is_exclusive is true
+ // insert_index can still be INDEX_LAST or INDEX_END
+ template <bool is_exclusive>
+ size_t seek_split_inserted(
+ size_t start_size, size_t extra_size, size_t target_size,
+ index_t& insert_index, size_t insert_size,
+ std::optional<bool>& is_insert_left) {
+ assert(!is_end());
+ assert(index() == 0);
+ size_t current_size = start_size;
+ index_t split_index = 0;
+ extra_size += header_size();
+ do {
+ if constexpr (!is_exclusive) {
+ if (is_last()) {
+ assert(split_index == index());
+ if (insert_index == INDEX_LAST) {
+ insert_index = index();
+ }
+ assert(insert_index <= index());
+ break;
+ }
+ }
+
+ size_t nxt_size = current_size;
+ if (split_index == 0) {
+ nxt_size += extra_size;
+ }
+ if (split_index == insert_index) {
+ nxt_size += insert_size;
+ if constexpr (is_exclusive) {
+ if (nxt_size > target_size) {
+ break;
+ }
+ current_size = nxt_size;
+ ++split_index;
+ }
+ }
+ nxt_size += size();
+ if (nxt_size > target_size) {
+ break;
+ }
+ current_size = nxt_size;
+
+ if constexpr (is_exclusive) {
+ if (is_last()) {
+ assert(split_index == index());
+ set_end();
+ split_index = index();
+ if (insert_index == INDEX_END) {
+ insert_index = index();
+ }
+ assert(insert_index == index());
+ break;
+ } else {
+ ++(*this);
+ ++split_index;
+ }
+ } else {
+ ++(*this);
+ ++split_index;
+ }
+ } while (true);
+ assert(current_size <= target_size);
+
+ _left_or_right<is_exclusive>(split_index, insert_index, is_insert_left);
+ assert(split_index == index());
+ return current_size;
+ }
+
+ size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) {
+ assert(!is_end());
+ assert(index() == 0);
+ size_t current_size = start_size;
+ do {
+ if (is_last()) {
+ break;
+ }
+
+ size_t nxt_size = current_size;
+ if (index() == 0) {
+ nxt_size += extra_size;
+ }
+ nxt_size += size();
+ if (nxt_size > target_size) {
+ break;
+ }
+ current_size = nxt_size;
+ ++(*this);
+ } while (true);
+ assert(current_size <= target_size);
+ return current_size;
+ }
+
+ // Note: possible to return an end iterater if to_index == INDEX_END
+ template <KeyT KT>
+ void copy_out_until(
+ typename container_t::template Appender<KT>& appender, index_t& to_index) {
+ if (is_end()) {
+ assert(!container.has_next());
+ if (to_index == INDEX_END) {
+ to_index = index();
+ }
+ assert(to_index == index());
+ return;
+ }
+ index_t items;
+ if (to_index == INDEX_END || to_index == INDEX_LAST) {
+ items = to_index;
+ } else {
+ assert(is_valid_index(to_index));
+ assert(index() <= to_index);
+ items = to_index - index();
+ }
+ if (appender.append(container, items)) {
+ set_end();
+ }
+ to_index = index();
+ }
+
+ node_offset_t trim_until(NodeExtentMutable& mut) {
+ if (is_end()) {
+ return 0;
+ }
+ return container_t::trim_until(mut, container);
+ }
+
+ node_offset_t trim_at(NodeExtentMutable& mut, node_offset_t trimmed) {
+ assert(!is_end());
+ return container_t::trim_at(mut, container, trimmed);
+ }
+
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ container.encode(p_node_start, encoded);
+ uint8_t is_end = _is_end;
+ ceph::encode(is_end, encoded);
+ }
+
+ static me_t decode(const char* p_node_start,
+ ceph::bufferlist::const_iterator& delta) {
+ auto container = container_t::decode(p_node_start, delta);
+ auto ret = me_t(container);
+ uint8_t is_end;
+ ceph::decode(is_end, delta);
+ if (is_end) {
+ ret.set_end();
+ }
+ return ret;
+ }
+
+ static node_offset_t header_size() {
+ return container_t::header_size();
+ }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(const full_key_t<KT>& key, const value_t& value) {
+ return container_t::template estimate_insert<KT>(key, value);
+ }
+
+ private:
+ container_t container;
+ bool _is_end = false;
+ };
+
+ /*
+ * iterator_t encapsulates both indexable and iterative implementations
+ * from a *non-empty* container.
+ * cstr(const container_t&)
+ * access:
+ * index() -> index_t
+ * get_key() -> key_get_type (const reference or value type)
+ * is_last() -> bool
+ * is_end() -> bool
+ * size() -> node_offset_t
+ * size_overhead() -> node_offset_t
+ * (IS_BOTTOM) get_p_value() -> const value_t*
+ * (!IS_BOTTOM) get_nxt_container() -> nxt_stage::container_t
+ * (!IS_BOTTOM) size_to_nxt() -> node_offset_t
+ * seek:
+ * operator++() -> iterator_t&
+ * seek_at(index)
+ * seek_till_end(index)
+ * seek_last()
+ * set_end()
+ * seek(key, exclude_last) -> MatchKindBS
+ * insert:
+ * (IS_BOTTOM) insert(mut, key, value, size, p_left_bound) -> p_value
+ * (!IS_BOTTOM) insert_prefix(mut, key, size, p_left_bound) -> memory_range_t
+ * (!IS_BOTTOM) update_size(mut, size)
+ * split:
+ * seek_split_inserted<bool is_exclusive>(
+ * start_size, extra_size, target_size, insert_index, insert_size,
+ * std::optional<bool>& is_insert_left)
+ * -> insert to left/right/unknown (!exclusive)
+ * -> insert to left/right (exclusive, can be end)
+ * -> split_size
+ * seek_split(start_size, extra_size, target_size) -> split_size
+ * copy_out_until(appender, to_index) (can be end)
+ * trim_until(mut) -> trim_size
+ * (!IS_BOTTOM) trim_at(mut, trimmed) -> trim_size
+ * denc:
+ * encode(p_node_start, encoded)
+ * decode(p_node_start, delta) -> iterator_t
+ * static:
+ * header_size() -> node_offset_t
+ * estimate_insert(key, value) -> node_offset_t
+ */
+ using iterator_t = _iterator_t<CONTAINER_TYPE>;
+ /* TODO: detailed comments
+ * - trim_until(mut) -> trim_size
+ * * keep 0 to i - 1, and remove the rest, return the size trimmed.
+ * * if this is the end iterator, do nothing and return 0.
+ * * if this is the start iterator, normally needs to go to the higher
+ * stage to trim the entire container.
+ * - trim_at(mut, trimmed) -> trim_size
+ * * trim happens inside the current iterator, causing the size reduced by
+ * <trimmed>, return the total size trimmed.
+ */
+
+ /*
+ * Lookup internals (hide?)
+ */
+
+ template <bool GET_KEY>
+ static result_t smallest_result(
+ const iterator_t& iter, full_key_t<KeyT::VIEW>* index_key) {
+ static_assert(!IS_BOTTOM);
+ assert(!iter.is_end());
+ auto pos_smallest = NXT_STAGE_T::position_t::begin();
+ auto nxt_container = iter.get_nxt_container();
+ auto value_ptr = NXT_STAGE_T::template get_p_value<GET_KEY>(
+ nxt_container, pos_smallest, index_key);
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ return result_t{{iter.index(), pos_smallest}, value_ptr, STAGE};
+ }
+
+ template <bool GET_KEY>
+ static result_t nxt_lower_bound(
+ const full_key_t<KeyT::HOBJ>& key, iterator_t& iter,
+ MatchHistory& history, full_key_t<KeyT::VIEW>* index_key) {
+ static_assert(!IS_BOTTOM);
+ assert(!iter.is_end());
+ auto nxt_container = iter.get_nxt_container();
+ auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>(
+ nxt_container, key, history, index_key);
+ if (nxt_result.is_end()) {
+ if (iter.is_last()) {
+ return result_t::end();
+ } else {
+ return smallest_result<GET_KEY>(++iter, index_key);
+ }
+ } else {
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ return result_t::from_nxt(iter.index(), nxt_result);
+ }
+ }
+
+ template <bool GET_POS, bool GET_KEY, bool GET_VAL>
+ static void lookup_largest_slot(
+ const container_t& container, position_t* p_position,
+ full_key_t<KeyT::VIEW>* p_index_key, const value_t** pp_value) {
+ auto iter = iterator_t(container);
+ iter.seek_last();
+ if constexpr (GET_KEY) {
+ assert(p_index_key);
+ p_index_key->set(iter.get_key());
+ }
+ if constexpr (GET_POS) {
+ assert(p_position);
+ p_position->index = iter.index();
+ }
+ if constexpr (IS_BOTTOM) {
+ if constexpr (GET_VAL) {
+ assert(pp_value);
+ *pp_value = iter.get_p_value();
+ }
+ } else {
+ auto nxt_container = iter.get_nxt_container();
+ if constexpr (GET_POS) {
+ NXT_STAGE_T::template lookup_largest_slot<true, GET_KEY, GET_VAL>(
+ nxt_container, &p_position->nxt, p_index_key, pp_value);
+ } else {
+ NXT_STAGE_T::template lookup_largest_slot<false, GET_KEY, GET_VAL>(
+ nxt_container, nullptr, p_index_key, pp_value);
+ }
+ }
+ }
+
+ template <bool GET_KEY = false>
+ static const value_t* get_p_value(
+ const container_t& container, const position_t& position,
+ full_key_t<KeyT::VIEW>* index_key = nullptr) {
+ auto iter = iterator_t(container);
+ iter.seek_at(position.index);
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ return NXT_STAGE_T::template get_p_value<GET_KEY>(
+ nxt_container, position.nxt, index_key);
+ } else {
+ return iter.get_p_value();
+ }
+ }
+
+ static void get_key_view(
+ const container_t& container,
+ const position_t& position,
+ full_key_t<KeyT::VIEW>& index_key) {
+ auto iter = iterator_t(container);
+ iter.seek_at(position.index);
+ index_key.set(iter.get_key());
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ return NXT_STAGE_T::get_key_view(nxt_container, position.nxt, index_key);
+ }
+ }
+
+ template <bool GET_KEY = false>
+ static result_t lower_bound(
+ const container_t& container,
+ const full_key_t<KeyT::HOBJ>& key,
+ MatchHistory& history,
+ full_key_t<KeyT::VIEW>* index_key = nullptr) {
+ bool exclude_last = false;
+ if (history.get<STAGE>().has_value()) {
+ if (*history.get<STAGE>() == MatchKindCMP::EQ) {
+ // lookup is short-circuited
+ if constexpr (!IS_BOTTOM) {
+ assert(history.get<STAGE - 1>().has_value());
+ if (history.is_GT<STAGE - 1>()) {
+ auto iter = iterator_t(container);
+ bool test_key_equal;
+ if constexpr (STAGE == STAGE_STRING) {
+ // TODO(cross-node string dedup)
+ // test_key_equal = (iter.get_key().type() == ns_oid_view_t::Type::MIN);
+ auto cmp = compare_to<KeyT::HOBJ>(key, iter.get_key());
+ assert(cmp != MatchKindCMP::GT);
+ test_key_equal = (cmp == MatchKindCMP::EQ);
+ } else {
+ auto cmp = compare_to<KeyT::HOBJ>(key, iter.get_key());
+ // From history, key[stage] == parent[stage][index - 1]
+ // which should be the smallest possible value for all
+ // index[stage][*]
+ assert(cmp != MatchKindCMP::GT);
+ test_key_equal = (cmp == MatchKindCMP::EQ);
+ }
+ if (test_key_equal) {
+ return nxt_lower_bound<GET_KEY>(key, iter, history, index_key);
+ } else {
+ // key[stage] < index[stage][left-most]
+ return smallest_result<GET_KEY>(iter, index_key);
+ }
+ }
+ }
+ // IS_BOTTOM || !history.is_GT<STAGE - 1>()
+ auto iter = iterator_t(container);
+ iter.seek_last();
+ if constexpr (STAGE == STAGE_STRING) {
+ // TODO(cross-node string dedup)
+ // assert(iter.get_key().type() == ns_oid_view_t::Type::MAX);
+ assert(compare_to<KeyT::HOBJ>(key, iter.get_key()) == MatchKindCMP::EQ);
+ } else {
+ assert(compare_to<KeyT::HOBJ>(key, iter.get_key()) == MatchKindCMP::EQ);
+ }
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ if constexpr (IS_BOTTOM) {
+ auto value_ptr = iter.get_p_value();
+ return result_t{{iter.index()}, value_ptr, MSTAT_EQ};
+ } else {
+ auto nxt_container = iter.get_nxt_container();
+ auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>(
+ nxt_container, key, history, index_key);
+ // !history.is_GT<STAGE - 1>() means
+ // key[stage+1 ...] <= index[stage+1 ...][*]
+ assert(!nxt_result.is_end());
+ return result_t::from_nxt(iter.index(), nxt_result);
+ }
+ } else if (*history.get<STAGE>() == MatchKindCMP::LT) {
+ exclude_last = true;
+ }
+ }
+ auto iter = iterator_t(container);
+ auto bs_match = iter.seek(key, exclude_last);
+ if (iter.is_end()) {
+ assert(!exclude_last);
+ assert(bs_match == MatchKindBS::NE);
+ history.set<STAGE>(MatchKindCMP::GT);
+ return result_t::end();
+ }
+ history.set<STAGE>(bs_match == MatchKindBS::EQ ?
+ MatchKindCMP::EQ : MatchKindCMP::LT);
+ if constexpr (IS_BOTTOM) {
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ auto value_ptr = iter.get_p_value();
+ return result_t{{iter.index()}, value_ptr,
+ (bs_match == MatchKindBS::EQ ? MSTAT_EQ : MSTAT_LT0)};
+ } else {
+ if (bs_match == MatchKindBS::EQ) {
+ return nxt_lower_bound<GET_KEY>(key, iter, history, index_key);
+ } else {
+ return smallest_result<GET_KEY>(iter, index_key);
+ }
+ }
+ }
+
+ template <KeyT KT>
+ static node_offset_t insert_size(const full_key_t<KT>& key, const value_t& value) {
+ if constexpr (IS_BOTTOM) {
+ return iterator_t::template estimate_insert<KT>(key, value);
+ } else {
+ return iterator_t::template estimate_insert<KT>(key, value) +
+ NXT_STAGE_T::iterator_t::header_size() +
+ NXT_STAGE_T::template insert_size<KT>(key, value);
+ }
+ }
+
+ template <KeyT KT>
+ static node_offset_t insert_size_at(
+ match_stage_t stage, const full_key_t<KeyT::HOBJ>& key, const value_t& value) {
+ if (stage == STAGE) {
+ return insert_size<KT>(key, value);
+ } else {
+ assert(stage < STAGE);
+ return NXT_STAGE_T::template insert_size_at<KT>(stage, key, value);
+ }
+ }
+
+ template <typename T = std::tuple<match_stage_t, node_offset_t>>
+ static std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, T> evaluate_insert(
+ const container_t& container, const full_key_t<KeyT::VIEW>& key,
+ const value_t& value, position_t& position, bool evaluate_last) {
+ auto iter = iterator_t(container);
+ auto& index = position.index;
+ if (evaluate_last || index == INDEX_END) {
+ iter.seek_last();
+ index = iter.index();
+ // evaluate the previous index
+ } else {
+ assert(is_valid_index(index));
+ // evaluate the current index
+ iter.seek_at(index);
+ auto match = compare_to<KeyT::VIEW>(key, iter.get_key());
+ if (match == MatchKindCMP::EQ) {
+ if constexpr (IS_BOTTOM) {
+ ceph_abort("insert conflict at current index!");
+ } else {
+ // insert into the current index
+ auto nxt_container = iter.get_nxt_container();
+ return NXT_STAGE_T::evaluate_insert(
+ nxt_container, key, value, position.nxt, false);
+ }
+ } else {
+ assert(match == MatchKindCMP::LT);
+ if (index == 0) {
+ // already the first index, so insert at the current index
+ return {STAGE, insert_size<KeyT::VIEW>(key, value)};
+ }
+ --index;
+ iter = iterator_t(container);
+ iter.seek_at(index);
+ // proceed to evaluate the previous index
+ }
+ }
+
+ // XXX(multi-type): when key is from a different type of node
+ auto match = compare_to<KeyT::VIEW>(key, iter.get_key());
+ if (match == MatchKindCMP::GT) {
+ // key doesn't match both indexes, so insert at the current index
+ ++index;
+ return {STAGE, insert_size<KeyT::VIEW>(key, value)};
+ } else {
+ assert(match == MatchKindCMP::EQ);
+ if constexpr (IS_BOTTOM) {
+ // ceph_abort?
+ ceph_abort("insert conflict at the previous index!");
+ } else {
+ // insert into the previous index
+ auto nxt_container = iter.get_nxt_container();
+ return NXT_STAGE_T::evaluate_insert(
+ nxt_container, key, value, position.nxt, true);
+ }
+ }
+ }
+
+ template <typename T = bool>
+ static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T>
+ compensate_insert_position_at(match_stage_t stage, position_t& position) {
+ auto& index = position.index;
+ if (stage == STAGE) {
+ assert(index == 0);
+ // insert at the end of the current stage
+ index = INDEX_END;
+ return true;
+ } else {
+ if constexpr (IS_BOTTOM) {
+ ceph_abort("impossible path");
+ } else {
+ assert(stage < STAGE);
+ bool compensate = NXT_STAGE_T::
+ compensate_insert_position_at(stage, position.nxt);
+ if (compensate) {
+ assert(is_valid_index(index));
+ if (index == 0) {
+ // insert into the *last* index of the current stage
+ index = INDEX_LAST;
+ return true;
+ } else {
+ --index;
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+ }
+ }
+
+ static void patch_insert_end(position_t& insert_pos, match_stage_t insert_stage) {
+ assert(insert_stage <= STAGE);
+ if (insert_stage == STAGE) {
+ insert_pos.index = INDEX_END;
+ } else if constexpr (!IS_BOTTOM) {
+ insert_pos.index = INDEX_LAST;
+ NXT_STAGE_T::patch_insert_end(insert_pos.nxt, insert_stage);
+ }
+ }
+
+ template <typename T = std::tuple<match_stage_t, node_offset_t>>
+ static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T> evaluate_insert(
+ const full_key_t<KeyT::HOBJ>& key, const onode_t& value,
+ const MatchHistory& history, match_stat_t mstat, position_t& position) {
+ match_stage_t insert_stage = STAGE_TOP;
+ while (*history.get_by_stage(insert_stage) == MatchKindCMP::EQ) {
+ assert(insert_stage != STAGE_BOTTOM && "insert conflict!");
+ --insert_stage;
+ }
+
+ if (history.is_GT()) {
+ if (position.is_end()) {
+ // no need to compensate insert position
+ assert(insert_stage <= STAGE && "impossible insert stage");
+ } else if (position == position_t::begin()) {
+ // I must be short-circuited by staged::smallest_result()
+ // in staged::lower_bound(), so we need to rely on mstat instead
+ assert(mstat >= MSTAT_LT0 && mstat <= MSTAT_LT3);
+ if (mstat == MSTAT_LT0) {
+ insert_stage = STAGE_RIGHT;
+ } else if (mstat == MSTAT_LT1) {
+ insert_stage = STAGE_STRING;
+ } else {
+ insert_stage = STAGE_LEFT;
+ }
+ // XXX(multi-type): need to upgrade node type before inserting an
+ // incompatible index at front.
+ assert(insert_stage <= STAGE && "incompatible insert");
+ } else {
+ assert(insert_stage <= STAGE && "impossible insert stage");
+ [[maybe_unused]] bool ret = compensate_insert_position_at(insert_stage, position);
+ assert(!ret);
+ }
+ }
+
+ if (position.is_end()) {
+ patch_insert_end(position, insert_stage);
+ }
+
+ node_offset_t insert_size = insert_size_at<KeyT::HOBJ>(insert_stage, key, value);
+
+ return {insert_stage, insert_size};
+ }
+
+ template <KeyT KT>
+ static const value_t* insert_new(
+ NodeExtentMutable& mut, const memory_range_t& range,
+ const full_key_t<KT>& key, const value_t& value) {
+ char* p_insert = const_cast<char*>(range.p_end);
+ const value_t* p_value = nullptr;
+ StagedAppender<KT> appender;
+ appender.init(&mut, p_insert);
+ appender.append(key, value, p_value);
+ [[maybe_unused]] const char* p_insert_front = appender.wrap();
+ assert(p_insert_front == range.p_start);
+ return p_value;
+ }
+
+ template <KeyT KT, bool SPLIT>
+ static const value_t* proceed_insert_recursively(
+ NodeExtentMutable& mut, const container_t& container,
+ const full_key_t<KT>& key, const value_t& value,
+ position_t& position, match_stage_t& stage,
+ node_offset_t& _insert_size, const char* p_left_bound) {
+ // proceed insert from right to left
+ assert(stage <= STAGE);
+ auto iter = iterator_t(container);
+ auto& index = position.index;
+
+ bool do_insert = false;
+ if (stage == STAGE) {
+ if (index == INDEX_END) {
+ iter.seek_last();
+ iter.set_end();
+ index = iter.index();
+ } else {
+ assert(is_valid_index(index));
+ iter.seek_till_end(index);
+ }
+ do_insert = true;
+ } else { // stage < STAGE
+ if (index == INDEX_LAST) {
+ iter.seek_last();
+ index = iter.index();
+ } else {
+ assert(is_valid_index(index));
+ iter.seek_till_end(index);
+ }
+ if constexpr (SPLIT) {
+ if (iter.is_end()) {
+ // insert at the higher stage due to split
+ do_insert = true;
+ _insert_size = insert_size<KT>(key, value);
+ stage = STAGE;
+ }
+ } else {
+ assert(!iter.is_end());
+ }
+ }
+
+ if (do_insert) {
+ if constexpr (!IS_BOTTOM) {
+ position.nxt = position_t::nxt_t::begin();
+ }
+ assert(_insert_size == insert_size<KT>(key, value));
+ if constexpr (IS_BOTTOM) {
+ return iter.template insert<KT>(
+ mut, key, value, _insert_size, p_left_bound);
+ } else {
+ auto range = iter.template insert_prefix<KT>(
+ mut, key, _insert_size, p_left_bound);
+ return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value);
+ }
+ } else {
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ auto p_value = NXT_STAGE_T::template proceed_insert_recursively<KT, SPLIT>(
+ mut, nxt_container, key, value,
+ position.nxt, stage, _insert_size, p_left_bound);
+ iter.update_size(mut, _insert_size);
+ return p_value;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ }
+
+ template <KeyT KT, bool SPLIT>
+ static const value_t* proceed_insert(
+ NodeExtentMutable& mut, const container_t& container,
+ const full_key_t<KT>& key, const value_t& value,
+ position_t& position, match_stage_t& stage, node_offset_t& _insert_size) {
+ auto p_left_bound = container.p_left_bound();
+ if (unlikely(!container.keys())) {
+ if (position.is_end()) {
+ position = position_t::begin();
+ assert(stage == STAGE);
+ assert(_insert_size == insert_size<KT>(key, value));
+ } else if (position == position_t::begin()) {
+ // when insert into a trimmed and empty left node
+ stage = STAGE;
+ _insert_size = insert_size<KT>(key, value);
+ } else {
+ ceph_abort("impossible path");
+ }
+ if constexpr (IS_BOTTOM) {
+ return container_t::template insert_at<KT>(
+ mut, container, key, value, 0, _insert_size, p_left_bound);
+ } else {
+ auto range = container_t::template insert_prefix_at<KT>(
+ mut, container, key, 0, _insert_size, p_left_bound);
+ return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value);
+ }
+ } else {
+ return proceed_insert_recursively<KT, SPLIT>(
+ mut, container, key, value,
+ position, stage, _insert_size, p_left_bound);
+ }
+ }
+
+ static std::ostream& dump(const container_t& container,
+ std::ostream& os,
+ const std::string& prefix,
+ size_t& size,
+ const char* p_start) {
+ auto iter = iterator_t(container);
+ assert(!iter.is_end());
+ std::string prefix_blank(prefix.size(), ' ');
+ const std::string* p_prefix = &prefix;
+ size += iterator_t::header_size();
+ do {
+ std::ostringstream sos;
+ sos << *p_prefix << iter.get_key() << ": ";
+ std::string i_prefix = sos.str();
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ size += iter.size_to_nxt();
+ NXT_STAGE_T::dump(nxt_container, os, i_prefix, size, p_start);
+ } else {
+ auto value_ptr = iter.get_p_value();
+ int offset = reinterpret_cast<const char*>(value_ptr) - p_start;
+ size += iter.size();
+ os << "\n" << i_prefix;
+ if constexpr (NODE_TYPE == node_type_t::LEAF) {
+ os << *value_ptr;
+ } else {
+ os << "0x" << std::hex << value_ptr->value << std::dec;
+ }
+ os << " " << size << "B"
+ << " @" << offset << "B";
+ }
+ if (iter.is_last()) {
+ break;
+ } else {
+ ++iter;
+ p_prefix = &prefix_blank;
+ }
+ } while (true);
+ return os;
+ }
+
+ static void validate(const container_t& container) {
+ auto iter = iterator_t(container);
+ assert(!iter.is_end());
+ auto key = iter.get_key();
+ do {
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ NXT_STAGE_T::validate(nxt_container);
+ }
+ if (iter.is_last()) {
+ break;
+ } else {
+ ++iter;
+ assert(compare_to(key, iter.get_key()) == MatchKindCMP::LT);
+ key = iter.get_key();
+ }
+ } while (true);
+ }
+
+ static void get_stats(const container_t& container, node_stats_t& stats,
+ full_key_t<KeyT::VIEW>& index_key) {
+ auto iter = iterator_t(container);
+ assert(!iter.is_end());
+ stats.size_overhead += iterator_t::header_size();
+ do {
+ index_key.replace(iter.get_key());
+ stats.size_overhead += iter.size_overhead();
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ NXT_STAGE_T::get_stats(nxt_container, stats, index_key);
+ } else {
+ ++stats.num_kvs;
+ size_t kv_logical_size = index_key.size_logical();
+ size_t value_size;
+ if constexpr (NODE_TYPE == node_type_t::LEAF) {
+ value_size = iter.get_p_value()->size;
+ } else {
+ value_size = sizeof(value_t);
+ }
+ stats.size_value += value_size;
+ kv_logical_size += value_size;
+ stats.size_logical += kv_logical_size;
+ }
+ if (iter.is_last()) {
+ break;
+ } else {
+ ++iter;
+ }
+ } while (true);
+ }
+
+ static bool next_position(const container_t& container, position_t& pos) {
+ auto iter = iterator_t(container);
+ assert(!iter.is_end());
+ iter.seek_at(pos.index);
+ bool find_next;
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ find_next = NXT_STAGE_T::next_position(nxt_container, pos.nxt);
+ } else {
+ find_next = true;
+ }
+ if (find_next) {
+ if (iter.is_last()) {
+ return true;
+ } else {
+ pos.index = iter.index() + 1;
+ if constexpr (!IS_BOTTOM) {
+ pos.nxt = NXT_STAGE_T::position_t::begin();
+ }
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ struct _BaseEmpty {};
+ class _BaseWithNxtIterator {
+ protected:
+ typename NXT_STAGE_T::StagedIterator _nxt;
+ };
+ class StagedIterator
+ : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtIterator> {
+ public:
+ StagedIterator() = default;
+ bool valid() const { return iter.has_value(); }
+ index_t index() const {
+ return iter->index();
+ }
+ bool is_end() const { return iter->is_end(); }
+ bool in_progress() const {
+ assert(valid());
+ if constexpr (!IS_BOTTOM) {
+ if (this->_nxt.valid()) {
+ if (this->_nxt.index() == 0) {
+ return this->_nxt.in_progress();
+ } else {
+ return true;
+ }
+ } else {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+ key_get_type get_key() const { return iter->get_key(); }
+
+ iterator_t& get() { return *iter; }
+ void set(const container_t& container) {
+ assert(!valid());
+ iter = iterator_t(container);
+ }
+ void set_end() { iter->set_end(); }
+ typename NXT_STAGE_T::StagedIterator& nxt() {
+ if constexpr (!IS_BOTTOM) {
+ if (!this->_nxt.valid()) {
+ auto nxt_container = iter->get_nxt_container();
+ this->_nxt.set(nxt_container);
+ }
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ typename NXT_STAGE_T::StagedIterator& get_nxt() {
+ if constexpr (!IS_BOTTOM) {
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ StagedIterator& operator++() {
+ if (iter->is_last()) {
+ iter->set_end();
+ } else {
+ ++(*iter);
+ }
+ if constexpr (!IS_BOTTOM) {
+ this->_nxt.reset();
+ }
+ return *this;
+ }
+ void reset() {
+ if (valid()) {
+ iter.reset();
+ if constexpr (!IS_BOTTOM) {
+ this->_nxt.reset();
+ }
+ }
+ }
+ std::ostream& print(std::ostream& os, bool is_top) const {
+ if (valid()) {
+ if (iter->is_end()) {
+ return os << "END";
+ } else {
+ os << index();
+ }
+ } else {
+ if (is_top) {
+ return os << "invalid StagedIterator!";
+ } else {
+ os << "0!";
+ }
+ }
+ if constexpr (!IS_BOTTOM) {
+ os << ", ";
+ return this->_nxt.print(os, false);
+ } else {
+ return os;
+ }
+ }
+ position_t get_pos() const {
+ if (valid()) {
+ if constexpr (IS_BOTTOM) {
+ return position_t{index()};
+ } else {
+ return position_t{index(), this->_nxt.get_pos()};
+ }
+ } else {
+ return position_t::begin();
+ }
+ }
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ uint8_t present = static_cast<bool>(iter);
+ ceph::encode(present, encoded);
+ if (iter.has_value()) {
+ iter->encode(p_node_start, encoded);
+ if constexpr (!IS_BOTTOM) {
+ this->_nxt.encode(p_node_start, encoded);
+ }
+ }
+ }
+ static StagedIterator decode(const char* p_node_start,
+ ceph::bufferlist::const_iterator& delta) {
+ StagedIterator ret;
+ uint8_t present;
+ ceph::decode(present, delta);
+ if (present) {
+ ret.iter = iterator_t::decode(p_node_start, delta);
+ if constexpr (!IS_BOTTOM) {
+ ret._nxt = NXT_STAGE_T::StagedIterator::decode(p_node_start, delta);
+ }
+ }
+ return ret;
+ }
+ friend std::ostream& operator<<(std::ostream& os, const StagedIterator& iter) {
+ return iter.print(os, true);
+ }
+ private:
+ std::optional<iterator_t> iter;
+ };
+
+ static bool recursively_locate_split(
+ size_t& current_size, size_t extra_size,
+ size_t target_size, StagedIterator& split_at) {
+ assert(current_size <= target_size);
+ iterator_t& split_iter = split_at.get();
+ current_size = split_iter.seek_split(current_size, extra_size, target_size);
+ assert(current_size <= target_size);
+ assert(!split_iter.is_end());
+ if (split_iter.index() == 0) {
+ extra_size += iterator_t::header_size();
+ } else {
+ extra_size = 0;
+ }
+ bool locate_nxt;
+ if constexpr (!IS_BOTTOM) {
+ locate_nxt = NXT_STAGE_T::recursively_locate_split(
+ current_size, extra_size + split_iter.size_to_nxt(),
+ target_size, split_at.nxt());
+ } else { // IS_BOTTOM
+ // located upper_bound, fair split strategy
+ size_t nxt_size = split_iter.size() + extra_size;
+ assert(current_size + nxt_size > target_size);
+ if (current_size + nxt_size/2 < target_size) {
+ // include next
+ current_size += nxt_size;
+ locate_nxt = true;
+ } else {
+ // exclude next
+ locate_nxt = false;
+ }
+ }
+ if (locate_nxt) {
+ if (split_iter.is_last()) {
+ return true;
+ } else {
+ ++split_at;
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ static bool recursively_locate_split_inserted(
+ size_t& current_size, size_t extra_size, size_t target_size,
+ position_t& insert_pos, match_stage_t insert_stage, size_t insert_size,
+ std::optional<bool>& is_insert_left, StagedIterator& split_at) {
+ assert(current_size <= target_size);
+ assert(!is_insert_left.has_value());
+ iterator_t& split_iter = split_at.get();
+ auto& insert_index = insert_pos.index;
+ if (insert_stage == STAGE) {
+ current_size = split_iter.template seek_split_inserted<true>(
+ current_size, extra_size, target_size,
+ insert_index, insert_size, is_insert_left);
+ assert(is_insert_left.has_value());
+ assert(current_size <= target_size);
+ if (split_iter.index() == 0) {
+ if (insert_index == 0) {
+ if (*is_insert_left == false) {
+ extra_size += iterator_t::header_size();
+ } else {
+ extra_size = 0;
+ }
+ } else {
+ extra_size += iterator_t::header_size();
+ }
+ } else {
+ extra_size = 0;
+ }
+ if (*is_insert_left == false && split_iter.index() == insert_index) {
+ // split_iter can be end
+ // found the lower-bound of target_size
+ // ...[s_index-1] |!| (i_index) [s_index]...
+
+ // located upper-bound, fair split strategy
+ // look at the next slot (the insert item)
+ size_t nxt_size = insert_size + extra_size;
+ assert(current_size + nxt_size > target_size);
+ if (current_size + nxt_size/2 < target_size) {
+ // include next
+ *is_insert_left = true;
+ current_size += nxt_size;
+ if (split_iter.is_end()) {
+ // ...[s_index-1] (i_index) |!|
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ // exclude next
+ return false;
+ }
+ } else {
+ // Already considered insert effect in the current stage.
+ // Look into the next stage to identify the target_size lower-bound w/o
+ // insert effect.
+ assert(!split_iter.is_end());
+ bool locate_nxt;
+ if constexpr (!IS_BOTTOM) {
+ locate_nxt = NXT_STAGE_T::recursively_locate_split(
+ current_size, extra_size + split_iter.size_to_nxt(),
+ target_size, split_at.nxt());
+ } else { // IS_BOTTOM
+ // located upper-bound, fair split strategy
+ // look at the next slot
+ size_t nxt_size = split_iter.size() + extra_size;
+ assert(current_size + nxt_size > target_size);
+ if (current_size + nxt_size/2 < target_size) {
+ // include next
+ current_size += nxt_size;
+ locate_nxt = true;
+ } else {
+ // exclude next
+ locate_nxt = false;
+ }
+ }
+ if (locate_nxt) {
+ if (split_iter.is_last()) {
+ auto end_index = split_iter.index() + 1;
+ if (insert_index == INDEX_END) {
+ insert_index = end_index;
+ }
+ assert(insert_index <= end_index);
+ if (insert_index == end_index) {
+ assert(*is_insert_left == false);
+ split_iter.set_end();
+ // ...[s_index-1] |!| (i_index)
+ return false;
+ } else {
+ assert(*is_insert_left == true);
+ return true;
+ }
+ } else {
+ ++split_at;
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+ } else {
+ if constexpr (!IS_BOTTOM) {
+ assert(insert_stage < STAGE);
+ current_size = split_iter.template seek_split_inserted<false>(
+ current_size, extra_size, target_size,
+ insert_index, insert_size, is_insert_left);
+ assert(!split_iter.is_end());
+ assert(current_size <= target_size);
+ if (split_iter.index() == 0) {
+ extra_size += iterator_t::header_size();
+ } else {
+ extra_size = 0;
+ }
+ bool locate_nxt;
+ if (!is_insert_left.has_value()) {
+ // Considered insert effect in the current stage, and insert happens
+ // in the lower stage.
+ // Look into the next stage to identify the target_size lower-bound w/
+ // insert effect.
+ assert(split_iter.index() == insert_index);
+ locate_nxt = NXT_STAGE_T::recursively_locate_split_inserted(
+ current_size, extra_size + split_iter.size_to_nxt(), target_size,
+ insert_pos.nxt, insert_stage, insert_size,
+ is_insert_left, split_at.nxt());
+ assert(is_insert_left.has_value());
+#ifndef NDEBUG
+ if (locate_nxt) {
+ assert(*is_insert_left == true);
+ }
+#endif
+ } else {
+ // is_insert_left.has_value() == true
+ // Insert will *not* happen in the lower stage.
+ // Need to look into the next stage to identify the target_size
+ // lower-bound w/ insert effect
+ assert(split_iter.index() != insert_index);
+ locate_nxt = NXT_STAGE_T::recursively_locate_split(
+ current_size, extra_size + split_iter.size_to_nxt(),
+ target_size, split_at.nxt());
+#ifndef NDEBUG
+ if (split_iter.index() < insert_index) {
+ assert(*is_insert_left == false);
+ } else {
+ assert(*is_insert_left == true);
+ }
+#endif
+ }
+ if (locate_nxt) {
+ if (split_iter.is_last()) {
+ return true;
+ } else {
+ ++split_at;
+ return false;
+ }
+ } else {
+ return false;
+ }
+ } else {
+ ceph_abort("impossible path");
+ return false;;
+ }
+ }
+ }
+
+ /*
+ * container appender type system
+ * container_t::Appender(NodeExtentMutable& mut, char* p_append)
+ * append(const container_t& src, index_t from, index_t items)
+ * wrap() -> char*
+ * IF !IS_BOTTOM:
+ * open_nxt(const key_get_type&)
+ * open_nxt(const full_key_t&)
+ * -> std::tuple<NodeExtentMutable&, char*>
+ * wrap_nxt(char* p_append)
+ * ELSE
+ * append(const full_key_t& key, const value_t& value)
+ */
+ template <KeyT KT>
+ struct _BaseWithNxtAppender {
+ typename NXT_STAGE_T::template StagedAppender<KT> _nxt;
+ };
+ template <KeyT KT>
+ class StagedAppender
+ : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtAppender<KT>> {
+ public:
+ StagedAppender() = default;
+ ~StagedAppender() {
+ assert(!require_wrap_nxt);
+ assert(!valid());
+ }
+ bool valid() const { return appender.has_value(); }
+ index_t index() const {
+ assert(valid());
+ return _index;
+ }
+ bool in_progress() const { return require_wrap_nxt; }
+ // TODO: pass by reference
+ void init(NodeExtentMutable* p_mut, char* p_start) {
+ assert(!valid());
+ appender = typename container_t::template Appender<KT>(p_mut, p_start);
+ _index = 0;
+ }
+ // possible to make src_iter end if to_index == INDEX_END
+ void append_until(StagedIterator& src_iter, index_t& to_index) {
+ assert(!require_wrap_nxt);
+ auto s_index = src_iter.index();
+ src_iter.get().template copy_out_until<KT>(*appender, to_index);
+ assert(src_iter.index() == to_index);
+ assert(to_index >= s_index);
+ auto increment = (to_index - s_index);
+ if (increment) {
+ _index += increment;
+ if constexpr (!IS_BOTTOM) {
+ src_iter.get_nxt().reset();
+ }
+ }
+ }
+ void append(const full_key_t<KT>& key,
+ const value_t& value, const value_t*& p_value) {
+ assert(!require_wrap_nxt);
+ if constexpr (!IS_BOTTOM) {
+ auto& nxt = open_nxt(key);
+ nxt.append(key, value, p_value);
+ wrap_nxt();
+ } else {
+ appender->append(key, value, p_value);
+ ++_index;
+ }
+ }
+ char* wrap() {
+ assert(valid());
+ assert(_index > 0);
+ if constexpr (!IS_BOTTOM) {
+ if (require_wrap_nxt) {
+ wrap_nxt();
+ }
+ }
+ auto ret = appender->wrap();
+ appender.reset();
+ return ret;
+ }
+ typename NXT_STAGE_T::template StagedAppender<KT>&
+ open_nxt(key_get_type paritial_key) {
+ assert(!require_wrap_nxt);
+ if constexpr (!IS_BOTTOM) {
+ require_wrap_nxt = true;
+ auto [p_mut, p_append] = appender->open_nxt(paritial_key);
+ this->_nxt.init(p_mut, p_append);
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ typename NXT_STAGE_T::template StagedAppender<KT>&
+ open_nxt(const full_key_t<KT>& key) {
+ assert(!require_wrap_nxt);
+ if constexpr (!IS_BOTTOM) {
+ require_wrap_nxt = true;
+ auto [p_mut, p_append] = appender->open_nxt(key);
+ this->_nxt.init(p_mut, p_append);
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ typename NXT_STAGE_T::template StagedAppender<KT>& get_nxt() {
+ if constexpr (!IS_BOTTOM) {
+ assert(require_wrap_nxt);
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ void wrap_nxt() {
+ if constexpr (!IS_BOTTOM) {
+ assert(require_wrap_nxt);
+ require_wrap_nxt = false;
+ auto p_append = this->_nxt.wrap();
+ appender->wrap_nxt(p_append);
+ ++_index;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ private:
+ std::optional<typename container_t::template Appender<KT>> appender;
+ index_t _index;
+ bool require_wrap_nxt = false;
+ };
+
+ template <KeyT KT>
+ static void _append_range(
+ StagedIterator& src_iter, StagedAppender<KT>& appender, index_t& to_index) {
+ if (src_iter.is_end()) {
+ // append done
+ assert(to_index == INDEX_END);
+ to_index = src_iter.index();
+ } else if constexpr (!IS_BOTTOM) {
+ if (appender.in_progress()) {
+ // appender has appended something at the current item,
+ // cannot append the current item as-a-whole
+ index_t to_index_nxt = INDEX_END;
+ NXT_STAGE_T::template _append_range<KT>(
+ src_iter.nxt(), appender.get_nxt(), to_index_nxt);
+ ++src_iter;
+ appender.wrap_nxt();
+ } else if (src_iter.in_progress()) {
+ // src_iter is not at the beginning of the current item,
+ // cannot append the current item as-a-whole
+ index_t to_index_nxt = INDEX_END;
+ NXT_STAGE_T::template _append_range<KT>(
+ src_iter.nxt(), appender.open_nxt(src_iter.get_key()), to_index_nxt);
+ ++src_iter;
+ appender.wrap_nxt();
+ } else {
+ // we can safely append the current item as-a-whole
+ }
+ }
+ appender.append_until(src_iter, to_index);
+ }
+
+ template <KeyT KT>
+ static void _append_into(StagedIterator& src_iter, StagedAppender<KT>& appender,
+ position_t& position, match_stage_t stage) {
+ assert(position.index == src_iter.index());
+ // reaches the last item
+ if (stage == STAGE) {
+ // done, end recursion
+ if constexpr (!IS_BOTTOM) {
+ position.nxt = position_t::nxt_t::begin();
+ }
+ } else {
+ assert(stage < STAGE);
+ // proceed append in the next stage
+ NXT_STAGE_T::template append_until<KT>(
+ src_iter.nxt(), appender.open_nxt(src_iter.get_key()),
+ position.nxt, stage);
+ }
+ }
+
+ template <KeyT KT>
+ static void append_until(StagedIterator& src_iter, StagedAppender<KT>& appender,
+ position_t& position, match_stage_t stage) {
+ index_t from_index = src_iter.index();
+ index_t& to_index = position.index;
+ assert(from_index <= to_index);
+ if constexpr (IS_BOTTOM) {
+ assert(stage == STAGE);
+ appender.append_until(src_iter, to_index);
+ } else {
+ assert(stage <= STAGE);
+ if (src_iter.index() == to_index) {
+ _append_into<KT>(src_iter, appender, position, stage);
+ } else {
+ if (to_index == INDEX_END) {
+ assert(stage == STAGE);
+ } else if (to_index == INDEX_LAST) {
+ assert(stage < STAGE);
+ }
+ _append_range<KT>(src_iter, appender, to_index);
+ _append_into<KT>(src_iter, appender, position, stage);
+ }
+ }
+ to_index -= from_index;
+ }
+
+ template <KeyT KT>
+ static bool append_insert(
+ const full_key_t<KT>& key, const value_t& value,
+ StagedIterator& src_iter, StagedAppender<KT>& appender,
+ bool is_front_insert, match_stage_t& stage, const value_t*& p_value) {
+ assert(src_iter.valid());
+ if (stage == STAGE) {
+ appender.append(key, value, p_value);
+ if (src_iter.is_end()) {
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ assert(stage < STAGE);
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_is_end = NXT_STAGE_T::template append_insert<KT>(
+ key, value, src_iter.get_nxt(), appender.get_nxt(),
+ is_front_insert, stage, p_value);
+ if (nxt_is_end) {
+ appender.wrap_nxt();
+ ++src_iter;
+ if (is_front_insert) {
+ stage = STAGE;
+ }
+ if (src_iter.is_end()) {
+ return true;
+ }
+ }
+ return false;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ }
+
+ /* TrimType:
+ * BEFORE: remove the entire container, normally means the according higher
+ * stage iterator needs to be trimmed as-a-whole.
+ * AFTER: retain the entire container, normally means the trim should be
+ * start from the next iterator at the higher stage.
+ * AT: trim happens in the current container, and the according higher
+ * stage iterator needs to be adjusted by the trimmed size.
+ */
+ static std::tuple<TrimType, node_offset_t>
+ recursively_trim(NodeExtentMutable& mut, StagedIterator& trim_at) {
+ if (!trim_at.valid()) {
+ return {TrimType::BEFORE, 0u};
+ }
+ if (trim_at.is_end()) {
+ return {TrimType::AFTER, 0u};
+ }
+
+ auto& iter = trim_at.get();
+ if constexpr (!IS_BOTTOM) {
+ auto [type, trimmed] = NXT_STAGE_T::recursively_trim(
+ mut, trim_at.get_nxt());
+ node_offset_t trim_size;
+ if (type == TrimType::AFTER) {
+ if (iter.is_last()) {
+ return {TrimType::AFTER, 0u};
+ }
+ ++trim_at;
+ trim_size = iter.trim_until(mut);
+ } else if (type == TrimType::BEFORE) {
+ if (iter.index() == 0) {
+ return {TrimType::BEFORE, 0u};
+ }
+ trim_size = iter.trim_until(mut);
+ } else {
+ trim_size = iter.trim_at(mut, trimmed);
+ }
+ return {TrimType::AT, trim_size};
+ } else {
+ if (iter.index() == 0) {
+ return {TrimType::BEFORE, 0u};
+ } else {
+ auto trimmed = iter.trim_until(mut);
+ return {TrimType::AT, trimmed};
+ }
+ }
+ }
+
+ static void trim(NodeExtentMutable& mut, StagedIterator& trim_at) {
+ auto [type, trimmed] = recursively_trim(mut, trim_at);
+ if (type == TrimType::BEFORE) {
+ assert(trim_at.valid());
+ auto& iter = trim_at.get();
+ iter.trim_until(mut);
+ }
+ }
+};
+
+/**
+ * Configurations for struct staged
+ *
+ * staged_params_* assembles different container_t implementations (defined by
+ * stated::_iterator_t) by STAGE, and constructs the final multi-stage
+ * implementations for different node layouts defined by
+ * node_extent_t<FieldType, NODE_TYPE>.
+ *
+ * The specialized implementations for different layouts are accessible through
+ * the helper type node_to_stage_t<node_extent_t<FieldType, NODE_TYPE>>.
+ *
+ * Specifically, the settings of 8 layouts are:
+ *
+ * The layout (N0, LEAF/INTERNAL) has 3 stages:
+ * - STAGE_LEFT: node_extent_t<node_fields_0_t, LEAF/INTERNAL>
+ * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL>
+ * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL>
+ *
+ * The layout (N1, LEAF/INTERNAL) has 3 stages:
+ * - STAGE_LEFT: node_extent_t<node_fields_1_t, LEAF/INTERNAL>
+ * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL>
+ * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL>
+ *
+ * The layout (N2, LEAF/INTERNAL) has 2 stages:
+ * - STAGE_STRING: node_extent_t<node_fields_2_t, LEAF/INTERNAL>
+ * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL>
+ *
+ * The layout (N3, LEAF) has 1 stage:
+ * - STAGE_RIGHT: node_extent_t<leaf_fields_3_t, LEAF>
+ *
+ * The layout (N3, INTERNAL) has 1 stage:
+ * - STAGE_RIGHT: node_extent_t<internal_fields_3_t, INTERNAL>
+ */
+
+template <node_type_t _NODE_TYPE>
+struct staged_params_subitems {
+ using container_t = sub_items_t<_NODE_TYPE>;
+ static constexpr auto NODE_TYPE = _NODE_TYPE;
+ static constexpr auto STAGE = STAGE_RIGHT;
+
+ // dummy type in order to make our type system work
+ // any better solution to get rid of this?
+ using next_param_t = staged_params_subitems<NODE_TYPE>;
+};
+
+template <node_type_t _NODE_TYPE>
+struct staged_params_item_iterator {
+ using container_t = item_iterator_t<_NODE_TYPE>;
+ static constexpr auto NODE_TYPE = _NODE_TYPE;
+ static constexpr auto STAGE = STAGE_STRING;
+
+ using next_param_t = staged_params_subitems<NODE_TYPE>;
+};
+
+template <typename NodeType>
+struct staged_params_node_01 {
+ using container_t = NodeType;
+ static constexpr auto NODE_TYPE = NodeType::NODE_TYPE;
+ static constexpr auto STAGE = STAGE_LEFT;
+
+ using next_param_t = staged_params_item_iterator<NODE_TYPE>;
+};
+
+template <typename NodeType>
+struct staged_params_node_2 {
+ using container_t = NodeType;
+ static constexpr auto NODE_TYPE = NodeType::NODE_TYPE;
+ static constexpr auto STAGE = STAGE_STRING;
+
+ using next_param_t = staged_params_subitems<NODE_TYPE>;
+};
+
+template <typename NodeType>
+struct staged_params_node_3 {
+ using container_t = NodeType;
+ static constexpr auto NODE_TYPE = NodeType::NODE_TYPE;
+ static constexpr auto STAGE = STAGE_RIGHT;
+
+ // dummy type in order to make our type system work
+ // any better solution to get rid of this?
+ using next_param_t = staged_params_node_3<NodeType>;
+};
+
+template <typename NodeType, typename Enable = void> struct _node_to_stage_t;
+template <typename NodeType>
+struct _node_to_stage_t<NodeType,
+ std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N0 ||
+ NodeType::FIELD_TYPE == field_type_t::N1>> {
+ using type = staged<staged_params_node_01<NodeType>>;
+};
+template <typename NodeType>
+struct _node_to_stage_t<NodeType,
+ std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N2>> {
+ using type = staged<staged_params_node_2<NodeType>>;
+};
+template <typename NodeType>
+struct _node_to_stage_t<NodeType,
+ std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N3>> {
+ using type = staged<staged_params_node_3<NodeType>>;
+};
+template <typename NodeType>
+using node_to_stage_t = typename _node_to_stage_t<NodeType>::type;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h
new file mode 100644
index 000000000..a9d5cef3b
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h
@@ -0,0 +1,411 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <optional>
+#include <ostream>
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/tree_types.h"
+
+namespace crimson::os::seastore::onode {
+
+using match_stage_t = int8_t;
+constexpr match_stage_t STAGE_LEFT = 2; // shard/pool/crush
+constexpr match_stage_t STAGE_STRING = 1; // nspace/oid
+constexpr match_stage_t STAGE_RIGHT = 0; // snap/gen
+constexpr auto STAGE_TOP = STAGE_LEFT;
+constexpr auto STAGE_BOTTOM = STAGE_RIGHT;
+constexpr bool is_valid_stage(match_stage_t stage) {
+ return std::clamp(stage, STAGE_BOTTOM, STAGE_TOP) == stage;
+}
+// TODO: replace by
+// using match_history_t = int8_t;
+// left_m, str_m, right_m
+// 3: GT,
+// 2: EQ, GT,
+// 1: EQ, EQ, GT
+// 0: EQ, EQ, EQ
+// -1: EQ, EQ, LT
+// -2: EQ, LT,
+// -3: LT,
+
+struct MatchHistory {
+ template <match_stage_t STAGE>
+ const std::optional<MatchKindCMP>& get() const {
+ static_assert(is_valid_stage(STAGE));
+ if constexpr (STAGE == STAGE_RIGHT) {
+ return right_match;
+ } else if (STAGE == STAGE_STRING) {
+ return string_match;
+ } else {
+ return left_match;
+ }
+ }
+
+ const std::optional<MatchKindCMP>&
+ get_by_stage(match_stage_t stage) const {
+ assert(is_valid_stage(stage));
+ if (stage == STAGE_RIGHT) {
+ return right_match;
+ } else if (stage == STAGE_STRING) {
+ return string_match;
+ } else {
+ return left_match;
+ }
+ }
+
+ template <match_stage_t STAGE = STAGE_TOP>
+ const bool is_GT() const;
+
+ template <match_stage_t STAGE>
+ void set(MatchKindCMP match) {
+ static_assert(is_valid_stage(STAGE));
+ if constexpr (STAGE < STAGE_TOP) {
+ assert(*get<STAGE + 1>() == MatchKindCMP::EQ);
+ }
+ assert(!get<STAGE>().has_value() || *get<STAGE>() != MatchKindCMP::EQ);
+ const_cast<std::optional<MatchKindCMP>&>(get<STAGE>()) = match;
+ }
+
+ std::ostream& dump(std::ostream& os) const {
+ os << "history(";
+ dump_each(os, left_match) << ", ";
+ dump_each(os, string_match) << ", ";
+ dump_each(os, right_match) << ")";
+ return os;
+ }
+
+ std::ostream& dump_each(
+ std::ostream& os, const std::optional<MatchKindCMP>& match) const {
+ if (!match.has_value()) {
+ return os << "--";
+ } else if (*match == MatchKindCMP::LT) {
+ return os << "LT";
+ } else if (*match == MatchKindCMP::EQ) {
+ return os << "EQ";
+ } else if (*match == MatchKindCMP::GT) {
+ return os << "GT";
+ } else {
+ ceph_abort("impossble path");
+ }
+ }
+
+ std::optional<MatchKindCMP> left_match;
+ std::optional<MatchKindCMP> string_match;
+ std::optional<MatchKindCMP> right_match;
+};
+inline std::ostream& operator<<(std::ostream& os, const MatchHistory& pos) {
+ return pos.dump(os);
+}
+
+template <match_stage_t STAGE>
+struct _check_GT_t {
+ static bool eval(const MatchHistory* history) {
+ return history->get<STAGE>() &&
+ (*history->get<STAGE>() == MatchKindCMP::GT ||
+ (*history->get<STAGE>() == MatchKindCMP::EQ &&
+ _check_GT_t<STAGE - 1>::eval(history)));
+ }
+};
+template <>
+struct _check_GT_t<STAGE_RIGHT> {
+ static bool eval(const MatchHistory* history) {
+ return history->get<STAGE_RIGHT>() &&
+ *history->get<STAGE_RIGHT>() == MatchKindCMP::GT;
+ }
+};
+template <match_stage_t STAGE>
+const bool MatchHistory::is_GT() const {
+ static_assert(is_valid_stage(STAGE));
+ if constexpr (STAGE < STAGE_TOP) {
+ assert(get<STAGE + 1>() == MatchKindCMP::EQ);
+ }
+ return _check_GT_t<STAGE>::eval(this);
+}
+
+template <match_stage_t STAGE>
+struct staged_position_t {
+ static_assert(is_valid_stage(STAGE));
+ using me_t = staged_position_t<STAGE>;
+ using nxt_t = staged_position_t<STAGE - 1>;
+ bool is_end() const {
+ if (index == INDEX_END) {
+ return true;
+ } else {
+ assert(is_valid_index(index));
+ return false;
+ }
+ }
+ index_t& index_by_stage(match_stage_t stage) {
+ assert(stage <= STAGE);
+ if (STAGE == stage) {
+ return index;
+ } else {
+ return nxt.index_by_stage(stage);
+ }
+ }
+
+ int cmp(const me_t& o) const {
+ if (index > o.index) {
+ return 1;
+ } else if (index < o.index) {
+ return -1;
+ } else {
+ return nxt.cmp(o.nxt);
+ }
+ }
+ bool operator>(const me_t& o) const { return cmp(o) > 0; }
+ bool operator>=(const me_t& o) const { return cmp(o) >= 0; }
+ bool operator<(const me_t& o) const { return cmp(o) < 0; }
+ bool operator<=(const me_t& o) const { return cmp(o) <= 0; }
+ bool operator==(const me_t& o) const { return cmp(o) == 0; }
+ bool operator!=(const me_t& o) const { return cmp(o) != 0; }
+
+ me_t& operator-=(const me_t& o) {
+ assert(is_valid_index(o.index));
+ assert(index >= o.index);
+ if (index != INDEX_END) {
+ assert(is_valid_index(index));
+ index -= o.index;
+ if (index == 0) {
+ nxt -= o.nxt;
+ }
+ }
+ return *this;
+ }
+
+ void encode(ceph::bufferlist& encoded) const {
+ ceph::encode(index, encoded);
+ nxt.encode(encoded);
+ }
+
+ static me_t decode(ceph::bufferlist::const_iterator& delta) {
+ me_t ret;
+ ceph::decode(ret.index, delta);
+ ret.nxt = nxt_t::decode(delta);
+ return ret;
+ }
+
+ static me_t begin() { return {0u, nxt_t::begin()}; }
+ static me_t end() {
+ return {INDEX_END, nxt_t::end()};
+ }
+
+ index_t index;
+ nxt_t nxt;
+};
+template <match_stage_t STAGE>
+std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE>& pos) {
+ if (pos.index == INDEX_END) {
+ os << "END";
+ } else if (pos.index == INDEX_LAST) {
+ os << "LAST";
+ } else {
+ os << pos.index;
+ assert(is_valid_index(pos.index));
+ }
+ return os << ", " << pos.nxt;
+}
+
+template <>
+struct staged_position_t<STAGE_BOTTOM> {
+ using me_t = staged_position_t<STAGE_BOTTOM>;
+ bool is_end() const {
+ if (index == INDEX_END) {
+ return true;
+ } else {
+ assert(is_valid_index(index));
+ return false;
+ }
+ }
+ index_t& index_by_stage(match_stage_t stage) {
+ assert(stage == STAGE_BOTTOM);
+ return index;
+ }
+
+ int cmp(const staged_position_t<STAGE_BOTTOM>& o) const {
+ if (index > o.index) {
+ return 1;
+ } else if (index < o.index) {
+ return -1;
+ } else {
+ return 0;
+ }
+ }
+ bool operator>(const me_t& o) const { return cmp(o) > 0; }
+ bool operator>=(const me_t& o) const { return cmp(o) >= 0; }
+ bool operator<(const me_t& o) const { return cmp(o) < 0; }
+ bool operator<=(const me_t& o) const { return cmp(o) <= 0; }
+ bool operator==(const me_t& o) const { return cmp(o) == 0; }
+ bool operator!=(const me_t& o) const { return cmp(o) != 0; }
+
+ me_t& operator-=(const me_t& o) {
+ assert(is_valid_index(o.index));
+ assert(index >= o.index);
+ if (index != INDEX_END) {
+ assert(is_valid_index(index));
+ index -= o.index;
+ }
+ return *this;
+ }
+
+ void encode(ceph::bufferlist& encoded) const {
+ ceph::encode(index, encoded);
+ }
+
+ static me_t decode(ceph::bufferlist::const_iterator& delta) {
+ me_t ret;
+ ceph::decode(ret.index, delta);
+ return ret;
+ }
+
+ static me_t begin() { return {0u}; }
+ static me_t end() { return {INDEX_END}; }
+
+ index_t index;
+};
+template <>
+inline std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE_BOTTOM>& pos) {
+ if (pos.index == INDEX_END) {
+ os << "END";
+ } else if (pos.index == INDEX_LAST) {
+ os << "LAST";
+ } else {
+ os << pos.index;
+ assert(is_valid_index(pos.index));
+ }
+ return os;
+}
+
+using search_position_t = staged_position_t<STAGE_TOP>;
+
+template <match_stage_t STAGE>
+const staged_position_t<STAGE>& cast_down(const search_position_t& pos) {
+ if constexpr (STAGE == STAGE_LEFT) {
+ return pos;
+ } else if constexpr (STAGE == STAGE_STRING) {
+#ifndef NDEBUG
+ if (pos.is_end()) {
+ assert(pos.nxt.is_end());
+ } else {
+ assert(pos.index == 0u);
+ }
+#endif
+ return pos.nxt;
+ } else if constexpr (STAGE == STAGE_RIGHT) {
+#ifndef NDEBUG
+ if (pos.is_end()) {
+ assert(pos.nxt.nxt.is_end());
+ } else {
+ assert(pos.index == 0u);
+ assert(pos.nxt.index == 0u);
+ }
+#endif
+ return pos.nxt.nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+template <match_stage_t STAGE>
+staged_position_t<STAGE>& cast_down(search_position_t& pos) {
+ const search_position_t& _pos = pos;
+ return const_cast<staged_position_t<STAGE>&>(cast_down<STAGE>(_pos));
+}
+
+template <match_stage_t STAGE>
+staged_position_t<STAGE>& cast_down_fill_0(search_position_t& pos) {
+ if constexpr (STAGE == STAGE_LEFT) {
+ return pos;
+ } if constexpr (STAGE == STAGE_STRING) {
+ pos.index = 0;
+ return pos.nxt;
+ } else if constexpr (STAGE == STAGE_RIGHT) {
+ pos.index = 0;
+ pos.nxt.index = 0;
+ return pos.nxt.nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+inline search_position_t&& normalize(search_position_t&& pos) { return std::move(pos); }
+
+template <match_stage_t STAGE, typename = std::enable_if_t<STAGE != STAGE_TOP>>
+search_position_t normalize(staged_position_t<STAGE>&& pos) {
+ if (pos.is_end()) {
+ return search_position_t::end();
+ }
+ if constexpr (STAGE == STAGE_STRING) {
+ return {0u, std::move(pos)};
+ } else if (STAGE == STAGE_RIGHT) {
+ return {0u, {0u, std::move(pos)}};
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+struct memory_range_t {
+ const char* p_start;
+ const char* p_end;
+};
+
+enum class ContainerType { ITERATIVE, INDEXABLE };
+
+template <node_type_t> struct value_type;
+template<> struct value_type<node_type_t::INTERNAL> { using type = laddr_packed_t; };
+template<> struct value_type<node_type_t::LEAF> { using type = onode_t; };
+template <node_type_t NODE_TYPE>
+using value_type_t = typename value_type<NODE_TYPE>::type;
+
+template <node_type_t NODE_TYPE, match_stage_t STAGE>
+struct staged_result_t {
+ using me_t = staged_result_t<NODE_TYPE, STAGE>;
+ bool is_end() const { return position.is_end(); }
+
+ static me_t end() {
+ return {staged_position_t<STAGE>::end(), nullptr, MSTAT_END};
+ }
+ template <typename T = me_t>
+ static std::enable_if_t<STAGE != STAGE_BOTTOM, T> from_nxt(
+ index_t index, const staged_result_t<NODE_TYPE, STAGE - 1>& nxt_stage_result) {
+ return {{index, nxt_stage_result.position},
+ nxt_stage_result.p_value,
+ nxt_stage_result.mstat};
+ }
+
+ staged_position_t<STAGE> position;
+ const value_type_t<NODE_TYPE>* p_value;
+ match_stat_t mstat;
+};
+
+template <node_type_t NODE_TYPE>
+using lookup_result_t = staged_result_t<NODE_TYPE, STAGE_TOP>;
+
+template <node_type_t NODE_TYPE>
+lookup_result_t<NODE_TYPE>&& normalize(
+ lookup_result_t<NODE_TYPE>&& result) { return std::move(result); }
+
+template <node_type_t NODE_TYPE, match_stage_t STAGE,
+ typename = std::enable_if_t<STAGE != STAGE_TOP>>
+lookup_result_t<NODE_TYPE> normalize(
+ staged_result_t<NODE_TYPE, STAGE>&& result) {
+ // FIXME: assert result.mstat correct
+ return {normalize(std::move(result.position)), result.p_value, result.mstat};
+}
+
+struct node_stats_t {
+ size_t size_persistent = 0;
+ size_t size_filled = 0;
+ // filled by staged::get_stats()
+ size_t size_logical = 0;
+ size_t size_overhead = 0;
+ size_t size_value = 0;
+ unsigned num_kvs = 0;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc
new file mode 100644
index 000000000..aaca6c3c6
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc
@@ -0,0 +1,208 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "sub_items_stage.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+template <KeyT KT>
+const laddr_packed_t* internal_sub_items_t::insert_at(
+ NodeExtentMutable& mut, const internal_sub_items_t& sub_items,
+ const full_key_t<KT>& key, const laddr_packed_t& value,
+ index_t index, node_offset_t size, const char* p_left_bound) {
+ assert(index <= sub_items.keys());
+ assert(size == estimate_insert<KT>(key, value));
+ const char* p_shift_start = p_left_bound;
+ const char* p_shift_end = reinterpret_cast<const char*>(
+ sub_items.p_first_item + 1 - index);
+ mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size);
+
+ auto p_insert = const_cast<char*>(p_shift_end) - size;
+ auto item = internal_sub_item_t{snap_gen_t::from_key<KT>(key), value};
+ mut.copy_in_absolute(p_insert, item);
+ return &reinterpret_cast<internal_sub_item_t*>(p_insert)->value;
+}
+#define IA_TEMPLATE(KT) \
+ template const laddr_packed_t* internal_sub_items_t::insert_at<KT>( \
+ NodeExtentMutable&, const internal_sub_items_t&, const full_key_t<KT>&, \
+ const laddr_packed_t&, index_t, node_offset_t, const char*)
+IA_TEMPLATE(KeyT::VIEW);
+IA_TEMPLATE(KeyT::HOBJ);
+
+node_offset_t internal_sub_items_t::trim_until(
+ NodeExtentMutable&, internal_sub_items_t& items, index_t index) {
+ assert(index != 0);
+ auto keys = items.keys();
+ assert(index <= keys);
+ size_t ret = sizeof(internal_sub_item_t) * (keys - index);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+}
+
+template <KeyT KT>
+void internal_sub_items_t::Appender<KT>::append(
+ const internal_sub_items_t& src, index_t from, index_t items) {
+ assert(from <= src.keys());
+ if (items == 0) {
+ return;
+ }
+ assert(from < src.keys());
+ assert(from + items <= src.keys());
+ node_offset_t size = sizeof(internal_sub_item_t) * items;
+ p_append -= size;
+ p_mut->copy_in_absolute(p_append, src.p_first_item + 1 - from - items, size);
+}
+
+template <KeyT KT>
+void internal_sub_items_t::Appender<KT>::append(
+ const full_key_t<KT>& key, const laddr_packed_t& value,
+ const laddr_packed_t*& p_value) {
+ p_append -= sizeof(internal_sub_item_t);
+ auto item = internal_sub_item_t{snap_gen_t::from_key<KT>(key), value};
+ p_mut->copy_in_absolute(p_append, item);
+ p_value = &reinterpret_cast<internal_sub_item_t*>(p_append)->value;
+}
+
+template <KeyT KT>
+const onode_t* leaf_sub_items_t::insert_at(
+ NodeExtentMutable& mut, const leaf_sub_items_t& sub_items,
+ const full_key_t<KT>& key, const onode_t& value,
+ index_t index, node_offset_t size, const char* p_left_bound) {
+ assert(index <= sub_items.keys());
+ assert(size == estimate_insert<KT>(key, value));
+ // a. [... item(index)] << size
+ const char* p_shift_start = p_left_bound;
+ const char* p_shift_end = sub_items.get_item_end(index);
+ mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size);
+
+ // b. insert item
+ auto p_insert = const_cast<char*>(p_shift_end - size);
+ auto p_value = reinterpret_cast<const onode_t*>(p_insert);
+ mut.copy_in_absolute(p_insert, &value, value.size);
+ p_insert += value.size;
+ mut.copy_in_absolute(p_insert, snap_gen_t::template from_key<KT>(key));
+ assert(p_insert + sizeof(snap_gen_t) + sizeof(node_offset_t) == p_shift_end);
+
+ // c. compensate affected offsets
+ auto item_size = value.size + sizeof(snap_gen_t);
+ for (auto i = index; i < sub_items.keys(); ++i) {
+ const node_offset_packed_t& offset_i = sub_items.get_offset(i);
+ mut.copy_in_absolute((void*)&offset_i, node_offset_t(offset_i.value + item_size));
+ }
+
+ // d. [item(index-1) ... item(0) ... offset(index)] <<< sizeof(node_offset_t)
+ const char* p_offset = (index == 0 ?
+ (const char*)&sub_items.get_offset(0) + sizeof(node_offset_t) :
+ (const char*)&sub_items.get_offset(index - 1));
+ p_shift_start = p_shift_end;
+ p_shift_end = p_offset;
+ mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)sizeof(node_offset_t));
+
+ // e. insert offset
+ node_offset_t offset_to_item_start = item_size + sub_items.get_offset_to_end(index);
+ mut.copy_in_absolute(
+ const_cast<char*>(p_shift_end) - sizeof(node_offset_t), offset_to_item_start);
+
+ // f. update num_sub_keys
+ mut.copy_in_absolute((void*)sub_items.p_num_keys, num_keys_t(sub_items.keys() + 1));
+
+ return p_value;
+}
+template const onode_t* leaf_sub_items_t::insert_at<KeyT::HOBJ>(
+ NodeExtentMutable&, const leaf_sub_items_t&, const full_key_t<KeyT::HOBJ>&,
+ const onode_t&, index_t, node_offset_t, const char*);
+
+node_offset_t leaf_sub_items_t::trim_until(
+ NodeExtentMutable& mut, leaf_sub_items_t& items, index_t index) {
+ assert(index != 0);
+ auto keys = items.keys();
+ assert(index <= keys);
+ if (index == keys) {
+ return 0;
+ }
+ index_t trim_items = keys - index;
+ const char* p_items_start = items.p_start();
+ const char* p_shift_start = items.get_item_end(index);
+ const char* p_shift_end = items.get_item_end(0);
+ size_t size_trim_offsets = sizeof(node_offset_t) * trim_items;
+ mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start,
+ size_trim_offsets);
+ mut.copy_in_absolute((void*)items.p_num_keys, num_keys_t(index));
+ size_t ret = size_trim_offsets + (p_shift_start - p_items_start);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+}
+
+template class internal_sub_items_t::Appender<KeyT::VIEW>;
+template class internal_sub_items_t::Appender<KeyT::HOBJ>;
+
+// helper type for the visitor
+template<class... Ts> struct overloaded : Ts... { using Ts::operator()...; };
+// explicit deduction guide
+template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
+
+template <KeyT KT>
+char* leaf_sub_items_t::Appender<KT>::wrap() {
+ auto p_cur = p_append;
+ num_keys_t num_keys = 0;
+ for (auto i = 0u; i < cnt; ++i) {
+ auto& a = appends[i];
+ std::visit(overloaded {
+ [&] (const range_items_t& arg) { num_keys += arg.items; },
+ [&] (const kv_item_t& arg) { ++num_keys; }
+ }, a);
+ }
+ assert(num_keys);
+ p_cur -= sizeof(num_keys_t);
+ p_mut->copy_in_absolute(p_cur, num_keys);
+
+ node_offset_t last_offset = 0;
+ for (auto i = 0u; i < cnt; ++i) {
+ auto& a = appends[i];
+ std::visit(overloaded {
+ [&] (const range_items_t& arg) {
+ int compensate = (last_offset - op_src->get_offset_to_end(arg.from));
+ node_offset_t offset;
+ for (auto i = arg.from; i < arg.from + arg.items; ++i) {
+ offset = op_src->get_offset(i).value + compensate;
+ p_cur -= sizeof(node_offset_t);
+ p_mut->copy_in_absolute(p_cur, offset);
+ }
+ last_offset = offset;
+ },
+ [&] (const kv_item_t& arg) {
+ last_offset += sizeof(snap_gen_t) + arg.p_value->size;
+ p_cur -= sizeof(node_offset_t);
+ p_mut->copy_in_absolute(p_cur, last_offset);
+ }
+ }, a);
+ }
+
+ for (auto i = 0u; i < cnt; ++i) {
+ auto& a = appends[i];
+ std::visit(overloaded {
+ [&] (const range_items_t& arg) {
+ auto _p_start = op_src->get_item_end(arg.from + arg.items);
+ size_t _len = op_src->get_item_end(arg.from) - _p_start;
+ p_cur -= _len;
+ p_mut->copy_in_absolute(p_cur, _p_start, _len);
+ },
+ [&] (const kv_item_t& arg) {
+ assert(pp_value);
+ p_cur -= sizeof(snap_gen_t);
+ p_mut->copy_in_absolute(p_cur, snap_gen_t::template from_key<KT>(*arg.p_key));
+ p_cur -= arg.p_value->size;
+ p_mut->copy_in_absolute(p_cur, arg.p_value, arg.p_value->size);
+ *pp_value = reinterpret_cast<const onode_t*>(p_cur);
+ }
+ }, a);
+ }
+ return p_cur;
+}
+
+template class leaf_sub_items_t::Appender<KeyT::VIEW>;
+template class leaf_sub_items_t::Appender<KeyT::HOBJ>;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h
new file mode 100644
index 000000000..8ef5f7472
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h
@@ -0,0 +1,341 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <variant>
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "key_layout.h"
+#include "stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+struct internal_sub_item_t {
+ const snap_gen_t& get_key() const { return key; }
+ const laddr_packed_t* get_p_value() const { return &value; }
+
+ snap_gen_t key;
+ laddr_packed_t value;
+} __attribute__((packed));
+
+/**
+ * internal_sub_items_t
+ *
+ * The STAGE_RIGHT implementation for internal node N0/N1/N2, implements staged
+ * contract as an indexable container to index snap-gen to child node
+ * addresses.
+ *
+ * The layout of the contaner storing n sub-items:
+ *
+ * # <--------- container range -----------> #
+ * #<~># sub-items [2, n) #
+ * # # <- sub-item 1 -> # <- sub-item 0 -> #
+ * #...# snap-gen | laddr # snap-gen | laddr #
+ * ^
+ * |
+ * p_first_item +
+ */
+class internal_sub_items_t {
+ public:
+ using num_keys_t = index_t;
+
+ internal_sub_items_t(const memory_range_t& range) {
+ assert(range.p_start < range.p_end);
+ assert((range.p_end - range.p_start) % sizeof(internal_sub_item_t) == 0);
+ num_items = (range.p_end - range.p_start) / sizeof(internal_sub_item_t);
+ assert(num_items > 0);
+ auto _p_first_item = range.p_end - sizeof(internal_sub_item_t);
+ p_first_item = reinterpret_cast<const internal_sub_item_t*>(_p_first_item);
+ }
+
+ // container type system
+ using key_get_type = const snap_gen_t&;
+ static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE;
+ num_keys_t keys() const { return num_items; }
+ key_get_type operator[](index_t index) const {
+ assert(index < num_items);
+ return (p_first_item - index)->get_key();
+ }
+ node_offset_t size_before(index_t index) const {
+ size_t ret = index * sizeof(internal_sub_item_t);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ }
+ const laddr_packed_t* get_p_value(index_t index) const {
+ assert(index < num_items);
+ return (p_first_item - index)->get_p_value();
+ }
+ node_offset_t size_overhead_at(index_t index) const { return 0u; }
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ auto p_end = reinterpret_cast<const char*>(p_first_item) +
+ sizeof(internal_sub_item_t);
+ auto p_start = p_end - num_items * sizeof(internal_sub_item_t);
+ int start_offset = p_start - p_node_start;
+ int end_offset = p_end - p_node_start;
+ assert(start_offset > 0 &&
+ start_offset < end_offset &&
+ end_offset < NODE_BLOCK_SIZE);
+ ceph::encode(static_cast<node_offset_t>(start_offset), encoded);
+ ceph::encode(static_cast<node_offset_t>(end_offset), encoded);
+ }
+
+ static internal_sub_items_t decode(
+ const char* p_node_start, ceph::bufferlist::const_iterator& delta) {
+ node_offset_t start_offset;
+ ceph::decode(start_offset, delta);
+ node_offset_t end_offset;
+ ceph::decode(end_offset, delta);
+ assert(start_offset < end_offset);
+ assert(end_offset <= NODE_BLOCK_SIZE);
+ return internal_sub_items_t({p_node_start + start_offset,
+ p_node_start + end_offset});
+ }
+
+ static node_offset_t header_size() { return 0u; }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(
+ const full_key_t<KT>&, const laddr_packed_t&) {
+ return sizeof(internal_sub_item_t);
+ }
+
+ template <KeyT KT>
+ static const laddr_packed_t* insert_at(
+ NodeExtentMutable&, const internal_sub_items_t&,
+ const full_key_t<KT>&, const laddr_packed_t&,
+ index_t index, node_offset_t size, const char* p_left_bound);
+
+ static node_offset_t trim_until(NodeExtentMutable&, internal_sub_items_t&, index_t);
+
+ template <KeyT KT>
+ class Appender;
+
+ private:
+ index_t num_items;
+ const internal_sub_item_t* p_first_item;
+};
+
+template <KeyT KT>
+class internal_sub_items_t::Appender {
+ public:
+ Appender(NodeExtentMutable* p_mut, char* p_append)
+ : p_mut{p_mut}, p_append{p_append} {}
+ void append(const internal_sub_items_t& src, index_t from, index_t items);
+ void append(const full_key_t<KT>&, const laddr_packed_t&, const laddr_packed_t*&);
+ char* wrap() { return p_append; }
+ private:
+ NodeExtentMutable* p_mut;
+ char* p_append;
+};
+
+/**
+ * leaf_sub_items_t
+ *
+ * The STAGE_RIGHT implementation for leaf node N0/N1/N2, implements staged
+ * contract as an indexable container to index snap-gen to onode_t.
+ *
+ * The layout of the contaner storing n sub-items:
+ *
+ * # <------------------------ container range -------------------------------> #
+ * # <---------- sub-items ----------------> # <--- offsets ---------# #
+ * #<~># sub-items [2, n) #<~>| offsets [2, n) # #
+ * # # <- sub-item 1 -> # <- sub-item 0 -> # | # #
+ * #...# snap-gen | onode # snap-gen | onode #...| offset1 | offset0 # num_keys #
+ * ^ ^ ^
+ * | | |
+ * p_items_end + p_offsets + |
+ * p_num_keys +
+ */
+class leaf_sub_items_t {
+ public:
+ // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t),
+ // and the minimal size of onode_t
+ using num_keys_t = uint8_t;
+
+ leaf_sub_items_t(const memory_range_t& range) {
+ assert(range.p_start < range.p_end);
+ auto _p_num_keys = range.p_end - sizeof(num_keys_t);
+ assert(range.p_start < _p_num_keys);
+ p_num_keys = reinterpret_cast<const num_keys_t*>(_p_num_keys);
+ assert(keys());
+ auto _p_offsets = _p_num_keys - sizeof(node_offset_t);
+ assert(range.p_start < _p_offsets);
+ p_offsets = reinterpret_cast<const node_offset_packed_t*>(_p_offsets);
+ p_items_end = reinterpret_cast<const char*>(&get_offset(keys() - 1));
+ assert(range.p_start < p_items_end);
+ assert(range.p_start == p_start());
+ }
+
+ bool operator==(const leaf_sub_items_t& x) {
+ return (p_num_keys == x.p_num_keys &&
+ p_offsets == x.p_offsets &&
+ p_items_end == x.p_items_end);
+ }
+
+ const char* p_start() const { return get_item_end(keys()); }
+
+ const node_offset_packed_t& get_offset(index_t index) const {
+ assert(index < keys());
+ return *(p_offsets - index);
+ }
+
+ const node_offset_t get_offset_to_end(index_t index) const {
+ assert(index <= keys());
+ return index == 0 ? 0 : get_offset(index - 1).value;
+ }
+
+ const char* get_item_start(index_t index) const {
+ return p_items_end - get_offset(index).value;
+ }
+
+ const char* get_item_end(index_t index) const {
+ return p_items_end - get_offset_to_end(index);
+ }
+
+ // container type system
+ using key_get_type = const snap_gen_t&;
+ static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE;
+ num_keys_t keys() const { return *p_num_keys; }
+ key_get_type operator[](index_t index) const {
+ assert(index < keys());
+ auto pointer = get_item_end(index);
+ assert(get_item_start(index) < pointer);
+ pointer -= sizeof(snap_gen_t);
+ assert(get_item_start(index) < pointer);
+ return *reinterpret_cast<const snap_gen_t*>(pointer);
+ }
+ node_offset_t size_before(index_t index) const {
+ assert(index <= keys());
+ size_t ret;
+ if (index == 0) {
+ ret = sizeof(num_keys_t);
+ } else {
+ --index;
+ ret = sizeof(num_keys_t) +
+ (index + 1) * sizeof(node_offset_t) +
+ get_offset(index).value;
+ }
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ }
+ node_offset_t size_overhead_at(index_t index) const { return sizeof(node_offset_t); }
+ const onode_t* get_p_value(index_t index) const {
+ assert(index < keys());
+ auto pointer = get_item_start(index);
+ auto value = reinterpret_cast<const onode_t*>(pointer);
+ assert(pointer + value->size + sizeof(snap_gen_t) == get_item_end(index));
+ return value;
+ }
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ auto p_end = reinterpret_cast<const char*>(p_num_keys) +
+ sizeof(num_keys_t);
+ int start_offset = p_start() - p_node_start;
+ int end_offset = p_end - p_node_start;
+ assert(start_offset > 0 &&
+ start_offset < end_offset &&
+ end_offset < NODE_BLOCK_SIZE);
+ ceph::encode(static_cast<node_offset_t>(start_offset), encoded);
+ ceph::encode(static_cast<node_offset_t>(end_offset), encoded);
+ }
+
+ static leaf_sub_items_t decode(
+ const char* p_node_start, ceph::bufferlist::const_iterator& delta) {
+ node_offset_t start_offset;
+ ceph::decode(start_offset, delta);
+ node_offset_t end_offset;
+ ceph::decode(end_offset, delta);
+ assert(start_offset < end_offset);
+ assert(end_offset <= NODE_BLOCK_SIZE);
+ return leaf_sub_items_t({p_node_start + start_offset,
+ p_node_start + end_offset});
+ }
+
+ static node_offset_t header_size() { return sizeof(num_keys_t); }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(const full_key_t<KT>&, const onode_t& value) {
+ return value.size + sizeof(snap_gen_t) + sizeof(node_offset_t);
+ }
+
+ template <KeyT KT>
+ static const onode_t* insert_at(
+ NodeExtentMutable&, const leaf_sub_items_t&,
+ const full_key_t<KT>&, const onode_t&,
+ index_t index, node_offset_t size, const char* p_left_bound);
+
+ static node_offset_t trim_until(NodeExtentMutable&, leaf_sub_items_t&, index_t index);
+
+ template <KeyT KT>
+ class Appender;
+
+ private:
+ // TODO: support unaligned access
+ const num_keys_t* p_num_keys;
+ const node_offset_packed_t* p_offsets;
+ const char* p_items_end;
+};
+
+constexpr index_t APPENDER_LIMIT = 3u;
+
+template <KeyT KT>
+class leaf_sub_items_t::Appender {
+ struct range_items_t {
+ index_t from;
+ index_t items;
+ };
+ struct kv_item_t {
+ const full_key_t<KT>* p_key;
+ const onode_t* p_value;
+ };
+ using var_t = std::variant<range_items_t, kv_item_t>;
+
+ public:
+ Appender(NodeExtentMutable* p_mut, char* p_append)
+ : p_mut{p_mut}, p_append{p_append} {
+ }
+
+ void append(const leaf_sub_items_t& src, index_t from, index_t items) {
+ assert(cnt <= APPENDER_LIMIT);
+ assert(from <= src.keys());
+ if (items == 0) {
+ return;
+ }
+ if (op_src) {
+ assert(*op_src == src);
+ } else {
+ op_src = src;
+ }
+ assert(from < src.keys());
+ assert(from + items <= src.keys());
+ appends[cnt] = range_items_t{from, items};
+ ++cnt;
+ }
+ void append(const full_key_t<KT>& key,
+ const onode_t& value, const onode_t*& p_value) {
+ assert(pp_value == nullptr);
+ assert(cnt <= APPENDER_LIMIT);
+ appends[cnt] = kv_item_t{&key, &value};
+ ++cnt;
+ pp_value = &p_value;
+ }
+ char* wrap();
+
+ private:
+ std::optional<leaf_sub_items_t> op_src;
+ const onode_t** pp_value = nullptr;
+ NodeExtentMutable* p_mut;
+ char* p_append;
+ var_t appends[APPENDER_LIMIT];
+ index_t cnt = 0;
+};
+
+template <node_type_t> struct _sub_items_t;
+template<> struct _sub_items_t<node_type_t::INTERNAL> { using type = internal_sub_items_t; };
+template<> struct _sub_items_t<node_type_t::LEAF> { using type = leaf_sub_items_t; };
+template <node_type_t NODE_TYPE>
+using sub_items_t = typename _sub_items_t<NODE_TYPE>::type;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc
new file mode 100644
index 000000000..5a28f5097
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc
@@ -0,0 +1,26 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "super.h"
+#include "node.h"
+
+namespace crimson::os::seastore::onode {
+
+Ref<Node> RootNodeTrackerIsolated::get_root(Transaction& t) const {
+ auto iter = tracked_supers.find(&t);
+ if (iter == tracked_supers.end()) {
+ return nullptr;
+ } else {
+ return iter->second->get_p_root();
+ }
+}
+
+Ref<Node> RootNodeTrackerShared::get_root(Transaction&) const {
+ if (is_clean()) {
+ return nullptr;
+ } else {
+ return tracked_super->get_p_root();
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.h b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h
new file mode 100644
index 000000000..5eefee9ff
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h
@@ -0,0 +1,143 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+
+#include "crimson/common/type_helpers.h"
+
+#include "fwd.h"
+
+namespace crimson::os::seastore::onode {
+
+class Node;
+class Super;
+
+/**
+ * RootNodeTracker
+ *
+ * An abstracted tracker to get the root node by Transaction.
+ */
+class RootNodeTracker {
+ public:
+ virtual ~RootNodeTracker() = default;
+ virtual bool is_clean() const = 0;
+ virtual Ref<Node> get_root(Transaction&) const = 0;
+ static RootNodeTrackerURef create(bool read_isolated);
+ protected:
+ RootNodeTracker() = default;
+ RootNodeTracker(const RootNodeTracker&) = delete;
+ RootNodeTracker(RootNodeTracker&&) = delete;
+ RootNodeTracker& operator=(const RootNodeTracker&) = delete;
+ RootNodeTracker& operator=(RootNodeTracker&&) = delete;
+ virtual void do_track_super(Transaction&, Super&) = 0;
+ virtual void do_untrack_super(Transaction&, Super&) = 0;
+ friend class Super;
+};
+
+/**
+ * Super
+ *
+ * The parent of root node. It contains the relationship between a Transaction
+ * and a root node address.
+ */
+class Super {
+ public:
+ using URef = std::unique_ptr<Super>;
+ Super(const Super&) = delete;
+ Super(Super&&) = delete;
+ Super& operator=(const Super&) = delete;
+ Super& operator=(Super&&) = delete;
+ virtual ~Super() {
+ assert(tracked_root_node == nullptr);
+ tracker.do_untrack_super(t, *this);
+ }
+
+ virtual laddr_t get_root_laddr() const = 0;
+ virtual void write_root_laddr(context_t, laddr_t) = 0;
+
+ void do_track_root(Node& root) {
+ assert(tracked_root_node == nullptr);
+ tracked_root_node = &root;
+ }
+ void do_untrack_root(Node& root) {
+ assert(tracked_root_node == &root);
+ tracked_root_node = nullptr;
+ }
+ Node* get_p_root() const {
+ assert(tracked_root_node != nullptr);
+ return tracked_root_node;
+ }
+
+ protected:
+ Super(Transaction& t, RootNodeTracker& tracker)
+ : t{t}, tracker{tracker} {
+ tracker.do_track_super(t, *this);
+ }
+
+ private:
+ Transaction& t;
+ RootNodeTracker& tracker;
+ Node* tracked_root_node = nullptr;
+};
+
+/**
+ * RootNodeTrackerIsolated
+ *
+ * A concrete RootNodeTracker implementation which provides root node isolation
+ * between Transactions for Seastore backend.
+ */
+class RootNodeTrackerIsolated final : public RootNodeTracker {
+ public:
+ ~RootNodeTrackerIsolated() override { assert(is_clean()); }
+ protected:
+ bool is_clean() const override {
+ return tracked_supers.empty();
+ }
+ void do_track_super(Transaction& t, Super& super) override {
+ assert(tracked_supers.find(&t) == tracked_supers.end());
+ tracked_supers[&t] = &super;
+ }
+ void do_untrack_super(Transaction& t, Super& super) override {
+ [[maybe_unused]] auto removed = tracked_supers.erase(&t);
+ assert(removed);
+ }
+ ::Ref<Node> get_root(Transaction& t) const override;
+ std::map<Transaction*, Super*> tracked_supers;
+};
+
+/**
+ * RootNodeTrackerShared
+ *
+ * A concrete RootNodeTracker implementation which has no isolation between
+ * Transactions for Dummy backend.
+ */
+class RootNodeTrackerShared final : public RootNodeTracker {
+ public:
+ ~RootNodeTrackerShared() override { assert(is_clean()); }
+ protected:
+ bool is_clean() const override {
+ return tracked_super == nullptr;
+ }
+ void do_track_super(Transaction&, Super& super) override {
+ assert(is_clean());
+ tracked_super = &super;
+ }
+ void do_untrack_super(Transaction&, Super& super) override {
+ assert(tracked_super == &super);
+ tracked_super = nullptr;
+ }
+ ::Ref<Node> get_root(Transaction&) const override;
+ Super* tracked_super = nullptr;
+};
+
+inline RootNodeTrackerURef RootNodeTracker::create(bool read_isolated) {
+ if (read_isolated) {
+ return RootNodeTrackerURef(new RootNodeTrackerIsolated());
+ } else {
+ return RootNodeTrackerURef(new RootNodeTrackerShared());
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc
new file mode 100644
index 000000000..2c8c21652
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tree.h"
+
+#include "node.h"
+#include "node_extent_manager.h"
+#include "stages/key_layout.h"
+#include "super.h"
+
+namespace crimson::os::seastore::onode {
+
+using btree_ertr = Btree::btree_ertr;
+template <class ValueT=void>
+using btree_future = Btree::btree_future<ValueT>;
+using Cursor = Btree::Cursor;
+
+Cursor::Cursor(Btree* p_tree, Ref<tree_cursor_t> _p_cursor)
+ : p_tree(p_tree) {
+ if (_p_cursor->is_end()) {
+ // no need to hold the leaf node
+ } else {
+ p_cursor = _p_cursor;
+ }
+}
+Cursor::Cursor(Btree* p_tree) : p_tree{p_tree} {}
+Cursor::Cursor(const Cursor&) = default;
+Cursor::Cursor(Cursor&&) noexcept = default;
+Cursor& Cursor::operator=(const Cursor&) = default;
+Cursor& Cursor::operator=(Cursor&&) = default;
+Cursor::~Cursor() = default;
+
+bool Cursor::is_end() const {
+ if (p_cursor) {
+ assert(!p_cursor->is_end());
+ return false;
+ } else {
+ return true;
+ }
+}
+
+ghobject_t Cursor::get_ghobj() const {
+ return p_cursor->get_key_view().to_ghobj();
+}
+
+const onode_t* Cursor::value() const {
+ return p_cursor->get_p_value();
+}
+
+bool Cursor::operator==(const Cursor& x) const {
+ return p_cursor == x.p_cursor;
+}
+
+Cursor& Cursor::operator++() {
+ // TODO
+ return *this;
+}
+
+Cursor Cursor::operator++(int) {
+ Cursor tmp = *this;
+ ++*this;
+ return tmp;
+}
+
+Cursor Cursor::make_end(Btree* p_tree) {
+ return {p_tree};
+}
+
+Btree::Btree(NodeExtentManagerURef&& _nm)
+ : nm{std::move(_nm)},
+ root_tracker{RootNodeTracker::create(nm->is_read_isolated())} {}
+
+Btree::~Btree() { assert(root_tracker->is_clean()); }
+
+btree_future<> Btree::mkfs(Transaction& t) {
+ return Node::mkfs(get_context(t), *root_tracker);
+}
+
+btree_future<Cursor> Btree::begin(Transaction& t) {
+ return get_root(t).safe_then([this, &t](auto root) {
+ return root->lookup_smallest(get_context(t));
+ }).safe_then([this](auto cursor) {
+ return Cursor{this, cursor};
+ });
+}
+
+btree_future<Cursor> Btree::last(Transaction& t) {
+ return get_root(t).safe_then([this, &t](auto root) {
+ return root->lookup_largest(get_context(t));
+ }).safe_then([this](auto cursor) {
+ return Cursor(this, cursor);
+ });
+}
+
+Cursor Btree::end() {
+ return Cursor::make_end(this);
+}
+
+btree_future<bool>
+Btree::contains(Transaction& t, const ghobject_t& obj) {
+ return seastar::do_with(
+ full_key_t<KeyT::HOBJ>(obj),
+ [this, &t](auto& key) -> btree_future<bool> {
+ return get_root(t).safe_then([this, &t, &key](auto root) {
+ // TODO: improve lower_bound()
+ return root->lower_bound(get_context(t), key);
+ }).safe_then([](auto result) {
+ return MatchKindBS::EQ == result.match();
+ });
+ }
+ );
+}
+
+btree_future<Cursor>
+Btree::find(Transaction& t, const ghobject_t& obj) {
+ return seastar::do_with(
+ full_key_t<KeyT::HOBJ>(obj),
+ [this, &t](auto& key) -> btree_future<Cursor> {
+ return get_root(t).safe_then([this, &t, &key](auto root) {
+ // TODO: improve lower_bound()
+ return root->lower_bound(get_context(t), key);
+ }).safe_then([this](auto result) {
+ if (result.match() == MatchKindBS::EQ) {
+ return Cursor(this, result.p_cursor);
+ } else {
+ return Cursor::make_end(this);
+ }
+ });
+ }
+ );
+}
+
+btree_future<Cursor>
+Btree::lower_bound(Transaction& t, const ghobject_t& obj) {
+ return seastar::do_with(
+ full_key_t<KeyT::HOBJ>(obj),
+ [this, &t](auto& key) -> btree_future<Cursor> {
+ return get_root(t).safe_then([this, &t, &key](auto root) {
+ return root->lower_bound(get_context(t), key);
+ }).safe_then([this](auto result) {
+ return Cursor(this, result.p_cursor);
+ });
+ }
+ );
+}
+
+btree_future<std::pair<Cursor, bool>>
+Btree::insert(Transaction& t, const ghobject_t& obj, const onode_t& value) {
+ return seastar::do_with(
+ full_key_t<KeyT::HOBJ>(obj),
+ [this, &t, &value](auto& key) -> btree_future<std::pair<Cursor, bool>> {
+ return get_root(t).safe_then([this, &t, &key, &value](auto root) {
+ return root->insert(get_context(t), key, value);
+ }).safe_then([this](auto ret) {
+ auto& [cursor, success] = ret;
+ return std::make_pair(Cursor(this, cursor), success);
+ });
+ }
+ );
+}
+
+btree_future<size_t> Btree::erase(Transaction& t, const ghobject_t& obj) {
+ // TODO
+ return btree_ertr::make_ready_future<size_t>(0u);
+}
+
+btree_future<Cursor> Btree::erase(Cursor& pos) {
+ // TODO
+ return btree_ertr::make_ready_future<Cursor>(
+ Cursor::make_end(this));
+}
+
+btree_future<Cursor>
+Btree::erase(Cursor& first, Cursor& last) {
+ // TODO
+ return btree_ertr::make_ready_future<Cursor>(
+ Cursor::make_end(this));
+}
+
+btree_future<size_t> Btree::height(Transaction& t) {
+ return get_root(t).safe_then([](auto root) {
+ return size_t(root->level() + 1);
+ });
+}
+
+btree_future<tree_stats_t> Btree::get_stats_slow(Transaction& t) {
+ return get_root(t).safe_then([this, &t](auto root) {
+ unsigned height = root->level() + 1;
+ return root->get_tree_stats(get_context(t)
+ ).safe_then([height](auto stats) {
+ stats.height = height;
+ return btree_ertr::make_ready_future<tree_stats_t>(stats);
+ });
+ });
+}
+
+std::ostream& Btree::dump(Transaction& t, std::ostream& os) {
+ auto root = root_tracker->get_root(t);
+ if (root) {
+ root->dump(os);
+ } else {
+ os << "empty tree!";
+ }
+ return os;
+}
+
+std::ostream& Btree::print(std::ostream& os) const {
+ return os << "BTree-" << *nm;
+}
+
+btree_future<Ref<Node>> Btree::get_root(Transaction& t) {
+ auto root = root_tracker->get_root(t);
+ if (root) {
+ return btree_ertr::make_ready_future<Ref<Node>>(root);
+ } else {
+ return Node::load_root(get_context(t), *root_tracker);
+ }
+}
+
+bool Btree::test_is_clean() const {
+ return root_tracker->is_clean();
+}
+
+btree_future<> Btree::test_clone_from(
+ Transaction& t, Transaction& t_from, Btree& from) {
+ // Note: assume the tree to clone is tracked correctly in memory.
+ // In some unit tests, parts of the tree are stubbed out that they
+ // should not be loaded from NodeExtentManager.
+ return from.get_root(t_from
+ ).safe_then([this, &t](auto root_from) {
+ return root_from->test_clone_root(get_context(t), *root_tracker);
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h
new file mode 100644
index 000000000..7ee618cb3
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h
@@ -0,0 +1,119 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+
+#include "common/hobject.h"
+#include "crimson/common/type_helpers.h"
+
+#include "fwd.h"
+#include "tree_types.h"
+
+/**
+ * tree.h
+ *
+ * An example implementation to expose tree interfaces to users. The current
+ * interface design is based on:
+ * - ceph::os::Transaction::create/touch/remove()
+ * - ceph::ObjectStore::collection_list()
+ * - ceph::BlueStore::get_onode()
+ * - db->get_iterator(PREFIIX_OBJ) by ceph::BlueStore::fsck()
+ *
+ * TODO: Redesign the interfaces based on real onode manager requirements.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class Node;
+class Btree {
+ public:
+ using btree_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent,
+ crimson::ct_error::erange>;
+ template <class ValueT=void>
+ using btree_future = btree_ertr::future<ValueT>;
+
+ Btree(NodeExtentManagerURef&& nm);
+ Btree(const Btree&) = delete;
+ Btree(Btree&&) = delete;
+ Btree& operator=(const Btree&) = delete;
+ Btree& operator=(Btree&&) = delete;
+ ~Btree();
+
+ btree_future<> mkfs(Transaction&);
+
+ class Cursor;
+ // lookup
+ btree_future<Cursor> begin(Transaction&);
+ btree_future<Cursor> last(Transaction&);
+ Cursor end();
+ btree_future<bool> contains(Transaction&, const ghobject_t&);
+ btree_future<Cursor> find(Transaction&, const ghobject_t&);
+ btree_future<Cursor> lower_bound(Transaction&, const ghobject_t&);
+
+ // modifiers
+ // TODO: replace onode_t
+ btree_future<std::pair<Cursor, bool>>
+ insert(Transaction&, const ghobject_t&, const onode_t&);
+ btree_future<size_t> erase(Transaction&, const ghobject_t& key);
+ btree_future<Cursor> erase(Cursor& pos);
+ btree_future<Cursor> erase(Cursor& first, Cursor& last);
+
+ // stats
+ btree_future<size_t> height(Transaction&);
+ btree_future<tree_stats_t> get_stats_slow(Transaction&);
+ std::ostream& dump(Transaction&, std::ostream&);
+ std::ostream& print(std::ostream& os) const;
+
+ // test_only
+ bool test_is_clean() const;
+ btree_future<> test_clone_from(Transaction& t, Transaction& t_from, Btree& from);
+
+ private:
+ context_t get_context(Transaction& t) { return {*nm, t}; }
+ btree_future<Ref<Node>> get_root(Transaction& t);
+
+ NodeExtentManagerURef nm;
+ RootNodeTrackerURef root_tracker;
+
+ friend class DummyChildPool;
+};
+inline std::ostream& operator<<(std::ostream& os, const Btree& tree) {
+ return tree.print(os);
+}
+
+class tree_cursor_t;
+class Btree::Cursor {
+ public:
+ Cursor(const Cursor&);
+ Cursor(Cursor&&) noexcept;
+ Cursor& operator=(const Cursor&);
+ Cursor& operator=(Cursor&&);
+ ~Cursor();
+
+ bool is_end() const;
+ // XXX: return key_view_t to avoid unecessary ghobject_t constructions
+ ghobject_t get_ghobj() const;
+ const onode_t* value() const;
+ bool operator==(const Cursor& x) const;
+ bool operator!=(const Cursor& x) const { return !(*this == x); }
+ Cursor& operator++();
+ Cursor operator++(int);
+
+ private:
+ Cursor(Btree*, Ref<tree_cursor_t>);
+ Cursor(Btree*);
+
+ static Cursor make_end(Btree*);
+
+ Btree* p_tree;
+ Ref<tree_cursor_t> p_cursor;
+
+ friend class Btree;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h
new file mode 100644
index 000000000..0bb345e0a
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h
@@ -0,0 +1,125 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+
+namespace crimson::os::seastore::onode {
+
+// TODO: Redesign according to real requirement from onode manager
+struct onode_t {
+ // onode should be smaller than a node
+ uint16_t size; // address up to 64 KiB sized node
+ uint16_t id;
+ // omap, extent_map, inline data
+
+ bool operator==(const onode_t& o) const { return size == o.size && id == o.id; }
+ bool operator!=(const onode_t& o) const { return !(*this == o); }
+
+ void encode(ceph::bufferlist& encoded) const {
+ ceph::encode(size, encoded);
+ ceph::encode(id, encoded);
+ }
+ static onode_t decode(ceph::bufferlist::const_iterator& delta) {
+ uint16_t size;
+ ceph::decode(size, delta);
+ uint16_t id;
+ ceph::decode(id, delta);
+ onode_t ret{size, id};
+ return ret;
+ }
+ static void validate_tail_magic(const onode_t& onode) {
+ auto p_target = (const char*)&onode + onode.size - sizeof(uint32_t);
+ uint32_t target;
+ std::memcpy(&target, p_target, sizeof(uint32_t));
+ ceph_assert(target == onode.size * 137);
+ }
+ static std::unique_ptr<char[]> allocate(const onode_t& config) {
+ ceph_assert(config.size >= sizeof(onode_t) + sizeof(uint32_t));
+
+ auto ret = std::make_unique<char[]>(config.size);
+ char* p_mem = ret.get();
+ auto p_onode = reinterpret_cast<onode_t*>(p_mem);
+ *p_onode = config;
+
+ uint32_t tail_magic = config.size * 137;
+ p_mem += (config.size - sizeof(uint32_t));
+ std::memcpy(p_mem, &tail_magic, sizeof(uint32_t));
+ validate_tail_magic(*p_onode);
+
+ return ret;
+ }
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const onode_t& node) {
+ return os << "onode(" << node.id << ", " << node.size << "B)";
+}
+
+struct tree_stats_t {
+ size_t size_persistent_leaf = 0;
+ size_t size_persistent_internal = 0;
+ size_t size_filled_leaf = 0;
+ size_t size_filled_internal = 0;
+ size_t size_logical_leaf = 0;
+ size_t size_logical_internal = 0;
+ size_t size_overhead_leaf = 0;
+ size_t size_overhead_internal = 0;
+ size_t size_value_leaf = 0;
+ size_t size_value_internal = 0;
+ unsigned num_kvs_leaf = 0;
+ unsigned num_kvs_internal = 0;
+ unsigned num_nodes_leaf = 0;
+ unsigned num_nodes_internal = 0;
+ unsigned height = 0;
+
+ size_t size_persistent() const {
+ return size_persistent_leaf + size_persistent_internal; }
+ size_t size_filled() const {
+ return size_filled_leaf + size_filled_internal; }
+ size_t size_logical() const {
+ return size_logical_leaf + size_logical_internal; }
+ size_t size_overhead() const {
+ return size_overhead_leaf + size_overhead_internal; }
+ size_t size_value() const {
+ return size_value_leaf + size_value_internal; }
+ unsigned num_kvs() const {
+ return num_kvs_leaf + num_kvs_internal; }
+ unsigned num_nodes() const {
+ return num_nodes_leaf + num_nodes_internal; }
+
+ double ratio_fullness() const {
+ return (double)size_filled() / size_persistent(); }
+ double ratio_key_compression() const {
+ return (double)(size_filled() - size_value()) / (size_logical() - size_value()); }
+ double ratio_overhead() const {
+ return (double)size_overhead() / size_filled(); }
+ double ratio_keys_leaf() const {
+ return (double)num_kvs_leaf / num_kvs(); }
+ double ratio_nodes_leaf() const {
+ return (double)num_nodes_leaf / num_nodes(); }
+ double ratio_filled_leaf() const {
+ return (double)size_filled_leaf / size_filled(); }
+};
+inline std::ostream& operator<<(std::ostream& os, const tree_stats_t& stats) {
+ os << "Tree stats:"
+ << "\n height = " << stats.height
+ << "\n num values = " << stats.num_kvs_leaf
+ << "\n num nodes = " << stats.num_nodes()
+ << " (leaf=" << stats.num_nodes_leaf
+ << ", internal=" << stats.num_nodes_internal << ")"
+ << "\n size persistent = " << stats.size_persistent() << "B"
+ << "\n size filled = " << stats.size_filled() << "B"
+ << " (value=" << stats.size_value_leaf << "B"
+ << ", rest=" << stats.size_filled() - stats.size_value_leaf << "B)"
+ << "\n size logical = " << stats.size_logical() << "B"
+ << "\n size overhead = " << stats.size_overhead() << "B"
+ << "\n ratio fullness = " << stats.ratio_fullness()
+ << "\n ratio keys leaf = " << stats.ratio_keys_leaf()
+ << "\n ratio nodes leaf = " << stats.ratio_nodes_leaf()
+ << "\n ratio filled leaf = " << stats.ratio_filled_leaf()
+ << "\n ratio key compression = " << stats.ratio_key_compression();
+ assert(stats.num_kvs_internal + 1 == stats.num_nodes());
+ return os;
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h
new file mode 100644
index 000000000..536052003
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h
@@ -0,0 +1,333 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <cstring>
+#include <random>
+#include <string>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+#include "crimson/common/log.h"
+#include "stages/key_layout.h"
+#include "tree.h"
+
+/**
+ * tree_utils.h
+ *
+ * Contains shared logic for unit tests and perf tool.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class Onodes {
+ public:
+ Onodes(size_t n) {
+ for (size_t i = 1; i <= n; ++i) {
+ auto p_onode = &create(i * 8);
+ onodes.push_back(p_onode);
+ }
+ }
+
+ Onodes(std::vector<size_t> sizes) {
+ for (auto& size : sizes) {
+ auto p_onode = &create(size);
+ onodes.push_back(p_onode);
+ }
+ }
+
+ ~Onodes() = default;
+
+ const onode_t& create(size_t size) {
+ ceph_assert(size <= std::numeric_limits<uint16_t>::max());
+ onode_t config{static_cast<uint16_t>(size), id++};
+ auto onode = onode_t::allocate(config);
+ auto p_onode = onode.get();
+ tracked_onodes.push_back(std::move(onode));
+ return *reinterpret_cast<onode_t*>(p_onode);
+ }
+
+ const onode_t& pick() const {
+ auto index = rd() % onodes.size();
+ return *onodes[index];
+ }
+
+ const onode_t& pick_largest() const {
+ return *onodes[onodes.size() - 1];
+ }
+
+ static void validate_cursor(
+ const Btree::Cursor& cursor, const ghobject_t& key, const onode_t& onode) {
+ ceph_assert(!cursor.is_end());
+ ceph_assert(cursor.get_ghobj() == key);
+ ceph_assert(cursor.value());
+ ceph_assert(cursor.value() != &onode);
+ ceph_assert(*cursor.value() == onode);
+ onode_t::validate_tail_magic(*cursor.value());
+ }
+
+ private:
+ uint16_t id = 0;
+ mutable std::random_device rd;
+ std::vector<const onode_t*> onodes;
+ std::vector<std::unique_ptr<char[]>> tracked_onodes;
+};
+
+class KVPool {
+ struct kv_conf_t {
+ unsigned index2;
+ unsigned index1;
+ unsigned index0;
+ size_t ns_size;
+ size_t oid_size;
+ const onode_t* p_value;
+
+ ghobject_t get_ghobj() const {
+ assert(index1 < 10);
+ std::ostringstream os_ns;
+ os_ns << "ns" << index1;
+ unsigned current_size = (unsigned)os_ns.tellp();
+ assert(ns_size >= current_size);
+ os_ns << std::string(ns_size - current_size, '_');
+
+ std::ostringstream os_oid;
+ os_oid << "oid" << index1;
+ current_size = (unsigned)os_oid.tellp();
+ assert(oid_size >= current_size);
+ os_oid << std::string(oid_size - current_size, '_');
+
+ return ghobject_t(shard_id_t(index2), index2, index2,
+ os_ns.str(), os_oid.str(), index0, index0);
+ }
+ };
+ using kv_vector_t = std::vector<kv_conf_t>;
+
+ public:
+ using kv_t = std::pair<ghobject_t, const onode_t*>;
+
+ KVPool(const std::vector<size_t>& str_sizes,
+ const std::vector<size_t>& onode_sizes,
+ const std::pair<unsigned, unsigned>& range2,
+ const std::pair<unsigned, unsigned>& range1,
+ const std::pair<unsigned, unsigned>& range0)
+ : str_sizes{str_sizes}, onodes{onode_sizes} {
+ ceph_assert(range2.first < range2.second);
+ ceph_assert(range2.second - 1 <= (unsigned)std::numeric_limits<shard_t>::max());
+ ceph_assert(range2.second - 1 <= std::numeric_limits<crush_hash_t>::max());
+ ceph_assert(range1.first < range1.second);
+ ceph_assert(range1.second - 1 <= 9);
+ ceph_assert(range0.first < range0.second);
+ std::random_device rd;
+ for (unsigned i = range2.first; i < range2.second; ++i) {
+ for (unsigned j = range1.first; j < range1.second; ++j) {
+ auto ns_size = (unsigned)str_sizes[rd() % str_sizes.size()];
+ auto oid_size = (unsigned)str_sizes[rd() % str_sizes.size()];
+ for (unsigned k = range0.first; k < range0.second; ++k) {
+ kvs.emplace_back(kv_conf_t{i, j, k, ns_size, oid_size, &onodes.pick()});
+ }
+ }
+ }
+ random_kvs = kvs;
+ std::random_shuffle(random_kvs.begin(), random_kvs.end());
+ }
+
+ class iterator_t {
+ public:
+ iterator_t() = default;
+ iterator_t(const iterator_t&) = default;
+ iterator_t(iterator_t&&) = default;
+ iterator_t& operator=(const iterator_t&) = default;
+ iterator_t& operator=(iterator_t&&) = default;
+
+ kv_t get_kv() const {
+ assert(!is_end());
+ auto& conf = (*p_kvs)[i];
+ return std::make_pair(conf.get_ghobj(), conf.p_value);
+ }
+ bool is_end() const { return !p_kvs || i >= p_kvs->size(); }
+ size_t index() const { return i; }
+
+ iterator_t& operator++() {
+ assert(!is_end());
+ ++i;
+ return *this;
+ }
+
+ iterator_t operator++(int) {
+ iterator_t tmp = *this;
+ ++*this;
+ return tmp;
+ }
+
+ private:
+ iterator_t(const kv_vector_t& kvs) : p_kvs{&kvs} {}
+
+ const kv_vector_t* p_kvs = nullptr;
+ size_t i = 0;
+ friend class KVPool;
+ };
+
+ iterator_t begin() const {
+ return iterator_t(kvs);
+ }
+
+ iterator_t random_begin() const {
+ return iterator_t(random_kvs);
+ }
+
+ size_t size() const {
+ return kvs.size();
+ }
+
+ private:
+ std::vector<size_t> str_sizes;
+ Onodes onodes;
+ kv_vector_t kvs;
+ kv_vector_t random_kvs;
+};
+
+template <bool TRACK>
+class TreeBuilder {
+ public:
+ using ertr = Btree::btree_ertr;
+ template <class ValueT=void>
+ using future = ertr::future<ValueT>;
+
+ TreeBuilder(KVPool& kvs, NodeExtentManagerURef&& nm)
+ : kvs{kvs} {
+ tree.emplace(std::move(nm));
+ }
+
+ future<> bootstrap(Transaction& t) {
+ std::ostringstream oss;
+#ifndef NDEBUG
+ oss << "debug=on, ";
+#else
+ oss << "debug=off, ";
+#endif
+#ifdef UNIT_TESTS_BUILT
+ oss << "UNIT_TEST_BUILT=on, ";
+#else
+ oss << "UNIT_TEST_BUILT=off, ";
+#endif
+ if constexpr (TRACK) {
+ oss << "track=on, ";
+ } else {
+ oss << "track=off, ";
+ }
+ oss << *tree;
+ logger().warn("TreeBuilder: {}, bootstrapping ...", oss.str());
+ return tree->mkfs(t);
+ }
+
+ future<> insert(Transaction& t) {
+ kv_iter = kvs.random_begin();
+ auto cursors = seastar::make_lw_shared<std::vector<Btree::Cursor>>();
+ logger().warn("start inserting {} kvs ...", kvs.size());
+ auto start_time = mono_clock::now();
+ return crimson::do_until([&t, this, cursors]() -> future<bool> {
+ if (kv_iter.is_end()) {
+ return ertr::make_ready_future<bool>(true);
+ }
+ auto [key, p_value] = kv_iter.get_kv();
+ logger().debug("[{}] {} -> {}", kv_iter.index(), key_hobj_t{key}, *p_value);
+ return tree->insert(t, key, *p_value
+ ).safe_then([&t, this, cursors](auto ret) {
+ auto& [cursor, success] = ret;
+ assert(success == true);
+ if constexpr (TRACK) {
+ cursors->emplace_back(cursor);
+ }
+#ifndef NDEBUG
+ auto [key, p_value] = kv_iter.get_kv();
+ Onodes::validate_cursor(cursor, key, *p_value);
+ return tree->lower_bound(t, key).safe_then([this, cursor](auto cursor_) {
+ auto [key, p_value] = kv_iter.get_kv();
+ ceph_assert(cursor_.get_ghobj() == key);
+ ceph_assert(cursor_.value() == cursor.value());
+ ++kv_iter;
+ return ertr::make_ready_future<bool>(false);
+ });
+#else
+ ++kv_iter;
+ return ertr::make_ready_future<bool>(false);
+#endif
+ });
+ }).safe_then([&t, this, start_time, cursors] {
+ std::chrono::duration<double> duration = mono_clock::now() - start_time;
+ logger().warn("Insert done! {}s", duration.count());
+ if (!cursors->empty()) {
+ logger().info("Verifing tracked cursors ...");
+ kv_iter = kvs.random_begin();
+ return seastar::do_with(
+ cursors->begin(), [&t, this, cursors](auto& c_iter) {
+ return crimson::do_until([&t, this, &c_iter, cursors]() -> future<bool> {
+ if (kv_iter.is_end()) {
+ logger().info("Verify done!");
+ return ertr::make_ready_future<bool>(true);
+ }
+ assert(c_iter != cursors->end());
+ auto [k, v] = kv_iter.get_kv();
+ // validate values in tree keep intact
+ return tree->lower_bound(t, k).safe_then([this, &c_iter](auto cursor) {
+ auto [k, v] = kv_iter.get_kv();
+ Onodes::validate_cursor(cursor, k, *v);
+ // validate values in cursors keep intact
+ Onodes::validate_cursor(*c_iter, k, *v);
+ ++kv_iter;
+ ++c_iter;
+ return ertr::make_ready_future<bool>(false);
+ });
+ });
+ });
+ } else {
+ return ertr::now();
+ }
+ });
+ }
+
+ future<> get_stats(Transaction& t) {
+ return tree->get_stats_slow(t
+ ).safe_then([this](auto stats) {
+ logger().warn("{}", stats);
+ });
+ }
+
+ void reload(NodeExtentManagerURef&& nm) {
+ tree.emplace(std::move(nm));
+ }
+
+ future<> validate(Transaction& t) {
+ logger().info("Verifing insertion ...");
+ return seastar::do_with(
+ kvs.begin(), [&t, this] (auto& kvs_iter) {
+ return crimson::do_until([&t, this, &kvs_iter]() -> future<bool> {
+ if (kvs_iter.is_end()) {
+ logger().info("Verify done!");
+ return ertr::make_ready_future<bool>(true);
+ }
+ auto [k, v] = kvs_iter.get_kv();
+ return tree->lower_bound(t, k
+ ).safe_then([&kvs_iter, k=k, v=v] (auto cursor) {
+ Onodes::validate_cursor(cursor, k, *v);
+ ++kvs_iter;
+ return ertr::make_ready_future<bool>(false);
+ });
+ });
+ });
+ }
+
+ private:
+ static seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+
+ KVPool& kvs;
+ std::optional<Btree> tree;
+ KVPool::iterator_t kv_iter;
+};
+
+}
diff --git a/src/crimson/os/seastore/root_block.h b/src/crimson/os/seastore/root_block.h
new file mode 100644
index 000000000..4a5024caa
--- /dev/null
+++ b/src/crimson/os/seastore/root_block.h
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/cached_extent.h"
+
+namespace crimson::os::seastore {
+
+/**
+ * root_t
+ *
+ * Contains information required to find metadata roots.
+ * TODO: generalize this to permit more than one lba_manager implementation
+ */
+struct __attribute__((aligned(8), packed)) root_t {
+ depth_t lba_depth = 0;
+ depth_t segment_depth = 0;
+ paddr_t lba_root_addr;
+ paddr_t segment_root;
+ laddr_t onode_root = L_ADDR_NULL;
+
+ void adjust_addrs_from_base(paddr_t base) {
+ if (lba_root_addr.is_relative()) {
+ lba_root_addr = base.add_record_relative(lba_root_addr);
+ }
+ }
+};
+
+/**
+ * RootBlock
+ *
+ * Holds the physical addresses of all metadata roots.
+ * In-memory values may be
+ * - absolute: reference to block which predates the current transaction
+ * - record_relative: reference to block updated in this transaction
+ * if !pending()
+ *
+ * Journal replay only considers deltas and must always discover the most
+ * recent value for the RootBlock. Because the contents of root_t above are
+ * very small, it's simplest to stash the entire root_t value into the delta
+ * and never actually write the RootBlock to a physical location (safe since
+ * nothing references the location of the RootBlock).
+ *
+ * As a result, Cache treats the root differently in a few ways including:
+ * - state will only ever be DIRTY or MUTATION_PENDING
+ * - RootBlock's never show up in the transaction fresh or dirty lists --
+ * there's a special Transaction::root member for when the root needs to
+ * be mutated.
+ *
+ * TODO: Journal trimming will need to be aware of the most recent RootBlock
+ * delta location, or, even easier, just always write one out with the
+ * mutation which changes the journal trim bound.
+ */
+struct RootBlock : CachedExtent {
+ constexpr static segment_off_t SIZE = 4<<10;
+ using Ref = TCachedExtentRef<RootBlock>;
+
+ root_t root;
+
+ RootBlock() : CachedExtent(0) {}
+
+ RootBlock(const RootBlock &rhs) = default;
+
+ CachedExtentRef duplicate_for_write() final {
+ return CachedExtentRef(new RootBlock(*this));
+ };
+
+ static constexpr extent_types_t TYPE = extent_types_t::ROOT;
+ extent_types_t get_type() const final {
+ return extent_types_t::ROOT;
+ }
+
+ /// dumps root as delta
+ ceph::bufferlist get_delta() final {
+ ceph::bufferlist bl;
+ ceph::buffer::ptr bptr(sizeof(root_t));
+ *reinterpret_cast<root_t*>(bptr.c_str()) = root;
+ bl.append(bptr);
+ return bl;
+ }
+
+ /// overwrites root
+ void apply_delta_and_adjust_crc(paddr_t base, const ceph::bufferlist &_bl) final {
+ assert(_bl.length() == sizeof(root_t));
+ ceph::bufferlist bl = _bl;
+ bl.rebuild();
+ root = *reinterpret_cast<const root_t*>(bl.front().c_str());
+ root.adjust_addrs_from_base(base);
+ }
+
+ /// Patches relative addrs in memory based on record commit addr
+ void on_delta_write(paddr_t record_block_offset) final {
+ root.adjust_addrs_from_base(record_block_offset);
+ }
+
+ complete_load_ertr::future<> complete_load() final {
+ ceph_abort_msg("Root is only written via deltas");
+ }
+
+ void on_initial_write() final {
+ ceph_abort_msg("Root is only written via deltas");
+ }
+
+ root_t &get_root() { return root; }
+};
+using RootBlockRef = RootBlock::Ref;
+
+}
diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc
new file mode 100644
index 000000000..50c148cea
--- /dev/null
+++ b/src/crimson/os/seastore/seastore.cc
@@ -0,0 +1,532 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "seastore.h"
+
+#include <boost/algorithm/string/trim.hpp>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "common/safe_io.h"
+#include "os/Transaction.h"
+
+#include "crimson/common/buffer_io.h"
+#include "crimson/common/config_proxy.h"
+
+#include "crimson/os/futurized_collection.h"
+
+#include "crimson/os/seastore/segment_manager/ephemeral.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/onode_manager.h"
+#include "crimson/os/seastore/cache.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+using crimson::common::local_conf;
+
+namespace crimson::os::seastore {
+
+struct SeastoreCollection final : public FuturizedCollection {
+ template <typename... T>
+ SeastoreCollection(T&&... args) :
+ FuturizedCollection(std::forward<T>(args)...) {}
+};
+
+SeaStore::SeaStore(const std::string& path)
+ : segment_manager(segment_manager::create_test_ephemeral() /* TODO */),
+ segment_cleaner(
+ std::make_unique<SegmentCleaner>(
+ SegmentCleaner::config_t::default_from_segment_manager(
+ *segment_manager))),
+ cache(std::make_unique<Cache>(*segment_manager)),
+ journal(new Journal(*segment_manager)),
+ lba_manager(
+ lba_manager::create_lba_manager(*segment_manager, *cache)),
+ transaction_manager(
+ new TransactionManager(
+ *segment_manager,
+ *segment_cleaner,
+ *journal,
+ *cache,
+ *lba_manager)),
+ onode_manager(onode_manager::create_ephemeral())
+{
+ journal->set_segment_provider(&*segment_cleaner);
+ segment_cleaner->set_extent_callback(&*transaction_manager);
+}
+
+SeaStore::~SeaStore() = default;
+
+seastar::future<> SeaStore::stop()
+{
+ return seastar::now();
+}
+
+seastar::future<> SeaStore::mount()
+{
+ return seastar::now();
+}
+
+seastar::future<> SeaStore::umount()
+{
+ return seastar::now();
+}
+
+seastar::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
+{
+ return seastar::now();
+}
+
+seastar::future<store_statfs_t> SeaStore::stat() const
+{
+ logger().debug("{}", __func__);
+ store_statfs_t st;
+ return seastar::make_ready_future<store_statfs_t>(st);
+}
+
+seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
+SeaStore::list_objects(CollectionRef ch,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit) const
+{
+ return seastar::make_ready_future<std::tuple<std::vector<ghobject_t>, ghobject_t>>(
+ std::make_tuple(std::vector<ghobject_t>(), end));
+}
+
+seastar::future<CollectionRef> SeaStore::create_new_collection(const coll_t& cid)
+{
+ auto c = _get_collection(cid);
+ return seastar::make_ready_future<CollectionRef>(c);
+}
+
+seastar::future<CollectionRef> SeaStore::open_collection(const coll_t& cid)
+{
+ return seastar::make_ready_future<CollectionRef>(_get_collection(cid));
+}
+
+seastar::future<std::vector<coll_t>> SeaStore::list_collections()
+{
+ return seastar::make_ready_future<std::vector<coll_t>>();
+}
+
+SeaStore::read_errorator::future<ceph::bufferlist> SeaStore::read(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ uint32_t op_flags)
+{
+ return read_errorator::make_ready_future<ceph::bufferlist>();
+}
+
+SeaStore::read_errorator::future<ceph::bufferlist> SeaStore::readv(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ interval_set<uint64_t>& m,
+ uint32_t op_flags)
+{
+ return read_errorator::make_ready_future<ceph::bufferlist>();
+}
+
+SeaStore::get_attr_errorator::future<ceph::bufferptr> SeaStore::get_attr(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ std::string_view name) const
+{
+ auto c = static_cast<SeastoreCollection*>(ch.get());
+ logger().debug("{} {} {}",
+ __func__, c->get_cid(), oid);
+ return crimson::ct_error::enoent::make();
+}
+
+SeaStore::get_attrs_ertr::future<SeaStore::attrs_t> SeaStore::get_attrs(
+ CollectionRef ch,
+ const ghobject_t& oid)
+{
+ auto c = static_cast<SeastoreCollection*>(ch.get());
+ logger().debug("{} {} {}",
+ __func__, c->get_cid(), oid);
+ return crimson::ct_error::enoent::make();
+}
+
+seastar::future<struct stat> stat(
+ CollectionRef c,
+ const ghobject_t& oid)
+{
+ return seastar::make_ready_future<struct stat>();
+}
+
+
+seastar::future<struct stat> SeaStore::stat(
+ CollectionRef c,
+ const ghobject_t& oid)
+{
+ struct stat st;
+ return seastar::make_ready_future<struct stat>(st);
+}
+
+auto
+SeaStore::omap_get_header(
+ CollectionRef c,
+ const ghobject_t& oid)
+ -> read_errorator::future<bufferlist>
+{
+ return seastar::make_ready_future<bufferlist>();
+}
+
+auto
+SeaStore::omap_get_values(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ const omap_keys_t& keys)
+ -> read_errorator::future<omap_values_t>
+{
+ auto c = static_cast<SeastoreCollection*>(ch.get());
+ logger().debug("{} {} {}",
+ __func__, c->get_cid(), oid);
+ return seastar::make_ready_future<omap_values_t>();
+}
+
+auto
+SeaStore::omap_get_values(
+ CollectionRef ch,
+ const ghobject_t &oid,
+ const std::optional<string> &start)
+ -> read_errorator::future<std::tuple<bool, SeaStore::omap_values_t>>
+{
+ auto c = static_cast<SeastoreCollection*>(ch.get());
+ logger().debug(
+ "{} {} {}",
+ __func__, c->get_cid(), oid);
+ return seastar::make_ready_future<std::tuple<bool, omap_values_t>>(
+ std::make_tuple(false, omap_values_t()));
+}
+
+seastar::future<FuturizedStore::OmapIteratorRef> get_omap_iterator(
+ CollectionRef ch,
+ const ghobject_t& oid)
+{
+ return seastar::make_ready_future<FuturizedStore::OmapIteratorRef>();
+}
+
+seastar::future<std::map<uint64_t, uint64_t>> fiemap(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ uint64_t off,
+ uint64_t len)
+{
+ return seastar::make_ready_future<std::map<uint64_t, uint64_t>>();
+}
+
+seastar::future<> SeaStore::do_transaction(
+ CollectionRef _ch,
+ ceph::os::Transaction&& _t)
+{
+ return seastar::do_with(
+ _t.begin(),
+ transaction_manager->create_transaction(),
+ std::vector<OnodeRef>(),
+ std::move(_t),
+ std::move(_ch),
+ [this](auto &iter, auto &trans, auto &onodes, auto &t, auto &ch) {
+ return onode_manager->get_or_create_onodes(
+ *trans, iter.get_objects()).safe_then(
+ [this, &iter, &trans, &onodes, &t, &ch](auto &&read_onodes) {
+ onodes = std::move(read_onodes);
+ return seastar::do_until(
+ [&iter]() { return iter.have_op(); },
+ [this, &iter, &trans, &onodes, &t, &ch]() {
+ return _do_transaction_step(trans, ch, onodes, iter).safe_then(
+ [this, &trans] {
+ return transaction_manager->submit_transaction(std::move(trans));
+ }).handle_error(
+ // TODO: add errorator::do_until
+ crimson::ct_error::eagain::handle([]() {
+ // TODO retry
+ }),
+ write_ertr::all_same_way([&t](auto e) {
+ logger().error(" transaction dump:\n");
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+ t.dump(&f);
+ f.close_section();
+ std::stringstream str;
+ f.flush(str);
+ logger().error("{}", str.str());
+ abort();
+ }));
+ });
+ }).safe_then([this, &trans, &onodes]() {
+ return onode_manager->write_dirty(*trans, onodes);
+ }).safe_then([]() {
+ // TODO: complete transaction!
+ return;
+ }).handle_error(
+ write_ertr::all_same_way([&t](auto e) {
+ logger().error(" transaction dump:\n");
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+ t.dump(&f);
+ f.close_section();
+ std::stringstream str;
+ f.flush(str);
+ logger().error("{}", str.str());
+ abort();
+ })).then([&t]() {
+ for (auto i : {
+ t.get_on_applied(),
+ t.get_on_commit(),
+ t.get_on_applied_sync()}) {
+ if (i) {
+ i->complete(0);
+ }
+ }
+ });
+ });
+}
+
+SeaStore::write_ertr::future<> SeaStore::_do_transaction_step(
+ TransactionRef &trans,
+ CollectionRef &col,
+ std::vector<OnodeRef> &onodes,
+ ceph::os::Transaction::iterator &i)
+{
+ auto get_onode = [&onodes](size_t i) -> OnodeRef& {
+ ceph_assert(i < onodes.size());
+ return onodes[i];
+ };
+
+ using ceph::os::Transaction;
+ try {
+ switch (auto op = i.decode_op(); op->op) {
+ case Transaction::OP_NOP:
+ return write_ertr::now();
+ case Transaction::OP_REMOVE:
+ {
+ return _remove(trans, get_onode(op->oid));
+ }
+ break;
+ case Transaction::OP_TOUCH:
+ {
+ return _touch(trans, get_onode(op->oid));
+ }
+ break;
+ case Transaction::OP_WRITE:
+ {
+ uint64_t off = op->off;
+ uint64_t len = op->len;
+ uint32_t fadvise_flags = i.get_fadvise_flags();
+ ceph::bufferlist bl;
+ i.decode_bl(bl);
+ return _write(trans, get_onode(op->oid), off, len, bl, fadvise_flags);
+ }
+ break;
+ case Transaction::OP_TRUNCATE:
+ {
+ uint64_t off = op->off;
+ return _truncate(trans, get_onode(op->oid), off);
+ }
+ break;
+ case Transaction::OP_SETATTR:
+ {
+ std::string name = i.decode_string();
+ ceph::bufferlist bl;
+ i.decode_bl(bl);
+ std::map<std::string, bufferptr> to_set;
+ to_set[name] = bufferptr(bl.c_str(), bl.length());
+ return _setattrs(trans, get_onode(op->oid), to_set);
+ }
+ break;
+ case Transaction::OP_MKCOLL:
+ {
+ coll_t cid = i.get_cid(op->cid);
+ return _create_collection(trans, cid, op->split_bits);
+ }
+ break;
+ case Transaction::OP_OMAP_SETKEYS:
+ {
+ std::map<std::string, ceph::bufferlist> aset;
+ i.decode_attrset(aset);
+ return _omap_set_values(trans, get_onode(op->oid), std::move(aset));
+ }
+ break;
+ case Transaction::OP_OMAP_SETHEADER:
+ {
+ ceph::bufferlist bl;
+ i.decode_bl(bl);
+ return _omap_set_header(trans, get_onode(op->oid), bl);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYS:
+ {
+ omap_keys_t keys;
+ i.decode_keyset(keys);
+ return _omap_rmkeys(trans, get_onode(op->oid), keys);
+ }
+ break;
+ case Transaction::OP_OMAP_RMKEYRANGE:
+ {
+ string first, last;
+ first = i.decode_string();
+ last = i.decode_string();
+ return _omap_rmkeyrange(trans, get_onode(op->oid), first, last);
+ }
+ break;
+ case Transaction::OP_COLL_HINT:
+ {
+ ceph::bufferlist hint;
+ i.decode_bl(hint);
+ return write_ertr::now();
+ }
+ default:
+ logger().error("bad op {}", static_cast<unsigned>(op->op));
+ return crimson::ct_error::input_output_error::make();
+ }
+ } catch (std::exception &e) {
+ logger().error("{} got exception {}", __func__, e);
+ return crimson::ct_error::input_output_error::make();
+ }
+}
+
+SeaStore::write_ertr::future<> SeaStore::_remove(
+ TransactionRef &trans,
+ OnodeRef &onode)
+{
+ logger().debug("{} onode={}",
+ __func__, *onode);
+ return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_touch(
+ TransactionRef &trans,
+ OnodeRef &onode)
+{
+ logger().debug("{} onode={}",
+ __func__, *onode);
+ return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_write(
+ TransactionRef &trans,
+ OnodeRef &onode,
+ uint64_t offset, size_t len, const ceph::bufferlist& bl,
+ uint32_t fadvise_flags)
+{
+ logger().debug("{}: {} {} ~ {}",
+ __func__, *onode, offset, len);
+ assert(len == bl.length());
+
+/*
+ return onode_manager->get_or_create_onode(cid, oid).safe_then([=, &bl](auto ref) {
+ return;
+ }).handle_error(
+ crimson::ct_error::enoent::handle([]() {
+ return;
+ }),
+ OnodeManager::open_ertr::pass_further{}
+ );
+ */
+ return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_omap_set_values(
+ TransactionRef &trans,
+ OnodeRef &onode,
+ std::map<std::string, ceph::bufferlist> &&aset)
+{
+ logger().debug(
+ "{}: {} {} keys",
+ __func__, *onode, aset.size());
+
+ return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_omap_set_header(
+ TransactionRef &trans,
+ OnodeRef &onode,
+ const ceph::bufferlist &header)
+{
+ logger().debug(
+ "{}: {} {} bytes",
+ __func__, *onode, header.length());
+ return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_omap_rmkeys(
+ TransactionRef &trans,
+ OnodeRef &onode,
+ const omap_keys_t& aset)
+{
+ logger().debug(
+ "{} {} {} keys",
+ __func__, *onode, aset.size());
+ return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_omap_rmkeyrange(
+ TransactionRef &trans,
+ OnodeRef &onode,
+ const std::string &first,
+ const std::string &last)
+{
+ logger().debug(
+ "{} {} first={} last={}",
+ __func__, *onode, first, last);
+ return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_truncate(
+ TransactionRef &trans,
+ OnodeRef &onode,
+ uint64_t size)
+{
+ logger().debug("{} onode={} size={}",
+ __func__, *onode, size);
+ return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_setattrs(
+ TransactionRef &trans,
+ OnodeRef &onode,
+ std::map<std::string,bufferptr>& aset)
+{
+ logger().debug("{} onode={}",
+ __func__, *onode);
+ return write_ertr::now();
+}
+
+SeaStore::write_ertr::future<> SeaStore::_create_collection(
+ TransactionRef &trans,
+ const coll_t& cid, int bits)
+{
+ return write_ertr::now();
+}
+
+boost::intrusive_ptr<SeastoreCollection> SeaStore::_get_collection(const coll_t& cid)
+{
+ return new SeastoreCollection{cid};
+}
+
+seastar::future<> SeaStore::write_meta(const std::string& key,
+ const std::string& value)
+{
+ return seastar::make_ready_future<>();
+}
+
+seastar::future<std::tuple<int, std::string>> SeaStore::read_meta(const std::string& key)
+{
+ return seastar::make_ready_future<std::tuple<int, std::string>>(
+ std::make_tuple(0, ""s));
+}
+
+uuid_d SeaStore::get_fsid() const
+{
+ return osd_fsid;
+}
+
+}
diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h
new file mode 100644
index 000000000..798442c34
--- /dev/null
+++ b/src/crimson/os/seastore/seastore.h
@@ -0,0 +1,181 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <map>
+#include <typeinfo>
+#include <vector>
+
+#include <optional>
+#include <seastar/core/future.hh>
+
+#include "osd/osd_types.h"
+#include "include/uuid.h"
+
+#include "os/Transaction.h"
+#include "crimson/os/seastore/segment_cleaner.h"
+#include "crimson/os/futurized_store.h"
+#include "transaction.h"
+
+namespace crimson::os::seastore {
+
+class SeastoreCollection;
+class SegmentManager;
+class OnodeManager;
+class Onode;
+using OnodeRef = boost::intrusive_ptr<Onode>;
+class Journal;
+class LBAManager;
+class TransactionManager;
+class Cache;
+
+class SeaStore final : public FuturizedStore {
+ uuid_d osd_fsid;
+
+public:
+
+ SeaStore(const std::string& path);
+ ~SeaStore() final;
+
+ seastar::future<> stop() final;
+ seastar::future<> mount() final;
+ seastar::future<> umount() final;
+
+ seastar::future<> mkfs(uuid_d new_osd_fsid) final;
+ seastar::future<store_statfs_t> stat() const final;
+
+ read_errorator::future<ceph::bufferlist> read(
+ CollectionRef c,
+ const ghobject_t& oid,
+ uint64_t offset,
+ size_t len,
+ uint32_t op_flags = 0) final;
+ read_errorator::future<ceph::bufferlist> readv(
+ CollectionRef c,
+ const ghobject_t& oid,
+ interval_set<uint64_t>& m,
+ uint32_t op_flags = 0) final;
+ get_attr_errorator::future<ceph::bufferptr> get_attr(
+ CollectionRef c,
+ const ghobject_t& oid,
+ std::string_view name) const final;
+ get_attrs_ertr::future<attrs_t> get_attrs(
+ CollectionRef c,
+ const ghobject_t& oid) final;
+
+ seastar::future<struct stat> stat(
+ CollectionRef c,
+ const ghobject_t& oid) final;
+
+ read_errorator::future<omap_values_t> omap_get_values(
+ CollectionRef c,
+ const ghobject_t& oid,
+ const omap_keys_t& keys) final;
+
+ /// Retrieves paged set of values > start (if present)
+ read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
+ CollectionRef c, ///< [in] collection
+ const ghobject_t &oid, ///< [in] oid
+ const std::optional<std::string> &start ///< [in] start, empty for begin
+ ) final; ///< @return <done, values> values.empty() iff done
+
+ read_errorator::future<bufferlist> omap_get_header(
+ CollectionRef c,
+ const ghobject_t& oid) final;
+
+ seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
+ CollectionRef c,
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit) const final;
+
+ seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
+ seastar::future<CollectionRef> open_collection(const coll_t& cid) final;
+ seastar::future<std::vector<coll_t>> list_collections() final;
+
+ seastar::future<> do_transaction(
+ CollectionRef ch,
+ ceph::os::Transaction&& txn) final;
+
+ seastar::future<OmapIteratorRef> get_omap_iterator(
+ CollectionRef ch,
+ const ghobject_t& oid) final;
+ seastar::future<std::map<uint64_t, uint64_t>> fiemap(
+ CollectionRef ch,
+ const ghobject_t& oid,
+ uint64_t off,
+ uint64_t len) final;
+
+ seastar::future<> write_meta(const std::string& key,
+ const std::string& value) final;
+ seastar::future<std::tuple<int, std::string>> read_meta(const std::string& key) final;
+ uuid_d get_fsid() const final;
+
+ unsigned get_max_attr_name_length() const final {
+ return 256;
+ }
+
+private:
+ std::unique_ptr<SegmentManager> segment_manager;
+ std::unique_ptr<SegmentCleaner> segment_cleaner;
+ std::unique_ptr<Cache> cache;
+ std::unique_ptr<Journal> journal;
+ std::unique_ptr<LBAManager> lba_manager;
+ std::unique_ptr<TransactionManager> transaction_manager;
+ std::unique_ptr<OnodeManager> onode_manager;
+
+
+ using write_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ write_ertr::future<> _do_transaction_step(
+ TransactionRef &trans,
+ CollectionRef &col,
+ std::vector<OnodeRef> &onodes,
+ ceph::os::Transaction::iterator &i);
+
+ write_ertr::future<> _remove(
+ TransactionRef &trans,
+ OnodeRef &onode);
+ write_ertr::future<> _touch(
+ TransactionRef &trans,
+ OnodeRef &onode);
+ write_ertr::future<> _write(
+ TransactionRef &trans,
+ OnodeRef &onode,
+ uint64_t offset, size_t len, const ceph::bufferlist& bl,
+ uint32_t fadvise_flags);
+ write_ertr::future<> _omap_set_values(
+ TransactionRef &trans,
+ OnodeRef &onode,
+ std::map<std::string, ceph::bufferlist> &&aset);
+ write_ertr::future<> _omap_set_header(
+ TransactionRef &trans,
+ OnodeRef &onode,
+ const ceph::bufferlist &header);
+ write_ertr::future<> _omap_rmkeys(
+ TransactionRef &trans,
+ OnodeRef &onode,
+ const omap_keys_t& aset);
+ write_ertr::future<> _omap_rmkeyrange(
+ TransactionRef &trans,
+ OnodeRef &onode,
+ const std::string &first,
+ const std::string &last);
+ write_ertr::future<> _truncate(
+ TransactionRef &trans,
+ OnodeRef &onode, uint64_t size);
+ write_ertr::future<> _setattrs(
+ TransactionRef &trans,
+ OnodeRef &onode,
+ std::map<std::string,bufferptr>& aset);
+ write_ertr::future<> _create_collection(
+ TransactionRef &trans,
+ const coll_t& cid, int bits);
+
+ boost::intrusive_ptr<SeastoreCollection> _get_collection(const coll_t& cid);
+};
+
+}
diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc
new file mode 100644
index 000000000..ff43b1e51
--- /dev/null
+++ b/src/crimson/os/seastore/seastore_types.cc
@@ -0,0 +1,105 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/seastore_types.h"
+
+namespace crimson::os::seastore {
+
+std::ostream &segment_to_stream(std::ostream &out, const segment_id_t &t)
+{
+ if (t == NULL_SEG_ID)
+ return out << "NULL_SEG";
+ else if (t == BLOCK_REL_SEG_ID)
+ return out << "BLOCK_REL_SEG";
+ else if (t == RECORD_REL_SEG_ID)
+ return out << "RECORD_REL_SEG";
+ else if (t == FAKE_SEG_ID)
+ return out << "FAKE_SEG";
+ else
+ return out << t;
+}
+
+std::ostream &offset_to_stream(std::ostream &out, const segment_off_t &t)
+{
+ if (t == NULL_SEG_OFF)
+ return out << "NULL_OFF";
+ else
+ return out << t;
+}
+
+std::ostream &operator<<(std::ostream &out, const paddr_t &rhs)
+{
+ out << "paddr_t<";
+ segment_to_stream(out, rhs.segment);
+ out << ", ";
+ offset_to_stream(out, rhs.offset);
+ return out << ">";
+}
+
+std::ostream &operator<<(std::ostream &out, const journal_seq_t &seq)
+{
+ return out << "journal_seq_t(segment_seq="
+ << seq.segment_seq << ", offset="
+ << seq.offset
+ << ")";
+}
+
+std::ostream &operator<<(std::ostream &out, extent_types_t t)
+{
+ switch (t) {
+ case extent_types_t::ROOT:
+ return out << "ROOT";
+ case extent_types_t::LADDR_INTERNAL:
+ return out << "LADDR_INTERNAL";
+ case extent_types_t::LADDR_LEAF:
+ return out << "LADDR_LEAF";
+ case extent_types_t::EXTMAP_INNER:
+ return out << "EXTMAP_INNER";
+ case extent_types_t::EXTMAP_LEAF:
+ return out << "EXTMAP_LEAF";
+ case extent_types_t::ONODE_BLOCK_STAGED:
+ return out << "ONODE_BLOCK_STAGED";
+ case extent_types_t::TEST_BLOCK:
+ return out << "TEST_BLOCK";
+ case extent_types_t::TEST_BLOCK_PHYSICAL:
+ return out << "TEST_BLOCK_PHYSICAL";
+ case extent_types_t::NONE:
+ return out << "NONE";
+ default:
+ return out << "UNKNOWN";
+ }
+}
+
+std::ostream &operator<<(std::ostream &out, const laddr_list_t &rhs)
+{
+ bool first = false;
+ for (auto &i: rhs) {
+ out << (first ? '[' : ',') << '(' << i.first << ',' << i.second << ')';
+ first = true;
+ }
+ return out << ']';
+}
+std::ostream &operator<<(std::ostream &out, const paddr_list_t &rhs)
+{
+ bool first = false;
+ for (auto &i: rhs) {
+ out << (first ? '[' : ',') << '(' << i.first << ',' << i.second << ')';
+ first = true;
+ }
+ return out << ']';
+}
+
+std::ostream &operator<<(std::ostream &lhs, const delta_info_t &rhs)
+{
+ return lhs << "delta_info_t("
+ << "type: " << rhs.type
+ << ", paddr: " << rhs.paddr
+ << ", laddr: " << rhs.laddr
+ << ", prev_crc: " << rhs.prev_crc
+ << ", final_crc: " << rhs.final_crc
+ << ", length: " << rhs.length
+ << ", pversion: " << rhs.pversion
+ << ")";
+}
+
+}
diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h
new file mode 100644
index 000000000..cb8480268
--- /dev/null
+++ b/src/crimson/os/seastore/seastore_types.h
@@ -0,0 +1,369 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <limits>
+#include <iostream>
+
+#include "include/byteorder.h"
+#include "include/denc.h"
+#include "include/buffer.h"
+#include "include/cmp.h"
+#include "include/uuid.h"
+
+namespace crimson::os::seastore {
+
+using depth_t = int32_t;
+using depth_le_t = ceph_les32;
+
+using checksum_t = uint32_t;
+
+// Immutable metadata for seastore to set at mkfs time
+struct seastore_meta_t {
+ uuid_d seastore_id;
+
+ DENC(seastore_meta_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.seastore_id, p);
+ DENC_FINISH(p);
+ }
+};
+
+// Identifies segment location on disk, see SegmentManager,
+using segment_id_t = uint32_t;
+constexpr segment_id_t NULL_SEG_ID =
+ std::numeric_limits<segment_id_t>::max() - 1;
+/* Used to denote relative paddr_t */
+constexpr segment_id_t RECORD_REL_SEG_ID =
+ std::numeric_limits<segment_id_t>::max() - 2;
+constexpr segment_id_t BLOCK_REL_SEG_ID =
+ std::numeric_limits<segment_id_t>::max() - 3;
+
+// for tests which generate fake paddrs
+constexpr segment_id_t FAKE_SEG_ID =
+ std::numeric_limits<segment_id_t>::max() - 4;
+
+std::ostream &segment_to_stream(std::ostream &, const segment_id_t &t);
+
+// Offset within a segment on disk, see SegmentManager
+// may be negative for relative offsets
+using segment_off_t = int32_t;
+constexpr segment_off_t NULL_SEG_OFF =
+ std::numeric_limits<segment_id_t>::max();
+
+std::ostream &offset_to_stream(std::ostream &, const segment_off_t &t);
+
+/* Monotonically increasing segment seq, uniquely identifies
+ * the incarnation of a segment */
+using segment_seq_t = uint32_t;
+static constexpr segment_seq_t NULL_SEG_SEQ =
+ std::numeric_limits<segment_seq_t>::max();
+
+// Offset of delta within a record
+using record_delta_idx_t = uint32_t;
+constexpr record_delta_idx_t NULL_DELTA_IDX =
+ std::numeric_limits<record_delta_idx_t>::max();
+
+/**
+ * paddr_t
+ *
+ * <segment, offset> offset on disk, see SegmentManager
+ *
+ * May be absolute, record_relative, or block_relative.
+ *
+ * Blocks get read independently of the surrounding record,
+ * so paddrs embedded directly within a block need to refer
+ * to other blocks within the same record by a block_relative
+ * addr relative to the block's own offset. By contrast,
+ * deltas to existing blocks need to use record_relative
+ * addrs relative to the first block of the record.
+ *
+ * Fresh extents during a transaction are refered to by
+ * record_relative paddrs.
+ */
+struct paddr_t {
+ segment_id_t segment = NULL_SEG_ID;
+ segment_off_t offset = NULL_SEG_OFF;
+
+ bool is_relative() const {
+ return segment == RECORD_REL_SEG_ID ||
+ segment == BLOCK_REL_SEG_ID;
+ }
+
+ bool is_record_relative() const {
+ return segment == RECORD_REL_SEG_ID;
+ }
+
+ bool is_block_relative() const {
+ return segment == BLOCK_REL_SEG_ID;
+ }
+
+ paddr_t add_offset(segment_off_t o) const {
+ return paddr_t{segment, offset + o};
+ }
+
+ paddr_t add_relative(paddr_t o) const {
+ assert(o.is_relative());
+ return paddr_t{segment, offset + o.offset};
+ }
+
+ paddr_t add_block_relative(paddr_t o) const {
+ // special version mainly for documentation purposes
+ assert(o.is_block_relative());
+ return add_relative(o);
+ }
+
+ paddr_t add_record_relative(paddr_t o) const {
+ // special version mainly for documentation purposes
+ assert(o.is_record_relative());
+ return add_relative(o);
+ }
+
+ /**
+ * paddr_t::operator-
+ *
+ * Only defined for record_relative paddr_ts. Yields a
+ * block_relative address.
+ */
+ paddr_t operator-(paddr_t rhs) const {
+ assert(rhs.is_relative() && is_relative());
+ assert(rhs.segment == segment);
+ return paddr_t{
+ BLOCK_REL_SEG_ID,
+ offset - rhs.offset
+ };
+ }
+
+ /**
+ * maybe_relative_to
+ *
+ * Helper for the case where an in-memory paddr_t may be
+ * either block_relative or absolute (not record_relative).
+ *
+ * base must be either absolute or record_relative.
+ */
+ paddr_t maybe_relative_to(paddr_t base) const {
+ assert(!base.is_block_relative());
+ if (is_block_relative())
+ return base.add_block_relative(*this);
+ else
+ return *this;
+ }
+
+ DENC(paddr_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.segment, p);
+ denc(v.offset, p);
+ DENC_FINISH(p);
+ }
+};
+WRITE_CMP_OPERATORS_2(paddr_t, segment, offset)
+WRITE_EQ_OPERATORS_2(paddr_t, segment, offset)
+constexpr paddr_t P_ADDR_NULL = paddr_t{};
+constexpr paddr_t P_ADDR_MIN = paddr_t{0, 0};
+constexpr paddr_t make_record_relative_paddr(segment_off_t off) {
+ return paddr_t{RECORD_REL_SEG_ID, off};
+}
+constexpr paddr_t make_block_relative_paddr(segment_off_t off) {
+ return paddr_t{BLOCK_REL_SEG_ID, off};
+}
+constexpr paddr_t make_fake_paddr(segment_off_t off) {
+ return paddr_t{FAKE_SEG_ID, off};
+}
+
+struct paddr_le_t {
+ ceph_le32 segment = init_le32(NULL_SEG_ID);
+ ceph_les32 offset = init_les32(NULL_SEG_OFF);
+
+ paddr_le_t() = default;
+ paddr_le_t(ceph_le32 segment, ceph_les32 offset)
+ : segment(segment), offset(offset) {}
+ paddr_le_t(segment_id_t segment, segment_off_t offset)
+ : segment(init_le32(segment)), offset(init_les32(offset)) {}
+ paddr_le_t(const paddr_t &addr) : paddr_le_t(addr.segment, addr.offset) {}
+
+ operator paddr_t() const {
+ return paddr_t{segment, offset};
+ }
+};
+
+std::ostream &operator<<(std::ostream &out, const paddr_t &rhs);
+
+using objaddr_t = uint32_t;
+constexpr objaddr_t OBJ_ADDR_MIN = std::numeric_limits<objaddr_t>::min();
+
+/* Monotonically increasing identifier for the location of a
+ * journal_record.
+ */
+struct journal_seq_t {
+ segment_seq_t segment_seq = 0;
+ paddr_t offset;
+
+ DENC(journal_seq_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.segment_seq, p);
+ denc(v.offset, p);
+ DENC_FINISH(p);
+ }
+};
+WRITE_CMP_OPERATORS_2(journal_seq_t, segment_seq, offset)
+WRITE_EQ_OPERATORS_2(journal_seq_t, segment_seq, offset)
+
+std::ostream &operator<<(std::ostream &out, const journal_seq_t &seq);
+
+static constexpr journal_seq_t NO_DELTAS = journal_seq_t{
+ NULL_SEG_SEQ,
+ P_ADDR_NULL
+};
+
+// logical addr, see LBAManager, TransactionManager
+using laddr_t = uint64_t;
+constexpr laddr_t L_ADDR_MIN = std::numeric_limits<laddr_t>::min();
+constexpr laddr_t L_ADDR_MAX = std::numeric_limits<laddr_t>::max();
+constexpr laddr_t L_ADDR_NULL = std::numeric_limits<laddr_t>::max();
+constexpr laddr_t L_ADDR_ROOT = std::numeric_limits<laddr_t>::max() - 1;
+constexpr laddr_t L_ADDR_LBAT = std::numeric_limits<laddr_t>::max() - 2;
+
+struct laddr_le_t {
+ ceph_le64 laddr = init_le64(L_ADDR_NULL);
+
+ laddr_le_t() = default;
+ laddr_le_t(const laddr_le_t &) = default;
+ explicit laddr_le_t(const laddr_t &addr)
+ : laddr(init_le64(addr)) {}
+
+ operator laddr_t() const {
+ return laddr_t(laddr);
+ }
+ laddr_le_t& operator=(laddr_t addr) {
+ ceph_le64 val;
+ val = addr;
+ laddr = val;
+ return *this;
+ }
+};
+
+// logical offset, see LBAManager, TransactionManager
+using extent_len_t = uint32_t;
+constexpr extent_len_t EXTENT_LEN_MAX =
+ std::numeric_limits<extent_len_t>::max();
+
+using extent_len_le_t = ceph_le32;
+inline extent_len_le_t init_extent_len_le_t(extent_len_t len) {
+ return init_le32(len);
+}
+
+struct laddr_list_t : std::list<std::pair<laddr_t, extent_len_t>> {
+ template <typename... T>
+ laddr_list_t(T&&... args)
+ : std::list<std::pair<laddr_t, extent_len_t>>(std::forward<T>(args)...) {}
+};
+struct paddr_list_t : std::list<std::pair<paddr_t, extent_len_t>> {
+ template <typename... T>
+ paddr_list_t(T&&... args)
+ : std::list<std::pair<paddr_t, extent_len_t>>(std::forward<T>(args)...) {}
+};
+
+std::ostream &operator<<(std::ostream &out, const laddr_list_t &rhs);
+std::ostream &operator<<(std::ostream &out, const paddr_list_t &rhs);
+
+/* identifies type of extent, used for interpretting deltas, managing
+ * writeback.
+ *
+ * Note that any new extent type needs to be added to
+ * Cache::get_extent_by_type in cache.cc
+ */
+enum class extent_types_t : uint8_t {
+ ROOT = 0,
+ LADDR_INTERNAL = 1,
+ LADDR_LEAF = 2,
+ ONODE_BLOCK = 3,
+ EXTMAP_INNER = 4,
+ EXTMAP_LEAF = 5,
+ ONODE_BLOCK_STAGED = 6,
+
+ // Test Block Types
+ TEST_BLOCK = 0xF0,
+ TEST_BLOCK_PHYSICAL = 0xF1,
+
+ // None
+ NONE = 0xFF
+};
+
+inline bool is_logical_type(extent_types_t type) {
+ switch (type) {
+ case extent_types_t::ROOT:
+ case extent_types_t::LADDR_INTERNAL:
+ case extent_types_t::LADDR_LEAF:
+ return false;
+ default:
+ return true;
+ }
+}
+
+std::ostream &operator<<(std::ostream &out, extent_types_t t);
+
+/* description of a new physical extent */
+struct extent_t {
+ extent_types_t type; ///< type of extent
+ laddr_t addr; ///< laddr of extent (L_ADDR_NULL for non-logical)
+ ceph::bufferlist bl; ///< payload, bl.length() == length, aligned
+};
+
+using extent_version_t = uint32_t;
+constexpr extent_version_t EXTENT_VERSION_NULL = 0;
+
+/* description of a mutation to a physical extent */
+struct delta_info_t {
+ extent_types_t type = extent_types_t::NONE; ///< delta type
+ paddr_t paddr; ///< physical address
+ laddr_t laddr = L_ADDR_NULL; ///< logical address
+ uint32_t prev_crc = 0;
+ uint32_t final_crc = 0;
+ segment_off_t length = NULL_SEG_OFF; ///< extent length
+ extent_version_t pversion; ///< prior version
+ ceph::bufferlist bl; ///< payload
+
+ DENC(delta_info_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.type, p);
+ denc(v.paddr, p);
+ denc(v.laddr, p);
+ denc(v.prev_crc, p);
+ denc(v.final_crc, p);
+ denc(v.length, p);
+ denc(v.pversion, p);
+ denc(v.bl, p);
+ DENC_FINISH(p);
+ }
+
+ bool operator==(const delta_info_t &rhs) const {
+ return (
+ type == rhs.type &&
+ paddr == rhs.paddr &&
+ laddr == rhs.laddr &&
+ prev_crc == rhs.prev_crc &&
+ final_crc == rhs.final_crc &&
+ length == rhs.length &&
+ pversion == rhs.pversion &&
+ bl == rhs.bl
+ );
+ }
+
+ friend std::ostream &operator<<(std::ostream &lhs, const delta_info_t &rhs);
+};
+
+std::ostream &operator<<(std::ostream &lhs, const delta_info_t &rhs);
+
+struct record_t {
+ std::vector<extent_t> extents;
+ std::vector<delta_info_t> deltas;
+};
+
+}
+
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::seastore_meta_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::paddr_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_seq_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::delta_info_t)
diff --git a/src/crimson/os/seastore/segment_cleaner.cc b/src/crimson/os/seastore/segment_cleaner.cc
new file mode 100644
index 000000000..3597c21df
--- /dev/null
+++ b/src/crimson/os/seastore/segment_cleaner.cc
@@ -0,0 +1,340 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/common/log.h"
+
+#include "crimson/os/seastore/transaction.h"
+#include "crimson/os/seastore/segment_cleaner.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+namespace crimson::os::seastore {
+
+bool SpaceTrackerSimple::equals(const SpaceTrackerI &_other) const
+{
+ const auto &other = static_cast<const SpaceTrackerSimple&>(_other);
+
+ if (other.live_bytes_by_segment.size() != live_bytes_by_segment.size()) {
+ logger().error("{}: different segment counts, bug in test");
+ assert(0 == "segment counts should match");
+ return false;
+ }
+
+ bool all_match = true;
+ for (segment_id_t i = 0; i < live_bytes_by_segment.size(); ++i) {
+ if (other.live_bytes_by_segment[i] != live_bytes_by_segment[i]) {
+ all_match = false;
+ logger().debug(
+ "{}: segment_id {} live bytes mismatch *this: {}, other: {}",
+ __func__,
+ i,
+ live_bytes_by_segment[i],
+ other.live_bytes_by_segment[i]);
+ }
+ }
+ return all_match;
+}
+
+int64_t SpaceTrackerDetailed::SegmentMap::allocate(
+ segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len,
+ const extent_len_t block_size)
+{
+ assert(offset % block_size == 0);
+ assert(len % block_size == 0);
+
+ const auto b = (offset / block_size);
+ const auto e = (offset + len) / block_size;
+
+ bool error = false;
+ for (auto i = b; i < e; ++i) {
+ if (bitmap[i]) {
+ if (!error) {
+ logger().error(
+ "SegmentMap::allocate found allocated in {}, {} ~ {}",
+ segment,
+ offset,
+ len);
+ error = true;
+ }
+ logger().debug(
+ "SegmentMap::allocate block {} allocated",
+ i * block_size);
+ }
+ bitmap[i] = true;
+ }
+ return update_usage(block_size);
+}
+
+int64_t SpaceTrackerDetailed::SegmentMap::release(
+ segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len,
+ const extent_len_t block_size)
+{
+ assert(offset % block_size == 0);
+ assert(len % block_size == 0);
+
+ const auto b = (offset / block_size);
+ const auto e = (offset + len) / block_size;
+
+ bool error = false;
+ for (auto i = b; i < e; ++i) {
+ if (!bitmap[i]) {
+ if (!error) {
+ logger().error(
+ "SegmentMap::release found unallocated in {}, {} ~ {}",
+ segment,
+ offset,
+ len);
+ error = true;
+ }
+ logger().debug(
+ "SegmentMap::release block {} unallocated",
+ i * block_size);
+ }
+ bitmap[i] = false;
+ }
+ return update_usage(-(int64_t)block_size);
+}
+
+bool SpaceTrackerDetailed::equals(const SpaceTrackerI &_other) const
+{
+ const auto &other = static_cast<const SpaceTrackerDetailed&>(_other);
+
+ if (other.segment_usage.size() != segment_usage.size()) {
+ logger().error("{}: different segment counts, bug in test");
+ assert(0 == "segment counts should match");
+ return false;
+ }
+
+ bool all_match = true;
+ for (segment_id_t i = 0; i < segment_usage.size(); ++i) {
+ if (other.segment_usage[i].get_usage() != segment_usage[i].get_usage()) {
+ all_match = false;
+ logger().error(
+ "{}: segment_id {} live bytes mismatch *this: {}, other: {}",
+ __func__,
+ i,
+ segment_usage[i].get_usage(),
+ other.segment_usage[i].get_usage());
+ }
+ }
+ return all_match;
+}
+
+void SpaceTrackerDetailed::SegmentMap::dump_usage(extent_len_t block_size) const
+{
+ for (unsigned i = 0; i < bitmap.size(); ++i) {
+ if (bitmap[i]) {
+ logger().debug(" {} still live", i * block_size);
+ }
+ }
+}
+
+void SpaceTrackerDetailed::dump_usage(segment_id_t id) const
+{
+ logger().debug("SpaceTrackerDetailed::dump_usage {}", id);
+ segment_usage[id].dump_usage(block_size);
+}
+
+SegmentCleaner::get_segment_ret SegmentCleaner::get_segment()
+{
+ for (size_t i = 0; i < segments.size(); ++i) {
+ if (segments[i].is_empty()) {
+ mark_open(i);
+ logger().debug("{}: returning segment {}", __func__, i);
+ return get_segment_ret(
+ get_segment_ertr::ready_future_marker{},
+ i);
+ }
+ }
+ assert(0 == "out of space handling todo");
+ return get_segment_ret(
+ get_segment_ertr::ready_future_marker{},
+ 0);
+}
+
+void SegmentCleaner::update_journal_tail_target(journal_seq_t target)
+{
+ logger().debug(
+ "{}: {}",
+ __func__,
+ target);
+ assert(journal_tail_target == journal_seq_t() || target >= journal_tail_target);
+ if (journal_tail_target == journal_seq_t() || target > journal_tail_target) {
+ journal_tail_target = target;
+ }
+}
+
+void SegmentCleaner::update_journal_tail_committed(journal_seq_t committed)
+{
+ if (journal_tail_committed == journal_seq_t() ||
+ committed > journal_tail_committed) {
+ logger().debug(
+ "{}: update journal_tail_committed {}",
+ __func__,
+ committed);
+ journal_tail_committed = committed;
+ }
+ if (journal_tail_target == journal_seq_t() ||
+ committed > journal_tail_target) {
+ logger().debug(
+ "{}: update journal_tail_target {}",
+ __func__,
+ committed);
+ journal_tail_target = committed;
+ }
+}
+
+void SegmentCleaner::close_segment(segment_id_t segment)
+{
+ mark_closed(segment);
+}
+
+SegmentCleaner::do_immediate_work_ret SegmentCleaner::do_immediate_work(
+ Transaction &t)
+{
+ auto next_target = get_dirty_tail_limit();
+ logger().debug(
+ "{}: journal_tail_target={} get_dirty_tail_limit()={}",
+ __func__,
+ journal_tail_target,
+ next_target);
+
+ logger().debug(
+ "SegmentCleaner::do_immediate_work gc total {}, available {}, unavailable {}, used {} available_ratio {}, reclaim_ratio {}, bytes_to_gc_for_available {}, bytes_to_gc_for_reclaim {}",
+ get_total_bytes(),
+ get_available_bytes(),
+ get_unavailable_bytes(),
+ get_used_bytes(),
+ get_available_ratio(),
+ get_reclaim_ratio(),
+ get_immediate_bytes_to_gc_for_available(),
+ get_immediate_bytes_to_gc_for_reclaim());
+
+ auto dirty_fut = do_immediate_work_ertr::now();
+ if (journal_tail_target < next_target) {
+ dirty_fut = rewrite_dirty(t, next_target);
+ }
+ return dirty_fut.safe_then([=, &t] {
+ return do_gc(t, get_immediate_bytes_to_gc());
+ }).handle_error(
+ do_immediate_work_ertr::pass_further{},
+ crimson::ct_error::assert_all{}
+ );
+}
+
+SegmentCleaner::do_deferred_work_ret SegmentCleaner::do_deferred_work(
+ Transaction &t)
+{
+ return do_deferred_work_ret(
+ do_deferred_work_ertr::ready_future_marker{},
+ ceph::timespan());
+}
+
+SegmentCleaner::rewrite_dirty_ret SegmentCleaner::rewrite_dirty(
+ Transaction &t,
+ journal_seq_t limit)
+{
+ return ecb->get_next_dirty_extents(
+ limit
+ ).then([=, &t](auto dirty_list) {
+ if (dirty_list.empty()) {
+ return do_immediate_work_ertr::now();
+ } else {
+ update_journal_tail_target(dirty_list.front()->get_dirty_from());
+ }
+ return seastar::do_with(
+ std::move(dirty_list),
+ [this, &t](auto &dirty_list) {
+ return crimson::do_for_each(
+ dirty_list,
+ [this, &t](auto &e) {
+ logger().debug(
+ "SegmentCleaner::do_immediate_work cleaning {}",
+ *e);
+ return ecb->rewrite_extent(t, e);
+ });
+ });
+ });
+}
+
+SegmentCleaner::do_gc_ret SegmentCleaner::do_gc(
+ Transaction &t,
+ size_t bytes)
+{
+ if (bytes == 0) {
+ return do_gc_ertr::now();
+ }
+
+ if (!scan_cursor) {
+ paddr_t next = P_ADDR_NULL;
+ next.segment = get_next_gc_target();
+ if (next == P_ADDR_NULL) {
+ logger().debug(
+ "SegmentCleaner::do_gc: no segments to gc");
+ return do_gc_ertr::now();
+ }
+ next.offset = 0;
+ scan_cursor =
+ std::make_unique<ExtentCallbackInterface::scan_extents_cursor>(
+ next);
+ logger().debug(
+ "SegmentCleaner::do_gc: starting gc on segment {}",
+ scan_cursor->get_offset().segment);
+ }
+
+ return ecb->scan_extents(
+ *scan_cursor,
+ bytes
+ ).safe_then([=, &t](auto addrs) {
+ return seastar::do_with(
+ std::move(addrs),
+ [=, &t](auto &addr_list) {
+ return crimson::do_for_each(
+ addr_list,
+ [=, &t](auto &addr_pair) {
+ auto &[addr, info] = addr_pair;
+ logger().debug(
+ "SegmentCleaner::do_gc: checking addr {}",
+ addr);
+ return ecb->get_extent_if_live(
+ t,
+ info.type,
+ addr,
+ info.addr,
+ info.len
+ ).safe_then([addr=addr, &t, this](CachedExtentRef ext) {
+ if (!ext) {
+ logger().debug(
+ "SegmentCleaner::do_gc: addr {} dead, skipping",
+ addr);
+ return ExtentCallbackInterface::rewrite_extent_ertr::now();
+ } else {
+ logger().debug(
+ "SegmentCleaner::do_gc: addr {} alive, gc'ing {}",
+ addr,
+ *ext);
+ }
+ return ecb->rewrite_extent(
+ t,
+ ext);
+ });
+ }).safe_then([&t, this] {
+ if (scan_cursor->is_complete()) {
+ t.mark_segment_to_release(scan_cursor->get_offset().segment);
+ scan_cursor.reset();
+ }
+ return ExtentCallbackInterface::release_segment_ertr::now();
+ });
+ });
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/segment_cleaner.h b/src/crimson/os/seastore/segment_cleaner.h
new file mode 100644
index 000000000..38ebd05bc
--- /dev/null
+++ b/src/crimson/os/seastore/segment_cleaner.h
@@ -0,0 +1,691 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive/set.hpp>
+
+#include "common/ceph_time.h"
+
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/journal.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/segment_manager.h"
+
+namespace crimson::os::seastore {
+class Transaction;
+
+struct segment_info_t {
+ Segment::segment_state_t state = Segment::segment_state_t::EMPTY;
+
+ // Will be non-null for any segments in the current journal
+ segment_seq_t journal_segment_seq = NULL_SEG_SEQ;
+
+
+ bool is_in_journal(journal_seq_t tail_committed) const {
+ return journal_segment_seq != NULL_SEG_SEQ &&
+ tail_committed.segment_seq <= journal_segment_seq;
+ }
+
+ bool is_empty() const {
+ return state == Segment::segment_state_t::EMPTY;
+ }
+
+ bool is_closed() const {
+ return state == Segment::segment_state_t::CLOSED;
+ }
+
+ bool is_open() const {
+ return state == Segment::segment_state_t::OPEN;
+ }
+};
+
+class SpaceTrackerI {
+public:
+ virtual int64_t allocate(
+ segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len) = 0;
+
+ virtual int64_t release(
+ segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len) = 0;
+
+ virtual int64_t get_usage(
+ segment_id_t segment) const = 0;
+
+ virtual bool equals(const SpaceTrackerI &other) const = 0;
+
+ virtual std::unique_ptr<SpaceTrackerI> make_empty() const = 0;
+
+ virtual void dump_usage(segment_id_t) const = 0;
+
+ virtual void reset() = 0;
+
+ virtual ~SpaceTrackerI() = default;
+};
+using SpaceTrackerIRef = std::unique_ptr<SpaceTrackerI>;
+
+class SpaceTrackerSimple : public SpaceTrackerI {
+ // Tracks live space for each segment
+ std::vector<int64_t> live_bytes_by_segment;
+
+ int64_t update_usage(segment_id_t segment, int64_t delta) {
+ assert(segment < live_bytes_by_segment.size());
+ live_bytes_by_segment[segment] += delta;
+ assert(live_bytes_by_segment[segment] >= 0);
+ return live_bytes_by_segment[segment];
+ }
+public:
+ SpaceTrackerSimple(size_t num_segments)
+ : live_bytes_by_segment(num_segments, 0) {}
+
+ int64_t allocate(
+ segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len) final {
+ return update_usage(segment, len);
+ }
+
+ int64_t release(
+ segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len) final {
+ return update_usage(segment, -len);
+ }
+
+ int64_t get_usage(segment_id_t segment) const final {
+ assert(segment < live_bytes_by_segment.size());
+ return live_bytes_by_segment[segment];
+ }
+
+ void dump_usage(segment_id_t) const final {}
+
+ void reset() final {
+ for (auto &i: live_bytes_by_segment)
+ i = 0;
+ }
+
+ SpaceTrackerIRef make_empty() const final {
+ return SpaceTrackerIRef(
+ new SpaceTrackerSimple(live_bytes_by_segment.size()));
+ }
+
+ bool equals(const SpaceTrackerI &other) const;
+};
+
+class SpaceTrackerDetailed : public SpaceTrackerI {
+ class SegmentMap {
+ int64_t used = 0;
+ std::vector<bool> bitmap;
+
+ public:
+ SegmentMap(size_t blocks) : bitmap(blocks, false) {}
+
+ int64_t update_usage(int64_t delta) {
+ used += delta;
+ return used;
+ }
+
+ int64_t allocate(
+ segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len,
+ const extent_len_t block_size);
+
+ int64_t release(
+ segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len,
+ const extent_len_t block_size);
+
+ int64_t get_usage() const {
+ return used;
+ }
+
+ void dump_usage(extent_len_t block_size) const;
+
+ void reset() {
+ used = 0;
+ for (auto &&i: bitmap) {
+ i = false;
+ }
+ }
+ };
+ const size_t block_size;
+ const size_t segment_size;
+
+ // Tracks live space for each segment
+ std::vector<SegmentMap> segment_usage;
+
+public:
+ SpaceTrackerDetailed(size_t num_segments, size_t segment_size, size_t block_size)
+ : block_size(block_size),
+ segment_size(segment_size),
+ segment_usage(num_segments, segment_size / block_size) {}
+
+ int64_t allocate(
+ segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len) final {
+ assert(segment < segment_usage.size());
+ return segment_usage[segment].allocate(segment, offset, len, block_size);
+ }
+
+ int64_t release(
+ segment_id_t segment,
+ segment_off_t offset,
+ extent_len_t len) final {
+ assert(segment < segment_usage.size());
+ return segment_usage[segment].release(segment, offset, len, block_size);
+ }
+
+ int64_t get_usage(segment_id_t segment) const final {
+ assert(segment < segment_usage.size());
+ return segment_usage[segment].get_usage();
+ }
+
+ void dump_usage(segment_id_t seg) const final;
+
+ void reset() final {
+ for (auto &i: segment_usage)
+ i.reset();
+ }
+
+ SpaceTrackerIRef make_empty() const final {
+ return SpaceTrackerIRef(
+ new SpaceTrackerDetailed(
+ segment_usage.size(),
+ segment_size,
+ block_size));
+ }
+
+ bool equals(const SpaceTrackerI &other) const;
+};
+
+
+class SegmentCleaner : public JournalSegmentProvider {
+public:
+ /// Config
+ struct config_t {
+ size_t num_segments = 0;
+ size_t segment_size = 0;
+ size_t block_size = 0;
+ size_t target_journal_segments = 0;
+ size_t max_journal_segments = 0;
+
+ double reclaim_ratio_hard_limit = 0;
+ // don't apply reclaim ratio with available space below this
+ double reclaim_ratio_usage_min = 0;
+
+ double available_ratio_hard_limit = 0;
+
+ static config_t default_from_segment_manager(
+ SegmentManager &manager) {
+ return config_t{
+ manager.get_num_segments(),
+ static_cast<size_t>(manager.get_segment_size()),
+ (size_t)manager.get_block_size(),
+ 2,
+ 4,
+ .5,
+ .95,
+ .2
+ };
+ }
+ };
+
+ /// Callback interface for querying and operating on segments
+ class ExtentCallbackInterface {
+ public:
+ virtual ~ExtentCallbackInterface() = default;
+ /**
+ * get_next_dirty_extent
+ *
+ * returns all extents with dirty_from < bound
+ */
+ using get_next_dirty_extents_ertr = crimson::errorator<>;
+ using get_next_dirty_extents_ret = get_next_dirty_extents_ertr::future<
+ std::vector<CachedExtentRef>>;
+ virtual get_next_dirty_extents_ret get_next_dirty_extents(
+ journal_seq_t bound ///< [in] return extents with dirty_from < bound
+ ) = 0;
+
+ /**
+ * rewrite_extent
+ *
+ * Updates t with operations moving the passed extents to a new
+ * segment. extent may be invalid, implementation must correctly
+ * handle finding the current instance if it is still alive and
+ * otherwise ignore it.
+ */
+ using rewrite_extent_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using rewrite_extent_ret = rewrite_extent_ertr::future<>;
+ virtual rewrite_extent_ret rewrite_extent(
+ Transaction &t,
+ CachedExtentRef extent) = 0;
+
+ /**
+ * get_extent_if_live
+ *
+ * Returns extent at specified location if still referenced by
+ * lba_manager and not removed by t.
+ *
+ * See TransactionManager::get_extent_if_live and
+ * LBAManager::get_physical_extent_if_live.
+ */
+ using get_extent_if_live_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using get_extent_if_live_ret = get_extent_if_live_ertr::future<
+ CachedExtentRef>;
+ virtual get_extent_if_live_ret get_extent_if_live(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t addr,
+ laddr_t laddr,
+ segment_off_t len) = 0;
+
+ /**
+ * scan_extents
+ *
+ * Interface shim for Journal::scan_extents
+ */
+ using scan_extents_cursor = Journal::scan_valid_records_cursor;
+ using scan_extents_ertr = Journal::scan_extents_ertr;
+ using scan_extents_ret = Journal::scan_extents_ret;
+ virtual scan_extents_ret scan_extents(
+ scan_extents_cursor &cursor,
+ extent_len_t bytes_to_read) = 0;
+
+ /**
+ * release_segment
+ *
+ * Release segment.
+ */
+ using release_segment_ertr = SegmentManager::release_ertr;
+ using release_segment_ret = release_segment_ertr::future<>;
+ virtual release_segment_ret release_segment(
+ segment_id_t id) = 0;
+ };
+
+private:
+ const config_t config;
+
+ SpaceTrackerIRef space_tracker;
+ std::vector<segment_info_t> segments;
+ size_t empty_segments;
+ int64_t used_bytes = 0;
+ bool init_complete = false;
+
+ journal_seq_t journal_tail_target;
+ journal_seq_t journal_tail_committed;
+ journal_seq_t journal_head;
+
+ ExtentCallbackInterface *ecb = nullptr;
+
+public:
+ SegmentCleaner(config_t config, bool detailed = false)
+ : config(config),
+ space_tracker(
+ detailed ?
+ (SpaceTrackerI*)new SpaceTrackerDetailed(
+ config.num_segments,
+ config.segment_size,
+ config.block_size) :
+ (SpaceTrackerI*)new SpaceTrackerSimple(
+ config.num_segments)),
+ segments(config.num_segments),
+ empty_segments(config.num_segments) {}
+
+ get_segment_ret get_segment() final;
+
+ void close_segment(segment_id_t segment) final;
+
+ void set_journal_segment(
+ segment_id_t segment, segment_seq_t seq) final {
+ assert(segment < segments.size());
+ segments[segment].journal_segment_seq = seq;
+ assert(segments[segment].is_open());
+ }
+
+ journal_seq_t get_journal_tail_target() const final {
+ return journal_tail_target;
+ }
+
+ void update_journal_tail_committed(journal_seq_t committed) final;
+
+ void update_journal_tail_target(journal_seq_t target);
+
+ void init_journal_tail(journal_seq_t tail) {
+ journal_tail_target = journal_tail_committed = tail;
+ }
+
+ void set_journal_head(journal_seq_t head) {
+ assert(journal_head == journal_seq_t() || head >= journal_head);
+ journal_head = head;
+ }
+
+ void init_mark_segment_closed(segment_id_t segment, segment_seq_t seq) final {
+ crimson::get_logger(ceph_subsys_filestore).debug(
+ "SegmentCleaner::init_mark_segment_closed: segment {}, seq {}",
+ segment,
+ seq);
+ mark_closed(segment);
+ segments[segment].journal_segment_seq = seq;
+ }
+
+ segment_seq_t get_seq(segment_id_t id) final {
+ return segments[id].journal_segment_seq;
+ }
+
+ void mark_segment_released(segment_id_t segment) {
+ return mark_empty(segment);
+ }
+
+ void mark_space_used(
+ paddr_t addr,
+ extent_len_t len,
+ bool init_scan = false) {
+ assert(addr.segment < segments.size());
+
+ if (!init_scan && !init_complete)
+ return;
+
+ if (!init_scan) {
+ assert(segments[addr.segment].state == Segment::segment_state_t::OPEN);
+ }
+
+ used_bytes += len;
+ [[maybe_unused]] auto ret = space_tracker->allocate(
+ addr.segment,
+ addr.offset,
+ len);
+ assert(ret > 0);
+ }
+
+ void mark_space_free(
+ paddr_t addr,
+ extent_len_t len) {
+ if (!init_complete)
+ return;
+
+ used_bytes -= len;
+ assert(addr.segment < segments.size());
+
+ [[maybe_unused]] auto ret = space_tracker->release(
+ addr.segment,
+ addr.offset,
+ len);
+ assert(ret >= 0);
+ }
+
+ segment_id_t get_next_gc_target() const {
+ segment_id_t ret = NULL_SEG_ID;
+ int64_t least_live_bytes = std::numeric_limits<int64_t>::max();
+ for (segment_id_t i = 0; i < segments.size(); ++i) {
+ if (segments[i].is_closed() &&
+ !segments[i].is_in_journal(journal_tail_committed) &&
+ space_tracker->get_usage(i) < least_live_bytes) {
+ ret = i;
+ least_live_bytes = space_tracker->get_usage(i);
+ }
+ }
+ if (ret != NULL_SEG_ID) {
+ crimson::get_logger(ceph_subsys_filestore).debug(
+ "SegmentCleaner::get_next_gc_target: segment {} seq {}",
+ ret,
+ segments[ret].journal_segment_seq);
+ }
+ return ret;
+ }
+
+ SpaceTrackerIRef get_empty_space_tracker() const {
+ return space_tracker->make_empty();
+ }
+
+ void complete_init() { init_complete = true; }
+
+ void set_extent_callback(ExtentCallbackInterface *cb) {
+ ecb = cb;
+ }
+
+ bool debug_check_space(const SpaceTrackerI &tracker) {
+ return space_tracker->equals(tracker);
+ }
+
+ /**
+ * do_immediate_work
+ *
+ * Should be invoked prior to submission of any transaction,
+ * will piggy-back work required to maintain deferred work
+ * constraints.
+ */
+ using do_immediate_work_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using do_immediate_work_ret = do_immediate_work_ertr::future<>;
+ do_immediate_work_ret do_immediate_work(
+ Transaction &t);
+
+
+ /**
+ * do_deferred_work
+ *
+ * Should be called at idle times -- will perform background
+ * operations based on deferred work constraints.
+ *
+ * If returned timespan is non-zero, caller should pause calling
+ * back into do_deferred_work before returned timespan has elapsed,
+ * or a foreground operation occurs.
+ */
+ using do_deferred_work_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using do_deferred_work_ret = do_deferred_work_ertr::future<
+ ceph::timespan
+ >;
+ do_deferred_work_ret do_deferred_work(
+ Transaction &t);
+
+private:
+
+ // journal status helpers
+
+ /**
+ * rewrite_dirty
+ *
+ * Writes out dirty blocks dirtied earlier than limit.
+ */
+ using rewrite_dirty_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using rewrite_dirty_ret = rewrite_dirty_ertr::future<>;
+ rewrite_dirty_ret rewrite_dirty(
+ Transaction &t,
+ journal_seq_t limit);
+
+ journal_seq_t get_dirty_tail() const {
+ auto ret = journal_head;
+ ret.segment_seq -= std::min(
+ static_cast<size_t>(ret.segment_seq),
+ config.target_journal_segments);
+ return ret;
+ }
+
+ journal_seq_t get_dirty_tail_limit() const {
+ auto ret = journal_head;
+ ret.segment_seq -= std::min(
+ static_cast<size_t>(ret.segment_seq),
+ config.max_journal_segments);
+ return ret;
+ }
+
+ // GC status helpers
+ std::unique_ptr<ExtentCallbackInterface::scan_extents_cursor> scan_cursor;
+
+ /**
+ * do_gc
+ *
+ * Performs bytes worth of gc work on t.
+ */
+ using do_gc_ertr = SegmentManager::read_ertr;
+ using do_gc_ret = do_gc_ertr::future<>;
+ do_gc_ret do_gc(
+ Transaction &t,
+ size_t bytes);
+
+ size_t get_bytes_used_current_segment() const {
+ assert(journal_head != journal_seq_t());
+ return journal_head.offset.offset;
+ }
+
+ size_t get_bytes_available_current_segment() const {
+ return config.segment_size - get_bytes_used_current_segment();
+ }
+
+ /**
+ * get_bytes_scanned_current_segment
+ *
+ * Returns the number of bytes from the current gc segment that
+ * have been scanned.
+ */
+ size_t get_bytes_scanned_current_segment() const {
+ if (!scan_cursor)
+ return 0;
+
+ return scan_cursor->get_offset().offset;
+ }
+
+ size_t get_available_bytes() const {
+ return (empty_segments * config.segment_size) +
+ get_bytes_available_current_segment() +
+ get_bytes_scanned_current_segment();
+ }
+
+ size_t get_total_bytes() const {
+ return config.segment_size * config.num_segments;
+ }
+
+ size_t get_unavailable_bytes() const {
+ return get_total_bytes() - get_available_bytes();
+ }
+
+ /// Returns bytes currently occupied by live extents (not journal)
+ size_t get_used_bytes() const {
+ return used_bytes;
+ }
+
+ /// Returns the number of bytes in unavailable segments that are not live
+ size_t get_reclaimable_bytes() const {
+ return get_unavailable_bytes() - get_used_bytes();
+ }
+
+ /**
+ * get_reclaim_ratio
+ *
+ * Returns the ratio of unavailable space that is not currently used.
+ */
+ double get_reclaim_ratio() const {
+ if (get_unavailable_bytes() == 0) return 0;
+ return (double)get_reclaimable_bytes() / (double)get_unavailable_bytes();
+ }
+
+ /**
+ * get_available_ratio
+ *
+ * Returns ratio of available space to write to total space
+ */
+ double get_available_ratio() const {
+ return (double)get_available_bytes() / (double)get_total_bytes();
+ }
+
+ /**
+ * get_immediate_bytes_to_gc_for_reclaim
+ *
+ * Returns the number of bytes to gc in order to bring the
+ * reclaim ratio below reclaim_ratio_usage_min.
+ */
+ size_t get_immediate_bytes_to_gc_for_reclaim() const {
+ if (get_reclaim_ratio() < config.reclaim_ratio_hard_limit)
+ return 0;
+
+ const size_t unavailable_target = std::max(
+ get_used_bytes() / (1.0 - config.reclaim_ratio_hard_limit),
+ (1 - config.reclaim_ratio_usage_min) * get_total_bytes());
+
+ if (unavailable_target > get_unavailable_bytes())
+ return 0;
+
+ return (get_unavailable_bytes() - unavailable_target) / get_reclaim_ratio();
+ }
+
+ /**
+ * get_immediate_bytes_to_gc_for_available
+ *
+ * Returns the number of bytes to gc in order to bring the
+ * the ratio of available disk space to total disk space above
+ * available_ratio_hard_limit.
+ */
+ size_t get_immediate_bytes_to_gc_for_available() const {
+ if (get_available_ratio() > config.available_ratio_hard_limit) {
+ return 0;
+ }
+
+ const double ratio_to_make_available = config.available_ratio_hard_limit -
+ get_available_ratio();
+ return ratio_to_make_available * (double)get_total_bytes()
+ / get_reclaim_ratio();
+ }
+
+ /**
+ * get_immediate_bytes_to_gc
+ *
+ * Returns number of bytes to gc in order to restore any strict
+ * limits.
+ */
+ size_t get_immediate_bytes_to_gc() const {
+ // number of bytes to gc in order to correct reclaim ratio
+ size_t for_reclaim = get_immediate_bytes_to_gc_for_reclaim();
+
+ // number of bytes to gc in order to correct available_ratio
+ size_t for_available = get_immediate_bytes_to_gc_for_available();
+
+ return std::max(for_reclaim, for_available);
+ }
+
+ void mark_closed(segment_id_t segment) {
+ assert(segments.size() > segment);
+ if (init_complete) {
+ assert(segments[segment].is_open());
+ } else {
+ assert(segments[segment].is_empty());
+ assert(empty_segments > 0);
+ --empty_segments;
+ }
+ crimson::get_logger(ceph_subsys_filestore).debug(
+ "mark_closed: empty_segments: {}",
+ empty_segments);
+ segments[segment].state = Segment::segment_state_t::CLOSED;
+ }
+
+ void mark_empty(segment_id_t segment) {
+ assert(segments.size() > segment);
+ assert(segments[segment].is_closed());
+ assert(segments.size() > empty_segments);
+ ++empty_segments;
+ if (space_tracker->get_usage(segment) != 0) {
+ space_tracker->dump_usage(segment);
+ assert(space_tracker->get_usage(segment) == 0);
+ }
+ segments[segment].state = Segment::segment_state_t::EMPTY;
+ }
+
+ void mark_open(segment_id_t segment) {
+ assert(segments.size() > segment);
+ assert(segments[segment].is_empty());
+ assert(empty_segments > 0);
+ --empty_segments;
+ segments[segment].state = Segment::segment_state_t::OPEN;
+ }
+};
+
+}
diff --git a/src/crimson/os/seastore/segment_manager.h b/src/crimson/os/seastore/segment_manager.h
new file mode 100644
index 000000000..61c6509d1
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager.h
@@ -0,0 +1,128 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iosfwd>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "include/buffer_fwd.h"
+#include "crimson/osd/exceptions.h"
+
+namespace crimson::os::seastore {
+
+class Segment : public boost::intrusive_ref_counter<
+ Segment,
+ boost::thread_unsafe_counter>{
+public:
+
+ enum class segment_state_t : uint8_t {
+ EMPTY = 0,
+ OPEN = 1,
+ CLOSED = 2
+ };
+
+ /**
+ * get_segment_id
+ */
+ virtual segment_id_t get_segment_id() const = 0;
+
+ /**
+ * min next write location
+ */
+ virtual segment_off_t get_write_ptr() const = 0;
+
+ /**
+ * max capacity
+ */
+ virtual segment_off_t get_write_capacity() const = 0;
+
+ /**
+ * close
+ *
+ * Closes segment for writes. Won't complete until
+ * outstanding writes to this segment are complete.
+ */
+ using close_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent>;
+ virtual close_ertr::future<> close() = 0;
+
+
+ /**
+ * write
+ *
+ * @param offset offset of write, must be aligned to <> and >= write pointer, advances
+ * write pointer
+ * @param bl buffer to write, will be padded if not aligned
+ */
+ using write_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error, // media error or corruption
+ crimson::ct_error::invarg, // if offset is < write pointer or misaligned
+ crimson::ct_error::ebadf, // segment closed
+ crimson::ct_error::enospc // write exceeds segment size
+ >;
+ virtual write_ertr::future<> write(
+ segment_off_t offset, ceph::bufferlist bl) = 0;
+
+ virtual ~Segment() {}
+};
+using SegmentRef = boost::intrusive_ptr<Segment>;
+
+constexpr size_t PADDR_SIZE = sizeof(paddr_t);
+
+class SegmentManager {
+public:
+ using open_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent>;
+ virtual open_ertr::future<SegmentRef> open(segment_id_t id) = 0;
+
+ using release_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent>;
+ virtual release_ertr::future<> release(segment_id_t id) = 0;
+
+ using read_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent,
+ crimson::ct_error::erange>;
+ virtual read_ertr::future<> read(
+ paddr_t addr,
+ size_t len,
+ ceph::bufferptr &out) = 0;
+ read_ertr::future<ceph::bufferptr> read(
+ paddr_t addr,
+ size_t len) {
+ auto ptrref = std::make_unique<ceph::bufferptr>(
+ buffer::create_page_aligned(len));
+ return read(addr, len, *ptrref).safe_then(
+ [ptrref=std::move(ptrref)]() mutable {
+ return read_ertr::make_ready_future<bufferptr>(std::move(*ptrref));
+ });
+ }
+
+ /* Methods for discovering device geometry, segmentid set, etc */
+ virtual size_t get_size() const = 0;
+ virtual segment_off_t get_block_size() const = 0;
+ virtual segment_off_t get_segment_size() const = 0;
+ virtual segment_id_t get_num_segments() const {
+ ceph_assert(get_size() % get_segment_size() == 0);
+ return ((segment_id_t)(get_size() / get_segment_size()));
+ }
+ virtual const seastore_meta_t &get_meta() const = 0;
+
+ virtual ~SegmentManager() {}
+};
+using SegmentManagerRef = std::unique_ptr<SegmentManager>;
+
+}
diff --git a/src/crimson/os/seastore/segment_manager/block.cc b/src/crimson/os/seastore/segment_manager/block.cc
new file mode 100644
index 000000000..6a4991d42
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager/block.cc
@@ -0,0 +1,402 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include "crimson/common/log.h"
+
+#include "include/buffer.h"
+#include "crimson/os/seastore/segment_manager/block.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+
+namespace crimson::os::seastore::segment_manager::block {
+
+static write_ertr::future<> do_write(
+ seastar::file &device,
+ uint64_t offset,
+ bufferptr &bptr)
+{
+ logger().debug(
+ "block: do_write offset {} len {}",
+ offset,
+ bptr.length());
+ return device.dma_write(
+ offset,
+ bptr.c_str(),
+ bptr.length()
+ ).handle_exception([](auto e) -> write_ertr::future<size_t> {
+ logger().error(
+ "do_write: dma_write got error {}",
+ e);
+ return crimson::ct_error::input_output_error::make();
+ }).then([length=bptr.length()](auto result)
+ -> write_ertr::future<> {
+ if (result != length) {
+ return crimson::ct_error::input_output_error::make();
+ }
+ return write_ertr::now();
+ });
+}
+
+static read_ertr::future<> do_read(
+ seastar::file &device,
+ uint64_t offset,
+ bufferptr &bptr)
+{
+ logger().debug(
+ "block: do_read offset {} len {}",
+ offset,
+ bptr.length());
+ return device.dma_read(
+ offset,
+ bptr.c_str(),
+ bptr.length()
+ ).handle_exception([](auto e) -> read_ertr::future<size_t> {
+ logger().error(
+ "do_read: dma_read got error {}",
+ e);
+ return crimson::ct_error::input_output_error::make();
+ }).then([length=bptr.length()](auto result) -> read_ertr::future<> {
+ if (result != length) {
+ return crimson::ct_error::input_output_error::make();
+ }
+ return read_ertr::now();
+ });
+}
+
+write_ertr::future<>
+SegmentStateTracker::write_out(
+ seastar::file &device,
+ uint64_t offset)
+{
+ return do_write(device, offset, bptr);
+}
+
+write_ertr::future<>
+SegmentStateTracker::read_in(
+ seastar::file &device,
+ uint64_t offset)
+{
+ return do_read(
+ device,
+ offset,
+ bptr);
+}
+
+static
+block_sm_superblock_t make_superblock(
+ const BlockSegmentManager::mkfs_config_t &config,
+ const seastar::stat_data &data)
+{
+ logger().debug(
+ "{}: size {}, block_size {}, allocated_size {}, configured_size {}",
+ __func__,
+ data.size,
+ data.block_size,
+ data.allocated_size,
+ config.total_size);
+ size_t size = (data.size == 0) ? config.total_size : data.size;
+ size_t raw_segments = size / config.segment_size;
+ size_t tracker_size = SegmentStateTracker::get_raw_size(
+ raw_segments,
+ data.block_size);
+ size_t segments = (size - tracker_size - data.block_size)
+ / config.segment_size;
+ return block_sm_superblock_t{
+ size,
+ config.segment_size,
+ data.block_size,
+ segments,
+ data.block_size,
+ tracker_size + data.block_size,
+ config.meta
+ };
+}
+
+using open_device_ret =
+ BlockSegmentManager::access_ertr::future<
+ std::pair<seastar::file, seastar::stat_data>
+ >;
+static
+open_device_ret open_device(const std::string &in_path, seastar::open_flags mode)
+{
+ return seastar::do_with(
+ in_path,
+ [mode](auto &path) {
+ return seastar::file_stat(path, seastar::follow_symlink::yes
+ ).then([mode, &path](auto stat) mutable {
+ return seastar::open_file_dma(path, mode).then([=](auto file) {
+ logger().debug("open_device: open successful");
+ return std::make_pair(file, stat);
+ });
+ }).handle_exception([](auto e) -> open_device_ret {
+ logger().error(
+ "open_device: got error {}",
+ e);
+ return crimson::ct_error::input_output_error::make();
+ });
+ });
+}
+
+
+static
+BlockSegmentManager::access_ertr::future<>
+write_superblock(seastar::file &device, block_sm_superblock_t sb)
+{
+ assert(ceph::encoded_sizeof_bounded<block_sm_superblock_t>() <
+ sb.block_size);
+ return seastar::do_with(
+ bufferptr(ceph::buffer::create_page_aligned(sb.block_size)),
+ [=, &device](auto &bp) {
+ bufferlist bl;
+ encode(sb, bl);
+ auto iter = bl.begin();
+ assert(bl.length() < sb.block_size);
+ iter.copy(bl.length(), bp.c_str());
+ logger().debug("write_superblock: doing writeout");
+ return do_write(device, 0, bp);
+ });
+}
+
+static
+BlockSegmentManager::access_ertr::future<block_sm_superblock_t>
+read_superblock(seastar::file &device, seastar::stat_data sd)
+{
+ assert(ceph::encoded_sizeof_bounded<block_sm_superblock_t>() <
+ sd.block_size);
+ return seastar::do_with(
+ bufferptr(ceph::buffer::create_page_aligned(sd.block_size)),
+ [=, &device](auto &bp) {
+ return do_read(
+ device,
+ 0,
+ bp
+ ).safe_then([=, &bp] {
+ bufferlist bl;
+ bl.push_back(bp);
+ block_sm_superblock_t ret;
+ auto bliter = bl.cbegin();
+ decode(ret, bliter);
+ return BlockSegmentManager::access_ertr::future<block_sm_superblock_t>(
+ BlockSegmentManager::access_ertr::ready_future_marker{},
+ ret);
+ });
+ });
+}
+
+BlockSegment::BlockSegment(
+ BlockSegmentManager &manager, segment_id_t id)
+ : manager(manager), id(id) {}
+
+segment_off_t BlockSegment::get_write_capacity() const
+{
+ return manager.get_segment_size();
+}
+
+Segment::close_ertr::future<> BlockSegment::close()
+{
+ manager.segment_close(id);
+ return close_ertr::now();
+}
+
+Segment::write_ertr::future<> BlockSegment::write(
+ segment_off_t offset, ceph::bufferlist bl)
+{
+ if (offset < write_pointer || offset % manager.superblock.block_size != 0)
+ return crimson::ct_error::invarg::make();
+
+ if (offset + bl.length() > manager.superblock.segment_size)
+ return crimson::ct_error::enospc::make();
+
+ write_pointer = offset + bl.length();
+ return manager.segment_write({id, offset}, bl);
+}
+
+Segment::close_ertr::future<> BlockSegmentManager::segment_close(segment_id_t id)
+{
+ assert(tracker);
+ tracker->set(id, segment_state_t::CLOSED);
+ return tracker->write_out(device, superblock.tracker_offset);
+}
+
+Segment::write_ertr::future<> BlockSegmentManager::segment_write(
+ paddr_t addr,
+ ceph::bufferlist bl,
+ bool ignore_check)
+{
+ assert((bl.length() % superblock.block_size) == 0);
+ logger().debug(
+ "segment_write to segment {} at offset {}, physical offset {}, len {}",
+ addr.segment,
+ addr.offset,
+ get_offset(addr),
+ bl.length());
+
+
+ // TODO send an iovec and avoid the copy -- bl should have aligned
+ // constituent buffers and they will remain unmodified until the write
+ // completes
+ return seastar::do_with(
+ bufferptr(ceph::buffer::create_page_aligned(bl.length())),
+ [&](auto &bp) {
+ auto iter = bl.cbegin();
+ iter.copy(bl.length(), bp.c_str());
+ return do_write(device, get_offset(addr), bp);
+ });
+}
+
+BlockSegmentManager::~BlockSegmentManager()
+{
+}
+
+BlockSegmentManager::mount_ret BlockSegmentManager::mount(mount_config_t config)
+{
+ return open_device(
+ config.path, seastar::open_flags::rw | seastar::open_flags::dsync
+ ).safe_then([=](auto p) {
+ device = std::move(p.first);
+ auto sd = p.second;
+ return read_superblock(device, sd);
+ }).safe_then([=](auto sb) {
+ superblock = sb;
+ tracker = std::make_unique<SegmentStateTracker>(
+ superblock.segments,
+ superblock.block_size);
+ return tracker->read_in(
+ device,
+ superblock.tracker_offset
+ ).safe_then([this] {
+ for (segment_id_t i = 0; i < tracker->get_capacity(); ++i) {
+ if (tracker->get(i) == segment_state_t::OPEN) {
+ tracker->set(i, segment_state_t::CLOSED);
+ }
+ }
+ return tracker->write_out(device, superblock.tracker_offset);
+ });
+ });
+}
+
+BlockSegmentManager::mkfs_ret BlockSegmentManager::mkfs(mkfs_config_t config)
+{
+ return seastar::do_with(
+ seastar::file{},
+ seastar::stat_data{},
+ block_sm_superblock_t{},
+ std::unique_ptr<SegmentStateTracker>(),
+ [=](auto &device, auto &stat, auto &sb, auto &tracker) {
+ return open_device(
+ config.path, seastar::open_flags::rw
+ ).safe_then([&, config](auto p) {
+ device = p.first;
+ stat = p.second;
+ sb = make_superblock(config, stat);
+ return write_superblock(device, sb);
+ }).safe_then([&] {
+ logger().debug("BlockSegmentManager::mkfs: superblock written");
+ tracker.reset(new SegmentStateTracker(sb.segments, sb.block_size));
+ return tracker->write_out(device, sb.tracker_offset);
+ }).finally([&] {
+ return device.close();
+ }).safe_then([] {
+ logger().debug("BlockSegmentManager::mkfs: complete");
+ return mkfs_ertr::now();
+ });
+ });
+}
+
+BlockSegmentManager::close_ertr::future<> BlockSegmentManager::close()
+{
+ return device.close();
+}
+
+SegmentManager::open_ertr::future<SegmentRef> BlockSegmentManager::open(
+ segment_id_t id)
+{
+ if (id >= get_num_segments()) {
+ logger().error("BlockSegmentManager::open: invalid segment {}", id);
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (tracker->get(id) != segment_state_t::EMPTY) {
+ logger().error(
+ "BlockSegmentManager::open: invalid segment {} state {}",
+ id,
+ tracker->get(id));
+ return crimson::ct_error::invarg::make();
+ }
+
+ tracker->set(id, segment_state_t::OPEN);
+ return tracker->write_out(device, superblock.tracker_offset
+ ).safe_then([this, id] {
+ return open_ertr::future<SegmentRef>(
+ open_ertr::ready_future_marker{},
+ SegmentRef(new BlockSegment(*this, id)));
+ });
+}
+
+SegmentManager::release_ertr::future<> BlockSegmentManager::release(
+ segment_id_t id)
+{
+ logger().debug("BlockSegmentManager::release: {}", id);
+
+ if (id >= get_num_segments()) {
+ logger().error(
+ "BlockSegmentManager::release: invalid segment {}",
+ id);
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (tracker->get(id) != segment_state_t::CLOSED) {
+ logger().error(
+ "BlockSegmentManager::release: invalid segment {} state {}",
+ id,
+ tracker->get(id));
+ return crimson::ct_error::invarg::make();
+ }
+
+ tracker->set(id, segment_state_t::EMPTY);
+ return tracker->write_out(device, superblock.tracker_offset);
+}
+
+SegmentManager::read_ertr::future<> BlockSegmentManager::read(
+ paddr_t addr,
+ size_t len,
+ ceph::bufferptr &out)
+{
+ if (addr.segment >= get_num_segments()) {
+ logger().error(
+ "BlockSegmentManager::read: invalid segment {}",
+ addr);
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (addr.offset + len > superblock.segment_size) {
+ logger().error(
+ "BlockSegmentManager::read: invalid offset {}~{}!",
+ addr,
+ len);
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (tracker->get(addr.segment) == segment_state_t::EMPTY) {
+ logger().error(
+ "BlockSegmentManager::read: read on invalid segment {} state {}",
+ addr.segment,
+ tracker->get(addr.segment));
+ return crimson::ct_error::enoent::make();
+ }
+
+ return do_read(
+ device,
+ get_offset(addr),
+ out);
+}
+
+}
diff --git a/src/crimson/os/seastore/segment_manager/block.h b/src/crimson/os/seastore/segment_manager/block.h
new file mode 100644
index 000000000..927b13e4e
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager/block.h
@@ -0,0 +1,222 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <seastar/core/file.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/reactor.hh>
+
+#include "crimson/common/layout.h"
+
+#include "crimson/os/seastore/segment_manager.h"
+
+namespace crimson::os::seastore::segment_manager::block {
+
+struct block_sm_superblock_t {
+ size_t size = 0;
+ size_t segment_size = 0;
+ size_t block_size = 0;
+
+ size_t segments = 0;
+ uint64_t tracker_offset = 0;
+ uint64_t first_segment_offset = 0;
+
+ seastore_meta_t meta;
+
+ DENC(block_sm_superblock_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.size, p);
+ denc(v.segment_size, p);
+ denc(v.block_size, p);
+ denc(v.segments, p);
+ denc(v.tracker_offset, p);
+ denc(v.first_segment_offset, p);
+ denc(v.meta, p);
+ DENC_FINISH(p);
+ }
+};
+
+using write_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+using read_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+
+/**
+ * SegmentStateTracker
+ *
+ * Tracks lifecycle state of each segment using space at the beginning
+ * of the drive.
+ */
+class SegmentStateTracker {
+ using segment_state_t = Segment::segment_state_t;
+
+ bufferptr bptr;
+
+ using L = absl::container_internal::Layout<uint8_t>;
+ const L layout;
+
+public:
+ static size_t get_raw_size(size_t segments, size_t block_size) {
+ return p2roundup(segments, block_size);
+ }
+
+ SegmentStateTracker(size_t segments, size_t block_size)
+ : bptr(ceph::buffer::create_page_aligned(
+ get_raw_size(segments, block_size))),
+ layout(bptr.length())
+ {
+ ::memset(
+ bptr.c_str(),
+ static_cast<char>(segment_state_t::EMPTY),
+ bptr.length());
+ }
+
+ size_t get_size() const {
+ return bptr.length();
+ }
+
+ size_t get_capacity() const {
+ return bptr.length();
+ }
+
+ segment_state_t get(segment_id_t offset) const {
+ assert(offset < get_capacity());
+ return static_cast<segment_state_t>(
+ layout.template Pointer<0>(
+ bptr.c_str())[offset]);
+ }
+
+ void set(segment_id_t offset, segment_state_t state) {
+ assert(offset < get_capacity());
+ layout.template Pointer<0>(bptr.c_str())[offset] =
+ static_cast<uint8_t>(state);
+ }
+
+ write_ertr::future<> write_out(
+ seastar::file &device,
+ uint64_t offset);
+
+ read_ertr::future<> read_in(
+ seastar::file &device,
+ uint64_t offset);
+};
+
+class BlockSegmentManager;
+class BlockSegment final : public Segment {
+ friend class BlockSegmentManager;
+ BlockSegmentManager &manager;
+ const segment_id_t id;
+ segment_off_t write_pointer = 0;
+public:
+ BlockSegment(BlockSegmentManager &manager, segment_id_t id);
+
+ segment_id_t get_segment_id() const final { return id; }
+ segment_off_t get_write_capacity() const final;
+ segment_off_t get_write_ptr() const final { return write_pointer; }
+ close_ertr::future<> close() final;
+ write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final;
+
+ ~BlockSegment() {}
+};
+
+/**
+ * BlockSegmentManager
+ *
+ * Implements SegmentManager on a conventional block device.
+ * SegmentStateTracker uses space at the start of the device to store
+ * state analagous to that of the segments of a zns device.
+ */
+class BlockSegmentManager final : public SegmentManager {
+public:
+ using access_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::permission_denied,
+ crimson::ct_error::enoent>;
+
+
+ struct mount_config_t {
+ std::string path;
+ };
+ using mount_ertr = access_ertr;
+ using mount_ret = access_ertr::future<>;
+ mount_ret mount(mount_config_t);
+
+ struct mkfs_config_t {
+ std::string path;
+ size_t segment_size = 0;
+ size_t total_size = 0;
+ seastore_meta_t meta;
+ };
+ using mkfs_ertr = access_ertr;
+ using mkfs_ret = mkfs_ertr::future<>;
+ static mkfs_ret mkfs(mkfs_config_t);
+
+ using close_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error
+ >;
+ close_ertr::future<> close();
+
+ BlockSegmentManager() = default;
+ ~BlockSegmentManager();
+
+ open_ertr::future<SegmentRef> open(segment_id_t id) final;
+
+ release_ertr::future<> release(segment_id_t id) final;
+
+ read_ertr::future<> read(
+ paddr_t addr,
+ size_t len,
+ ceph::bufferptr &out) final;
+
+ size_t get_size() const final {
+ return superblock.size;
+ }
+ segment_off_t get_block_size() const {
+ return superblock.block_size;
+ }
+ segment_off_t get_segment_size() const {
+ return superblock.segment_size;
+ }
+
+ // public so tests can bypass segment interface when simpler
+ Segment::write_ertr::future<> segment_write(
+ paddr_t addr,
+ ceph::bufferlist bl,
+ bool ignore_check=false);
+
+private:
+ friend class BlockSegment;
+ using segment_state_t = Segment::segment_state_t;
+
+
+ std::unique_ptr<SegmentStateTracker> tracker;
+ block_sm_superblock_t superblock;
+ seastar::file device;
+
+ size_t get_offset(paddr_t addr) {
+ return superblock.first_segment_offset +
+ (addr.segment * superblock.segment_size) +
+ addr.offset;
+ }
+
+ const seastore_meta_t &get_meta() const {
+ return superblock.meta;
+ }
+
+ std::vector<segment_state_t> segment_state;
+
+ char *buffer = nullptr;
+
+ Segment::close_ertr::future<> segment_close(segment_id_t id);
+};
+
+}
+
+WRITE_CLASS_DENC_BOUNDED(
+ crimson::os::seastore::segment_manager::block::block_sm_superblock_t
+)
+
diff --git a/src/crimson/os/seastore/segment_manager/ephemeral.cc b/src/crimson/os/seastore/segment_manager/ephemeral.cc
new file mode 100644
index 000000000..3250303ad
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager/ephemeral.cc
@@ -0,0 +1,226 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include "seastar/core/sleep.hh"
+
+#include "crimson/common/log.h"
+
+#include "include/buffer.h"
+#include "crimson/os/seastore/segment_manager/ephemeral.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+namespace crimson::os::seastore::segment_manager {
+
+std::ostream &operator<<(std::ostream &lhs, const ephemeral_config_t &c) {
+ return lhs << "ephemeral_config_t(size=" << c.size << ", block_size=" << c.block_size
+ << ", segment_size=" << c.segment_size << ")";
+}
+
+EphemeralSegmentManagerRef create_test_ephemeral() {
+ return EphemeralSegmentManagerRef(
+ new EphemeralSegmentManager(DEFAULT_TEST_EPHEMERAL));
+}
+
+EphemeralSegment::EphemeralSegment(
+ EphemeralSegmentManager &manager, segment_id_t id)
+ : manager(manager), id(id) {}
+
+segment_off_t EphemeralSegment::get_write_capacity() const
+{
+ return manager.get_segment_size();
+}
+
+Segment::close_ertr::future<> EphemeralSegment::close()
+{
+ manager.segment_close(id);
+ return close_ertr::now().safe_then([] {
+ return seastar::sleep(std::chrono::milliseconds(1));
+ });
+}
+
+Segment::write_ertr::future<> EphemeralSegment::write(
+ segment_off_t offset, ceph::bufferlist bl)
+{
+ if (offset < write_pointer || offset % manager.config.block_size != 0)
+ return crimson::ct_error::invarg::make();
+
+ if (offset + bl.length() > (size_t)manager.get_segment_size())
+ return crimson::ct_error::enospc::make();
+
+ return manager.segment_write({id, offset}, bl);
+}
+
+Segment::close_ertr::future<> EphemeralSegmentManager::segment_close(segment_id_t id)
+{
+ if (segment_state[id] != segment_state_t::OPEN)
+ return crimson::ct_error::invarg::make();
+
+ segment_state[id] = segment_state_t::CLOSED;
+ return Segment::close_ertr::now().safe_then([] {
+ return seastar::sleep(std::chrono::milliseconds(1));
+ });
+}
+
+Segment::write_ertr::future<> EphemeralSegmentManager::segment_write(
+ paddr_t addr,
+ ceph::bufferlist bl,
+ bool ignore_check)
+{
+ logger().debug(
+ "segment_write to segment {} at offset {}, physical offset {}, len {}, crc {}",
+ addr.segment,
+ addr.offset,
+ get_offset(addr),
+ bl.length(),
+ bl.crc32c(1));
+ if (!ignore_check && segment_state[addr.segment] != segment_state_t::OPEN)
+ return crimson::ct_error::invarg::make();
+
+ bl.begin().copy(bl.length(), buffer + get_offset(addr));
+ return Segment::write_ertr::now().safe_then([] {
+ return seastar::sleep(std::chrono::milliseconds(1));
+ });
+}
+
+EphemeralSegmentManager::init_ertr::future<> EphemeralSegmentManager::init()
+{
+ logger().debug(
+ "Initing ephemeral segment manager with config {}",
+ config);
+
+ meta = seastore_meta_t{};
+
+ if (config.block_size % (4<<10) != 0) {
+ return crimson::ct_error::invarg::make();
+ }
+ if (config.segment_size % config.block_size != 0) {
+ return crimson::ct_error::invarg::make();
+ }
+ if (config.size % config.segment_size != 0) {
+ return crimson::ct_error::invarg::make();
+ }
+
+ auto addr = ::mmap(
+ nullptr,
+ config.size,
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS,
+ -1,
+ 0);
+
+ segment_state.resize(config.size / config.segment_size, segment_state_t::EMPTY);
+
+ if (addr == MAP_FAILED)
+ return crimson::ct_error::enospc::make();
+
+ buffer = (char*)addr;
+
+ ::memset(buffer, 0, config.size);
+ return init_ertr::now().safe_then([] {
+ return seastar::sleep(std::chrono::milliseconds(1));
+ });
+}
+
+EphemeralSegmentManager::~EphemeralSegmentManager()
+{
+ if (buffer) {
+ ::munmap(buffer, config.size);
+ }
+}
+
+void EphemeralSegmentManager::remount()
+{
+ for (auto &i : segment_state) {
+ if (i == Segment::segment_state_t::OPEN)
+ i = Segment::segment_state_t::CLOSED;
+ }
+}
+
+SegmentManager::open_ertr::future<SegmentRef> EphemeralSegmentManager::open(
+ segment_id_t id)
+{
+ if (id >= get_num_segments()) {
+ logger().error("EphemeralSegmentManager::open: invalid segment {}", id);
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (segment_state[id] != segment_state_t::EMPTY) {
+ logger().error("EphemeralSegmentManager::open: segment {} not empty", id);
+ return crimson::ct_error::invarg::make();
+ }
+
+ segment_state[id] = segment_state_t::OPEN;
+ return open_ertr::make_ready_future<SegmentRef>(new EphemeralSegment(*this, id));
+}
+
+SegmentManager::release_ertr::future<> EphemeralSegmentManager::release(
+ segment_id_t id)
+{
+ logger().debug("EphemeralSegmentManager::release: {}", id);
+
+ if (id >= get_num_segments()) {
+ logger().error(
+ "EphemeralSegmentManager::release: invalid segment {}",
+ id);
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (segment_state[id] != segment_state_t::CLOSED) {
+ logger().error(
+ "EphemeralSegmentManager::release: segment id {} not closed",
+ id);
+ return crimson::ct_error::invarg::make();
+ }
+
+ ::memset(buffer + get_offset({id, 0}), 0, config.segment_size);
+ segment_state[id] = segment_state_t::EMPTY;
+ return release_ertr::now().safe_then([] {
+ return seastar::sleep(std::chrono::milliseconds(1));
+ });
+}
+
+SegmentManager::read_ertr::future<> EphemeralSegmentManager::read(
+ paddr_t addr,
+ size_t len,
+ ceph::bufferptr &out)
+{
+ if (addr.segment >= get_num_segments()) {
+ logger().error(
+ "EphemeralSegmentManager::read: invalid segment {}",
+ addr);
+ return crimson::ct_error::invarg::make();
+ }
+
+ if (addr.offset + len > config.segment_size) {
+ logger().error(
+ "EphemeralSegmentManager::read: invalid offset {}~{}!",
+ addr,
+ len);
+ return crimson::ct_error::invarg::make();
+ }
+
+ out.copy_in(0, len, buffer + get_offset(addr));
+
+ bufferlist bl;
+ bl.push_back(out);
+ logger().debug(
+ "segment_read to segment {} at offset {}, physical offset {}, length {}, crc {}",
+ addr.segment,
+ addr.offset,
+ get_offset(addr),
+ len,
+ bl.begin().crc32c(len, 1));
+
+ return read_ertr::now().safe_then([] {
+ return seastar::sleep(std::chrono::milliseconds(1));
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/segment_manager/ephemeral.h b/src/crimson/os/seastore/segment_manager/ephemeral.h
new file mode 100644
index 000000000..9f19cb4d0
--- /dev/null
+++ b/src/crimson/os/seastore/segment_manager/ephemeral.h
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "crimson/os/seastore/segment_manager.h"
+
+#include "crimson/os/seastore/segment_manager/ephemeral.h"
+
+namespace crimson::os::seastore::segment_manager {
+
+class EphemeralSegmentManager;
+using EphemeralSegmentManagerRef = std::unique_ptr<EphemeralSegmentManager>;
+
+struct ephemeral_config_t {
+ size_t size = 0;
+ size_t block_size = 0;
+ size_t segment_size = 0;
+};
+
+constexpr ephemeral_config_t DEFAULT_TEST_EPHEMERAL = {
+ 1 << 30,
+ 4 << 10,
+ 8 << 20
+};
+
+std::ostream &operator<<(std::ostream &, const ephemeral_config_t &);
+EphemeralSegmentManagerRef create_test_ephemeral();
+
+class EphemeralSegment final : public Segment {
+ friend class EphemeralSegmentManager;
+ EphemeralSegmentManager &manager;
+ const segment_id_t id;
+ segment_off_t write_pointer = 0;
+public:
+ EphemeralSegment(EphemeralSegmentManager &manager, segment_id_t id);
+
+ segment_id_t get_segment_id() const final { return id; }
+ segment_off_t get_write_capacity() const final;
+ segment_off_t get_write_ptr() const final { return write_pointer; }
+ close_ertr::future<> close() final;
+ write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final;
+
+ ~EphemeralSegment() {}
+};
+
+class EphemeralSegmentManager final : public SegmentManager {
+ friend class EphemeralSegment;
+ using segment_state_t = Segment::segment_state_t;
+
+ const ephemeral_config_t config;
+ std::optional<seastore_meta_t> meta;
+
+ size_t get_offset(paddr_t addr) {
+ return (addr.segment * config.segment_size) + addr.offset;
+ }
+
+ std::vector<segment_state_t> segment_state;
+
+ char *buffer = nullptr;
+
+ Segment::close_ertr::future<> segment_close(segment_id_t id);
+
+public:
+ EphemeralSegmentManager(ephemeral_config_t config) : config(config) {}
+ ~EphemeralSegmentManager();
+
+ using init_ertr = crimson::errorator<
+ crimson::ct_error::enospc,
+ crimson::ct_error::invarg,
+ crimson::ct_error::erange>;
+ init_ertr::future<> init();
+
+ open_ertr::future<SegmentRef> open(segment_id_t id) final;
+
+ release_ertr::future<> release(segment_id_t id) final;
+
+ read_ertr::future<> read(
+ paddr_t addr,
+ size_t len,
+ ceph::bufferptr &out) final;
+
+ size_t get_size() const final {
+ return config.size;
+ }
+ segment_off_t get_block_size() const final {
+ return config.block_size;
+ }
+ segment_off_t get_segment_size() const final {
+ return config.segment_size;
+ }
+
+ const seastore_meta_t &get_meta() const final {
+ assert(meta);
+ return *meta;
+ }
+
+ void remount();
+
+ // public so tests can bypass segment interface when simpler
+ Segment::write_ertr::future<> segment_write(
+ paddr_t addr,
+ ceph::bufferlist bl,
+ bool ignore_check=false);
+};
+
+}
diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h
new file mode 100644
index 000000000..e189d1d32
--- /dev/null
+++ b/src/crimson/os/seastore/transaction.h
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/root_block.h"
+
+namespace crimson::os::seastore {
+
+/**
+ * Transaction
+ *
+ * Representation of in-progress mutation. Used exclusively through Cache methods.
+ */
+class Transaction {
+public:
+ using Ref = std::unique_ptr<Transaction>;
+ enum class get_extent_ret {
+ PRESENT,
+ ABSENT,
+ RETIRED
+ };
+ get_extent_ret get_extent(paddr_t addr, CachedExtentRef *out) {
+ if (retired_set.count(addr)) {
+ return get_extent_ret::RETIRED;
+ } else if (auto iter = write_set.find_offset(addr);
+ iter != write_set.end()) {
+ if (out)
+ *out = CachedExtentRef(&*iter);
+ return get_extent_ret::PRESENT;
+ } else if (
+ auto iter = read_set.find(addr);
+ iter != read_set.end()) {
+ if (out)
+ *out = CachedExtentRef(*iter);
+ return get_extent_ret::PRESENT;
+ } else {
+ return get_extent_ret::ABSENT;
+ }
+ }
+
+ void add_to_retired_set(CachedExtentRef ref) {
+ ceph_assert(!is_weak());
+ if (!ref->is_initial_pending()) {
+ // && retired_set.count(ref->get_paddr()) == 0
+ // If it's already in the set, insert here will be a noop,
+ // which is what we want.
+ retired_set.insert(ref);
+ } else {
+ ref->state = CachedExtent::extent_state_t::INVALID;
+ }
+ if (ref->is_pending()) {
+ write_set.erase(*ref);
+ }
+ }
+
+ void add_to_read_set(CachedExtentRef ref) {
+ if (is_weak()) return;
+
+ ceph_assert(read_set.count(ref) == 0);
+ read_set.insert(ref);
+ }
+
+ void add_fresh_extent(CachedExtentRef ref) {
+ ceph_assert(!is_weak());
+ fresh_block_list.push_back(ref);
+ ref->set_paddr(make_record_relative_paddr(offset));
+ offset += ref->get_length();
+ write_set.insert(*ref);
+ }
+
+ void add_mutated_extent(CachedExtentRef ref) {
+ ceph_assert(!is_weak());
+ mutated_block_list.push_back(ref);
+ write_set.insert(*ref);
+ }
+
+ void mark_segment_to_release(segment_id_t segment) {
+ assert(to_release == NULL_SEG_ID);
+ to_release = segment;
+ }
+
+ segment_id_t get_segment_to_release() const {
+ return to_release;
+ }
+
+ const auto &get_fresh_block_list() {
+ return fresh_block_list;
+ }
+
+ const auto &get_mutated_block_list() {
+ return mutated_block_list;
+ }
+
+ const auto &get_retired_set() {
+ return retired_set;
+ }
+
+ bool is_weak() const {
+ return weak;
+ }
+
+private:
+ friend class Cache;
+ friend Ref make_transaction();
+ friend Ref make_weak_transaction();
+
+ /**
+ * If set, *this may not be used to perform writes and will not provide
+ * consistentency allowing operations using to avoid maintaining a read_set.
+ */
+ const bool weak;
+
+ RootBlockRef root; ///< ref to root if read or written by transaction
+
+ segment_off_t offset = 0; ///< relative offset of next block
+
+ pextent_set_t read_set; ///< set of extents read by paddr
+ ExtentIndex write_set; ///< set of extents written by paddr
+
+ std::list<CachedExtentRef> fresh_block_list; ///< list of fresh blocks
+ std::list<CachedExtentRef> mutated_block_list; ///< list of mutated blocks
+
+ pextent_set_t retired_set; ///< list of extents mutated by this transaction
+
+ ///< if != NULL_SEG_ID, release this segment after completion
+ segment_id_t to_release = NULL_SEG_ID;
+
+ Transaction(bool weak) : weak(weak) {}
+};
+using TransactionRef = Transaction::Ref;
+
+inline TransactionRef make_transaction() {
+ return std::unique_ptr<Transaction>(new Transaction(false));
+}
+
+inline TransactionRef make_weak_transaction() {
+ return std::unique_ptr<Transaction>(new Transaction(true));
+}
+
+}
diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc
new file mode 100644
index 000000000..7b86631e2
--- /dev/null
+++ b/src/crimson/os/seastore/transaction_manager.cc
@@ -0,0 +1,306 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/denc.h"
+#include "include/intarith.h"
+
+#include "crimson/common/log.h"
+
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/segment_manager.h"
+#include "crimson/os/seastore/journal.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+namespace crimson::os::seastore {
+
+TransactionManager::TransactionManager(
+ SegmentManager &segment_manager,
+ SegmentCleaner &segment_cleaner,
+ Journal &journal,
+ Cache &cache,
+ LBAManager &lba_manager)
+ : segment_manager(segment_manager),
+ segment_cleaner(segment_cleaner),
+ cache(cache),
+ lba_manager(lba_manager),
+ journal(journal)
+{}
+
+TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
+{
+ return journal.open_for_write().safe_then([this](auto addr) {
+ logger().debug("TransactionManager::mkfs: about to do_with");
+ segment_cleaner.set_journal_head(addr);
+ return seastar::do_with(
+ create_transaction(),
+ [this](auto &transaction) {
+ logger().debug("TransactionManager::mkfs: about to cache.mkfs");
+ cache.init();
+ return cache.mkfs(*transaction
+ ).safe_then([this, &transaction] {
+ return lba_manager.mkfs(*transaction);
+ }).safe_then([this, &transaction] {
+ logger().debug("TransactionManager::mkfs: about to submit_transaction");
+ return submit_transaction(std::move(transaction)).handle_error(
+ crimson::ct_error::eagain::handle([] {
+ ceph_assert(0 == "eagain impossible");
+ return mkfs_ertr::now();
+ }),
+ mkfs_ertr::pass_further{}
+ );
+ });
+ });
+ }).safe_then([this] {
+ return journal.close();
+ });
+}
+
+TransactionManager::mount_ertr::future<> TransactionManager::mount()
+{
+ cache.init();
+ return journal.replay([this](auto seq, auto paddr, const auto &e) {
+ return cache.replay_delta(seq, paddr, e);
+ }).safe_then([this] {
+ return journal.open_for_write();
+ }).safe_then([this](auto addr) {
+ segment_cleaner.set_journal_head(addr);
+ return seastar::do_with(
+ make_weak_transaction(),
+ [this](auto &t) {
+ return cache.init_cached_extents(*t, [this](auto &t, auto &e) {
+ return lba_manager.init_cached_extent(t, e);
+ }).safe_then([this, &t] {
+ assert(segment_cleaner.debug_check_space(
+ *segment_cleaner.get_empty_space_tracker()));
+ return lba_manager.scan_mapped_space(
+ *t,
+ [this](paddr_t addr, extent_len_t len) {
+ logger().debug("TransactionManager::mount: marking {}~{} used",
+ addr,
+ len);
+ segment_cleaner.mark_space_used(
+ addr,
+ len ,
+ /* init_scan = */ true);
+ });
+ });
+ });
+ }).safe_then([this] {
+ segment_cleaner.complete_init();
+ }).handle_error(
+ mount_ertr::pass_further{},
+ crimson::ct_error::all_same_way([] {
+ ceph_assert(0 == "unhandled error");
+ return mount_ertr::now();
+ }));
+}
+
+TransactionManager::close_ertr::future<> TransactionManager::close() {
+ return cache.close(
+ ).safe_then([this] {
+ return journal.close();
+ });
+}
+
+TransactionManager::ref_ret TransactionManager::inc_ref(
+ Transaction &t,
+ LogicalCachedExtentRef &ref)
+{
+ return lba_manager.incref_extent(t, ref->get_laddr()).safe_then([](auto r) {
+ return r.refcount;
+ }).handle_error(
+ ref_ertr::pass_further{},
+ ct_error::all_same_way([](auto e) {
+ ceph_assert(0 == "unhandled error, TODO");
+ }));
+}
+
+TransactionManager::ref_ret TransactionManager::inc_ref(
+ Transaction &t,
+ laddr_t offset)
+{
+ return lba_manager.incref_extent(t, offset).safe_then([](auto result) {
+ return result.refcount;
+ });
+}
+
+TransactionManager::ref_ret TransactionManager::dec_ref(
+ Transaction &t,
+ LogicalCachedExtentRef &ref)
+{
+ return lba_manager.decref_extent(t, ref->get_laddr()
+ ).safe_then([this, &t, ref](auto ret) {
+ if (ret.refcount == 0) {
+ logger().debug(
+ "TransactionManager::dec_ref: extent {} refcount 0",
+ *ref);
+ cache.retire_extent(t, ref);
+ }
+ return ret.refcount;
+ });
+}
+
+TransactionManager::ref_ret TransactionManager::dec_ref(
+ Transaction &t,
+ laddr_t offset)
+{
+ return lba_manager.decref_extent(t, offset
+ ).safe_then([this, offset, &t](auto result) -> ref_ret {
+ if (result.refcount == 0) {
+ logger().debug(
+ "TransactionManager::dec_ref: offset {} refcount 0",
+ offset);
+ return cache.retire_extent_if_cached(t, result.addr).safe_then([] {
+ return ref_ret(
+ ref_ertr::ready_future_marker{},
+ 0);
+ });
+ } else {
+ return ref_ret(
+ ref_ertr::ready_future_marker{},
+ result.refcount);
+ }
+ });
+}
+
+TransactionManager::submit_transaction_ertr::future<>
+TransactionManager::submit_transaction(
+ TransactionRef t)
+{
+ logger().debug("TransactionManager::submit_transaction");
+ return segment_cleaner.do_immediate_work(*t
+ ).safe_then([this, t=std::move(t)]() mutable -> submit_transaction_ertr::future<> {
+ auto record = cache.try_construct_record(*t);
+ if (!record) {
+ return crimson::ct_error::eagain::make();
+ }
+
+ return journal.submit_record(std::move(*record)
+ ).safe_then([this, t=std::move(t)](auto p) mutable {
+ auto [addr, journal_seq] = p;
+ segment_cleaner.set_journal_head(journal_seq);
+ cache.complete_commit(*t, addr, journal_seq, &segment_cleaner);
+ lba_manager.complete_transaction(*t);
+ auto to_release = t->get_segment_to_release();
+ if (to_release != NULL_SEG_ID) {
+ segment_cleaner.mark_segment_released(to_release);
+ return segment_manager.release(to_release);
+ } else {
+ return SegmentManager::release_ertr::now();
+ }
+ }).handle_error(
+ submit_transaction_ertr::pass_further{},
+ crimson::ct_error::all_same_way([](auto e) {
+ ceph_assert(0 == "Hit error submitting to journal");
+ }));
+ });
+}
+
+TransactionManager::get_next_dirty_extents_ret
+TransactionManager::get_next_dirty_extents(journal_seq_t seq)
+{
+ return cache.get_next_dirty_extents(seq);
+}
+
+TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
+ Transaction &t,
+ CachedExtentRef extent)
+{
+ {
+ auto updated = cache.update_extent_from_transaction(t, extent);
+ if (!updated) {
+ logger().debug(
+ "{}: {} is already retired, skipping",
+ __func__,
+ *extent);
+ return rewrite_extent_ertr::now();
+ }
+ extent = updated;
+ }
+
+ if (extent->get_type() == extent_types_t::ROOT) {
+ logger().debug(
+ "{}: marking root {} for rewrite",
+ __func__,
+ *extent);
+ cache.duplicate_for_write(t, extent);
+ return rewrite_extent_ertr::now();
+ }
+ return lba_manager.rewrite_extent(t, extent);
+}
+
+TransactionManager::get_extent_if_live_ret TransactionManager::get_extent_if_live(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t addr,
+ laddr_t laddr,
+ segment_off_t len)
+{
+ CachedExtentRef ret;
+ auto status = cache.get_extent_if_cached(t, addr, &ret);
+ if (status != Transaction::get_extent_ret::ABSENT) {
+ return get_extent_if_live_ret(
+ get_extent_if_live_ertr::ready_future_marker{},
+ ret);
+ }
+
+ if (is_logical_type(type)) {
+ return lba_manager.get_mapping(
+ t,
+ laddr,
+ len).safe_then([=, &t](lba_pin_list_t pins) {
+ ceph_assert(pins.size() <= 1);
+ if (pins.empty()) {
+ return get_extent_if_live_ret(
+ get_extent_if_live_ertr::ready_future_marker{},
+ CachedExtentRef());
+ }
+
+ auto pin = std::move(pins.front());
+ pins.pop_front();
+ ceph_assert(pin->get_laddr() == laddr);
+ ceph_assert(pin->get_length() == (extent_len_t)len);
+ if (pin->get_paddr() == addr) {
+ return cache.get_extent_by_type(
+ t,
+ type,
+ addr,
+ laddr,
+ len).safe_then(
+ [this, pin=std::move(pin)](CachedExtentRef ret) mutable {
+ auto lref = ret->cast<LogicalCachedExtent>();
+ if (!lref->has_pin()) {
+ lref->set_pin(std::move(pin));
+ lba_manager.add_pin(lref->get_pin());
+ }
+ return get_extent_if_live_ret(
+ get_extent_if_live_ertr::ready_future_marker{},
+ ret);
+ });
+ } else {
+ return get_extent_if_live_ret(
+ get_extent_if_live_ertr::ready_future_marker{},
+ CachedExtentRef());
+ }
+ });
+ } else {
+ logger().debug(
+ "TransactionManager::get_extent_if_live: non-logical extent {}",
+ addr);
+ return lba_manager.get_physical_extent_if_live(
+ t,
+ type,
+ addr,
+ laddr,
+ len);
+ }
+}
+
+TransactionManager::~TransactionManager() {}
+
+}
diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h
new file mode 100644
index 000000000..d28fd0b87
--- /dev/null
+++ b/src/crimson/os/seastore/transaction_manager.h
@@ -0,0 +1,296 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <optional>
+#include <vector>
+#include <utility>
+#include <functional>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "include/buffer.h"
+
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/segment_cleaner.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/segment_manager.h"
+#include "crimson/os/seastore/lba_manager.h"
+#include "crimson/os/seastore/journal.h"
+
+namespace crimson::os::seastore {
+class Journal;
+
+/**
+ * TransactionManager
+ *
+ * Abstraction hiding reading and writing to persistence.
+ * Exposes transaction based interface with read isolation.
+ */
+class TransactionManager : public SegmentCleaner::ExtentCallbackInterface {
+public:
+ TransactionManager(
+ SegmentManager &segment_manager,
+ SegmentCleaner &segment_cleaner,
+ Journal &journal,
+ Cache &cache,
+ LBAManager &lba_manager);
+
+ /// Writes initial metadata to disk
+ using mkfs_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error
+ >;
+ mkfs_ertr::future<> mkfs();
+
+ /// Reads initial metadata from disk
+ using mount_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error
+ >;
+ mount_ertr::future<> mount();
+
+ /// Closes transaction_manager
+ using close_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error
+ >;
+ close_ertr::future<> close();
+
+ /// Creates empty transaction
+ TransactionRef create_transaction() {
+ return make_transaction();
+ }
+
+ /// Creates weak transaction
+ TransactionRef create_weak_transaction() {
+ return make_weak_transaction();
+ }
+
+ /**
+ * Read extents corresponding to specified lba range
+ */
+ using read_extent_ertr = SegmentManager::read_ertr;
+ template <typename T>
+ using read_extent_ret = read_extent_ertr::future<lextent_list_t<T>>;
+ template <typename T>
+ read_extent_ret<T> read_extents(
+ Transaction &t,
+ laddr_t offset,
+ extent_len_t length)
+ {
+ std::unique_ptr<lextent_list_t<T>> ret =
+ std::make_unique<lextent_list_t<T>>();
+ auto &ret_ref = *ret;
+ std::unique_ptr<lba_pin_list_t> pin_list =
+ std::make_unique<lba_pin_list_t>();
+ auto &pin_list_ref = *pin_list;
+ return lba_manager.get_mapping(
+ t, offset, length
+ ).safe_then([this, &t, &pin_list_ref, &ret_ref](auto pins) {
+ crimson::get_logger(ceph_subsys_filestore).debug(
+ "read_extents: mappings {}",
+ pins);
+ pins.swap(pin_list_ref);
+ return crimson::do_for_each(
+ pin_list_ref.begin(),
+ pin_list_ref.end(),
+ [this, &t, &ret_ref](auto &pin) {
+ crimson::get_logger(ceph_subsys_filestore).debug(
+ "read_extents: get_extent {}~{}",
+ pin->get_paddr(),
+ pin->get_length());
+ return cache.get_extent<T>(
+ t,
+ pin->get_paddr(),
+ pin->get_length()
+ ).safe_then([this, &pin, &ret_ref](auto ref) mutable {
+ if (!ref->has_pin()) {
+ ref->set_pin(std::move(pin));
+ lba_manager.add_pin(ref->get_pin());
+ }
+ ret_ref.push_back(std::make_pair(ref->get_laddr(), ref));
+ crimson::get_logger(ceph_subsys_filestore).debug(
+ "read_extents: got extent {}",
+ *ref);
+ return read_extent_ertr::now();
+ });
+ });
+ }).safe_then([ret=std::move(ret), pin_list=std::move(pin_list)]() mutable {
+ return read_extent_ret<T>(
+ read_extent_ertr::ready_future_marker{},
+ std::move(*ret));
+ });
+ }
+
+ /// Obtain mutable copy of extent
+ LogicalCachedExtentRef get_mutable_extent(Transaction &t, LogicalCachedExtentRef ref) {
+ auto &logger = crimson::get_logger(ceph_subsys_filestore);
+ auto ret = cache.duplicate_for_write(
+ t,
+ ref)->cast<LogicalCachedExtent>();
+ if (!ret->has_pin()) {
+ logger.debug(
+ "{}: duplicating {} for write: {}",
+ __func__,
+ *ref,
+ *ret);
+ ret->set_pin(ref->get_pin().duplicate());
+ } else {
+ logger.debug(
+ "{}: {} already pending",
+ __func__,
+ *ref);
+ assert(ref->is_pending());
+ assert(&*ref == &*ret);
+ }
+ return ret;
+ }
+
+
+ using ref_ertr = LBAManager::ref_ertr;
+ using ref_ret = ref_ertr::future<unsigned>;
+
+ /// Add refcount for ref
+ ref_ret inc_ref(
+ Transaction &t,
+ LogicalCachedExtentRef &ref);
+
+ /// Add refcount for offset
+ ref_ret inc_ref(
+ Transaction &t,
+ laddr_t offset);
+
+ /// Remove refcount for ref
+ ref_ret dec_ref(
+ Transaction &t,
+ LogicalCachedExtentRef &ref);
+
+ /// Remove refcount for offset
+ ref_ret dec_ref(
+ Transaction &t,
+ laddr_t offset);
+
+ /**
+ * alloc_extent
+ *
+ * Allocates a new block of type T with the minimum lba range of size len
+ * greater than hint.
+ */
+ using alloc_extent_ertr = SegmentManager::read_ertr;
+ template <typename T>
+ using alloc_extent_ret = alloc_extent_ertr::future<TCachedExtentRef<T>>;
+ template <typename T>
+ alloc_extent_ret<T> alloc_extent(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len) {
+ auto ext = cache.alloc_new_extent<T>(
+ t,
+ len);
+ return lba_manager.alloc_extent(
+ t,
+ hint,
+ len,
+ ext->get_paddr()
+ ).safe_then([ext=std::move(ext)](auto &&ref) mutable {
+ ext->set_pin(std::move(ref));
+ return alloc_extent_ertr::make_ready_future<TCachedExtentRef<T>>(
+ std::move(ext));
+ });
+ }
+
+ /**
+ * submit_transaction
+ *
+ * Atomically submits transaction to persistence
+ */
+ using submit_transaction_ertr = crimson::errorator<
+ crimson::ct_error::eagain, // Caller should retry transaction from beginning
+ crimson::ct_error::input_output_error // Media error
+ >;
+ submit_transaction_ertr::future<> submit_transaction(TransactionRef);
+
+ /// SegmentCleaner::ExtentCallbackInterface
+
+ using SegmentCleaner::ExtentCallbackInterface::get_next_dirty_extents_ret;
+ get_next_dirty_extents_ret get_next_dirty_extents(
+ journal_seq_t seq) final;
+
+ using SegmentCleaner::ExtentCallbackInterface::rewrite_extent_ret;
+ rewrite_extent_ret rewrite_extent(
+ Transaction &t,
+ CachedExtentRef extent) final;
+
+ using SegmentCleaner::ExtentCallbackInterface::get_extent_if_live_ret;
+ get_extent_if_live_ret get_extent_if_live(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t addr,
+ laddr_t laddr,
+ segment_off_t len) final;
+
+ using scan_extents_cursor =
+ SegmentCleaner::ExtentCallbackInterface::scan_extents_cursor;
+ using scan_extents_ertr =
+ SegmentCleaner::ExtentCallbackInterface::scan_extents_ertr;
+ using scan_extents_ret =
+ SegmentCleaner::ExtentCallbackInterface::scan_extents_ret;
+ scan_extents_ret scan_extents(
+ scan_extents_cursor &cursor,
+ extent_len_t bytes_to_read) final {
+ return journal.scan_extents(cursor, bytes_to_read);
+ }
+
+ using release_segment_ret =
+ SegmentCleaner::ExtentCallbackInterface::release_segment_ret;
+ release_segment_ret release_segment(
+ segment_id_t id) final {
+ return segment_manager.release(id);
+ }
+
+ /**
+ * read_onode_root
+ *
+ * Get onode-tree root logical address
+ */
+ using read_onode_root_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error
+ >;
+ using read_onode_root_ret = read_onode_root_ertr::future<laddr_t>;
+ read_onode_root_ret read_onode_root(Transaction &t) {
+ return cache.get_root(t).safe_then([](auto croot) {
+ return croot->get_root().onode_root;
+ });
+ }
+
+ /**
+ * write_onode_root
+ *
+ * Write onode-tree root logical address, must be called after read.
+ */
+ void write_onode_root(Transaction &t, laddr_t addr) {
+ auto croot = cache.get_root_fast(t);
+ croot = cache.duplicate_for_write(t, croot)->cast<RootBlock>();
+ croot->get_root().onode_root = addr;
+ }
+
+ ~TransactionManager();
+
+private:
+ friend class Transaction;
+
+ SegmentManager &segment_manager;
+ SegmentCleaner &segment_cleaner;
+ Cache &cache;
+ LBAManager &lba_manager;
+ Journal &journal;
+};
+using TransactionManagerRef = std::unique_ptr<TransactionManager>;
+
+}