summaryrefslogtreecommitdiffstats
path: root/src/crimson/os/seastore/onode_manager/staged-fltree
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/crimson/os/seastore/onode_manager/staged-fltree
parentInitial commit. (diff)
downloadceph-6d07fdb6bb33b1af39833b850bb6cf8af79fe293.tar.xz
ceph-6d07fdb6bb33b1af39833b850bb6cf8af79fe293.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/crimson/os/seastore/onode_manager/staged-fltree')
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h93
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node.cc809
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node.h476
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h42
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h413
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc35
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h86
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h156
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc88
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h126
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h67
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc39
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h80
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc76
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h197
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h613
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h75
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h64
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc165
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h180
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc32
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h846
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc318
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h226
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc96
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h366
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h2186
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h411
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc208
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h341
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/super.cc26
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/super.h143
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc235
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/tree.h119
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h125
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h333
36 files changed, 9891 insertions, 0 deletions
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h
new file mode 100644
index 000000000..4908c691f
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <algorithm>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <string>
+
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction.h"
+
+namespace crimson::os::seastore::onode {
+
+using crimson::os::seastore::Transaction;
+using crimson::os::seastore::TransactionRef;
+using crimson::os::seastore::make_transaction;
+using crimson::os::seastore::laddr_t;
+using crimson::os::seastore::L_ADDR_MIN;
+using crimson::os::seastore::L_ADDR_NULL;
+using crimson::os::seastore::extent_len_t;
+
+class DeltaRecorder;
+class NodeExtent;
+class NodeExtentManager;
+class RootNodeTracker;
+using DeltaRecorderURef = std::unique_ptr<DeltaRecorder>;
+using NodeExtentRef = crimson::os::seastore::TCachedExtentRef<NodeExtent>;
+using NodeExtentManagerURef = std::unique_ptr<NodeExtentManager>;
+using RootNodeTrackerURef = std::unique_ptr<RootNodeTracker>;
+struct context_t {
+ NodeExtentManager& nm;
+ Transaction& t;
+};
+
+class LeafNodeImpl;
+class InternalNodeImpl;
+class NodeImpl;
+using LeafNodeImplURef = std::unique_ptr<LeafNodeImpl>;
+using InternalNodeImplURef = std::unique_ptr<InternalNodeImpl>;
+using NodeImplURef = std::unique_ptr<NodeImpl>;
+
+using level_t = uint8_t;
+// a type only to index within a node, 32 bits should be enough
+using index_t = uint32_t;
+constexpr auto INDEX_END = std::numeric_limits<index_t>::max();
+constexpr auto INDEX_LAST = INDEX_END - 0x4;
+constexpr auto INDEX_UPPER_BOUND = INDEX_END - 0x8;
+inline bool is_valid_index(index_t index) { return index < INDEX_UPPER_BOUND; }
+
+// TODO: decide by NODE_BLOCK_SIZE
+using node_offset_t = uint16_t;
+constexpr node_offset_t DISK_BLOCK_SIZE = 1u << 12;
+constexpr node_offset_t NODE_BLOCK_SIZE = DISK_BLOCK_SIZE * 1u;
+
+enum class MatchKindBS : int8_t { NE = -1, EQ = 0 };
+
+enum class MatchKindCMP : int8_t { LT = -1, EQ = 0, GT };
+inline MatchKindCMP toMatchKindCMP(int value) {
+ if (value > 0) {
+ return MatchKindCMP::GT;
+ } else if (value < 0) {
+ return MatchKindCMP::LT;
+ } else {
+ return MatchKindCMP::EQ;
+ }
+}
+template <typename Type>
+MatchKindCMP toMatchKindCMP(const Type& l, const Type& r) {
+ int match = l - r;
+ return toMatchKindCMP(match);
+}
+
+inline MatchKindCMP toMatchKindCMP(
+ std::string_view l, std::string_view r) {
+ return toMatchKindCMP(l.compare(r));
+}
+
+inline MatchKindCMP reverse(MatchKindCMP cmp) {
+ if (cmp == MatchKindCMP::LT) {
+ return MatchKindCMP::GT;
+ } else if (cmp == MatchKindCMP::GT) {
+ return MatchKindCMP::LT;
+ } else {
+ return cmp;
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc
new file mode 100644
index 000000000..3df458f08
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc
@@ -0,0 +1,809 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node.h"
+
+#include <cassert>
+#include <exception>
+#include <sstream>
+
+#include "common/likely.h"
+#include "crimson/common/log.h"
+#include "node_extent_manager.h"
+#include "node_impl.h"
+#include "stages/node_stage_layout.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+namespace crimson::os::seastore::onode {
+
+using node_ertr = Node::node_ertr;
+template <class ValueT=void>
+using node_future = Node::node_future<ValueT>;
+
+/*
+ * tree_cursor_t
+ */
+
+tree_cursor_t::tree_cursor_t(Ref<LeafNode> node, const search_position_t& pos)
+ : leaf_node{node}, position{pos} {
+ assert(!is_end());
+ leaf_node->do_track_cursor<true>(*this);
+}
+
+tree_cursor_t::tree_cursor_t(
+ Ref<LeafNode> node, const search_position_t& pos,
+ const key_view_t& key, const onode_t* _p_value, layout_version_t v)
+ : leaf_node{node}, position{pos} {
+ assert(!is_end());
+ update_kv(key, _p_value, v);
+ leaf_node->do_track_cursor<true>(*this);
+}
+
+tree_cursor_t::tree_cursor_t(Ref<LeafNode> node)
+ : leaf_node{node}, position{search_position_t::end()} {
+ assert(is_end());
+ assert(leaf_node->is_level_tail());
+}
+
+tree_cursor_t::~tree_cursor_t() {
+ if (!is_end()) {
+ leaf_node->do_untrack_cursor(*this);
+ }
+}
+
+const key_view_t& tree_cursor_t::get_key_view() const {
+ ensure_kv();
+ return *key_view;
+}
+
+const onode_t* tree_cursor_t::get_p_value() const {
+ ensure_kv();
+ return p_value;
+}
+
+template <bool VALIDATE>
+void tree_cursor_t::update_track(
+ Ref<LeafNode> node, const search_position_t& pos) {
+ // the cursor must be already untracked
+ // track the new node and new pos
+ assert(!pos.is_end());
+ assert(!is_end());
+ leaf_node = node;
+ position = pos;
+ key_view.reset();
+ p_value = nullptr;
+ leaf_node->do_track_cursor<VALIDATE>(*this);
+}
+template void tree_cursor_t::update_track<true>(Ref<LeafNode>, const search_position_t&);
+template void tree_cursor_t::update_track<false>(Ref<LeafNode>, const search_position_t&);
+
+void tree_cursor_t::update_kv(
+ const key_view_t& key, const onode_t* _p_value, layout_version_t v) const {
+ assert(!is_end());
+ assert(_p_value);
+ assert(std::make_tuple(key, _p_value, v) == leaf_node->get_kv(position));
+ key_view = key;
+ p_value = _p_value;
+ node_version = v;
+}
+
+void tree_cursor_t::ensure_kv() const {
+ assert(!is_end());
+ if (!p_value || node_version != leaf_node->get_layout_version()) {
+ // NOTE: the leaf node is always present when we hold its reference.
+ std::tie(key_view, p_value, node_version) = leaf_node->get_kv(position);
+ }
+ assert(p_value);
+}
+
+/*
+ * Node
+ */
+
+Node::Node(NodeImplURef&& impl) : impl{std::move(impl)} {}
+
+Node::~Node() {
+ // XXX: tolerate failure between allocate() and as_child()
+ if (is_root()) {
+ super->do_untrack_root(*this);
+ } else {
+ _parent_info->ptr->do_untrack_child(*this);
+ }
+}
+
+level_t Node::level() const {
+ return impl->level();
+}
+
+node_future<Node::search_result_t> Node::lower_bound(
+ context_t c, const key_hobj_t& key) {
+ return seastar::do_with(
+ MatchHistory(), [this, c, &key](auto& history) {
+ return lower_bound_tracked(c, key, history);
+ }
+ );
+}
+
+node_future<std::pair<Ref<tree_cursor_t>, bool>> Node::insert(
+ context_t c, const key_hobj_t& key, const onode_t& value) {
+ return seastar::do_with(
+ MatchHistory(), [this, c, &key, &value](auto& history) {
+ return lower_bound_tracked(c, key, history
+ ).safe_then([c, &key, &value, &history](auto result) {
+ if (result.match() == MatchKindBS::EQ) {
+ return node_ertr::make_ready_future<std::pair<Ref<tree_cursor_t>, bool>>(
+ std::make_pair(result.p_cursor, false));
+ } else {
+ auto leaf_node = result.p_cursor->get_leaf_node();
+ return leaf_node->insert_value(
+ c, key, value, result.p_cursor->get_position(), history, result.mstat
+ ).safe_then([](auto p_cursor) {
+ return node_ertr::make_ready_future<std::pair<Ref<tree_cursor_t>, bool>>(
+ std::make_pair(p_cursor, true));
+ });
+ }
+ });
+ }
+ );
+}
+
+node_future<tree_stats_t> Node::get_tree_stats(context_t c) {
+ return seastar::do_with(
+ tree_stats_t(), [this, c](auto& stats) {
+ return do_get_tree_stats(c, stats).safe_then([&stats] {
+ return stats;
+ });
+ }
+ );
+}
+
+std::ostream& Node::dump(std::ostream& os) const {
+ return impl->dump(os);
+}
+
+std::ostream& Node::dump_brief(std::ostream& os) const {
+ return impl->dump_brief(os);
+}
+
+void Node::test_make_destructable(
+ context_t c, NodeExtentMutable& mut, Super::URef&& _super) {
+ impl->test_set_tail(mut);
+ make_root(c, std::move(_super));
+}
+
+node_future<> Node::mkfs(context_t c, RootNodeTracker& root_tracker) {
+ return LeafNode::allocate_root(c, root_tracker
+ ).safe_then([](auto ret) { /* FIXME: discard_result(); */ });
+}
+
+node_future<Ref<Node>> Node::load_root(context_t c, RootNodeTracker& root_tracker) {
+ return c.nm.get_super(c.t, root_tracker
+ ).safe_then([c, &root_tracker](auto&& _super) {
+ auto root_addr = _super->get_root_laddr();
+ assert(root_addr != L_ADDR_NULL);
+ return Node::load(c, root_addr, true
+ ).safe_then([c, _super = std::move(_super),
+ &root_tracker](auto root) mutable {
+ assert(root->impl->field_type() == field_type_t::N0);
+ root->as_root(std::move(_super));
+ std::ignore = c; // as only used in an assert
+ std::ignore = root_tracker;
+ assert(root == root_tracker.get_root(c.t));
+ return node_ertr::make_ready_future<Ref<Node>>(root);
+ });
+ });
+}
+
+void Node::make_root(context_t c, Super::URef&& _super) {
+ _super->write_root_laddr(c, impl->laddr());
+ as_root(std::move(_super));
+}
+
+void Node::as_root(Super::URef&& _super) {
+ assert(!super && !_parent_info);
+ assert(_super->get_root_laddr() == impl->laddr());
+ assert(impl->is_level_tail());
+ super = std::move(_super);
+ super->do_track_root(*this);
+}
+
+node_future<> Node::upgrade_root(context_t c) {
+ assert(is_root());
+ assert(impl->is_level_tail());
+ assert(impl->field_type() == field_type_t::N0);
+ super->do_untrack_root(*this);
+ return InternalNode::allocate_root(c, impl->level(), impl->laddr(), std::move(super)
+ ).safe_then([this](auto new_root) {
+ as_child(search_position_t::end(), new_root);
+ });
+}
+
+template <bool VALIDATE>
+void Node::as_child(const search_position_t& pos, Ref<InternalNode> parent_node) {
+ assert(!super);
+ _parent_info = parent_info_t{pos, parent_node};
+ parent_info().ptr->do_track_child<VALIDATE>(*this);
+}
+template void Node::as_child<true>(const search_position_t&, Ref<InternalNode>);
+template void Node::as_child<false>(const search_position_t&, Ref<InternalNode>);
+
+node_future<> Node::insert_parent(context_t c, Ref<Node> right_node) {
+ assert(!is_root());
+ // TODO(cross-node string dedup)
+ return parent_info().ptr->apply_child_split(
+ c, parent_info().position, this, right_node);
+}
+
+node_future<Ref<Node>> Node::load(
+ context_t c, laddr_t addr, bool expect_is_level_tail) {
+ // NOTE:
+ // *option1: all types of node have the same length;
+ // option2: length is defined by node/field types;
+ // option3: length is totally flexible;
+ return c.nm.read_extent(c.t, addr, NODE_BLOCK_SIZE
+ ).safe_then([expect_is_level_tail](auto extent) {
+ auto [node_type, field_type] = extent->get_types();
+ if (node_type == node_type_t::LEAF) {
+ auto impl = LeafNodeImpl::load(extent, field_type, expect_is_level_tail);
+ return Ref<Node>(new LeafNode(impl.get(), std::move(impl)));
+ } else if (node_type == node_type_t::INTERNAL) {
+ auto impl = InternalNodeImpl::load(extent, field_type, expect_is_level_tail);
+ return Ref<Node>(new InternalNode(impl.get(), std::move(impl)));
+ } else {
+ ceph_abort("impossible path");
+ }
+ });
+}
+
+/*
+ * InternalNode
+ */
+
+InternalNode::InternalNode(InternalNodeImpl* impl, NodeImplURef&& impl_ref)
+ : Node(std::move(impl_ref)), impl{impl} {}
+
+node_future<> InternalNode::apply_child_split(
+ context_t c, const search_position_t& pos,
+ Ref<Node> left_child, Ref<Node> right_child) {
+#ifndef NDEBUG
+ if (pos.is_end()) {
+ assert(impl->is_level_tail());
+ }
+#endif
+ impl->prepare_mutate(c);
+
+ auto left_key = left_child->impl->get_largest_key_view();
+ auto left_child_addr = left_child->impl->laddr();
+ auto left_child_addr_packed = laddr_packed_t{left_child_addr};
+ auto right_key = right_child->impl->get_largest_key_view();
+ auto right_child_addr = right_child->impl->laddr();
+ logger().debug("OTree::Internal::Insert: "
+ "pos({}), left_child({}, {:#x}), right_child({}, {:#x}) ...",
+ pos, left_key, left_child_addr, right_key, right_child_addr);
+ // update pos => left_child to pos => right_child
+ impl->replace_child_addr(pos, right_child_addr, left_child_addr);
+ replace_track(pos, right_child, left_child);
+
+ search_position_t insert_pos = pos;
+ auto [insert_stage, insert_size] = impl->evaluate_insert(
+ left_key, left_child_addr, insert_pos);
+ auto free_size = impl->free_size();
+ if (free_size >= insert_size) {
+ // insert
+ [[maybe_unused]] auto p_value = impl->insert(
+ left_key, left_child_addr_packed, insert_pos, insert_stage, insert_size);
+ assert(impl->free_size() == free_size - insert_size);
+ assert(insert_pos <= pos);
+ assert(p_value->value == left_child_addr);
+ track_insert(insert_pos, insert_stage, left_child, right_child);
+ validate_tracked_children();
+ return node_ertr::now();
+ }
+ // split and insert
+ Ref<InternalNode> this_ref = this;
+ return (is_root() ? upgrade_root(c) : node_ertr::now()
+ ).safe_then([this, c] {
+ return InternalNode::allocate(
+ c, impl->field_type(), impl->is_level_tail(), impl->level());
+ }).safe_then([this_ref, this, c, left_key, left_child, right_child,
+ insert_pos, insert_stage=insert_stage, insert_size=insert_size](auto fresh_right) mutable {
+ auto right_node = fresh_right.node;
+ auto left_child_addr = left_child->impl->laddr();
+ auto left_child_addr_packed = laddr_packed_t{left_child_addr};
+ auto [split_pos, is_insert_left, p_value] = impl->split_insert(
+ fresh_right.mut, *right_node->impl, left_key, left_child_addr_packed,
+ insert_pos, insert_stage, insert_size);
+ assert(p_value->value == left_child_addr);
+ track_split(split_pos, right_node);
+ if (is_insert_left) {
+ track_insert(insert_pos, insert_stage, left_child);
+ } else {
+ right_node->track_insert(insert_pos, insert_stage, left_child);
+ }
+ validate_tracked_children();
+ right_node->validate_tracked_children();
+
+ // propagate index to parent
+ return insert_parent(c, right_node);
+ // TODO (optimize)
+ // try to acquire space from siblings before split... see btrfs
+ });
+}
+
+node_future<Ref<InternalNode>> InternalNode::allocate_root(
+ context_t c, level_t old_root_level,
+ laddr_t old_root_addr, Super::URef&& super) {
+ return InternalNode::allocate(c, field_type_t::N0, true, old_root_level + 1
+ ).safe_then([c, old_root_addr,
+ super = std::move(super)](auto fresh_node) mutable {
+ auto root = fresh_node.node;
+ auto p_value = root->impl->get_p_value(search_position_t::end());
+ fresh_node.mut.copy_in_absolute(
+ const_cast<laddr_packed_t*>(p_value), old_root_addr);
+ root->make_root_from(c, std::move(super), old_root_addr);
+ return root;
+ });
+}
+
+node_future<Ref<tree_cursor_t>>
+InternalNode::lookup_smallest(context_t c) {
+ auto position = search_position_t::begin();
+ laddr_t child_addr = impl->get_p_value(position)->value;
+ return get_or_track_child(c, position, child_addr
+ ).safe_then([c](auto child) {
+ return child->lookup_smallest(c);
+ });
+}
+
+node_future<Ref<tree_cursor_t>>
+InternalNode::lookup_largest(context_t c) {
+ // NOTE: unlike LeafNode::lookup_largest(), this only works for the tail
+ // internal node to return the tail child address.
+ auto position = search_position_t::end();
+ laddr_t child_addr = impl->get_p_value(position)->value;
+ return get_or_track_child(c, position, child_addr).safe_then([c](auto child) {
+ return child->lookup_largest(c);
+ });
+}
+
+node_future<Node::search_result_t>
+InternalNode::lower_bound_tracked(
+ context_t c, const key_hobj_t& key, MatchHistory& history) {
+ auto result = impl->lower_bound(key, history);
+ return get_or_track_child(c, result.position, result.p_value->value
+ ).safe_then([c, &key, &history](auto child) {
+ // XXX(multi-type): pass result.mstat to child
+ return child->lower_bound_tracked(c, key, history);
+ });
+}
+
+node_future<> InternalNode::do_get_tree_stats(
+ context_t c, tree_stats_t& stats) {
+ auto nstats = impl->get_stats();
+ stats.size_persistent_internal += nstats.size_persistent;
+ stats.size_filled_internal += nstats.size_filled;
+ stats.size_logical_internal += nstats.size_logical;
+ stats.size_overhead_internal += nstats.size_overhead;
+ stats.size_value_internal += nstats.size_value;
+ stats.num_kvs_internal += nstats.num_kvs;
+ stats.num_nodes_internal += 1;
+
+ Ref<const InternalNode> this_ref = this;
+ return seastar::do_with(
+ search_position_t(), [this, this_ref, c, &stats](auto& pos) {
+ pos = search_position_t::begin();
+ return crimson::do_until(
+ [this, this_ref, c, &stats, &pos]() -> node_future<bool> {
+ auto child_addr = impl->get_p_value(pos)->value;
+ return get_or_track_child(c, pos, child_addr
+ ).safe_then([c, &stats](auto child) {
+ return child->do_get_tree_stats(c, stats);
+ }).safe_then([this, this_ref, &pos] {
+ if (pos.is_end()) {
+ return node_ertr::make_ready_future<bool>(true);
+ } else {
+ impl->next_position(pos);
+ if (pos.is_end()) {
+ if (impl->is_level_tail()) {
+ return node_ertr::make_ready_future<bool>(false);
+ } else {
+ return node_ertr::make_ready_future<bool>(true);
+ }
+ } else {
+ return node_ertr::make_ready_future<bool>(false);
+ }
+ }
+ });
+ });
+ }
+ );
+}
+
+node_future<> InternalNode::test_clone_root(
+ context_t c_other, RootNodeTracker& tracker_other) const {
+ assert(is_root());
+ assert(impl->is_level_tail());
+ assert(impl->field_type() == field_type_t::N0);
+ Ref<const InternalNode> this_ref = this;
+ return InternalNode::allocate(c_other, field_type_t::N0, true, impl->level()
+ ).safe_then([this, c_other, &tracker_other](auto fresh_other) {
+ impl->test_copy_to(fresh_other.mut);
+ auto cloned_root = fresh_other.node;
+ return c_other.nm.get_super(c_other.t, tracker_other
+ ).safe_then([c_other, cloned_root](auto&& super_other) {
+ cloned_root->make_root_new(c_other, std::move(super_other));
+ return cloned_root;
+ });
+ }).safe_then([this_ref, this, c_other](auto cloned_root) {
+ // clone tracked children
+ // In some unit tests, the children are stubbed out that they
+ // don't exist in NodeExtentManager, and are only tracked in memory.
+ return crimson::do_for_each(
+ tracked_child_nodes.begin(),
+ tracked_child_nodes.end(),
+ [this_ref, c_other, cloned_root](auto& kv) {
+ assert(kv.first == kv.second->parent_info().position);
+ return kv.second->test_clone_non_root(c_other, cloned_root);
+ }
+ );
+ });
+}
+
+node_future<Ref<Node>> InternalNode::get_or_track_child(
+ context_t c, const search_position_t& position, laddr_t child_addr) {
+ bool level_tail = position.is_end();
+ Ref<Node> child;
+ auto found = tracked_child_nodes.find(position);
+ Ref<InternalNode> this_ref = this;
+ return (found == tracked_child_nodes.end()
+ ? (logger().trace("OTree::Internal: load child untracked at {:#x}, pos({}), level={}",
+ child_addr, position, level() - 1),
+ Node::load(c, child_addr, level_tail
+ ).safe_then([this, position] (auto child) {
+ child->as_child(position, this);
+ return child;
+ }))
+ : (logger().trace("OTree::Internal: load child tracked at {:#x}, pos({}), level={}",
+ child_addr, position, level() - 1),
+ node_ertr::make_ready_future<Ref<Node>>(found->second))
+ ).safe_then([this_ref, this, position, child_addr] (auto child) {
+ assert(child_addr == child->impl->laddr());
+ assert(position == child->parent_info().position);
+ std::ignore = position;
+ std::ignore = child_addr;
+ validate_child(*child);
+ return child;
+ });
+}
+
+void InternalNode::track_insert(
+ const search_position_t& insert_pos, match_stage_t insert_stage,
+ Ref<Node> insert_child, Ref<Node> nxt_child) {
+ // update tracks
+ auto pos_upper_bound = insert_pos;
+ pos_upper_bound.index_by_stage(insert_stage) = INDEX_UPPER_BOUND;
+ auto first = tracked_child_nodes.lower_bound(insert_pos);
+ auto last = tracked_child_nodes.lower_bound(pos_upper_bound);
+ std::vector<Node*> nodes;
+ std::for_each(first, last, [&nodes](auto& kv) {
+ nodes.push_back(kv.second);
+ });
+ tracked_child_nodes.erase(first, last);
+ for (auto& node : nodes) {
+ auto _pos = node->parent_info().position;
+ assert(!_pos.is_end());
+ ++_pos.index_by_stage(insert_stage);
+ node->as_child(_pos, this);
+ }
+ // track insert
+ insert_child->as_child(insert_pos, this);
+
+#ifndef NDEBUG
+ // validate left_child is before right_child
+ if (nxt_child) {
+ auto iter = tracked_child_nodes.find(insert_pos);
+ ++iter;
+ assert(iter->second == nxt_child);
+ }
+#endif
+}
+
+void InternalNode::replace_track(
+ const search_position_t& position, Ref<Node> new_child, Ref<Node> old_child) {
+ assert(tracked_child_nodes[position] == old_child);
+ tracked_child_nodes.erase(position);
+ new_child->as_child(position, this);
+ assert(tracked_child_nodes[position] == new_child);
+}
+
+void InternalNode::track_split(
+ const search_position_t& split_pos, Ref<InternalNode> right_node) {
+ auto first = tracked_child_nodes.lower_bound(split_pos);
+ auto iter = first;
+ while (iter != tracked_child_nodes.end()) {
+ search_position_t new_pos = iter->first;
+ new_pos -= split_pos;
+ iter->second->as_child<false>(new_pos, right_node);
+ ++iter;
+ }
+ tracked_child_nodes.erase(first, tracked_child_nodes.end());
+}
+
+void InternalNode::validate_child(const Node& child) const {
+#ifndef NDEBUG
+ assert(impl->level() - 1 == child.impl->level());
+ assert(this == child.parent_info().ptr);
+ auto& child_pos = child.parent_info().position;
+ assert(impl->get_p_value(child_pos)->value == child.impl->laddr());
+ if (child_pos.is_end()) {
+ assert(impl->is_level_tail());
+ assert(child.impl->is_level_tail());
+ } else {
+ assert(!child.impl->is_level_tail());
+ assert(impl->get_key_view(child_pos) == child.impl->get_largest_key_view());
+ }
+ // XXX(multi-type)
+ assert(impl->field_type() <= child.impl->field_type());
+#endif
+}
+
+node_future<InternalNode::fresh_node_t> InternalNode::allocate(
+ context_t c, field_type_t field_type, bool is_level_tail, level_t level) {
+ return InternalNodeImpl::allocate(c, field_type, is_level_tail, level
+ ).safe_then([](auto&& fresh_impl) {
+ auto node = Ref<InternalNode>(new InternalNode(
+ fresh_impl.impl.get(), std::move(fresh_impl.impl)));
+ return fresh_node_t{node, fresh_impl.mut};
+ });
+}
+
+/*
+ * LeafNode
+ */
+
+LeafNode::LeafNode(LeafNodeImpl* impl, NodeImplURef&& impl_ref)
+ : Node(std::move(impl_ref)), impl{impl} {}
+
+bool LeafNode::is_level_tail() const {
+ return impl->is_level_tail();
+}
+
+std::tuple<key_view_t, const onode_t*, layout_version_t> LeafNode::get_kv(
+ const search_position_t& pos) const {
+ key_view_t key_view;
+ auto p_value = impl->get_p_value(pos, &key_view);
+ return {key_view, p_value, layout_version};
+}
+
+node_future<Ref<tree_cursor_t>>
+LeafNode::lookup_smallest(context_t) {
+ if (unlikely(impl->is_empty())) {
+ assert(is_root());
+ return node_ertr::make_ready_future<Ref<tree_cursor_t>>(
+ new tree_cursor_t(this));
+ }
+ auto pos = search_position_t::begin();
+ key_view_t index_key;
+ auto p_value = impl->get_p_value(pos, &index_key);
+ return node_ertr::make_ready_future<Ref<tree_cursor_t>>(
+ get_or_track_cursor(pos, index_key, p_value));
+}
+
+node_future<Ref<tree_cursor_t>>
+LeafNode::lookup_largest(context_t) {
+ if (unlikely(impl->is_empty())) {
+ assert(is_root());
+ return node_ertr::make_ready_future<Ref<tree_cursor_t>>(
+ new tree_cursor_t(this));
+ }
+ search_position_t pos;
+ const onode_t* p_value = nullptr;
+ key_view_t index_key;
+ impl->get_largest_slot(pos, index_key, &p_value);
+ return node_ertr::make_ready_future<Ref<tree_cursor_t>>(
+ get_or_track_cursor(pos, index_key, p_value));
+}
+
+node_future<Node::search_result_t>
+LeafNode::lower_bound_tracked(
+ context_t c, const key_hobj_t& key, MatchHistory& history) {
+ key_view_t index_key;
+ auto result = impl->lower_bound(key, history, &index_key);
+ Ref<tree_cursor_t> cursor;
+ if (result.position.is_end()) {
+ assert(!result.p_value);
+ cursor = new tree_cursor_t(this);
+ } else {
+ cursor = get_or_track_cursor(result.position, index_key, result.p_value);
+ }
+ return node_ertr::make_ready_future<search_result_t>(
+ search_result_t{cursor, result.mstat});
+}
+
+node_future<> LeafNode::do_get_tree_stats(context_t, tree_stats_t& stats) {
+ auto nstats = impl->get_stats();
+ stats.size_persistent_leaf += nstats.size_persistent;
+ stats.size_filled_leaf += nstats.size_filled;
+ stats.size_logical_leaf += nstats.size_logical;
+ stats.size_overhead_leaf += nstats.size_overhead;
+ stats.size_value_leaf += nstats.size_value;
+ stats.num_kvs_leaf += nstats.num_kvs;
+ stats.num_nodes_leaf += 1;
+ return node_ertr::now();
+}
+
+node_future<> LeafNode::test_clone_root(
+ context_t c_other, RootNodeTracker& tracker_other) const {
+ assert(is_root());
+ assert(impl->is_level_tail());
+ assert(impl->field_type() == field_type_t::N0);
+ Ref<const LeafNode> this_ref = this;
+ return LeafNode::allocate(c_other, field_type_t::N0, true
+ ).safe_then([this, c_other, &tracker_other](auto fresh_other) {
+ impl->test_copy_to(fresh_other.mut);
+ auto cloned_root = fresh_other.node;
+ return c_other.nm.get_super(c_other.t, tracker_other
+ ).safe_then([c_other, cloned_root](auto&& super_other) {
+ cloned_root->make_root_new(c_other, std::move(super_other));
+ });
+ }).safe_then([this_ref]{});
+}
+
+node_future<Ref<tree_cursor_t>> LeafNode::insert_value(
+ context_t c, const key_hobj_t& key, const onode_t& value,
+ const search_position_t& pos, const MatchHistory& history,
+ match_stat_t mstat) {
+#ifndef NDEBUG
+ if (pos.is_end()) {
+ assert(impl->is_level_tail());
+ }
+#endif
+ logger().debug("OTree::Leaf::Insert: "
+ "pos({}), {}, {}, {}, mstat({}) ...",
+ pos, key, value, history, mstat);
+ search_position_t insert_pos = pos;
+ auto [insert_stage, insert_size] = impl->evaluate_insert(
+ key, value, history, mstat, insert_pos);
+ auto free_size = impl->free_size();
+ if (free_size >= insert_size) {
+ // insert
+ on_layout_change();
+ impl->prepare_mutate(c);
+ auto p_value = impl->insert(key, value, insert_pos, insert_stage, insert_size);
+ assert(impl->free_size() == free_size - insert_size);
+ assert(insert_pos <= pos);
+ assert(p_value->size == value.size);
+ auto ret = track_insert(insert_pos, insert_stage, p_value);
+ validate_tracked_cursors();
+ return node_ertr::make_ready_future<Ref<tree_cursor_t>>(ret);
+ }
+ // split and insert
+ Ref<LeafNode> this_ref = this;
+ return (is_root() ? upgrade_root(c) : node_ertr::now()
+ ).safe_then([this, c] {
+ return LeafNode::allocate(c, impl->field_type(), impl->is_level_tail());
+ }).safe_then([this_ref, this, c, &key, &value,
+ insert_pos, insert_stage=insert_stage, insert_size=insert_size](auto fresh_right) mutable {
+ auto right_node = fresh_right.node;
+ // no need to bump version for right node, as it is fresh
+ on_layout_change();
+ impl->prepare_mutate(c);
+ auto [split_pos, is_insert_left, p_value] = impl->split_insert(
+ fresh_right.mut, *right_node->impl, key, value,
+ insert_pos, insert_stage, insert_size);
+ assert(p_value->size == value.size);
+ track_split(split_pos, right_node);
+ Ref<tree_cursor_t> ret;
+ if (is_insert_left) {
+ ret = track_insert(insert_pos, insert_stage, p_value);
+ } else {
+ ret = right_node->track_insert(insert_pos, insert_stage, p_value);
+ }
+ validate_tracked_cursors();
+ right_node->validate_tracked_cursors();
+
+ // propagate insert to parent
+ return insert_parent(c, right_node).safe_then([ret] {
+ return ret;
+ });
+ // TODO (optimize)
+ // try to acquire space from siblings before split... see btrfs
+ });
+}
+
+node_future<Ref<LeafNode>> LeafNode::allocate_root(
+ context_t c, RootNodeTracker& root_tracker) {
+ return LeafNode::allocate(c, field_type_t::N0, true
+ ).safe_then([c, &root_tracker](auto fresh_node) {
+ auto root = fresh_node.node;
+ return c.nm.get_super(c.t, root_tracker
+ ).safe_then([c, root](auto&& super) {
+ root->make_root_new(c, std::move(super));
+ return root;
+ });
+ });
+}
+
+Ref<tree_cursor_t> LeafNode::get_or_track_cursor(
+ const search_position_t& position,
+ const key_view_t& key, const onode_t* p_value) {
+ assert(!position.is_end());
+ assert(p_value);
+ Ref<tree_cursor_t> p_cursor;
+ auto found = tracked_cursors.find(position);
+ if (found == tracked_cursors.end()) {
+ p_cursor = new tree_cursor_t(this, position, key, p_value, layout_version);
+ } else {
+ p_cursor = found->second;
+ assert(p_cursor->get_leaf_node() == this);
+ assert(p_cursor->get_position() == position);
+ p_cursor->update_kv(key, p_value, layout_version);
+ }
+ return p_cursor;
+}
+
+void LeafNode::validate_cursor(tree_cursor_t& cursor) const {
+#ifndef NDEBUG
+ assert(this == cursor.get_leaf_node().get());
+ assert(!cursor.is_end());
+ auto [key, val, ver] = get_kv(cursor.get_position());
+ assert(key == cursor.get_key_view());
+ assert(val == cursor.get_p_value());
+#endif
+}
+
+Ref<tree_cursor_t> LeafNode::track_insert(
+ const search_position_t& insert_pos, match_stage_t insert_stage,
+ const onode_t* p_onode) {
+ // update cursor position
+ auto pos_upper_bound = insert_pos;
+ pos_upper_bound.index_by_stage(insert_stage) = INDEX_UPPER_BOUND;
+ auto first = tracked_cursors.lower_bound(insert_pos);
+ auto last = tracked_cursors.lower_bound(pos_upper_bound);
+ std::vector<tree_cursor_t*> p_cursors;
+ std::for_each(first, last, [&p_cursors](auto& kv) {
+ p_cursors.push_back(kv.second);
+ });
+ tracked_cursors.erase(first, last);
+ for (auto& p_cursor : p_cursors) {
+ search_position_t new_pos = p_cursor->get_position();
+ ++new_pos.index_by_stage(insert_stage);
+ p_cursor->update_track<true>(this, new_pos);
+ }
+
+ // track insert
+ // TODO: getting key_view_t from stage::proceed_insert() and
+ // stage::append_insert() has not supported yet
+ return new tree_cursor_t(this, insert_pos);
+}
+
+void LeafNode::track_split(
+ const search_position_t& split_pos, Ref<LeafNode> right_node) {
+ // update cursor ownership and position
+ auto first = tracked_cursors.lower_bound(split_pos);
+ auto iter = first;
+ while (iter != tracked_cursors.end()) {
+ search_position_t new_pos = iter->first;
+ new_pos -= split_pos;
+ iter->second->update_track<false>(right_node, new_pos);
+ ++iter;
+ }
+ tracked_cursors.erase(first, tracked_cursors.end());
+}
+
+node_future<LeafNode::fresh_node_t> LeafNode::allocate(
+ context_t c, field_type_t field_type, bool is_level_tail) {
+ return LeafNodeImpl::allocate(c, field_type, is_level_tail
+ ).safe_then([](auto&& fresh_impl) {
+ auto node = Ref<LeafNode>(new LeafNode(
+ fresh_impl.impl.get(), std::move(fresh_impl.impl)));
+ return fresh_node_t{node, fresh_impl.mut};
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h
new file mode 100644
index 000000000..d6af489e7
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h
@@ -0,0 +1,476 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <ostream>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include "crimson/common/type_helpers.h"
+
+#include "node_extent_mutable.h"
+#include "stages/key_layout.h"
+#include "stages/stage_types.h"
+#include "super.h"
+#include "tree_types.h"
+
+/**
+ * Tree example (2 levels):
+ *
+ * Root node keys: [ 3 7 ]
+ * values: [p1 p2 p3]
+ * / | \
+ * ------- | -------
+ * | | |
+ * V V V
+ * Leaf node keys: [ 1 2 3] [ 4 5 7] [ 9 11 12]
+ * values: [v1 v2 v3] [v4 v5 v6] [v7 v8 v9]
+ *
+ * Tree structure properties:
+ * - As illustrated above, the parent key is strictly equal to its left child's
+ * largest key;
+ * - If a tree is indexing multiple seastore transactions, each transaction
+ * will be mapped to a Super which points to a distinct root node. So the
+ * transactions are isolated at tree level. However, tree nodes from
+ * different transactions can reference the same seastore CachedExtent before
+ * modification;
+ * - The resources of the transactional tree are tracked by tree_cursor_ts held
+ * by users. As long as any cursor is alive, the according tree hierarchy is
+ * alive and keeps tracked. See the reversed resource management sections
+ * below;
+ */
+
+namespace crimson::os::seastore::onode {
+
+class LeafNode;
+class InternalNode;
+
+/**
+ * tree_cursor_t
+ *
+ * A cursor points to a position (LeafNode and search_position_t) of the tree
+ * where it can find the according key and value pair. The position is updated
+ * by LeafNode insert/split/delete/merge internally and is kept valid. It also
+ * caches the key-value information for a specific node layout version.
+ *
+ * Exposes public interfaces for Btree::Cursor.
+ */
+using layout_version_t = uint32_t;
+class tree_cursor_t final
+ : public boost::intrusive_ref_counter<
+ tree_cursor_t, boost::thread_unsafe_counter> {
+ public:
+ // public to Btree
+ ~tree_cursor_t();
+ tree_cursor_t(const tree_cursor_t&) = delete;
+ tree_cursor_t(tree_cursor_t&&) = delete;
+ tree_cursor_t& operator=(const tree_cursor_t&) = delete;
+ tree_cursor_t& operator=(tree_cursor_t&&) = delete;
+
+ /**
+ * is_end
+ *
+ * Represents one-past-the-last of all the sorted key-value
+ * pairs in the tree. An end cursor won't contain valid key-value
+ * information.
+ */
+ bool is_end() const { return position.is_end(); }
+
+ /// Returns the key view in tree if it is not an end cursor.
+ const key_view_t& get_key_view() const;
+
+ /// Returns the value pointer in tree if it is not an end cursor.
+ const onode_t* get_p_value() const;
+
+ private:
+ tree_cursor_t(Ref<LeafNode>, const search_position_t&);
+ tree_cursor_t(Ref<LeafNode>, const search_position_t&,
+ const key_view_t& key, const onode_t*, layout_version_t);
+ // lookup reaches the end, contain leaf node for further insert
+ tree_cursor_t(Ref<LeafNode>);
+ const search_position_t& get_position() const { return position; }
+ Ref<LeafNode> get_leaf_node() { return leaf_node; }
+ template <bool VALIDATE>
+ void update_track(Ref<LeafNode>, const search_position_t&);
+ void update_kv(const key_view_t&, const onode_t*, layout_version_t) const;
+ void ensure_kv() const;
+
+ private:
+ /**
+ * Reversed resource management (tree_cursor_t)
+ *
+ * tree_cursor_t holds a reference to the LeafNode, so the LeafNode will be
+ * alive as long as any of it's cursors is still referenced by user.
+ */
+ Ref<LeafNode> leaf_node;
+ search_position_t position;
+
+ // cached information
+ mutable std::optional<key_view_t> key_view;
+ mutable const onode_t* p_value;
+ mutable layout_version_t node_version;
+
+ friend class LeafNode;
+ friend class Node; // get_position(), get_leaf_node()
+};
+
+/**
+ * Node
+ *
+ * An abstracted class for both InternalNode and LeafNode.
+ *
+ * Exposes public interfaces for Btree.
+ */
+class Node
+ : public boost::intrusive_ref_counter<
+ Node, boost::thread_unsafe_counter> {
+ public:
+ // public to Btree
+ using node_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent,
+ crimson::ct_error::erange>;
+ template <class ValueT=void>
+ using node_future = node_ertr::future<ValueT>;
+
+ struct search_result_t {
+ bool is_end() const { return p_cursor->is_end(); }
+ Ref<tree_cursor_t> p_cursor;
+ match_stat_t mstat;
+
+ MatchKindBS match() const {
+ assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX);
+ return (mstat == MSTAT_EQ ? MatchKindBS::EQ : MatchKindBS::NE);
+ }
+ };
+
+ virtual ~Node();
+ Node(const Node&) = delete;
+ Node(Node&&) = delete;
+ Node& operator=(const Node&) = delete;
+ Node& operator=(Node&&) = delete;
+
+ /**
+ * level
+ *
+ * A positive value denotes the level (or height) of this node in tree.
+ * 0 means LeafNode, positive means InternalNode.
+ */
+ level_t level() const;
+
+ /**
+ * lookup_smallest
+ *
+ * Returns a cursor pointing to the smallest key in the sub-tree formed by
+ * this node.
+ *
+ * Returns an end cursor if it is an empty root node.
+ */
+ virtual node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) = 0;
+
+ /**
+ * lookup_largest
+ *
+ * Returns a cursor pointing to the largest key in the sub-tree formed by
+ * this node.
+ *
+ * Returns an end cursor if it is an empty root node.
+ */
+ virtual node_future<Ref<tree_cursor_t>> lookup_largest(context_t) = 0;
+
+ /**
+ * lower_bound
+ *
+ * Returns a cursor pointing to the first element in the range [first, last)
+ * of the sub-tree which does not compare less than the input key. The
+ * result also denotes whether the pointed key is equal to the input key.
+ *
+ * Returns an end cursor with MatchKindBS::NE if:
+ * - It is an empty root node;
+ * - Or the input key is larger than all the keys in the sub-tree;
+ */
+ node_future<search_result_t> lower_bound(context_t c, const key_hobj_t& key);
+
+ /**
+ * insert
+ *
+ * Try to insert a key-value pair into the sub-tree formed by this node.
+ *
+ * Returns a boolean denoting whether the insertion is successful:
+ * - If true, the returned cursor points to the inserted element in tree;
+ * - If false, the returned cursor points to the conflicting element in tree;
+ */
+ node_future<std::pair<Ref<tree_cursor_t>, bool>> insert(
+ context_t, const key_hobj_t&, const onode_t&);
+
+ /// Recursively collects the statistics of the sub-tree formed by this node
+ node_future<tree_stats_t> get_tree_stats(context_t);
+
+ /// Returns an ostream containing a dump of all the elements in the node.
+ std::ostream& dump(std::ostream&) const;
+
+ /// Returns an ostream containing an one-line summary of this node.
+ std::ostream& dump_brief(std::ostream&) const;
+
+ /// Initializes the tree by allocating an empty root node.
+ static node_future<> mkfs(context_t, RootNodeTracker&);
+
+ /// Loads the tree root. The tree must be initialized.
+ static node_future<Ref<Node>> load_root(context_t, RootNodeTracker&);
+
+ // Only for unit test purposes.
+ void test_make_destructable(context_t, NodeExtentMutable&, Super::URef&&);
+ virtual node_future<> test_clone_root(context_t, RootNodeTracker&) const = 0;
+
+ protected:
+ virtual node_future<> test_clone_non_root(context_t, Ref<InternalNode>) const {
+ ceph_abort("impossible path");
+ }
+ virtual node_future<search_result_t> lower_bound_tracked(
+ context_t, const key_hobj_t&, MatchHistory&) = 0;
+ virtual node_future<> do_get_tree_stats(context_t, tree_stats_t&) = 0;
+
+ protected:
+ Node(NodeImplURef&&);
+ bool is_root() const {
+ assert((super && !_parent_info.has_value()) ||
+ (!super && _parent_info.has_value()));
+ return !_parent_info.has_value();
+ }
+
+ // as root
+ void make_root(context_t c, Super::URef&& _super);
+ void make_root_new(context_t c, Super::URef&& _super) {
+ assert(_super->get_root_laddr() == L_ADDR_NULL);
+ make_root(c, std::move(_super));
+ }
+ void make_root_from(context_t c, Super::URef&& _super, laddr_t from_addr) {
+ assert(_super->get_root_laddr() == from_addr);
+ make_root(c, std::move(_super));
+ }
+ void as_root(Super::URef&& _super);
+ node_future<> upgrade_root(context_t);
+
+ // as child/non-root
+ template <bool VALIDATE = true>
+ void as_child(const search_position_t&, Ref<InternalNode>);
+ struct parent_info_t {
+ search_position_t position;
+ Ref<InternalNode> ptr;
+ };
+ const parent_info_t& parent_info() const { return *_parent_info; }
+ node_future<> insert_parent(context_t, Ref<Node> right_node);
+
+ private:
+ /**
+ * Reversed resource management (Node)
+ *
+ * Root Node holds a reference to its parent Super class, so its parent
+ * will be alive as long as this root node is alive.
+ *
+ * None-root Node holds a reference to its parent Node, so its parent will
+ * be alive as long as any of it's children is alive.
+ */
+ // as root
+ Super::URef super;
+ // as child/non-root
+ std::optional<parent_info_t> _parent_info;
+
+ private:
+ static node_future<Ref<Node>> load(context_t, laddr_t, bool expect_is_level_tail);
+
+ NodeImplURef impl;
+ friend class InternalNode;
+};
+inline std::ostream& operator<<(std::ostream& os, const Node& node) {
+ return node.dump_brief(os);
+}
+
+/**
+ * InternalNode
+ *
+ * A concrete implementation of Node class that represents an internal tree
+ * node. Its level is always positive and its values are logical block
+ * addresses to its child nodes. An internal node cannot be empty.
+ */
+class InternalNode final : public Node {
+ public:
+ // public to Node
+ InternalNode(InternalNodeImpl*, NodeImplURef&&);
+ ~InternalNode() override { assert(tracked_child_nodes.empty()); }
+ InternalNode(const InternalNode&) = delete;
+ InternalNode(InternalNode&&) = delete;
+ InternalNode& operator=(const InternalNode&) = delete;
+ InternalNode& operator=(InternalNode&&) = delete;
+
+ node_future<> apply_child_split(
+ context_t, const search_position_t&, Ref<Node> left, Ref<Node> right);
+ template <bool VALIDATE>
+ void do_track_child(Node& child) {
+ if constexpr (VALIDATE) {
+ validate_child(child);
+ }
+ auto& child_pos = child.parent_info().position;
+ assert(tracked_child_nodes.find(child_pos) == tracked_child_nodes.end());
+ tracked_child_nodes[child_pos] = &child;
+ }
+ void do_untrack_child(const Node& child) {
+ auto& child_pos = child.parent_info().position;
+ assert(tracked_child_nodes.find(child_pos)->second == &child);
+ [[maybe_unused]] auto removed = tracked_child_nodes.erase(child_pos);
+ assert(removed);
+ }
+
+ static node_future<Ref<InternalNode>> allocate_root(
+ context_t, level_t, laddr_t, Super::URef&&);
+
+ protected:
+ node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) override;
+ node_future<Ref<tree_cursor_t>> lookup_largest(context_t) override;
+ node_future<search_result_t> lower_bound_tracked(
+ context_t, const key_hobj_t&, MatchHistory&) override;
+ node_future<> do_get_tree_stats(context_t, tree_stats_t&) override;
+
+ node_future<> test_clone_root(context_t, RootNodeTracker&) const override;
+
+ private:
+ // XXX: extract a common tracker for InternalNode to track Node,
+ // and LeafNode to track tree_cursor_t.
+ node_future<Ref<Node>> get_or_track_child(context_t, const search_position_t&, laddr_t);
+ void track_insert(
+ const search_position_t&, match_stage_t, Ref<Node>, Ref<Node> nxt_child = nullptr);
+ void replace_track(const search_position_t&, Ref<Node> new_child, Ref<Node> old_child);
+ void track_split(const search_position_t&, Ref<InternalNode>);
+ void validate_tracked_children() const {
+#ifndef NDEBUG
+ for (auto& kv : tracked_child_nodes) {
+ assert(kv.first == kv.second->parent_info().position);
+ validate_child(*kv.second);
+ }
+#endif
+ }
+ void validate_child(const Node& child) const;
+
+ struct fresh_node_t {
+ Ref<InternalNode> node;
+ NodeExtentMutable mut;
+ std::pair<Ref<Node>, NodeExtentMutable> make_pair() {
+ return std::make_pair(Ref<Node>(node), mut);
+ }
+ };
+ static node_future<fresh_node_t> allocate(context_t, field_type_t, bool, level_t);
+
+ private:
+ /**
+ * Reversed resource management (InternalNode)
+ *
+ * InteralNode keeps track of its child nodes which are still alive in
+ * memory, and their positions will be updated throughout
+ * insert/split/delete/merge operations of this node.
+ */
+ // XXX: leverage intrusive data structure to control memory overhead
+ std::map<search_position_t, Node*> tracked_child_nodes;
+ InternalNodeImpl* impl;
+};
+
+/**
+ * LeafNode
+ *
+ * A concrete implementation of Node class that represents a leaf tree node.
+ * Its level is always 0. A leaf node can only be empty if it is root.
+ */
+class LeafNode final : public Node {
+ public:
+ // public to tree_cursor_t
+ ~LeafNode() override { assert(tracked_cursors.empty()); }
+ LeafNode(const LeafNode&) = delete;
+ LeafNode(LeafNode&&) = delete;
+ LeafNode& operator=(const LeafNode&) = delete;
+ LeafNode& operator=(LeafNode&&) = delete;
+
+ bool is_level_tail() const;
+ layout_version_t get_layout_version() const { return layout_version; }
+ std::tuple<key_view_t, const onode_t*, layout_version_t> get_kv(
+ const search_position_t&) const;
+ template <bool VALIDATE>
+ void do_track_cursor(tree_cursor_t& cursor) {
+ if constexpr (VALIDATE) {
+ validate_cursor(cursor);
+ }
+ auto& cursor_pos = cursor.get_position();
+ assert(tracked_cursors.find(cursor_pos) == tracked_cursors.end());
+ tracked_cursors[cursor_pos] = &cursor;
+ }
+ void do_untrack_cursor(tree_cursor_t& cursor) {
+ validate_cursor(cursor);
+ auto& cursor_pos = cursor.get_position();
+ assert(tracked_cursors.find(cursor_pos)->second == &cursor);
+ [[maybe_unused]] auto removed = tracked_cursors.erase(cursor_pos);
+ assert(removed);
+ }
+
+ protected:
+ node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) override;
+ node_future<Ref<tree_cursor_t>> lookup_largest(context_t) override;
+ node_future<search_result_t> lower_bound_tracked(
+ context_t, const key_hobj_t&, MatchHistory&) override;
+ node_future<> do_get_tree_stats(context_t, tree_stats_t&) override;
+
+ node_future<> test_clone_root(context_t, RootNodeTracker&) const override;
+
+ private:
+ LeafNode(LeafNodeImpl*, NodeImplURef&&);
+ node_future<Ref<tree_cursor_t>> insert_value(
+ context_t, const key_hobj_t&, const onode_t&,
+ const search_position_t&, const MatchHistory&,
+ match_stat_t mstat);
+ static node_future<Ref<LeafNode>> allocate_root(context_t, RootNodeTracker&);
+ friend class Node;
+
+ private:
+ // XXX: extract a common tracker for InternalNode to track Node,
+ // and LeafNode to track tree_cursor_t.
+ Ref<tree_cursor_t> get_or_track_cursor(
+ const search_position_t&, const key_view_t&, const onode_t*);
+ Ref<tree_cursor_t> track_insert(
+ const search_position_t&, match_stage_t, const onode_t*);
+ void track_split(const search_position_t&, Ref<LeafNode>);
+ void validate_tracked_cursors() const {
+#ifndef NDEBUG
+ for (auto& kv : tracked_cursors) {
+ assert(kv.first == kv.second->get_position());
+ validate_cursor(*kv.second);
+ }
+#endif
+ }
+ void validate_cursor(tree_cursor_t& cursor) const;
+ // invalidate p_value pointers in tree_cursor_t
+ void on_layout_change() { ++layout_version; }
+
+ struct fresh_node_t {
+ Ref<LeafNode> node;
+ NodeExtentMutable mut;
+ std::pair<Ref<Node>, NodeExtentMutable> make_pair() {
+ return std::make_pair(Ref<Node>(node), mut);
+ }
+ };
+ static node_future<fresh_node_t> allocate(context_t, field_type_t, bool);
+
+ private:
+ /**
+ * Reversed resource management (LeafNode)
+ *
+ * LeafNode keeps track of the referencing cursors which are still alive in
+ * memory, and their positions will be updated throughout
+ * insert/split/delete/merge operations of this node.
+ */
+ // XXX: leverage intrusive data structure to control memory overhead
+ std::map<search_position_t, tree_cursor_t*> tracked_cursors;
+ LeafNodeImpl* impl;
+ layout_version_t layout_version = 0;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h
new file mode 100644
index 000000000..d08a99015
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "include/buffer.h"
+#include "node_types.h"
+
+namespace crimson::os::seastore::onode {
+
+/**
+ * DeltaRecorder
+ *
+ * An abstracted class to encapsulate different implementations to apply delta
+ * to a specific node layout.
+ */
+class DeltaRecorder {
+ public:
+ virtual ~DeltaRecorder() {
+ assert(is_empty());
+ }
+
+ bool is_empty() const {
+ return encoded.length() == 0;
+ }
+
+ ceph::bufferlist get_delta() {
+ assert(!is_empty());
+ return std::move(encoded);
+ }
+
+ virtual node_type_t node_type() const = 0;
+ virtual field_type_t field_type() const = 0;
+ virtual void apply_delta(ceph::bufferlist::const_iterator&,
+ NodeExtentMutable&) = 0;
+
+ protected:
+ DeltaRecorder() = default;
+ ceph::bufferlist encoded;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h
new file mode 100644
index 000000000..94782f50d
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h
@@ -0,0 +1,413 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/log.h"
+#include "node_extent_manager.h"
+#include "node_delta_recorder.h"
+#include "node_layout_replayable.h"
+
+#ifndef NDEBUG
+#include "node_extent_manager/test_replay.h"
+#endif
+
+namespace crimson::os::seastore::onode {
+
+/**
+ * DeltaRecorderT
+ *
+ * Responsible to encode and decode delta, and apply delta for a specific node
+ * layout.
+ */
+template <typename FieldType, node_type_t NODE_TYPE>
+class DeltaRecorderT final: public DeltaRecorder {
+ enum class op_t : uint8_t {
+ INSERT,
+ SPLIT,
+ SPLIT_INSERT,
+ UPDATE_CHILD_ADDR,
+ };
+
+ public:
+ using layout_t = NodeLayoutReplayableT<FieldType, NODE_TYPE>;
+ using node_stage_t = typename layout_t::node_stage_t;
+ using position_t = typename layout_t::position_t;
+ using StagedIterator = typename layout_t::StagedIterator;
+ using value_t = typename layout_t::value_t;
+ static constexpr auto FIELD_TYPE = layout_t::FIELD_TYPE;
+
+ ~DeltaRecorderT() override = default;
+
+ template <KeyT KT>
+ void encode_insert(
+ const full_key_t<KT>& key,
+ const value_t& value,
+ const position_t& insert_pos,
+ const match_stage_t& insert_stage,
+ const node_offset_t& insert_size) {
+ ceph::encode(op_t::INSERT, encoded);
+ encode_key<KT>(key, encoded);
+ encode_value(value, encoded);
+ insert_pos.encode(encoded);
+ ceph::encode(insert_stage, encoded);
+ ceph::encode(insert_size, encoded);
+ }
+
+ void encode_split(
+ const StagedIterator& split_at,
+ const char* p_node_start) {
+ ceph::encode(op_t::SPLIT, encoded);
+ split_at.encode(p_node_start, encoded);
+ }
+
+ template <KeyT KT>
+ void encode_split_insert(
+ const StagedIterator& split_at,
+ const full_key_t<KT>& key,
+ const value_t& value,
+ const position_t& insert_pos,
+ const match_stage_t& insert_stage,
+ const node_offset_t& insert_size,
+ const char* p_node_start) {
+ ceph::encode(op_t::SPLIT_INSERT, encoded);
+ split_at.encode(p_node_start, encoded);
+ encode_key<KT>(key, encoded);
+ encode_value(value, encoded);
+ insert_pos.encode(encoded);
+ ceph::encode(insert_stage, encoded);
+ ceph::encode(insert_size, encoded);
+ }
+
+ void encode_update_child_addr(
+ const laddr_t new_addr,
+ const laddr_packed_t* p_addr,
+ const char* p_node_start) {
+ ceph::encode(op_t::UPDATE_CHILD_ADDR, encoded);
+ ceph::encode(new_addr, encoded);
+ int node_offset = reinterpret_cast<const char*>(p_addr) - p_node_start;
+ assert(node_offset > 0 && node_offset <= NODE_BLOCK_SIZE);
+ ceph::encode(static_cast<node_offset_t>(node_offset), encoded);
+ }
+
+ static DeltaRecorderURef create() {
+ return std::unique_ptr<DeltaRecorder>(new DeltaRecorderT());
+ }
+
+ protected:
+ DeltaRecorderT() = default;
+ node_type_t node_type() const override { return NODE_TYPE; }
+ field_type_t field_type() const override { return FIELD_TYPE; }
+ void apply_delta(ceph::bufferlist::const_iterator& delta,
+ NodeExtentMutable& node) override {
+ assert(is_empty());
+ node_stage_t stage(reinterpret_cast<const FieldType*>(node.get_read()));
+ op_t op;
+ try {
+ ceph::decode(op, delta);
+ switch (op) {
+ case op_t::INSERT: {
+ logger().debug("OTree::Extent::Replay: decoding INSERT ...");
+ auto key = key_hobj_t::decode(delta);
+
+ std::unique_ptr<char[]> value_storage_heap;
+ value_t value_storage_stack;
+ auto p_value = decode_value(delta, value_storage_heap, value_storage_stack);
+
+ auto insert_pos = position_t::decode(delta);
+ match_stage_t insert_stage;
+ ceph::decode(insert_stage, delta);
+ node_offset_t insert_size;
+ ceph::decode(insert_size, delta);
+ logger().debug("OTree::Extent::Replay: apply {}, {}, "
+ "insert_pos({}), insert_stage={}, insert_size={}B ...",
+ key, *p_value, insert_pos, insert_stage, insert_size);
+ layout_t::template insert<KeyT::HOBJ>(
+ node, stage, key, *p_value, insert_pos, insert_stage, insert_size);
+ break;
+ }
+ case op_t::SPLIT: {
+ logger().debug("OTree::Extent::Replay: decoding SPLIT ...");
+ auto split_at = StagedIterator::decode(stage.p_start(), delta);
+ logger().debug("OTree::Extent::Replay: apply split_at={} ...", split_at);
+ layout_t::split(node, stage, split_at);
+ break;
+ }
+ case op_t::SPLIT_INSERT: {
+ logger().debug("OTree::Extent::Replay: decoding SPLIT_INSERT ...");
+ auto split_at = StagedIterator::decode(stage.p_start(), delta);
+ auto key = key_hobj_t::decode(delta);
+
+ std::unique_ptr<char[]> value_storage_heap;
+ value_t value_storage_stack;
+ auto p_value = decode_value(delta, value_storage_heap, value_storage_stack);
+
+ auto insert_pos = position_t::decode(delta);
+ match_stage_t insert_stage;
+ ceph::decode(insert_stage, delta);
+ node_offset_t insert_size;
+ ceph::decode(insert_size, delta);
+ logger().debug("OTree::Extent::Replay: apply split_at={}, {}, {}, "
+ "insert_pos({}), insert_stage={}, insert_size={}B ...",
+ split_at, key, *p_value, insert_pos, insert_stage, insert_size);
+ layout_t::template split_insert<KeyT::HOBJ>(
+ node, stage, split_at, key, *p_value, insert_pos, insert_stage, insert_size);
+ break;
+ }
+ case op_t::UPDATE_CHILD_ADDR: {
+ logger().debug("OTree::Extent::Replay: decoding UPDATE_CHILD_ADDR ...");
+ laddr_t new_addr;
+ ceph::decode(new_addr, delta);
+ node_offset_t update_offset;
+ ceph::decode(update_offset, delta);
+ auto p_addr = reinterpret_cast<laddr_packed_t*>(
+ node.get_write() + update_offset);
+ logger().debug("OTree::Extent::Replay: apply {:#x} to offset {:#x} ...",
+ new_addr, update_offset);
+ layout_t::update_child_addr(node, new_addr, p_addr);
+ break;
+ }
+ default:
+ logger().error("OTree::Extent::Replay: got unknown op {} when replay {:#x}",
+ op, node.get_laddr());
+ ceph_abort();
+ }
+ } catch (buffer::error& e) {
+ logger().error("OTree::Extent::Replay: got decode error {} when replay {:#x}",
+ e, node.get_laddr());
+ ceph_abort();
+ }
+ }
+
+ private:
+ static void encode_value(const value_t& value, ceph::bufferlist& encoded) {
+ if constexpr (std::is_same_v<value_t, laddr_packed_t>) {
+ // NODE_TYPE == node_type_t::INTERNAL
+ ceph::encode(value.value, encoded);
+ } else if constexpr (std::is_same_v<value_t, onode_t>) {
+ // NODE_TYPE == node_type_t::LEAF
+ value.encode(encoded);
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ static value_t* decode_value(ceph::bufferlist::const_iterator& delta,
+ std::unique_ptr<char[]>& value_storage_heap,
+ value_t& value_storage_stack) {
+ if constexpr (std::is_same_v<value_t, laddr_packed_t>) {
+ // NODE_TYPE == node_type_t::INTERNAL
+ laddr_t value;
+ ceph::decode(value, delta);
+ value_storage_stack.value = value;
+ return &value_storage_stack;
+ } else if constexpr (std::is_same_v<value_t, onode_t>) {
+ // NODE_TYPE == node_type_t::LEAF
+ auto value_config = onode_t::decode(delta);
+ value_storage_heap = onode_t::allocate(value_config);
+ return reinterpret_cast<onode_t*>(value_storage_heap.get());
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ static seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+};
+
+/**
+ * NodeExtentAccessorT
+ *
+ * This component is responsible to reference and mutate the underlying
+ * NodeExtent, record mutation parameters when needed, and apply the recorded
+ * modifications for a specific node layout.
+ */
+template <typename FieldType, node_type_t NODE_TYPE>
+class NodeExtentAccessorT {
+ public:
+ using layout_t = NodeLayoutReplayableT<FieldType, NODE_TYPE>;
+ using node_stage_t = typename layout_t::node_stage_t;
+ using position_t = typename layout_t::position_t;
+ using recorder_t = DeltaRecorderT<FieldType, NODE_TYPE>;
+ using StagedIterator = typename layout_t::StagedIterator;
+ using value_t = typename layout_t::value_t;
+ static constexpr auto FIELD_TYPE = layout_t::FIELD_TYPE;
+
+ NodeExtentAccessorT(NodeExtentRef extent)
+ : extent{extent},
+ node_stage{reinterpret_cast<const FieldType*>(extent->get_read())} {
+ if (no_recording()) {
+ mut.emplace(extent->get_mutable());
+ assert(extent->get_recorder() == nullptr);
+ recorder = nullptr;
+ } else if (needs_recording()) {
+ mut.emplace(extent->get_mutable());
+ auto p_recorder = extent->get_recorder();
+ assert(p_recorder != nullptr);
+ assert(p_recorder->node_type() == NODE_TYPE);
+ assert(p_recorder->field_type() == FIELD_TYPE);
+ recorder = static_cast<recorder_t*>(p_recorder);
+ } else if (needs_mutate()) {
+ // mut is empty
+ assert(extent->get_recorder() == nullptr ||
+ extent->get_recorder()->is_empty());
+ recorder = nullptr;
+ } else {
+ ceph_abort("impossible path");
+ }
+#ifndef NDEBUG
+ auto ref_recorder = recorder_t::create();
+ test_recorder = static_cast<recorder_t*>(ref_recorder.get());
+ test_extent = TestReplayExtent::create(
+ extent->get_length(), std::move(ref_recorder));
+#endif
+ }
+ ~NodeExtentAccessorT() = default;
+ NodeExtentAccessorT(const NodeExtentAccessorT&) = delete;
+ NodeExtentAccessorT(NodeExtentAccessorT&&) = delete;
+ NodeExtentAccessorT& operator=(const NodeExtentAccessorT&) = delete;
+ NodeExtentAccessorT& operator=(NodeExtentAccessorT&&) = delete;
+
+ const node_stage_t& read() const { return node_stage; }
+ laddr_t get_laddr() const { return extent->get_laddr(); }
+
+ // must be called before any mutate attempes.
+ // for the safety of mixed read and mutate, call before read.
+ void prepare_mutate(context_t c) {
+ if (needs_mutate()) {
+ auto ref_recorder = recorder_t::create();
+ recorder = static_cast<recorder_t*>(ref_recorder.get());
+ extent = extent->mutate(c, std::move(ref_recorder));
+ assert(needs_recording());
+ node_stage = node_stage_t(
+ reinterpret_cast<const FieldType*>(extent->get_read()));
+ assert(recorder == static_cast<recorder_t*>(extent->get_recorder()));
+ mut.emplace(extent->get_mutable());
+ }
+ }
+
+ template <KeyT KT>
+ const value_t* insert_replayable(
+ const full_key_t<KT>& key,
+ const value_t& value,
+ position_t& insert_pos,
+ match_stage_t& insert_stage,
+ node_offset_t& insert_size) {
+ assert(!needs_mutate());
+ if (needs_recording()) {
+ recorder->template encode_insert<KT>(
+ key, value, insert_pos, insert_stage, insert_size);
+ }
+#ifndef NDEBUG
+ test_extent->prepare_replay(extent);
+ test_recorder->template encode_insert<KT>(
+ key, value, insert_pos, insert_stage, insert_size);
+#endif
+ auto ret = layout_t::template insert<KT>(
+ *mut, read(), key, value,
+ insert_pos, insert_stage, insert_size);
+#ifndef NDEBUG
+ test_extent->replay_and_verify(extent);
+#endif
+ return ret;
+ }
+
+ void split_replayable(StagedIterator& split_at) {
+ assert(!needs_mutate());
+ if (needs_recording()) {
+ recorder->encode_split(split_at, read().p_start());
+ }
+#ifndef NDEBUG
+ test_extent->prepare_replay(extent);
+ test_recorder->template encode_split(split_at, read().p_start());
+#endif
+ layout_t::split(*mut, read(), split_at);
+#ifndef NDEBUG
+ test_extent->replay_and_verify(extent);
+#endif
+ }
+
+ template <KeyT KT>
+ const value_t* split_insert_replayable(
+ StagedIterator& split_at,
+ const full_key_t<KT>& key,
+ const value_t& value,
+ position_t& insert_pos,
+ match_stage_t& insert_stage,
+ node_offset_t& insert_size) {
+ assert(!needs_mutate());
+ if (needs_recording()) {
+ recorder->template encode_split_insert<KT>(
+ split_at, key, value, insert_pos, insert_stage, insert_size,
+ read().p_start());
+ }
+#ifndef NDEBUG
+ test_extent->prepare_replay(extent);
+ test_recorder->template encode_split_insert<KT>(
+ split_at, key, value, insert_pos, insert_stage, insert_size,
+ read().p_start());
+#endif
+ auto ret = layout_t::template split_insert<KT>(
+ *mut, read(), split_at, key, value,
+ insert_pos, insert_stage, insert_size);
+#ifndef NDEBUG
+ test_extent->replay_and_verify(extent);
+#endif
+ return ret;
+ }
+
+ void update_child_addr_replayable(
+ const laddr_t new_addr, laddr_packed_t* p_addr) {
+ assert(!needs_mutate());
+ if (needs_recording()) {
+ recorder->encode_update_child_addr(new_addr, p_addr, read().p_start());
+ }
+#ifndef NDEBUG
+ test_extent->prepare_replay(extent);
+ test_recorder->encode_update_child_addr(new_addr, p_addr, read().p_start());
+#endif
+ layout_t::update_child_addr(*mut, new_addr, p_addr);
+#ifndef NDEBUG
+ test_extent->replay_and_verify(extent);
+#endif
+ }
+
+ void test_copy_to(NodeExtentMutable& to) const {
+ assert(extent->get_length() == to.get_length());
+ std::memcpy(to.get_write(), extent->get_read(), extent->get_length());
+ }
+
+ private:
+ /**
+ * Possible states with CachedExtent::extent_state_t:
+ * INITIAL_WRITE_PENDING -- can mutate, no recording
+ * MUTATION_PENDING -- can mutate, needs recording
+ * CLEAN/DIRTY -- pending mutate
+ * INVALID -- impossible
+ */
+ bool no_recording() const {
+ return extent->is_initial_pending();
+ }
+ bool needs_recording() const {
+ return extent->is_mutation_pending();
+ }
+ bool needs_mutate() const {
+ assert(extent->is_valid());
+ return !extent->is_pending();
+ }
+
+ NodeExtentRef extent;
+ node_stage_t node_stage;
+ std::optional<NodeExtentMutable> mut;
+ // owned by extent
+ recorder_t* recorder;
+
+#ifndef NDEBUG
+ // verify record replay using a different memory block
+ TestReplayExtent::Ref test_extent;
+ recorder_t* test_recorder;
+#endif
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc
new file mode 100644
index 000000000..bd22d4b67
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_extent_manager.h"
+
+#include "node_extent_manager/dummy.h"
+#include "node_extent_manager/seastore.h"
+#include "stages/node_stage_layout.h"
+
+namespace crimson::os::seastore::onode {
+
+std::pair<node_type_t, field_type_t> NodeExtent::get_types() const {
+ const auto header = reinterpret_cast<const node_header_t*>(get_read());
+ auto node_type = header->get_node_type();
+ auto field_type = header->get_field_type();
+ if (!field_type.has_value()) {
+ throw std::runtime_error("load failed: bad field type");
+ }
+ return {node_type, *field_type};
+}
+
+NodeExtentManagerURef NodeExtentManager::create_dummy(bool is_sync) {
+ if (is_sync) {
+ return NodeExtentManagerURef(new DummyNodeExtentManager<true>());
+ } else {
+ return NodeExtentManagerURef(new DummyNodeExtentManager<false>());
+ }
+}
+
+NodeExtentManagerURef NodeExtentManager::create_seastore(
+ TransactionManager& tm, laddr_t min_laddr) {
+ return NodeExtentManagerURef(new SeastoreNodeExtentManager(tm, min_laddr));
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h
new file mode 100644
index 000000000..77b230e03
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/type_helpers.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/transaction_manager.h"
+
+#include "fwd.h"
+#include "super.h"
+#include "node_extent_mutable.h"
+#include "node_types.h"
+
+/**
+ * node_extent_manager.h
+ *
+ * Contains general interfaces for different backends (Dummy and Seastore).
+ */
+
+namespace crimson::os::seastore::onode {
+
+using crimson::os::seastore::LogicalCachedExtent;
+class NodeExtent : public LogicalCachedExtent {
+ public:
+ virtual ~NodeExtent() = default;
+ std::pair<node_type_t, field_type_t> get_types() const;
+ const char* get_read() const {
+ return get_bptr().c_str();
+ }
+ NodeExtentMutable get_mutable() {
+ assert(is_pending());
+ return do_get_mutable();
+ }
+
+ virtual DeltaRecorder* get_recorder() const = 0;
+ virtual NodeExtentRef mutate(context_t, DeltaRecorderURef&&) = 0;
+
+ protected:
+ template <typename... T>
+ NodeExtent(T&&... t) : LogicalCachedExtent(std::forward<T>(t)...) {}
+
+ NodeExtentMutable do_get_mutable() {
+ return NodeExtentMutable(*this);
+ }
+
+ /**
+ * Abstracted interfaces to implement:
+ * - CacheExtent::duplicate_for_write() -> CachedExtentRef
+ * - CacheExtent::get_type() -> extent_types_t
+ * - CacheExtent::get_delta() -> ceph::bufferlist
+ * - LogicalCachedExtent::apply_delta(const ceph::bufferlist) -> void
+ */
+
+ private:
+ friend class NodeExtentMutable;
+};
+
+using crimson::os::seastore::TransactionManager;
+class NodeExtentManager {
+ public:
+ virtual ~NodeExtentManager() = default;
+ using tm_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent,
+ crimson::ct_error::erange>;
+ template <class ValueT=void>
+ using tm_future = tm_ertr::future<ValueT>;
+
+ virtual bool is_read_isolated() const = 0;
+ virtual tm_future<NodeExtentRef> read_extent(
+ Transaction&, laddr_t, extent_len_t) = 0;
+ virtual tm_future<NodeExtentRef> alloc_extent(Transaction&, extent_len_t) = 0;
+ virtual tm_future<Super::URef> get_super(Transaction&, RootNodeTracker&) = 0;
+ virtual std::ostream& print(std::ostream& os) const = 0;
+
+ static NodeExtentManagerURef create_dummy(bool is_sync);
+ static NodeExtentManagerURef create_seastore(
+ TransactionManager& tm, laddr_t min_laddr = L_ADDR_MIN);
+};
+inline std::ostream& operator<<(std::ostream& os, const NodeExtentManager& nm) {
+ return nm.print(os);
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h
new file mode 100644
index 000000000..830ea4a7d
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h
@@ -0,0 +1,156 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <chrono>
+#include <seastar/core/sleep.hh>
+
+#include "include/buffer_raw.h"
+
+#include "crimson/common/log.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h"
+
+/**
+ * dummy.h
+ *
+ * Dummy backend implementations for test purposes.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class DummySuper final: public Super {
+ public:
+ DummySuper(Transaction& t, RootNodeTracker& tracker, laddr_t* p_root_laddr)
+ : Super(t, tracker), p_root_laddr{p_root_laddr} {}
+ ~DummySuper() override = default;
+ protected:
+ laddr_t get_root_laddr() const override { return *p_root_laddr; }
+ void write_root_laddr(context_t, laddr_t addr) override {
+ logger().info("OTree::Dummy: update root {:#x} ...", addr);
+ *p_root_laddr = addr;
+ }
+ private:
+ static seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+ laddr_t* p_root_laddr;
+};
+
+class DummyNodeExtent final: public NodeExtent {
+ public:
+ DummyNodeExtent(ceph::bufferptr &&ptr) : NodeExtent(std::move(ptr)) {
+ state = extent_state_t::INITIAL_WRITE_PENDING;
+ }
+ ~DummyNodeExtent() override = default;
+ protected:
+ NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override {
+ ceph_abort("impossible path"); }
+ DeltaRecorder* get_recorder() const override {
+ return nullptr; }
+ CachedExtentRef duplicate_for_write() override {
+ ceph_abort("impossible path"); }
+ extent_types_t get_type() const override {
+ return extent_types_t::TEST_BLOCK; }
+ ceph::bufferlist get_delta() override {
+ ceph_abort("impossible path"); }
+ void apply_delta(const ceph::bufferlist&) override {
+ ceph_abort("impossible path"); }
+};
+
+template <bool SYNC>
+class DummyNodeExtentManager final: public NodeExtentManager {
+ static constexpr size_t ALIGNMENT = 4096;
+ public:
+ ~DummyNodeExtentManager() override = default;
+ protected:
+ bool is_read_isolated() const override { return false; }
+
+ tm_future<NodeExtentRef> read_extent(
+ Transaction& t, laddr_t addr, extent_len_t len) override {
+ logger().trace("OTree::Dummy: reading {}B at {:#x} ...", len, addr);
+ if constexpr (SYNC) {
+ return read_extent_sync(t, addr, len);
+ } else {
+ using namespace std::chrono_literals;
+ return seastar::sleep(1us).then([this, &t, addr, len] {
+ return read_extent_sync(t, addr, len);
+ });
+ }
+ }
+
+ tm_future<NodeExtentRef> alloc_extent(
+ Transaction& t, extent_len_t len) override {
+ logger().trace("OTree::Dummy: allocating {}B ...", len);
+ if constexpr (SYNC) {
+ return alloc_extent_sync(t, len);
+ } else {
+ using namespace std::chrono_literals;
+ return seastar::sleep(1us).then([this, &t, len] {
+ return alloc_extent_sync(t, len);
+ });
+ }
+ }
+
+ tm_future<Super::URef> get_super(
+ Transaction& t, RootNodeTracker& tracker) override {
+ logger().trace("OTree::Dummy: get root ...");
+ if constexpr (SYNC) {
+ return get_super_sync(t, tracker);
+ } else {
+ using namespace std::chrono_literals;
+ return seastar::sleep(1us).then([this, &t, &tracker] {
+ return get_super_sync(t, tracker);
+ });
+ }
+ }
+
+ std::ostream& print(std::ostream& os) const override {
+ return os << "DummyNodeExtentManager(sync=" << SYNC << ")";
+ }
+
+ private:
+ tm_future<NodeExtentRef> read_extent_sync(
+ Transaction& t, laddr_t addr, extent_len_t len) {
+ auto iter = allocate_map.find(addr);
+ assert(iter != allocate_map.end());
+ auto extent = iter->second;
+ logger().trace("OTree::Dummy: read {}B at {:#x}",
+ extent->get_length(), extent->get_laddr());
+ assert(extent->get_laddr() == addr);
+ assert(extent->get_length() == len);
+ return tm_ertr::make_ready_future<NodeExtentRef>(extent);
+ }
+
+ tm_future<NodeExtentRef> alloc_extent_sync(
+ Transaction& t, extent_len_t len) {
+ assert(len % ALIGNMENT == 0);
+ auto r = ceph::buffer::create_aligned(len, ALIGNMENT);
+ auto addr = reinterpret_cast<laddr_t>(r->get_data());
+ auto bp = ceph::bufferptr(std::move(r));
+ auto extent = Ref<DummyNodeExtent>(new DummyNodeExtent(std::move(bp)));
+ extent->set_laddr(addr);
+ assert(allocate_map.find(extent->get_laddr()) == allocate_map.end());
+ allocate_map.insert({extent->get_laddr(), extent});
+ logger().debug("OTree::Dummy: allocated {}B at {:#x}",
+ extent->get_length(), extent->get_laddr());
+ assert(extent->get_length() == len);
+ return tm_ertr::make_ready_future<NodeExtentRef>(extent);
+ }
+
+ tm_future<Super::URef> get_super_sync(
+ Transaction& t, RootNodeTracker& tracker) {
+ logger().debug("OTree::Dummy: got root {:#x}", root_laddr);
+ return tm_ertr::make_ready_future<Super::URef>(
+ Super::URef(new DummySuper(t, tracker, &root_laddr)));
+ }
+
+ static seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+
+ std::map<laddr_t, Ref<DummyNodeExtent>> allocate_map;
+ laddr_t root_laddr = L_ADDR_NULL;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc
new file mode 100644
index 000000000..8d88485bf
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc
@@ -0,0 +1,88 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "seastore.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h"
+
+namespace {
+
+seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+}
+
+}
+
+namespace crimson::os::seastore::onode {
+
+static DeltaRecorderURef create_recorder(
+ node_type_t node_type, field_type_t field_type) {
+ if (node_type == node_type_t::LEAF) {
+ if (field_type == field_type_t::N0) {
+ return DeltaRecorderT<node_fields_0_t, node_type_t::LEAF>::create();
+ } else if (field_type == field_type_t::N1) {
+ return DeltaRecorderT<node_fields_1_t, node_type_t::LEAF>::create();
+ } else if (field_type == field_type_t::N2) {
+ return DeltaRecorderT<node_fields_2_t, node_type_t::LEAF>::create();
+ } else if (field_type == field_type_t::N3) {
+ return DeltaRecorderT<leaf_fields_3_t, node_type_t::LEAF>::create();
+ } else {
+ ceph_abort("impossible path");
+ }
+ } else if (node_type == node_type_t::INTERNAL) {
+ if (field_type == field_type_t::N0) {
+ return DeltaRecorderT<node_fields_0_t, node_type_t::INTERNAL>::create();
+ } else if (field_type == field_type_t::N1) {
+ return DeltaRecorderT<node_fields_1_t, node_type_t::INTERNAL>::create();
+ } else if (field_type == field_type_t::N2) {
+ return DeltaRecorderT<node_fields_2_t, node_type_t::INTERNAL>::create();
+ } else if (field_type == field_type_t::N3) {
+ return DeltaRecorderT<internal_fields_3_t, node_type_t::INTERNAL>::create();
+ } else {
+ ceph_abort("impossible path");
+ }
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+void SeastoreSuper::write_root_laddr(context_t c, laddr_t addr) {
+ logger().info("OTree::Seastore: update root {:#x} ...", addr);
+ root_addr = addr;
+ auto nm = static_cast<SeastoreNodeExtentManager*>(&c.nm);
+ nm->get_tm().write_onode_root(c.t, addr);
+}
+
+NodeExtentRef SeastoreNodeExtent::mutate(
+ context_t c, DeltaRecorderURef&& _recorder) {
+ logger().debug("OTree::Seastore: mutate {:#x} ...", get_laddr());
+ auto nm = static_cast<SeastoreNodeExtentManager*>(&c.nm);
+ auto extent = nm->get_tm().get_mutable_extent(c.t, this);
+ auto ret = extent->cast<SeastoreNodeExtent>();
+ assert(!ret->recorder || ret->recorder->is_empty());
+ ret->recorder = std::move(_recorder);
+ return ret;
+}
+
+void SeastoreNodeExtent::apply_delta(const ceph::bufferlist& bl) {
+ logger().debug("OTree::Seastore: replay {:#x} ...", get_laddr());
+ if (!recorder) {
+ auto [node_type, field_type] = get_types();
+ recorder = create_recorder(node_type, field_type);
+ } else {
+#ifndef NDEBUG
+ auto [node_type, field_type] = get_types();
+ assert(recorder->node_type() == node_type);
+ assert(recorder->field_type() == field_type);
+#endif
+ }
+ assert(is_clean());
+ auto node = do_get_mutable();
+ auto p = bl.cbegin();
+ while (p != bl.end()) {
+ recorder->apply_delta(p, node);
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h
new file mode 100644
index 000000000..f80b99fab
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h
@@ -0,0 +1,126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/common/log.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h"
+
+/**
+ * seastore.h
+ *
+ * Seastore backend implementations.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class SeastoreSuper final: public Super {
+ public:
+ SeastoreSuper(Transaction& t, RootNodeTracker& tracker,
+ laddr_t root_addr, TransactionManager& tm)
+ : Super(t, tracker), root_addr{root_addr}, tm{tm} {}
+ ~SeastoreSuper() override = default;
+ protected:
+ laddr_t get_root_laddr() const override {
+ return root_addr;
+ }
+ void write_root_laddr(context_t c, laddr_t addr) override;
+ private:
+ static seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+ laddr_t root_addr;
+ TransactionManager& tm;
+};
+
+class SeastoreNodeExtent final: public NodeExtent {
+ public:
+ SeastoreNodeExtent(ceph::bufferptr &&ptr)
+ : NodeExtent(std::move(ptr)) {}
+ SeastoreNodeExtent(const SeastoreNodeExtent& other)
+ : NodeExtent(other) {}
+ ~SeastoreNodeExtent() override = default;
+ protected:
+ NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override;
+
+ DeltaRecorder* get_recorder() const override {
+ return recorder.get();
+ }
+
+ CachedExtentRef duplicate_for_write() override {
+ return CachedExtentRef(new SeastoreNodeExtent(*this));
+ }
+ extent_types_t get_type() const override {
+ return extent_types_t::ONODE_BLOCK_STAGED;
+ }
+ ceph::bufferlist get_delta() override {
+ assert(recorder);
+ return recorder->get_delta();
+ }
+ void apply_delta(const ceph::bufferlist&) override;
+ private:
+ DeltaRecorderURef recorder;
+};
+
+class SeastoreNodeExtentManager final: public NodeExtentManager {
+ public:
+ SeastoreNodeExtentManager(TransactionManager& tm, laddr_t min)
+ : tm{tm}, addr_min{min} {};
+ ~SeastoreNodeExtentManager() override = default;
+ TransactionManager& get_tm() { return tm; }
+ protected:
+ bool is_read_isolated() const override { return true; }
+
+ tm_future<NodeExtentRef> read_extent(
+ Transaction& t, laddr_t addr, extent_len_t len) override {
+ logger().debug("OTree::Seastore: reading {}B at {:#x} ...", len, addr);
+ return tm.read_extents<SeastoreNodeExtent>(t, addr, len
+ ).safe_then([addr, len](auto&& extents) {
+ assert(extents.size() == 1);
+ [[maybe_unused]] auto [laddr, e] = extents.front();
+ logger().trace("OTree::Seastore: read {}B at {:#x}",
+ e->get_length(), e->get_laddr());
+ assert(e->get_laddr() == addr);
+ assert(e->get_length() == len);
+ std::ignore = addr;
+ std::ignore = len;
+ return NodeExtentRef(e);
+ });
+ }
+
+ tm_future<NodeExtentRef> alloc_extent(
+ Transaction& t, extent_len_t len) override {
+ logger().debug("OTree::Seastore: allocating {}B ...", len);
+ return tm.alloc_extent<SeastoreNodeExtent>(t, addr_min, len
+ ).safe_then([len](auto extent) {
+ logger().debug("OTree::Seastore: allocated {}B at {:#x}",
+ extent->get_length(), extent->get_laddr());
+ assert(extent->get_length() == len);
+ std::ignore = len;
+ return NodeExtentRef(extent);
+ });
+ }
+
+ tm_future<Super::URef> get_super(
+ Transaction& t, RootNodeTracker& tracker) override {
+ logger().trace("OTree::Seastore: get root ...");
+ return tm.read_onode_root(t).safe_then([this, &t, &tracker](auto root_addr) {
+ logger().debug("OTree::Seastore: got root {:#x}", root_addr);
+ return Super::URef(new SeastoreSuper(t, tracker, root_addr, tm));
+ });
+ }
+
+ std::ostream& print(std::ostream& os) const override {
+ return os << "SeastoreNodeExtentManager";
+ }
+
+ private:
+ static seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+ TransactionManager& tm;
+ const laddr_t addr_min;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h
new file mode 100644
index 000000000..240c88932
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h"
+
+/** test_replay.h
+ *
+ * A special version of NodeExtent to help verify delta encode, decode and
+ * replay in recorder_t under debug build.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class TestReplayExtent final: public NodeExtent {
+ public:
+ using Ref = crimson::os::seastore::TCachedExtentRef<TestReplayExtent>;
+
+ void prepare_replay(NodeExtentRef from_extent) {
+ assert(get_length() == from_extent->get_length());
+ auto mut = do_get_mutable();
+ std::memcpy(mut.get_write(), from_extent->get_read(), get_length());
+ }
+
+ void replay_and_verify(NodeExtentRef replayed_extent) {
+ assert(get_length() == replayed_extent->get_length());
+ auto mut = do_get_mutable();
+ auto bl = recorder->get_delta();
+ assert(bl.length());
+ auto p = bl.cbegin();
+ recorder->apply_delta(p, mut);
+ assert(p == bl.end());
+ auto cmp = std::memcmp(get_read(), replayed_extent->get_read(), get_length());
+ ceph_assert(cmp == 0 && "replay mismatch!");
+ }
+
+ static Ref create(extent_len_t length, DeltaRecorderURef&& recorder) {
+ auto r = ceph::buffer::create_aligned(length, 4096);
+ auto bp = ceph::bufferptr(std::move(r));
+ return new TestReplayExtent(std::move(bp), std::move(recorder));
+ }
+
+ protected:
+ NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override {
+ ceph_abort("impossible path"); }
+ DeltaRecorder* get_recorder() const override {
+ ceph_abort("impossible path"); }
+ CachedExtentRef duplicate_for_write() override {
+ ceph_abort("impossible path"); }
+ extent_types_t get_type() const override {
+ return extent_types_t::TEST_BLOCK; }
+ ceph::bufferlist get_delta() override {
+ ceph_abort("impossible path"); }
+ void apply_delta(const ceph::bufferlist&) override {
+ ceph_abort("impossible path"); }
+
+ private:
+ TestReplayExtent(ceph::bufferptr&& ptr, DeltaRecorderURef&& recorder)
+ : NodeExtent(std::move(ptr)), recorder(std::move(recorder)) {
+ state = extent_state_t::MUTATION_PENDING;
+ }
+ DeltaRecorderURef recorder;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc
new file mode 100644
index 000000000..048c4000d
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_extent_mutable.h"
+#include "node_extent_manager.h"
+
+namespace crimson::os::seastore::onode {
+
+NodeExtentMutable::NodeExtentMutable(NodeExtent& extent)
+ : extent{extent} {
+ assert(extent.is_pending() || // during mutation
+ extent.is_clean()); // during replay
+}
+
+const char* NodeExtentMutable::get_read() const {
+ assert(extent.is_pending() || // during mutation
+ extent.is_clean()); // during replay
+ return extent.get_bptr().c_str();
+}
+
+char* NodeExtentMutable::get_write() {
+ assert(extent.is_pending() || // during mutation
+ extent.is_clean()); // during replay
+ return extent.get_bptr().c_str();
+}
+
+extent_len_t NodeExtentMutable::get_length() const {
+ return extent.get_length();
+}
+
+laddr_t NodeExtentMutable::get_laddr() const {
+ return extent.get_laddr();
+}
+
+const char* NodeExtentMutable::buf_upper_bound() const {
+ return get_read() + get_length();
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h
new file mode 100644
index 000000000..52f10a013
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <cstring>
+
+#include "fwd.h"
+
+#pragma once
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtent;
+
+/**
+ * NodeExtentMutable
+ *
+ * A thin wrapper of NodeExtent to make sure that only the newly allocated
+ * or the duplicated NodeExtent is mutable, and the memory modifications are
+ * safe within the extent range.
+ */
+class NodeExtentMutable {
+ public:
+ void copy_in_absolute(void* dst, const void* src, extent_len_t len) {
+ assert((char*)dst >= get_write());
+ assert((char*)dst + len <= buf_upper_bound());
+ std::memcpy(dst, src, len);
+ }
+ template <typename T>
+ void copy_in_absolute(void* dst, const T& src) {
+ copy_in_absolute(dst, &src, sizeof(T));
+ }
+
+ const void* copy_in_relative(
+ extent_len_t dst_offset, const void* src, extent_len_t len) {
+ auto dst = get_write() + dst_offset;
+ copy_in_absolute(dst, src, len);
+ return dst;
+ }
+ template <typename T>
+ const T* copy_in_relative(
+ extent_len_t dst_offset, const T& src) {
+ auto dst = copy_in_relative(dst_offset, &src, sizeof(T));
+ return static_cast<const T*>(dst);
+ }
+
+ void shift_absolute(const void* src, extent_len_t len, int offset) {
+ assert((const char*)src >= get_write());
+ assert((const char*)src + len <= buf_upper_bound());
+ char* to = (char*)src + offset;
+ assert(to >= get_write());
+ assert(to + len <= buf_upper_bound());
+ if (len != 0) {
+ std::memmove(to, src, len);
+ }
+ }
+ void shift_relative(extent_len_t src_offset, extent_len_t len, int offset) {
+ shift_absolute(get_write() + src_offset, len, offset);
+ }
+
+ template <typename T>
+ void validate_inplace_update(const T& updated) {
+ assert((const char*)&updated >= get_write());
+ assert((const char*)&updated + sizeof(T) <= buf_upper_bound());
+ }
+
+ const char* get_read() const;
+ char* get_write();
+ extent_len_t get_length() const;
+ laddr_t get_laddr() const;
+
+ private:
+ explicit NodeExtentMutable(NodeExtent&);
+ const char* buf_upper_bound() const;
+
+ NodeExtent& extent;
+
+ friend class NodeExtent;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc
new file mode 100644
index 000000000..59d792b1a
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_impl.h"
+#include "node_layout.h"
+
+namespace crimson::os::seastore::onode {
+
+#ifdef UNIT_TESTS_BUILT
+last_split_info_t last_split = {};
+#endif
+
+// XXX: branchless allocation
+InternalNodeImpl::alloc_ertr::future<InternalNodeImpl::fresh_impl_t>
+InternalNodeImpl::allocate(
+ context_t c, field_type_t type, bool is_level_tail, level_t level) {
+ if (type == field_type_t::N0) {
+ return InternalNode0::allocate(c, is_level_tail, level);
+ } else if (type == field_type_t::N1) {
+ return InternalNode1::allocate(c, is_level_tail, level);
+ } else if (type == field_type_t::N2) {
+ return InternalNode2::allocate(c, is_level_tail, level);
+ } else if (type == field_type_t::N3) {
+ return InternalNode3::allocate(c, is_level_tail, level);
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+LeafNodeImpl::alloc_ertr::future<LeafNodeImpl::fresh_impl_t>
+LeafNodeImpl::allocate(
+ context_t c, field_type_t type, bool is_level_tail) {
+ if (type == field_type_t::N0) {
+ return LeafNode0::allocate(c, is_level_tail, 0);
+ } else if (type == field_type_t::N1) {
+ return LeafNode1::allocate(c, is_level_tail, 0);
+ } else if (type == field_type_t::N2) {
+ return LeafNode2::allocate(c, is_level_tail, 0);
+ } else if (type == field_type_t::N3) {
+ return LeafNode3::allocate(c, is_level_tail, 0);
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+InternalNodeImplURef InternalNodeImpl::load(
+ NodeExtentRef extent, field_type_t type, bool expect_is_level_tail) {
+ if (type == field_type_t::N0) {
+ return InternalNode0::load(extent, expect_is_level_tail);
+ } else if (type == field_type_t::N1) {
+ return InternalNode1::load(extent, expect_is_level_tail);
+ } else if (type == field_type_t::N2) {
+ return InternalNode2::load(extent, expect_is_level_tail);
+ } else if (type == field_type_t::N3) {
+ return InternalNode3::load(extent, expect_is_level_tail);
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+LeafNodeImplURef LeafNodeImpl::load(
+ NodeExtentRef extent, field_type_t type, bool expect_is_level_tail) {
+ if (type == field_type_t::N0) {
+ return LeafNode0::load(extent, expect_is_level_tail);
+ } else if (type == field_type_t::N1) {
+ return LeafNode1::load(extent, expect_is_level_tail);
+ } else if (type == field_type_t::N2) {
+ return LeafNode2::load(extent, expect_is_level_tail);
+ } else if (type == field_type_t::N3) {
+ return LeafNode3::load(extent, expect_is_level_tail);
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h
new file mode 100644
index 000000000..3267cda2b
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h
@@ -0,0 +1,197 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+
+#include "node_extent_mutable.h"
+#include "node_types.h"
+#include "stages/stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+#ifdef UNIT_TESTS_BUILT
+enum class InsertType { BEGIN, LAST, MID };
+struct split_expectation_t {
+ match_stage_t split_stage;
+ match_stage_t insert_stage;
+ bool is_insert_left;
+ InsertType insert_type;
+};
+struct last_split_info_t {
+ search_position_t split_pos;
+ match_stage_t insert_stage;
+ bool is_insert_left;
+ InsertType insert_type;
+ bool match(const split_expectation_t& e) const {
+ match_stage_t split_stage;
+ if (split_pos.nxt.nxt.index == 0) {
+ if (split_pos.nxt.index == 0) {
+ split_stage = 2;
+ } else {
+ split_stage = 1;
+ }
+ } else {
+ split_stage = 0;
+ }
+ return split_stage == e.split_stage &&
+ insert_stage == e.insert_stage &&
+ is_insert_left == e.is_insert_left &&
+ insert_type == e.insert_type;
+ }
+ bool match_split_pos(const search_position_t& pos) const {
+ return split_pos == pos;
+ }
+};
+extern last_split_info_t last_split;
+#endif
+
+struct key_hobj_t;
+struct key_view_t;
+class NodeExtentMutable;
+
+/**
+ * NodeImpl
+ *
+ * Hides type specific node layout implementations for Node.
+ */
+class NodeImpl {
+ public:
+ using alloc_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent,
+ crimson::ct_error::erange>;
+ virtual ~NodeImpl() = default;
+
+ virtual field_type_t field_type() const = 0;
+ virtual laddr_t laddr() const = 0;
+ virtual void prepare_mutate(context_t) = 0;
+ virtual bool is_level_tail() const = 0;
+ virtual bool is_empty() const = 0;
+ virtual level_t level() const = 0;
+ virtual node_offset_t free_size() const = 0;
+ virtual key_view_t get_key_view(const search_position_t&) const = 0;
+ virtual key_view_t get_largest_key_view() const = 0;
+ virtual void next_position(search_position_t&) const = 0;
+
+ virtual node_stats_t get_stats() const = 0;
+ virtual std::ostream& dump(std::ostream&) const = 0;
+ virtual std::ostream& dump_brief(std::ostream&) const = 0;
+ virtual void validate_layout() const = 0;
+
+ virtual void test_copy_to(NodeExtentMutable&) const = 0;
+ virtual void test_set_tail(NodeExtentMutable&) = 0;
+
+ protected:
+ NodeImpl() = default;
+};
+
+/**
+ * InternalNodeImpl
+ *
+ * Hides type specific node layout implementations for InternalNode.
+ */
+class InternalNodeImpl : public NodeImpl {
+ public:
+ struct internal_marker_t {};
+ virtual ~InternalNodeImpl() = default;
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual const laddr_packed_t* get_p_value(
+ const search_position_t&,
+ key_view_t* = nullptr, internal_marker_t = {}) const {
+ ceph_abort("impossible path");
+ }
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual lookup_result_t<node_type_t::INTERNAL> lower_bound(
+ const key_hobj_t&, MatchHistory&,
+ key_view_t* = nullptr, internal_marker_t = {}) const {
+ ceph_abort("impossible path");
+ }
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual const laddr_packed_t* insert(
+ const key_view_t&, const laddr_packed_t&, search_position_t&, match_stage_t&, node_offset_t&) {
+ ceph_abort("impossible path");
+ }
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual std::tuple<search_position_t, bool, const laddr_packed_t*> split_insert(
+ NodeExtentMutable&, NodeImpl&, const key_view_t&, const laddr_packed_t&,
+ search_position_t&, match_stage_t&, node_offset_t&) {
+ ceph_abort("impossible path");
+ }
+
+ virtual void replace_child_addr(const search_position_t&, laddr_t dst, laddr_t src) = 0;
+ virtual std::tuple<match_stage_t, node_offset_t> evaluate_insert(
+ const key_view_t&, const laddr_t&, search_position_t&) const = 0;
+
+ struct fresh_impl_t {
+ InternalNodeImplURef impl;
+ NodeExtentMutable mut;
+ std::pair<NodeImplURef, NodeExtentMutable> make_pair() {
+ return {std::move(impl), mut};
+ }
+ };
+ static alloc_ertr::future<fresh_impl_t> allocate(context_t, field_type_t, bool, level_t);
+ static InternalNodeImplURef load(NodeExtentRef, field_type_t, bool);
+
+ protected:
+ InternalNodeImpl() = default;
+};
+
+/**
+ * LeafNodeImpl
+ *
+ * Hides type specific node layout implementations for LeafNode.
+ */
+class LeafNodeImpl : public NodeImpl {
+ public:
+ struct leaf_marker_t {};
+ virtual ~LeafNodeImpl() = default;
+
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual const onode_t* get_p_value(
+ const search_position_t&,
+ key_view_t* = nullptr, leaf_marker_t={}) const {
+ ceph_abort("impossible path");
+ }
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual lookup_result_t<node_type_t::LEAF> lower_bound(
+ const key_hobj_t&, MatchHistory&,
+ key_view_t* = nullptr, leaf_marker_t = {}) const {
+ ceph_abort("impossible path");
+ }
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual const onode_t* insert(
+ const key_hobj_t&, const onode_t&, search_position_t&, match_stage_t&, node_offset_t&) {
+ ceph_abort("impossible path");
+ }
+ #pragma GCC diagnostic ignored "-Woverloaded-virtual"
+ virtual std::tuple<search_position_t, bool, const onode_t*> split_insert(
+ NodeExtentMutable&, NodeImpl&, const key_hobj_t&, const onode_t&,
+ search_position_t&, match_stage_t&, node_offset_t&) {
+ ceph_abort("impossible path");
+ }
+
+ virtual void get_largest_slot(
+ search_position_t&, key_view_t&, const onode_t**) const = 0;
+ virtual std::tuple<match_stage_t, node_offset_t> evaluate_insert(
+ const key_hobj_t&, const onode_t&,
+ const MatchHistory&, match_stat_t, search_position_t&) const = 0;
+
+ struct fresh_impl_t {
+ LeafNodeImplURef impl;
+ NodeExtentMutable mut;
+ std::pair<NodeImplURef, NodeExtentMutable> make_pair() {
+ return {std::move(impl), mut};
+ }
+ };
+ static alloc_ertr::future<fresh_impl_t> allocate(context_t, field_type_t, bool);
+ static LeafNodeImplURef load(NodeExtentRef, field_type_t, bool);
+
+ protected:
+ LeafNodeImpl() = default;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
new file mode 100644
index 000000000..916d17424
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
@@ -0,0 +1,613 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+#include <sstream>
+
+#include "common/likely.h"
+#include "crimson/common/log.h"
+#include "node_extent_accessor.h"
+#include "node_impl.h"
+#include "stages/node_stage_layout.h"
+
+namespace crimson::os::seastore::onode {
+
+template <node_type_t NODE_TYPE> struct insert_key_type;
+template <> struct insert_key_type<node_type_t::INTERNAL> {
+ static constexpr auto type = KeyT::VIEW; };
+template <> struct insert_key_type<node_type_t::LEAF> {
+ static constexpr auto type = KeyT::HOBJ; };
+
+template <node_type_t NODE_TYPE> struct node_impl_type;
+template <> struct node_impl_type<node_type_t::INTERNAL> {
+ using type = InternalNodeImpl; };
+template <> struct node_impl_type<node_type_t::LEAF> {
+ using type = LeafNodeImpl; };
+
+template <node_type_t NODE_TYPE> struct node_marker_type;
+template <> struct node_marker_type<node_type_t::INTERNAL> {
+ using type = InternalNodeImpl::internal_marker_t; };
+template <> struct node_marker_type<node_type_t::LEAF> {
+ using type = LeafNodeImpl::leaf_marker_t; };
+
+/**
+ * NodeLayoutT
+ *
+ * Contains templated and concrete implementations for both InternalNodeImpl
+ * and LeafNodeImpl under a specific node layout.
+ */
+template <typename FieldType, node_type_t NODE_TYPE>
+class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl {
+ public:
+ using URef = std::unique_ptr<NodeLayoutT>;
+ using extent_t = NodeExtentAccessorT<FieldType, NODE_TYPE>;
+ using parent_t = typename node_impl_type<NODE_TYPE>::type;
+ using marker_t = typename node_marker_type<NODE_TYPE>::type;
+ using node_stage_t = typename extent_t::node_stage_t;
+ using position_t = typename extent_t::position_t;
+ using value_t = typename extent_t::value_t;
+ static constexpr auto FIELD_TYPE = extent_t::FIELD_TYPE;
+ static constexpr auto KEY_TYPE = insert_key_type<NODE_TYPE>::type;
+ static constexpr auto STAGE = STAGE_T::STAGE;
+
+ NodeLayoutT(const NodeLayoutT&) = delete;
+ NodeLayoutT(NodeLayoutT&&) = delete;
+ NodeLayoutT& operator=(const NodeLayoutT&) = delete;
+ NodeLayoutT& operator=(NodeLayoutT&&) = delete;
+ ~NodeLayoutT() override = default;
+
+ static URef load(NodeExtentRef extent, bool expect_is_level_tail) {
+ std::unique_ptr<NodeLayoutT> ret(new NodeLayoutT(extent));
+ assert(ret->is_level_tail() == expect_is_level_tail);
+ return ret;
+ }
+
+ using alloc_ertr = NodeExtentManager::tm_ertr;
+ static alloc_ertr::future<typename parent_t::fresh_impl_t> allocate(
+ context_t c, bool is_level_tail, level_t level) {
+ // NOTE: Currently, all the node types have the same size for simplicity.
+ // But depending on the requirement, we may need to make node size
+ // configurable by field_type_t and node_type_t, or totally flexible.
+ return c.nm.alloc_extent(c.t, node_stage_t::EXTENT_SIZE
+ ).safe_then([is_level_tail, level](auto extent) {
+ assert(extent->is_initial_pending());
+ auto mut = extent->get_mutable();
+ node_stage_t::bootstrap_extent(
+ mut, FIELD_TYPE, NODE_TYPE, is_level_tail, level);
+ return typename parent_t::fresh_impl_t{
+ std::unique_ptr<parent_t>(new NodeLayoutT(extent)), mut};
+ });
+ }
+
+ protected:
+ /*
+ * NodeImpl
+ */
+ field_type_t field_type() const override { return FIELD_TYPE; }
+ laddr_t laddr() const override { return extent.get_laddr(); }
+ void prepare_mutate(context_t c) override { return extent.prepare_mutate(c); }
+ bool is_level_tail() const override { return extent.read().is_level_tail(); }
+ bool is_empty() const override { return extent.read().keys() == 0; }
+ level_t level() const override { return extent.read().level(); }
+ node_offset_t free_size() const override { return extent.read().free_size(); }
+
+ key_view_t get_key_view(const search_position_t& position) const override {
+ key_view_t ret;
+ STAGE_T::get_key_view(extent.read(), cast_down<STAGE>(position), ret);
+ return ret;
+ }
+
+ key_view_t get_largest_key_view() const override {
+ key_view_t index_key;
+ STAGE_T::template lookup_largest_slot<false, true, false>(
+ extent.read(), nullptr, &index_key, nullptr);
+ return index_key;
+ }
+
+ void next_position(search_position_t& pos) const override {
+ assert(!pos.is_end());
+ bool find_next = STAGE_T::next_position(extent.read(), cast_down<STAGE>(pos));
+ if (find_next) {
+ pos = search_position_t::end();
+ }
+ }
+
+ node_stats_t get_stats() const override {
+ node_stats_t stats;
+ auto& node_stage = extent.read();
+ key_view_t index_key;
+ if (node_stage.keys()) {
+ STAGE_T::get_stats(node_stage, stats, index_key);
+ }
+ stats.size_persistent = node_stage_t::EXTENT_SIZE;
+ stats.size_filled = filled_size();
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (is_level_tail()) {
+ stats.size_logical += sizeof(value_t);
+ stats.size_value += sizeof(value_t);
+ stats.num_kvs += 1;
+ }
+ }
+ return stats;
+ }
+
+ std::ostream& dump(std::ostream& os) const override {
+ auto& node_stage = extent.read();
+ auto p_start = node_stage.p_start();
+ dump_brief(os);
+ auto stats = get_stats();
+ os << " num_kvs=" << stats.num_kvs
+ << ", logical=" << stats.size_logical
+ << "B, overhead=" << stats.size_overhead
+ << "B, value=" << stats.size_value << "B";
+ os << ":\n header: " << node_stage_t::header_size() << "B";
+ size_t size = 0u;
+ if (node_stage.keys()) {
+ STAGE_T::dump(node_stage, os, " ", size, p_start);
+ } else {
+ size += node_stage_t::header_size();
+ if (NODE_TYPE == node_type_t::LEAF || !node_stage.is_level_tail()) {
+ os << " empty!";
+ }
+ }
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (node_stage.is_level_tail()) {
+ size += sizeof(laddr_t);
+ auto value_ptr = node_stage.get_end_p_laddr();
+ int offset = reinterpret_cast<const char*>(value_ptr) - p_start;
+ os << "\n tail value: 0x"
+ << std::hex << value_ptr->value << std::dec
+ << " " << size << "B"
+ << " @" << offset << "B";
+ }
+ }
+ assert(size == filled_size());
+ return os;
+ }
+
+ std::ostream& dump_brief(std::ostream& os) const override {
+ auto& node_stage = extent.read();
+ os << "Node" << NODE_TYPE << FIELD_TYPE
+ << "@0x" << std::hex << extent.get_laddr()
+ << "+" << node_stage_t::EXTENT_SIZE << std::dec
+ << (node_stage.is_level_tail() ? "$" : "")
+ << "(level=" << (unsigned)node_stage.level()
+ << ", filled=" << filled_size() << "B"
+ << ", free=" << node_stage.free_size() << "B"
+ << ")";
+ return os;
+ }
+
+ void validate_layout() const override {
+#ifndef NDEBUG
+ STAGE_T::validate(extent.read());
+#endif
+ }
+
+ void test_copy_to(NodeExtentMutable& to) const override {
+ extent.test_copy_to(to);
+ }
+
+ void test_set_tail(NodeExtentMutable& mut) override {
+ node_stage_t::update_is_level_tail(mut, extent.read(), true);
+ }
+
+ /*
+ * Common
+ */
+ const value_t* get_p_value(const search_position_t& position,
+ key_view_t* index_key=nullptr, marker_t={}) const override {
+ auto& node_stage = extent.read();
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ assert(!index_key);
+ if (position.is_end()) {
+ assert(is_level_tail());
+ return node_stage.get_end_p_laddr();
+ }
+ } else {
+ assert(!position.is_end());
+ }
+ if (index_key) {
+ return STAGE_T::template get_p_value<true>(
+ node_stage, cast_down<STAGE>(position), index_key);
+ } else {
+ return STAGE_T::get_p_value(node_stage, cast_down<STAGE>(position));
+ }
+ }
+
+ lookup_result_t<NODE_TYPE> lower_bound(
+ const key_hobj_t& key, MatchHistory& history,
+ key_view_t* index_key=nullptr, marker_t={}) const override {
+ auto& node_stage = extent.read();
+ if constexpr (NODE_TYPE == node_type_t::LEAF) {
+ if (unlikely(node_stage.keys() == 0)) {
+ history.set<STAGE_LEFT>(MatchKindCMP::LT);
+ return lookup_result_t<NODE_TYPE>::end();
+ }
+ }
+
+ typename STAGE_T::result_t result_raw;
+ if (index_key) {
+ result_raw = STAGE_T::template lower_bound<true>(
+ node_stage, key, history, index_key);
+#ifndef NDEBUG
+ if (!result_raw.is_end()) {
+ full_key_t<KeyT::VIEW> index;
+ STAGE_T::get_key_view(node_stage, result_raw.position, index);
+ assert(index == *index_key);
+ }
+#endif
+ } else {
+ result_raw = STAGE_T::lower_bound(node_stage, key, history);
+ }
+#ifndef NDEBUG
+ if (result_raw.is_end()) {
+ assert(result_raw.mstat == MSTAT_END);
+ } else {
+ full_key_t<KeyT::VIEW> index;
+ STAGE_T::get_key_view(node_stage, result_raw.position, index);
+ assert_mstat(key, index, result_raw.mstat);
+ }
+#endif
+
+ // calculate MSTAT_LT3
+ if constexpr (FIELD_TYPE == field_type_t::N0) {
+ // currently only internal node checks mstat
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (result_raw.mstat == MSTAT_LT2) {
+ auto cmp = compare_to<KeyT::HOBJ>(
+ key, node_stage[result_raw.position.index].shard_pool);
+ assert(cmp != MatchKindCMP::GT);
+ if (cmp != MatchKindCMP::EQ) {
+ result_raw.mstat = MSTAT_LT3;
+ }
+ }
+ }
+ }
+
+ auto result = normalize(std::move(result_raw));
+ if (result.is_end()) {
+ assert(node_stage.is_level_tail());
+ assert(result.p_value == nullptr);
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ result.p_value = node_stage.get_end_p_laddr();
+ }
+ } else {
+ assert(result.p_value != nullptr);
+ }
+ return result;
+ }
+
+ const value_t* insert(
+ const full_key_t<KEY_TYPE>& key, const value_t& value,
+ search_position_t& insert_pos, match_stage_t& insert_stage,
+ node_offset_t& insert_size) override {
+ logger().debug("OTree::Layout::Insert: begin at "
+ "insert_pos({}), insert_stage={}, insert_size={}B ...",
+ insert_pos, insert_stage, insert_size);
+ if (unlikely(logger().is_enabled(seastar::log_level::trace))) {
+ std::ostringstream sos;
+ dump(sos);
+ logger().trace("OTree::Layout::Insert: -- dump\n{}", sos.str());
+ }
+ auto ret = extent.template insert_replayable<KEY_TYPE>(
+ key, value, cast_down<STAGE>(insert_pos), insert_stage, insert_size);
+ logger().debug("OTree::Layout::Insert: done at "
+ "insert_pos({}), insert_stage={}, insert_size={}B",
+ insert_pos, insert_stage, insert_size);
+ if (unlikely(logger().is_enabled(seastar::log_level::trace))) {
+ std::ostringstream sos;
+ dump(sos);
+ logger().trace("OTree::Layout::Insert: -- dump\n{}", sos.str());
+ }
+ validate_layout();
+ assert(get_key_view(insert_pos) == key);
+ return ret;
+ }
+
+ std::tuple<search_position_t, bool, const value_t*> split_insert(
+ NodeExtentMutable& right_mut, NodeImpl& right_impl,
+ const full_key_t<KEY_TYPE>& key, const value_t& value,
+ search_position_t& _insert_pos, match_stage_t& insert_stage,
+ node_offset_t& insert_size) override {
+ logger().info("OTree::Layout::Split: begin at "
+ "insert_pos({}), insert_stage={}, insert_size={}B, "
+ "{:#x}=>{:#x} ...",
+ _insert_pos, insert_stage, insert_size,
+ laddr(), right_impl.laddr());
+ if (unlikely(logger().is_enabled(seastar::log_level::debug))) {
+ std::ostringstream sos;
+ dump(sos);
+ logger().debug("OTree::Layout::Split: -- dump\n{}", sos.str());
+ }
+#ifdef UNIT_TESTS_BUILT
+ auto insert_stage_pre = insert_stage;
+#endif
+
+ auto& insert_pos = cast_down<STAGE>(_insert_pos);
+ auto& node_stage = extent.read();
+ typename STAGE_T::StagedIterator split_at;
+ bool is_insert_left;
+ size_t split_size;
+ size_t target_split_size;
+ {
+ size_t empty_size = node_stage.size_before(0);
+ size_t filled_kv_size = filled_size() - empty_size;
+ /** NODE_BLOCK_SIZE considerations
+ *
+ * Generally,
+ * target_split_size = (filled_size + insert_size) / 2
+ * We can have two locate_split() strategies:
+ * A. the simpler one is to locate the largest split position where
+ * the estimated left_node_size <= target_split_size;
+ * B. the fair one takes a further step to calculate the next slot of
+ * P KiB, and if left_node_size + P/2 < target_split_size, compensate
+ * the split position to include the next slot; (TODO)
+ *
+ * Say that the node_block_size = N KiB, the largest allowed
+ * insert_size = 1/I * N KiB (I > 1). We want to identify the minimal 'I'
+ * that won't lead to "double split" effect, meaning after a split,
+ * the right node size is still larger than N KiB and need to split
+ * again. I think "double split" makes split much more complicated and
+ * we can no longer identify whether the node is safe under concurrent
+ * operations.
+ *
+ * We need to evaluate the worst case in order to identify 'I'. This means:
+ * - filled_size ~= N KiB
+ * - insert_size == N/I KiB
+ * - target_split_size ~= (I+1)/2I * N KiB
+ * To simplify the below calculations, node_block_size is normalized to 1.
+ *
+ * With strategy A, the worst case is when left_node_size cannot include
+ * the next slot that will just overflow the target_split_size:
+ * - left_node_size + 1/I ~= (I+1)/2I
+ * - left_node_size ~= (I-1)/2I
+ * - right_node_size ~= 1 + 1/I - left_node_size ~= (I+3)/2I
+ * The right_node_size cannot larger than the node_block_size in the
+ * worst case, which means (I+3)/2I < 1, so I > 3, meaning the largest
+ * possible insert_size must be smaller than 1/3 of the node_block_size.
+ *
+ * With strategy B, the worst case is when left_node_size cannot include
+ * the next slot that will just overflow the threshold
+ * target_split_size - 1/2I, thus:
+ * - left_node_size ~= (I+1)/2I - 1/2I ~= 1/2
+ * - right_node_size ~= 1 + 1/I - 1/2 ~= (I+2)/2I < node_block_size(1)
+ * - I > 2
+ * This means the largest possible insert_size must be smaller than 1/2 of
+ * the node_block_size, which is better than strategy A.
+
+ * In order to avoid "double split", there is another side-effect we need
+ * to take into consideration: if split happens with snap-gen indexes, the
+ * according ns-oid string needs to be copied to the right node. That is
+ * to say: right_node_size + string_size < node_block_size.
+ *
+ * Say that the largest allowed string size is 1/S of the largest allowed
+ * insert_size N/I KiB. If we go with stragety B, the equation should be
+ * changed to:
+ * - right_node_size ~= (I+2)/2I + 1/(I*S) < 1
+ * - I > 2 + 2/S (S > 1)
+ *
+ * Now back to NODE_BLOCK_SIZE calculation, if we have limits of at most
+ * X KiB ns-oid string and Y KiB of onode_t to store in this BTree, then:
+ * - largest_insert_size ~= X+Y KiB
+ * - 1/S == X/(X+Y)
+ * - I > (4X+2Y)/(X+Y)
+ * - node_block_size(N) == I * insert_size > 4X+2Y KiB
+ *
+ * In conclusion,
+ * (TODO) the current node block size (4 KiB) is too small to
+ * store entire 2 KiB ns-oid string. We need to consider a larger
+ * node_block_size.
+ *
+ * We are setting X = Y = 640 B in order not to break the current
+ * implementations with 4KiB node.
+ *
+ * (TODO) Implement smarter logics to check when "double split" happens.
+ */
+ target_split_size = empty_size + (filled_kv_size + insert_size) / 2;
+ assert(insert_size < (node_stage.total_size() - empty_size) / 2);
+
+ std::optional<bool> _is_insert_left;
+ split_at.set(node_stage);
+ split_size = 0;
+ bool locate_nxt = STAGE_T::recursively_locate_split_inserted(
+ split_size, 0, target_split_size, insert_pos,
+ insert_stage, insert_size, _is_insert_left, split_at);
+ is_insert_left = *_is_insert_left;
+ logger().debug("OTree::Layout::Split: -- located "
+ "split_at({}), insert_pos({}), is_insert_left={}, "
+ "split_size={}B(target={}B, current={}B)",
+ split_at, insert_pos, is_insert_left,
+ split_size, target_split_size, filled_size());
+ // split_size can be larger than target_split_size in strategy B
+ // assert(split_size <= target_split_size);
+ if (locate_nxt) {
+ assert(insert_stage == STAGE);
+ assert(split_at.get().is_last());
+ split_at.set_end();
+ assert(insert_pos.index == split_at.index());
+ }
+ }
+
+ auto append_at = split_at;
+ // TODO(cross-node string dedup)
+ typename STAGE_T::template StagedAppender<KEY_TYPE> right_appender;
+ right_appender.init(&right_mut, right_mut.get_write());
+ const value_t* p_value = nullptr;
+ if (!is_insert_left) {
+ // right node: append [start(append_at), insert_pos)
+ STAGE_T::template append_until<KEY_TYPE>(
+ append_at, right_appender, insert_pos, insert_stage);
+ logger().debug("OTree::Layout::Split: -- right appended until "
+ "insert_pos({}), insert_stage={}, insert/append the rest ...",
+ insert_pos, insert_stage);
+ // right node: append [insert_pos(key, value)]
+ bool is_front_insert = (insert_pos == position_t::begin());
+ [[maybe_unused]] bool is_end = STAGE_T::template append_insert<KEY_TYPE>(
+ key, value, append_at, right_appender,
+ is_front_insert, insert_stage, p_value);
+ assert(append_at.is_end() == is_end);
+ } else {
+ logger().debug("OTree::Layout::Split: -- right appending ...");
+ }
+
+ // right node: append (insert_pos, end)
+ auto pos_end = position_t::end();
+ STAGE_T::template append_until<KEY_TYPE>(
+ append_at, right_appender, pos_end, STAGE);
+ assert(append_at.is_end());
+ right_appender.wrap();
+ if (unlikely(logger().is_enabled(seastar::log_level::debug))) {
+ std::ostringstream sos;
+ right_impl.dump(sos);
+ logger().debug("OTree::Layout::Split: -- right node dump\n{}", sos.str());
+ }
+ right_impl.validate_layout();
+
+ // mutate left node
+ if (is_insert_left) {
+ logger().debug("OTree::Layout::Split: -- left trim/insert at "
+ "insert_pos({}), insert_stage={} ...",
+ insert_pos, insert_stage);
+ p_value = extent.template split_insert_replayable<KEY_TYPE>(
+ split_at, key, value, insert_pos, insert_stage, insert_size);
+ assert(get_key_view(_insert_pos) == key);
+ } else {
+ logger().debug("OTree::Layout::Split: -- left trim ...");
+ assert(right_impl.get_key_view(_insert_pos) == key);
+ extent.split_replayable(split_at);
+ }
+ if (unlikely(logger().is_enabled(seastar::log_level::debug))) {
+ std::ostringstream sos;
+ dump(sos);
+ logger().debug("OTree::Layout::Split: -- left node dump\n{}", sos.str());
+ }
+ validate_layout();
+ assert(p_value);
+
+ auto split_pos = normalize(split_at.get_pos());
+ logger().info("OTree::Layout::Split: done at "
+ "insert_pos({}), insert_stage={}, insert_size={}B, split_at({}), "
+ "is_insert_left={}, split_size={}B(target={}B)",
+ _insert_pos, insert_stage, insert_size, split_pos,
+ is_insert_left, split_size, target_split_size);
+ assert(split_size == filled_size());
+
+#ifdef UNIT_TESTS_BUILT
+ InsertType insert_type;
+ search_position_t last_pos;
+ if (is_insert_left) {
+ STAGE_T::template lookup_largest_slot<true, false, false>(
+ extent.read(), &cast_down_fill_0<STAGE>(last_pos), nullptr, nullptr);
+ } else {
+ node_stage_t right_stage{reinterpret_cast<FieldType*>(right_mut.get_write())};
+ STAGE_T::template lookup_largest_slot<true, false, false>(
+ right_stage, &cast_down_fill_0<STAGE>(last_pos), nullptr, nullptr);
+ }
+ if (_insert_pos == search_position_t::begin()) {
+ insert_type = InsertType::BEGIN;
+ } else if (_insert_pos == last_pos) {
+ insert_type = InsertType::LAST;
+ } else {
+ insert_type = InsertType::MID;
+ }
+ last_split = {split_pos, insert_stage_pre, is_insert_left, insert_type};
+#endif
+ return {split_pos, is_insert_left, p_value};
+ }
+
+ /*
+ * InternalNodeImpl
+ */
+ void replace_child_addr(
+ const search_position_t& pos, laddr_t dst, laddr_t src) override {
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ const laddr_packed_t* p_value = get_p_value(pos);
+ assert(p_value->value == src);
+ extent.update_child_addr_replayable(dst, const_cast<laddr_packed_t*>(p_value));
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ std::tuple<match_stage_t, node_offset_t> evaluate_insert(
+ const key_view_t& key, const laddr_t& value,
+ search_position_t& insert_pos) const override {
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ auto packed_value = laddr_packed_t{value};
+ auto& node_stage = extent.read();
+ match_stage_t insert_stage;
+ node_offset_t insert_size;
+ if (unlikely(!node_stage.keys())) {
+ assert(insert_pos.is_end());
+ insert_stage = STAGE;
+ insert_size = STAGE_T::template insert_size<KeyT::VIEW>(key, packed_value);
+ } else {
+ std::tie(insert_stage, insert_size) = STAGE_T::evaluate_insert(
+ node_stage, key, packed_value, cast_down<STAGE>(insert_pos), false);
+ }
+ return {insert_stage, insert_size};
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ /*
+ * LeafNodeImpl
+ */
+ void get_largest_slot(search_position_t& pos,
+ key_view_t& index_key, const onode_t** pp_value) const override {
+ if constexpr (NODE_TYPE == node_type_t::LEAF) {
+ STAGE_T::template lookup_largest_slot<true, true, true>(
+ extent.read(), &cast_down_fill_0<STAGE>(pos), &index_key, pp_value);
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ std::tuple<match_stage_t, node_offset_t> evaluate_insert(
+ const key_hobj_t& key, const onode_t& value,
+ const MatchHistory& history, match_stat_t mstat,
+ search_position_t& insert_pos) const override {
+ if constexpr (NODE_TYPE == node_type_t::LEAF) {
+ if (unlikely(is_empty())) {
+ assert(insert_pos.is_end());
+ return {STAGE, STAGE_T::template insert_size<KeyT::HOBJ>(key, value)};
+ } else {
+ return STAGE_T::evaluate_insert(
+ key, value, history, mstat, cast_down<STAGE>(insert_pos));
+ }
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+
+ private:
+ NodeLayoutT(NodeExtentRef extent) : extent{extent} {}
+
+ node_offset_t filled_size() const {
+ auto& node_stage = extent.read();
+ auto ret = node_stage.size_before(node_stage.keys());
+ assert(ret == node_stage.total_size() - node_stage.free_size());
+ return ret;
+ }
+
+ static seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+
+ extent_t extent;
+};
+
+using InternalNode0 = NodeLayoutT<node_fields_0_t, node_type_t::INTERNAL>;
+using InternalNode1 = NodeLayoutT<node_fields_1_t, node_type_t::INTERNAL>;
+using InternalNode2 = NodeLayoutT<node_fields_2_t, node_type_t::INTERNAL>;
+using InternalNode3 = NodeLayoutT<internal_fields_3_t, node_type_t::INTERNAL>;
+using LeafNode0 = NodeLayoutT<node_fields_0_t, node_type_t::LEAF>;
+using LeafNode1 = NodeLayoutT<node_fields_1_t, node_type_t::LEAF>;
+using LeafNode2 = NodeLayoutT<node_fields_2_t, node_type_t::LEAF>;
+using LeafNode3 = NodeLayoutT<leaf_fields_3_t, node_type_t::LEAF>;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h
new file mode 100644
index 000000000..c1499d609
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "node_extent_mutable.h"
+#include "stages/node_stage.h"
+#include "stages/stage.h"
+
+#define STAGE_T node_to_stage_t<node_stage_t>
+
+namespace crimson::os::seastore::onode {
+
+/**
+ * NodeLayoutReplayableT
+ *
+ * Contains templated logics to modify the layout of a NodeExtend which are
+ * also replayable. Used by NodeExtentAccessorT at runtime and by
+ * DeltaRecorderT during replay.
+ */
+template <typename FieldType, node_type_t NODE_TYPE>
+struct NodeLayoutReplayableT {
+ using node_stage_t = node_extent_t<FieldType, NODE_TYPE>;
+ using position_t = typename STAGE_T::position_t;
+ using StagedIterator = typename STAGE_T::StagedIterator;
+ using value_t = value_type_t<NODE_TYPE>;
+ static constexpr auto FIELD_TYPE = FieldType::FIELD_TYPE;
+
+ template <KeyT KT>
+ static const value_t* insert(
+ NodeExtentMutable& mut,
+ const node_stage_t& node_stage,
+ const full_key_t<KT>& key,
+ const value_t& value,
+ position_t& insert_pos,
+ match_stage_t& insert_stage,
+ node_offset_t& insert_size) {
+ auto p_value = STAGE_T::template proceed_insert<KT, false>(
+ mut, node_stage, key, value, insert_pos, insert_stage, insert_size);
+ return p_value;
+ }
+
+ static void split(
+ NodeExtentMutable& mut,
+ const node_stage_t& node_stage,
+ StagedIterator& split_at) {
+ node_stage_t::update_is_level_tail(mut, node_stage, false);
+ STAGE_T::trim(mut, split_at);
+ }
+
+ template <KeyT KT>
+ static const value_t* split_insert(
+ NodeExtentMutable& mut,
+ const node_stage_t& node_stage,
+ StagedIterator& split_at,
+ const full_key_t<KT>& key,
+ const value_t& value,
+ position_t& insert_pos,
+ match_stage_t& insert_stage,
+ node_offset_t& insert_size) {
+ node_stage_t::update_is_level_tail(mut, node_stage, false);
+ STAGE_T::trim(mut, split_at);
+ auto p_value = STAGE_T::template proceed_insert<KT, true>(
+ mut, node_stage, key, value, insert_pos, insert_stage, insert_size);
+ return p_value;
+ }
+
+ static void update_child_addr(
+ NodeExtentMutable& mut, const laddr_t new_addr, laddr_packed_t* p_addr) {
+ assert(NODE_TYPE == node_type_t::INTERNAL);
+ mut.copy_in_absolute(p_addr, new_addr);
+ }
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h
new file mode 100644
index 000000000..6774544c7
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <ostream>
+
+#include "fwd.h"
+
+namespace crimson::os::seastore::onode {
+
+constexpr uint8_t FIELD_TYPE_MAGIC = 0x25;
+enum class field_type_t : uint8_t {
+ N0 = FIELD_TYPE_MAGIC,
+ N1,
+ N2,
+ N3,
+ _MAX
+};
+inline uint8_t to_unsigned(field_type_t type) {
+ auto value = static_cast<uint8_t>(type);
+ assert(value >= FIELD_TYPE_MAGIC);
+ assert(value < static_cast<uint8_t>(field_type_t::_MAX));
+ return value - FIELD_TYPE_MAGIC;
+}
+inline std::ostream& operator<<(std::ostream &os, field_type_t type) {
+ const char* const names[] = {"0", "1", "2", "3"};
+ auto index = to_unsigned(type);
+ os << names[index];
+ return os;
+}
+
+enum class node_type_t : uint8_t {
+ LEAF = 0,
+ INTERNAL
+};
+inline std::ostream& operator<<(std::ostream &os, const node_type_t& type) {
+ const char* const names[] = {"L", "I"};
+ auto index = static_cast<uint8_t>(type);
+ assert(index <= 1u);
+ os << names[index];
+ return os;
+}
+
+struct laddr_packed_t {
+ laddr_t value;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const laddr_packed_t& laddr) {
+ return os << "laddr_packed(0x" << std::hex << laddr.value << std::dec << ")";
+}
+
+using match_stat_t = int8_t;
+constexpr match_stat_t MSTAT_END = -2; // index is search_position_t::end()
+constexpr match_stat_t MSTAT_EQ = -1; // key == index
+constexpr match_stat_t MSTAT_LT0 = 0; // key == index [pool/shard crush ns/oid]; key < index [snap/gen]
+constexpr match_stat_t MSTAT_LT1 = 1; // key == index [pool/shard crush]; key < index [ns/oid]
+constexpr match_stat_t MSTAT_LT2 = 2; // key < index [pool/shard crush ns/oid] ||
+ // key == index [pool/shard]; key < index [crush]
+constexpr match_stat_t MSTAT_LT3 = 3; // key < index [pool/shard]
+constexpr match_stat_t MSTAT_MIN = MSTAT_END;
+constexpr match_stat_t MSTAT_MAX = MSTAT_LT3;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc
new file mode 100644
index 000000000..443c6cabd
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc
@@ -0,0 +1,165 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "item_iterator_stage.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+#define ITER_T item_iterator_t<NODE_TYPE>
+#define ITER_INST(NT) item_iterator_t<NT>
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+memory_range_t ITER_T::insert_prefix(
+ NodeExtentMutable& mut, const ITER_T& iter, const full_key_t<KT>& key,
+ bool is_end, node_offset_t size, const char* p_left_bound) {
+ // 1. insert range
+ char* p_insert;
+ if (is_end) {
+ assert(!iter.has_next());
+ p_insert = const_cast<char*>(iter.p_start());
+ } else {
+ p_insert = const_cast<char*>(iter.p_end());
+ }
+ char* p_insert_front = p_insert - size;
+
+ // 2. shift memory
+ const char* p_shift_start = p_left_bound;
+ const char* p_shift_end = p_insert;
+ mut.shift_absolute(p_shift_start,
+ p_shift_end - p_shift_start,
+ -(int)size);
+
+ // 3. append header
+ p_insert -= sizeof(node_offset_t);
+ node_offset_t back_offset = (p_insert - p_insert_front);
+ mut.copy_in_absolute(p_insert, back_offset);
+ ns_oid_view_t::append<KT>(mut, key, p_insert);
+
+ return {p_insert_front, p_insert};
+}
+#define IP_TEMPLATE(NT, KT) \
+ template memory_range_t ITER_INST(NT)::insert_prefix<KT>( \
+ NodeExtentMutable&, const ITER_INST(NT)&, const full_key_t<KT>&, \
+ bool, node_offset_t, const char*)
+IP_TEMPLATE(node_type_t::LEAF, KeyT::VIEW);
+IP_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW);
+IP_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ);
+IP_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ);
+
+template <node_type_t NODE_TYPE>
+void ITER_T::update_size(
+ NodeExtentMutable& mut, const ITER_T& iter, int change) {
+ node_offset_t offset = iter.get_back_offset();
+ int new_size = change + offset;
+ assert(new_size > 0 && new_size < NODE_BLOCK_SIZE);
+ mut.copy_in_absolute(
+ (void*)iter.get_item_range().p_end, node_offset_t(new_size));
+}
+
+template <node_type_t NODE_TYPE>
+node_offset_t ITER_T::trim_until(NodeExtentMutable&, const ITER_T& iter) {
+ assert(iter.index() != 0);
+ size_t ret = iter.p_end() - iter.p_items_start;
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+}
+
+template <node_type_t NODE_TYPE>
+node_offset_t ITER_T::trim_at(
+ NodeExtentMutable& mut, const ITER_T& iter, node_offset_t trimmed) {
+ size_t trim_size = iter.p_start() - iter.p_items_start + trimmed;
+ assert(trim_size < NODE_BLOCK_SIZE);
+ assert(iter.get_back_offset() > trimmed);
+ node_offset_t new_offset = iter.get_back_offset() - trimmed;
+ mut.copy_in_absolute((void*)iter.item_range.p_end, new_offset);
+ return trim_size;
+}
+
+#define ITER_TEMPLATE(NT) template class ITER_INST(NT)
+ITER_TEMPLATE(node_type_t::LEAF);
+ITER_TEMPLATE(node_type_t::INTERNAL);
+
+#define APPEND_T ITER_T::Appender<KT>
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+bool APPEND_T::append(const ITER_T& src, index_t& items) {
+ auto p_end = src.p_end();
+ bool append_till_end = false;
+ if (is_valid_index(items)) {
+ for (auto i = 1u; i <= items; ++i) {
+ if (!src.has_next()) {
+ assert(i == items);
+ append_till_end = true;
+ break;
+ }
+ ++src;
+ }
+ } else {
+ if (items == INDEX_END) {
+ append_till_end = true;
+ } else {
+ assert(items == INDEX_LAST);
+ }
+ items = 0;
+ while (src.has_next()) {
+ ++src;
+ ++items;
+ }
+ if (append_till_end) {
+ ++items;
+ }
+ }
+
+ const char* p_start;
+ if (append_till_end) {
+ p_start = src.p_start();
+ } else {
+ p_start = src.p_end();
+ }
+ assert(p_end >= p_start);
+ size_t append_size = p_end - p_start;
+ p_append -= append_size;
+ p_mut->copy_in_absolute(p_append, p_start, append_size);
+ return append_till_end;
+}
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const key_get_type& partial_key) {
+ p_append -= sizeof(node_offset_t);
+ p_offset_while_open = p_append;
+ ns_oid_view_t::append(*p_mut, partial_key, p_append);
+ return {p_mut, p_append};
+}
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const full_key_t<KT>& key) {
+ p_append -= sizeof(node_offset_t);
+ p_offset_while_open = p_append;
+ ns_oid_view_t::append<KT>(*p_mut, key, p_append);
+ return {p_mut, p_append};
+}
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+void APPEND_T::wrap_nxt(char* _p_append) {
+ assert(_p_append < p_append);
+ p_mut->copy_in_absolute(
+ p_offset_while_open, node_offset_t(p_offset_while_open - _p_append));
+ p_append = _p_append;
+}
+
+#define APPEND_TEMPLATE(NT, KT) template class ITER_INST(NT)::Appender<KT>
+APPEND_TEMPLATE(node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ);
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h
new file mode 100644
index 000000000..bb68eec8f
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h
@@ -0,0 +1,180 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "key_layout.h"
+#include "stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+/**
+ * item_iterator_t
+ *
+ * The STAGE_STRING implementation for node N0/N1, implements staged contract
+ * as an iterative container to resolve crush hash conflicts.
+ *
+ * The layout of the contaner to index ns, oid strings storing n items:
+ *
+ * # <--------- container range ---------> #
+ * #<~># items [i+1, n) #
+ * # # items [0, i) #<~>#
+ * # # <------ item i -------------> # #
+ * # # <--- item_range ---> | # #
+ * # # | # #
+ * # # next-stage | ns-oid | back_ # #
+ * # # contaner | strings | offset # #
+ * #...# range | | #...#
+ * ^ ^ | ^
+ * | | | |
+ * | +---------------------------+ |
+ * + p_items_start p_items_end +
+ */
+template <node_type_t NODE_TYPE>
+class item_iterator_t {
+ using value_t = value_type_t<NODE_TYPE>;
+ public:
+ item_iterator_t(const memory_range_t& range)
+ : p_items_start(range.p_start), p_items_end(range.p_end) {
+ assert(p_items_start < p_items_end);
+ next_item_range(p_items_end);
+ }
+
+ const char* p_start() const { return item_range.p_start; }
+ const char* p_end() const { return item_range.p_end + sizeof(node_offset_t); }
+ const memory_range_t& get_item_range() const { return item_range; }
+ node_offset_t get_back_offset() const { return back_offset; }
+
+ // container type system
+ using key_get_type = const ns_oid_view_t&;
+ static constexpr auto CONTAINER_TYPE = ContainerType::ITERATIVE;
+ index_t index() const { return _index; }
+ key_get_type get_key() const {
+ if (!key.has_value()) {
+ key = ns_oid_view_t(item_range.p_end);
+ assert(item_range.p_start < (*key).p_start());
+ }
+ return *key;
+ }
+ node_offset_t size() const {
+ size_t ret = item_range.p_end - item_range.p_start + sizeof(node_offset_t);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ };
+ node_offset_t size_to_nxt() const {
+ size_t ret = get_key().size() + sizeof(node_offset_t);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ }
+ node_offset_t size_overhead() const {
+ return sizeof(node_offset_t) + get_key().size_overhead();
+ }
+ memory_range_t get_nxt_container() const {
+ return {item_range.p_start, get_key().p_start()};
+ }
+ bool has_next() const {
+ assert(p_items_start <= item_range.p_start);
+ return p_items_start < item_range.p_start;
+ }
+ const item_iterator_t<NODE_TYPE>& operator++() const {
+ assert(has_next());
+ next_item_range(item_range.p_start);
+ key.reset();
+ ++_index;
+ return *this;
+ }
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ int start_offset = p_items_start - p_node_start;
+ int end_offset = p_items_end - p_node_start;
+ assert(start_offset > 0 && start_offset < NODE_BLOCK_SIZE);
+ assert(end_offset > 0 && end_offset <= NODE_BLOCK_SIZE);
+ ceph::encode(static_cast<node_offset_t>(start_offset), encoded);
+ ceph::encode(static_cast<node_offset_t>(end_offset), encoded);
+ ceph::encode(_index, encoded);
+ }
+
+ static item_iterator_t decode(const char* p_node_start,
+ ceph::bufferlist::const_iterator& delta) {
+ node_offset_t start_offset;
+ ceph::decode(start_offset, delta);
+ node_offset_t end_offset;
+ ceph::decode(end_offset, delta);
+ assert(start_offset < end_offset);
+ assert(end_offset <= NODE_BLOCK_SIZE);
+ index_t index;
+ ceph::decode(index, delta);
+
+ item_iterator_t ret({p_node_start + start_offset,
+ p_node_start + end_offset});
+ while (index > 0) {
+ ++ret;
+ --index;
+ }
+ return ret;
+ }
+
+ static node_offset_t header_size() { return 0u; }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(
+ const full_key_t<KT>& key, const value_t&) {
+ return ns_oid_view_t::estimate_size<KT>(key) + sizeof(node_offset_t);
+ }
+
+ template <KeyT KT>
+ static memory_range_t insert_prefix(
+ NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter,
+ const full_key_t<KT>& key, bool is_end,
+ node_offset_t size, const char* p_left_bound);
+
+ static void update_size(
+ NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter, int change);
+
+ static node_offset_t trim_until(NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&);
+ static node_offset_t trim_at(
+ NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&, node_offset_t trimmed);
+
+ template <KeyT KT>
+ class Appender;
+
+ private:
+ void next_item_range(const char* p_end) const {
+ auto p_item_end = p_end - sizeof(node_offset_t);
+ assert(p_items_start < p_item_end);
+ back_offset = reinterpret_cast<const node_offset_packed_t*>(p_item_end)->value;
+ assert(back_offset);
+ const char* p_item_start = p_item_end - back_offset;
+ assert(p_items_start <= p_item_start);
+ item_range = {p_item_start, p_item_end};
+ }
+
+ const char* p_items_start;
+ const char* p_items_end;
+ mutable memory_range_t item_range;
+ mutable node_offset_t back_offset;
+ mutable std::optional<ns_oid_view_t> key;
+ mutable index_t _index = 0u;
+};
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+class item_iterator_t<NODE_TYPE>::Appender {
+ public:
+ Appender(NodeExtentMutable* p_mut, char* p_append)
+ : p_mut{p_mut}, p_append{p_append} {}
+ bool append(const item_iterator_t<NODE_TYPE>& src, index_t& items);
+ char* wrap() { return p_append; }
+ std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&);
+ std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&);
+ void wrap_nxt(char* _p_append);
+
+ private:
+ NodeExtentMutable* p_mut;
+ char* p_append;
+ char* p_offset_while_open;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc
new file mode 100644
index 000000000..d60bb8d09
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "key_layout.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+void string_key_view_t::append_str(
+ NodeExtentMutable& mut, std::string_view str, char*& p_append) {
+ assert(is_valid_size(str.length()));
+ p_append -= sizeof(string_size_t);
+ string_size_t len = str.length();
+ mut.copy_in_absolute(p_append, len);
+ p_append -= len;
+ mut.copy_in_absolute(p_append, str.data(), len);
+}
+
+void string_key_view_t::append_dedup(
+ NodeExtentMutable& mut, const Type& dedup_type, char*& p_append) {
+ p_append -= sizeof(string_size_t);
+ if (dedup_type == Type::MIN) {
+ mut.copy_in_absolute(p_append, MIN);
+ } else if (dedup_type == Type::MAX) {
+ mut.copy_in_absolute(p_append, MAX);
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h
new file mode 100644
index 000000000..cc1f546c1
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h
@@ -0,0 +1,846 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <limits>
+#include <optional>
+#include <ostream>
+
+#include "common/hobject.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h"
+
+namespace crimson::os::seastore::onode {
+
+using shard_t = int8_t;
+using pool_t = int64_t;
+using crush_hash_t = uint32_t;
+using snap_t = uint64_t;
+using gen_t = uint64_t;
+static_assert(sizeof(shard_t) == sizeof(ghobject_t().shard_id.id));
+static_assert(sizeof(pool_t) == sizeof(ghobject_t().hobj.pool));
+static_assert(sizeof(crush_hash_t) == sizeof(ghobject_t().hobj.get_hash()));
+static_assert(sizeof(snap_t) == sizeof(ghobject_t().hobj.snap.val));
+static_assert(sizeof(gen_t) == sizeof(ghobject_t().generation));
+
+class NodeExtentMutable;
+class key_view_t;
+class key_hobj_t;
+enum class KeyT { VIEW, HOBJ };
+template <KeyT> struct _full_key_type;
+template<> struct _full_key_type<KeyT::VIEW> { using type = key_view_t; };
+template<> struct _full_key_type<KeyT::HOBJ> { using type = key_hobj_t; };
+template <KeyT type>
+using full_key_t = typename _full_key_type<type>::type;
+
+struct node_offset_packed_t {
+ node_offset_t value;
+} __attribute__((packed));
+
+// TODO: consider alignments
+struct shard_pool_t {
+ bool operator==(const shard_pool_t& x) const {
+ return (shard == x.shard && pool == x.pool);
+ }
+ bool operator!=(const shard_pool_t& x) const { return !(*this == x); }
+
+ template <KeyT KT>
+ static shard_pool_t from_key(const full_key_t<KT>& key);
+
+ shard_t shard;
+ pool_t pool;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const shard_pool_t& sp) {
+ return os << (unsigned)sp.shard << "," << sp.pool;
+}
+inline MatchKindCMP compare_to(const shard_pool_t& l, const shard_pool_t& r) {
+ auto ret = toMatchKindCMP(l.shard, r.shard);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return toMatchKindCMP(l.pool, r.pool);
+}
+
+struct crush_t {
+ bool operator==(const crush_t& x) const { return crush == x.crush; }
+ bool operator!=(const crush_t& x) const { return !(*this == x); }
+
+ template <KeyT KT>
+ static crush_t from_key(const full_key_t<KT>& key);
+
+ crush_hash_t crush;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const crush_t& c) {
+ return os << c.crush;
+}
+inline MatchKindCMP compare_to(const crush_t& l, const crush_t& r) {
+ return toMatchKindCMP(l.crush, r.crush);
+}
+
+struct shard_pool_crush_t {
+ bool operator==(const shard_pool_crush_t& x) const {
+ return (shard_pool == x.shard_pool && crush == x.crush);
+ }
+ bool operator!=(const shard_pool_crush_t& x) const { return !(*this == x); }
+
+ template <KeyT KT>
+ static shard_pool_crush_t from_key(const full_key_t<KT>& key);
+
+ shard_pool_t shard_pool;
+ crush_t crush;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const shard_pool_crush_t& spc) {
+ return os << spc.shard_pool << "," << spc.crush;
+}
+inline MatchKindCMP compare_to(
+ const shard_pool_crush_t& l, const shard_pool_crush_t& r) {
+ auto ret = compare_to(l.shard_pool, r.shard_pool);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return compare_to(l.crush, r.crush);
+}
+
+struct snap_gen_t {
+ bool operator==(const snap_gen_t& x) const {
+ return (snap == x.snap && gen == x.gen);
+ }
+ bool operator!=(const snap_gen_t& x) const { return !(*this == x); }
+
+ template <KeyT KT>
+ static snap_gen_t from_key(const full_key_t<KT>& key);
+
+ snap_t snap;
+ gen_t gen;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const snap_gen_t& sg) {
+ return os << sg.snap << "," << sg.gen;
+}
+inline MatchKindCMP compare_to(const snap_gen_t& l, const snap_gen_t& r) {
+ auto ret = toMatchKindCMP(l.snap, r.snap);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return toMatchKindCMP(l.gen, r.gen);
+}
+
+/**
+ * string_key_view_t
+ *
+ * The layout to store char array as an oid or an ns string which may be
+ * compressed.
+ *
+ * If compressed, the physical block only stores an unsigned int of
+ * string_size_t, with value 0 denoting Type::MIN, and value max() denoting
+ * Type::MAX.
+ *
+ * If not compressed (Type::STR), the physical block stores the char array and
+ * a valid string_size_t value.
+ */
+struct string_key_view_t {
+ enum class Type {MIN, STR, MAX};
+ // presumably the maximum string length is 2KiB
+ using string_size_t = uint16_t;
+ static constexpr auto MAX = std::numeric_limits<string_size_t>::max();
+ static constexpr auto MIN = string_size_t(0u);
+ static auto is_valid_size(size_t size) {
+ return (size > MIN && size < MAX);
+ }
+
+ string_key_view_t(const char* p_end) {
+ p_length = p_end - sizeof(string_size_t);
+ std::memcpy(&length, p_length, sizeof(string_size_t));
+ if (is_valid_size(length)) {
+ auto _p_key = p_length - length;
+ p_key = static_cast<const char*>(_p_key);
+ } else {
+ assert(length == MAX || length == MIN);
+ p_key = nullptr;
+ }
+ }
+ Type type() const {
+ if (length == MIN) {
+ return Type::MIN;
+ } else if (length == MAX) {
+ return Type::MAX;
+ } else {
+ assert(is_valid_size(length));
+ return Type::STR;
+ }
+ }
+ const char* p_start() const {
+ if (p_key) {
+ return p_key;
+ } else {
+ return p_length;
+ }
+ }
+ const char* p_next_end() const {
+ if (p_key) {
+ return p_start();
+ } else {
+ return p_length + sizeof(string_size_t);
+ }
+ }
+ node_offset_t size() const {
+ size_t ret = length + sizeof(string_size_t);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ }
+ node_offset_t size_logical() const {
+ assert(type() == Type::STR);
+ assert(is_valid_size(length));
+ return length;
+ }
+ node_offset_t size_overhead() const {
+ assert(type() == Type::STR);
+ return sizeof(string_size_t);
+ }
+
+ std::string_view to_string_view() const {
+ assert(type() == Type::STR);
+ assert(is_valid_size(length));
+ return {p_key, length};
+ }
+ bool operator==(const string_key_view_t& x) const {
+ if (type() == x.type() && type() != Type::STR)
+ return true;
+ if (type() != x.type())
+ return false;
+ if (length != x.length)
+ return false;
+ return (memcmp(p_key, x.p_key, length) == 0);
+ }
+ bool operator!=(const string_key_view_t& x) const { return !(*this == x); }
+
+ static void append_str(
+ NodeExtentMutable&, std::string_view, char*& p_append);
+
+ static void test_append_str(std::string_view str, char*& p_append) {
+ assert(is_valid_size(str.length()));
+ p_append -= sizeof(string_size_t);
+ string_size_t len = str.length();
+ std::memcpy(p_append, &len, sizeof(string_size_t));
+ p_append -= len;
+ std::memcpy(p_append, str.data(), len);
+ }
+
+ static void append_dedup(
+ NodeExtentMutable&, const Type& dedup_type, char*& p_append);
+
+ static void test_append_dedup(const Type& dedup_type, char*& p_append) {
+ p_append -= sizeof(string_size_t);
+ string_size_t len;
+ if (dedup_type == Type::MIN) {
+ len = MIN;
+ } else if (dedup_type == Type::MAX) {
+ len = MAX;
+ } else {
+ ceph_abort("impossible path");
+ }
+ std::memcpy(p_append, &len, sizeof(string_size_t));
+ }
+
+ const char* p_key;
+ const char* p_length;
+ // TODO: remove if p_length is aligned
+ string_size_t length;
+};
+
+/**
+ * string_view_masked_t
+ *
+ * A common class to hide the underlying string implementation regardless of a
+ * string_key_view_t (maybe compressed), a string/string_view, or a compressed
+ * string. And leverage this consistant class to do compare, print, convert and
+ * append operations.
+ */
+class string_view_masked_t {
+ public:
+ using string_size_t = string_key_view_t::string_size_t;
+ using Type = string_key_view_t::Type;
+ explicit string_view_masked_t(const string_key_view_t& index)
+ : type{index.type()} {
+ if (type == Type::STR) {
+ view = index.to_string_view();
+ }
+ }
+ explicit string_view_masked_t(std::string_view str)
+ : type{Type::STR}, view{str} {
+ assert(string_key_view_t::is_valid_size(view.size()));
+ }
+
+ Type get_type() const { return type; }
+ std::string_view to_string_view() const {
+ assert(get_type() == Type::STR);
+ return view;
+ }
+ string_size_t size() const {
+ assert(get_type() == Type::STR);
+ assert(string_key_view_t::is_valid_size(view.size()));
+ return view.size();
+ }
+ bool operator==(const string_view_masked_t& x) const {
+ if (get_type() == x.get_type() && get_type() != Type::STR)
+ return true;
+ if (get_type() != x.get_type())
+ return false;
+ if (size() != x.size())
+ return false;
+ return (memcmp(view.data(), x.view.data(), size()) == 0);
+ }
+ bool operator!=(const string_view_masked_t& x) const { return !(*this == x); }
+ void encode(ceph::bufferlist& bl) const {
+ if (get_type() == Type::MIN) {
+ ceph::encode(string_key_view_t::MIN, bl);
+ } else if (get_type() == Type::MAX) {
+ ceph::encode(string_key_view_t::MAX, bl);
+ } else {
+ ceph::encode(size(), bl);
+ ceph::encode_nohead(view, bl);
+ }
+ }
+ static auto min() { return string_view_masked_t{Type::MIN}; }
+ static auto max() { return string_view_masked_t{Type::MAX}; }
+ static string_view_masked_t decode(
+ std::string& str_storage, ceph::bufferlist::const_iterator& delta) {
+ string_size_t size;
+ ceph::decode(size, delta);
+ if (size == string_key_view_t::MIN) {
+ return min();
+ } else if (size == string_key_view_t::MAX) {
+ return max();
+ } else {
+ ceph::decode_nohead(size, str_storage, delta);
+ return string_view_masked_t(str_storage);
+ }
+ }
+
+ private:
+ explicit string_view_masked_t(Type type)
+ : type{type} {}
+
+ Type type;
+ std::string_view view;
+};
+inline MatchKindCMP compare_to(const string_view_masked_t& l, const string_view_masked_t& r) {
+ using Type = string_view_masked_t::Type;
+ auto l_type = l.get_type();
+ auto r_type = r.get_type();
+ if (l_type == Type::STR && r_type == Type::STR) {
+ assert(l.size() && r.size());
+ return toMatchKindCMP(l.to_string_view(), r.to_string_view());
+ } else if (l_type == r_type) {
+ return MatchKindCMP::EQ;
+ } else if (l_type == Type::MIN || r_type == Type::MAX) {
+ return MatchKindCMP::LT;
+ } else { // l_type == Type::MAX || r_type == Type::MIN
+ return MatchKindCMP::GT;
+ }
+}
+inline MatchKindCMP compare_to(std::string_view l, const string_view_masked_t& r) {
+ using Type = string_view_masked_t::Type;
+ assert(l.length());
+ auto r_type = r.get_type();
+ if (r_type == Type::MIN) {
+ return MatchKindCMP::GT;
+ } else if (r_type == Type::MAX) {
+ return MatchKindCMP::LT;
+ } else { // r_type == Type::STR
+ assert(r.size());
+ return toMatchKindCMP(l, r.to_string_view());
+ }
+}
+inline MatchKindCMP compare_to(const string_view_masked_t& l, std::string_view r) {
+ return reverse(compare_to(r, l));
+}
+inline std::ostream& operator<<(std::ostream& os, const string_view_masked_t& masked) {
+ using Type = string_view_masked_t::Type;
+ auto type = masked.get_type();
+ if (type == Type::MIN) {
+ return os << "MIN";
+ } else if (type == Type::MAX) {
+ return os << "MAX";
+ } else { // type == Type::STR
+ auto view = masked.to_string_view();
+ if (view.length() <= 12) {
+ os << "\"" << view << "\"";
+ } else {
+ os << "\"" << std::string_view(view.data(), 4) << ".."
+ << std::string_view(view.data() + view.length() - 2, 2)
+ << "/" << view.length() << "B\"";
+ }
+ return os;
+ }
+}
+
+struct ns_oid_view_t {
+ using string_size_t = string_key_view_t::string_size_t;
+ using Type = string_key_view_t::Type;
+
+ ns_oid_view_t(const char* p_end) : nspace(p_end), oid(nspace.p_next_end()) {}
+ Type type() const { return oid.type(); }
+ const char* p_start() const { return oid.p_start(); }
+ node_offset_t size() const {
+ if (type() == Type::STR) {
+ size_t ret = nspace.size() + oid.size();
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ } else {
+ return sizeof(string_size_t);
+ }
+ }
+ node_offset_t size_logical() const {
+ assert(type() == Type::STR);
+ return nspace.size_logical() + oid.size_logical();
+ }
+ node_offset_t size_overhead() const {
+ assert(type() == Type::STR);
+ return nspace.size_overhead() + oid.size_overhead();
+ }
+ bool operator==(const ns_oid_view_t& x) const {
+ return (string_view_masked_t{nspace} == string_view_masked_t{x.nspace} &&
+ string_view_masked_t{oid} == string_view_masked_t{x.oid});
+ }
+ bool operator!=(const ns_oid_view_t& x) const { return !(*this == x); }
+
+ template <KeyT KT>
+ static node_offset_t estimate_size(const full_key_t<KT>& key);
+
+ template <KeyT KT>
+ static void append(NodeExtentMutable&,
+ const full_key_t<KT>& key,
+ char*& p_append);
+
+ static void append(NodeExtentMutable& mut,
+ const ns_oid_view_t& view,
+ char*& p_append) {
+ if (view.type() == Type::STR) {
+ string_key_view_t::append_str(mut, view.nspace.to_string_view(), p_append);
+ string_key_view_t::append_str(mut, view.oid.to_string_view(), p_append);
+ } else {
+ string_key_view_t::append_dedup(mut, view.type(), p_append);
+ }
+ }
+
+ template <KeyT KT>
+ static void test_append(const full_key_t<KT>& key, char*& p_append);
+
+ string_key_view_t nspace;
+ string_key_view_t oid;
+};
+inline std::ostream& operator<<(std::ostream& os, const ns_oid_view_t& ns_oid) {
+ return os << string_view_masked_t{ns_oid.nspace} << ","
+ << string_view_masked_t{ns_oid.oid};
+}
+inline MatchKindCMP compare_to(const ns_oid_view_t& l, const ns_oid_view_t& r) {
+ auto ret = compare_to(string_view_masked_t{l.nspace},
+ string_view_masked_t{r.nspace});
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return compare_to(string_view_masked_t{l.oid},
+ string_view_masked_t{r.oid});
+}
+
+/**
+ * key_hobj_t
+ *
+ * A specialized implementation of a full_key_t storing a ghobject_t passed
+ * from user.
+ */
+class key_hobj_t {
+ public:
+ explicit key_hobj_t(const ghobject_t& ghobj) : ghobj{ghobj} {}
+ /*
+ * common interfaces as a full_key_t
+ */
+ shard_t shard() const {
+ return ghobj.shard_id;
+ }
+ pool_t pool() const {
+ return ghobj.hobj.pool;
+ }
+ crush_hash_t crush() const {
+ return ghobj.hobj.get_hash();
+ }
+ std::string_view nspace() const {
+ // TODO(cross-node string dedup)
+ return ghobj.hobj.nspace;
+ }
+ string_view_masked_t nspace_masked() const {
+ // TODO(cross-node string dedup)
+ return string_view_masked_t{nspace()};
+ }
+ std::string_view oid() const {
+ // TODO(cross-node string dedup)
+ return ghobj.hobj.oid.name;
+ }
+ string_view_masked_t oid_masked() const {
+ // TODO(cross-node string dedup)
+ return string_view_masked_t{oid()};
+ }
+ ns_oid_view_t::Type dedup_type() const {
+ return _dedup_type;
+ }
+ snap_t snap() const {
+ return ghobj.hobj.snap;
+ }
+ gen_t gen() const {
+ return ghobj.generation;
+ }
+
+ bool operator==(const full_key_t<KeyT::VIEW>& o) const;
+ bool operator==(const full_key_t<KeyT::HOBJ>& o) const;
+ bool operator!=(const full_key_t<KeyT::VIEW>& o) const {
+ return !operator==(o);
+ }
+ bool operator!=(const full_key_t<KeyT::HOBJ>& o) const {
+ return !operator==(o);
+ }
+
+ std::ostream& dump(std::ostream& os) const {
+ os << "key_hobj(" << (unsigned)shard() << ","
+ << pool() << "," << crush() << "; "
+ << string_view_masked_t{nspace()} << ","
+ << string_view_masked_t{oid()} << "; "
+ << snap() << "," << gen() << ")";
+ return os;
+ }
+
+ static key_hobj_t decode(ceph::bufferlist::const_iterator& delta) {
+ shard_t shard;
+ ceph::decode(shard, delta);
+ pool_t pool;
+ ceph::decode(pool, delta);
+ crush_hash_t crush;
+ ceph::decode(crush, delta);
+ std::string nspace;
+ auto nspace_masked = string_view_masked_t::decode(nspace, delta);
+ // TODO(cross-node string dedup)
+ assert(nspace_masked.get_type() == string_view_masked_t::Type::STR);
+ std::string oid;
+ auto oid_masked = string_view_masked_t::decode(oid, delta);
+ // TODO(cross-node string dedup)
+ assert(oid_masked.get_type() == string_view_masked_t::Type::STR);
+ snap_t snap;
+ ceph::decode(snap, delta);
+ gen_t gen;
+ ceph::decode(gen, delta);
+ return key_hobj_t(ghobject_t(
+ shard_id_t(shard), pool, crush, nspace, oid, snap, gen));
+ }
+
+ private:
+ ns_oid_view_t::Type _dedup_type = ns_oid_view_t::Type::STR;
+ ghobject_t ghobj;
+};
+inline std::ostream& operator<<(std::ostream& os, const key_hobj_t& key) {
+ return key.dump(os);
+}
+
+/**
+ * key_view_t
+ *
+ * A specialized implementation of a full_key_t pointing to the locations
+ * storing the full key in a tree node.
+ */
+class key_view_t {
+ public:
+ /**
+ * common interfaces as a full_key_t
+ */
+ shard_t shard() const {
+ return shard_pool_packed().shard;
+ }
+ pool_t pool() const {
+ return shard_pool_packed().pool;
+ }
+ crush_hash_t crush() const {
+ return crush_packed().crush;
+ }
+ std::string_view nspace() const {
+ // TODO(cross-node string dedup)
+ return ns_oid_view().nspace.to_string_view();
+ }
+ string_view_masked_t nspace_masked() const {
+ // TODO(cross-node string dedup)
+ return string_view_masked_t{ns_oid_view().nspace};
+ }
+ std::string_view oid() const {
+ // TODO(cross-node string dedup)
+ return ns_oid_view().oid.to_string_view();
+ }
+ string_view_masked_t oid_masked() const {
+ // TODO(cross-node string dedup)
+ return string_view_masked_t{ns_oid_view().oid};
+ }
+ ns_oid_view_t::Type dedup_type() const {
+ return ns_oid_view().type();
+ }
+ snap_t snap() const {
+ return snap_gen_packed().snap;
+ }
+ gen_t gen() const {
+ return snap_gen_packed().gen;
+ }
+
+ bool operator==(const full_key_t<KeyT::VIEW>& o) const;
+ bool operator==(const full_key_t<KeyT::HOBJ>& o) const;
+ bool operator!=(const full_key_t<KeyT::VIEW>& o) const {
+ return !operator==(o);
+ }
+ bool operator!=(const full_key_t<KeyT::HOBJ>& o) const {
+ return !operator==(o);
+ }
+
+ /**
+ * key_view_t specific interfaces
+ */
+ bool has_shard_pool() const {
+ return p_shard_pool != nullptr;
+ }
+ bool has_crush() const {
+ return p_crush != nullptr;
+ }
+ bool has_ns_oid() const {
+ return p_ns_oid.has_value();
+ }
+ bool has_snap_gen() const {
+ return p_snap_gen != nullptr;
+ }
+
+ const shard_pool_t& shard_pool_packed() const {
+ assert(has_shard_pool());
+ return *p_shard_pool;
+ }
+ const crush_t& crush_packed() const {
+ assert(has_crush());
+ return *p_crush;
+ }
+ const ns_oid_view_t& ns_oid_view() const {
+ assert(has_ns_oid());
+ return *p_ns_oid;
+ }
+ const snap_gen_t& snap_gen_packed() const {
+ assert(has_snap_gen());
+ return *p_snap_gen;
+ }
+
+ size_t size_logical() const {
+ return sizeof(shard_t) + sizeof(pool_t) + sizeof(crush_hash_t) +
+ sizeof(snap_t) + sizeof(gen_t) + ns_oid_view().size_logical();
+ }
+
+ ghobject_t to_ghobj() const {
+ return ghobject_t(
+ shard_id_t(shard()), pool(), crush(),
+ std::string(nspace()), std::string(oid()), snap(), gen());
+ }
+
+ void replace(const crush_t& key) { p_crush = &key; }
+ void set(const crush_t& key) {
+ assert(!has_crush());
+ replace(key);
+ }
+ void replace(const shard_pool_crush_t& key) { p_shard_pool = &key.shard_pool; }
+ void set(const shard_pool_crush_t& key) {
+ set(key.crush);
+ assert(!has_shard_pool());
+ replace(key);
+ }
+ void replace(const ns_oid_view_t& key) { p_ns_oid = key; }
+ void set(const ns_oid_view_t& key) {
+ assert(!has_ns_oid());
+ replace(key);
+ }
+ void replace(const snap_gen_t& key) { p_snap_gen = &key; }
+ void set(const snap_gen_t& key) {
+ assert(!has_snap_gen());
+ replace(key);
+ }
+
+ std::ostream& dump(std::ostream& os) const {
+ os << "key_view(";
+ if (has_shard_pool()) {
+ os << (unsigned)shard() << "," << pool() << ",";
+ } else {
+ os << "X,X,";
+ }
+ if (has_crush()) {
+ os << crush() << "; ";
+ } else {
+ os << "X; ";
+ }
+ if (has_ns_oid()) {
+ os << ns_oid_view() << "; ";
+ } else {
+ os << "X,X; ";
+ }
+ if (has_snap_gen()) {
+ os << snap() << "," << gen() << ")";
+ } else {
+ os << "X,X)";
+ }
+ return os;
+ }
+
+ private:
+ const shard_pool_t* p_shard_pool = nullptr;
+ const crush_t* p_crush = nullptr;
+ std::optional<ns_oid_view_t> p_ns_oid;
+ const snap_gen_t* p_snap_gen = nullptr;
+};
+
+template <KeyT KT>
+void encode_key(const full_key_t<KT>& key, ceph::bufferlist& bl) {
+ ceph::encode(key.shard(), bl);
+ ceph::encode(key.pool(), bl);
+ ceph::encode(key.crush(), bl);
+ key.nspace_masked().encode(bl);
+ key.oid_masked().encode(bl);
+ ceph::encode(key.snap(), bl);
+ ceph::encode(key.gen(), bl);
+}
+
+inline MatchKindCMP compare_to(std::string_view l, std::string_view r) {
+ return toMatchKindCMP(l, r);
+}
+template <KeyT TypeL, KeyT TypeR>
+bool compare_full_key(const full_key_t<TypeL>& l, const full_key_t<TypeR>& r) {
+ if (l.shard() != r.shard())
+ return false;
+ if (l.pool() != r.pool())
+ return false;
+ if (l.crush() != r.crush())
+ return false;
+ if (compare_to(l.nspace(), r.nspace()) != MatchKindCMP::EQ)
+ return false;
+ if (compare_to(l.oid(), r.oid()) != MatchKindCMP::EQ)
+ return false;
+ if (l.snap() != r.snap())
+ return false;
+ if (l.gen() != r.gen())
+ return false;
+ return true;
+}
+
+inline bool key_hobj_t::operator==(const full_key_t<KeyT::VIEW>& o) const {
+ return compare_full_key<KeyT::HOBJ, KeyT::VIEW>(*this, o);
+}
+inline bool key_hobj_t::operator==(const full_key_t<KeyT::HOBJ>& o) const {
+ return compare_full_key<KeyT::HOBJ, KeyT::HOBJ>(*this, o);
+}
+inline bool key_view_t::operator==(const full_key_t<KeyT::VIEW>& o) const {
+ return compare_full_key<KeyT::VIEW, KeyT::VIEW>(*this, o);
+}
+inline bool key_view_t::operator==(const full_key_t<KeyT::HOBJ>& o) const {
+ return compare_full_key<KeyT::VIEW, KeyT::HOBJ>(*this, o);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const key_view_t& key) {
+ return key.dump(os);
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const shard_pool_t& target) {
+ auto ret = toMatchKindCMP(key.shard(), target.shard);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return toMatchKindCMP(key.pool(), target.pool);
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const crush_t& target) {
+ return toMatchKindCMP(key.crush(), target.crush);
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const shard_pool_crush_t& target) {
+ auto ret = compare_to<Type>(key, target.shard_pool);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return compare_to<Type>(key, target.crush);
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const ns_oid_view_t& target) {
+ auto ret = compare_to(key.nspace(), string_view_masked_t{target.nspace});
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return compare_to(key.oid(), string_view_masked_t{target.oid});
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const snap_gen_t& target) {
+ auto ret = toMatchKindCMP(key.snap(), target.snap);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return toMatchKindCMP(key.gen(), target.gen);
+}
+
+template <KeyT KT>
+shard_pool_t shard_pool_t::from_key(const full_key_t<KT>& key) {
+ if constexpr (KT == KeyT::VIEW) {
+ return key.shard_pool_packed();
+ } else {
+ return {key.shard(), key.pool()};
+ }
+}
+
+template <KeyT KT>
+crush_t crush_t::from_key(const full_key_t<KT>& key) {
+ if constexpr (KT == KeyT::VIEW) {
+ return key.crush_packed();
+ } else {
+ return {key.crush()};
+ }
+}
+
+template <KeyT KT>
+shard_pool_crush_t shard_pool_crush_t::from_key(const full_key_t<KT>& key) {
+ return {shard_pool_t::from_key<KT>(key), crush_t::from_key<KT>(key)};
+}
+
+template <KeyT KT>
+snap_gen_t snap_gen_t::from_key(const full_key_t<KT>& key) {
+ if constexpr (KT == KeyT::VIEW) {
+ return key.snap_gen_packed();
+ } else {
+ return {key.snap(), key.gen()};
+ }
+}
+
+template <KeyT KT>
+node_offset_t ns_oid_view_t::estimate_size(const full_key_t<KT>& key) {
+ if constexpr (KT == KeyT::VIEW) {
+ return key.ns_oid_view().size();
+ } else {
+ if (key.dedup_type() != Type::STR) {
+ // size after deduplication
+ return sizeof(string_size_t);
+ } else {
+ return 2 * sizeof(string_size_t) + key.nspace().size() + key.oid().size();
+ }
+ }
+}
+
+template <KeyT KT>
+void ns_oid_view_t::append(
+ NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) {
+ if (key.dedup_type() == Type::STR) {
+ string_key_view_t::append_str(mut, key.nspace(), p_append);
+ string_key_view_t::append_str(mut, key.oid(), p_append);
+ } else {
+ string_key_view_t::append_dedup(mut, key.dedup_type(), p_append);
+ }
+}
+
+template <KeyT KT>
+void ns_oid_view_t::test_append(const full_key_t<KT>& key, char*& p_append) {
+ if (key.dedup_type() == Type::STR) {
+ string_key_view_t::test_append_str(key.nspace(), p_append);
+ string_key_view_t::test_append_str(key.oid(), p_append);
+ } else {
+ string_key_view_t::test_append_dedup(key.dedup_type(), p_append);
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc
new file mode 100644
index 000000000..4a5988185
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc
@@ -0,0 +1,318 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_stage.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+#include "node_stage_layout.h"
+
+namespace crimson::os::seastore::onode {
+
+#define NODE_T node_extent_t<FieldType, NODE_TYPE>
+#define NODE_INST(FT, NT) node_extent_t<FT, NT>
+
+template <typename FieldType, node_type_t NODE_TYPE>
+const char* NODE_T::p_left_bound() const {
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ // N3 internal node doesn't have the right part
+ return nullptr;
+ } else {
+ auto ret = p_start() + fields().get_item_end_offset(keys());
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (is_level_tail()) {
+ ret -= sizeof(laddr_t);
+ }
+ }
+ return ret;
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+node_offset_t NODE_T::size_to_nxt_at(index_t index) const {
+ assert(index < keys());
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ return FieldType::estimate_insert_one();
+ } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+ auto p_end = p_start() + p_fields->get_item_end_offset(index);
+ return FieldType::estimate_insert_one() + ns_oid_view_t(p_end).size();
+ } else {
+ ceph_abort("N3 node is not nested");
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+memory_range_t NODE_T::get_nxt_container(index_t index) const {
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ ceph_abort("N3 internal node doesn't have the right part");
+ } else {
+ node_offset_t item_start_offset = p_fields->get_item_start_offset(index);
+ node_offset_t item_end_offset = p_fields->get_item_end_offset(index);
+ assert(item_start_offset < item_end_offset);
+ auto item_p_start = p_start() + item_start_offset;
+ auto item_p_end = p_start() + item_end_offset;
+ if constexpr (FIELD_TYPE == field_type_t::N2) {
+ // range for sub_items_t<NODE_TYPE>
+ item_p_end = ns_oid_view_t(item_p_end).p_start();
+ assert(item_p_start < item_p_end);
+ } else {
+ // range for item_iterator_t<NODE_TYPE>
+ }
+ return {item_p_start, item_p_end};
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+void NODE_T::bootstrap_extent(
+ NodeExtentMutable& mut,
+ field_type_t field_type, node_type_t node_type,
+ bool is_level_tail, level_t level) {
+ node_header_t::bootstrap_extent(
+ mut, field_type, node_type, is_level_tail, level);
+ mut.copy_in_relative(
+ sizeof(node_header_t), typename FieldType::num_keys_t(0u));
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+void NODE_T::update_is_level_tail(
+ NodeExtentMutable& mut, const node_extent_t& extent, bool value) {
+ node_header_t::update_is_level_tail(mut, extent.p_fields->header, value);
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+memory_range_t NODE_T::insert_prefix_at(
+ NodeExtentMutable& mut, const node_extent_t& node, const full_key_t<KT>& key,
+ index_t index, node_offset_t size, const char* p_left_bound) {
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ assert(index <= node.keys());
+ assert(p_left_bound == node.p_left_bound());
+ assert(size > FieldType::estimate_insert_one());
+ auto size_right = size - FieldType::estimate_insert_one();
+ const char* p_insert = node.p_start() + node.fields().get_item_end_offset(index);
+ const char* p_insert_front = p_insert - size_right;
+ FieldType::template insert_at<KT>(mut, key, node.fields(), index, size_right);
+ mut.shift_absolute(p_left_bound,
+ p_insert - p_left_bound,
+ -(int)size_right);
+ return {p_insert_front, p_insert};
+ } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+ ceph_abort("not implemented");
+ } else {
+ ceph_abort("impossible");
+ }
+}
+#define IPA_TEMPLATE(FT, NT, KT) \
+ template memory_range_t NODE_INST(FT, NT)::insert_prefix_at<KT>( \
+ NodeExtentMutable&, const node_extent_t&, const full_key_t<KT>&, \
+ index_t, node_offset_t, const char*)
+IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ);
+
+template <typename FieldType, node_type_t NODE_TYPE>
+void NODE_T::update_size_at(
+ NodeExtentMutable& mut, const node_extent_t& node, index_t index, int change) {
+ assert(index < node.keys());
+ FieldType::update_size_at(mut, node.fields(), index, change);
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+node_offset_t NODE_T::trim_until(
+ NodeExtentMutable& mut, const node_extent_t& node, index_t index) {
+ assert(!node.is_level_tail());
+ auto keys = node.keys();
+ assert(index <= keys);
+ if (index == keys) {
+ return 0;
+ }
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ ceph_abort("not implemented");
+ } else {
+ mut.copy_in_absolute(
+ (void*)&node.p_fields->num_keys, num_keys_t(index));
+ }
+ // no need to calculate trim size for node
+ return 0;
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+node_offset_t NODE_T::trim_at(
+ NodeExtentMutable& mut, const node_extent_t& node,
+ index_t index, node_offset_t trimmed) {
+ assert(!node.is_level_tail());
+ assert(index < node.keys());
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ ceph_abort("not implemented");
+ } else {
+ node_offset_t offset = node.p_fields->get_item_start_offset(index);
+ size_t new_offset = offset + trimmed;
+ assert(new_offset < node.p_fields->get_item_end_offset(index));
+ mut.copy_in_absolute(const_cast<void*>(node.p_fields->p_offset(index)),
+ node_offset_t(new_offset));
+ mut.copy_in_absolute(
+ (void*)&node.p_fields->num_keys, num_keys_t(index + 1));
+ }
+ // no need to calculate trim size for node
+ return 0;
+}
+
+#define NODE_TEMPLATE(FT, NT) template class NODE_INST(FT, NT)
+NODE_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(node_fields_0_t, node_type_t::LEAF);
+NODE_TEMPLATE(node_fields_1_t, node_type_t::LEAF);
+NODE_TEMPLATE(node_fields_2_t, node_type_t::LEAF);
+NODE_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF);
+
+#define APPEND_T node_extent_t<FieldType, NODE_TYPE>::Appender<KT>
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+void APPEND_T::append(const node_extent_t& src, index_t from, index_t items) {
+ assert(from <= src.keys());
+ if (p_src == nullptr) {
+ p_src = &src;
+ } else {
+ assert(p_src == &src);
+ }
+ if (items == 0) {
+ return;
+ }
+ assert(from < src.keys());
+ assert(from + items <= src.keys());
+ num_keys += items;
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ ceph_abort("impossible path");
+ } else {
+ // append left part forwards
+ node_offset_t offset_left_start = src.fields().get_key_start_offset(from);
+ node_offset_t offset_left_end = src.fields().get_key_start_offset(from + items);
+ node_offset_t left_size = offset_left_end - offset_left_start;
+ if (num_keys == 0) {
+ // no need to adjust offset
+ assert(from == 0);
+ assert(p_start + offset_left_start == p_append_left);
+ p_mut->copy_in_absolute(p_append_left,
+ src.p_start() + offset_left_start, left_size);
+ } else {
+ node_offset_t step_size = FieldType::estimate_insert_one();
+ node_offset_t offset_base = src.fields().get_item_end_offset(from);
+ int offset_change = p_append_right - p_start - offset_base;
+ auto p_offset_dst = p_append_left;
+ if constexpr (FIELD_TYPE != field_type_t::N2) {
+ // copy keys
+ p_mut->copy_in_absolute(p_append_left,
+ src.p_start() + offset_left_start, left_size);
+ // point to offset for update
+ p_offset_dst += sizeof(typename FieldType::key_t);
+ }
+ for (auto i = from; i < from + items; ++i) {
+ p_mut->copy_in_absolute(p_offset_dst,
+ node_offset_t(src.fields().get_item_start_offset(i) + offset_change));
+ p_offset_dst += step_size;
+ }
+ assert(p_append_left + left_size + sizeof(typename FieldType::key_t) ==
+ p_offset_dst);
+ }
+ p_append_left += left_size;
+
+ // append right part backwards
+ node_offset_t offset_right_start = src.fields().get_item_end_offset(from + items);
+ node_offset_t offset_right_end = src.fields().get_item_end_offset(from);
+ node_offset_t right_size = offset_right_end - offset_right_start;
+ p_append_right -= right_size;
+ p_mut->copy_in_absolute(p_append_right,
+ src.p_start() + offset_right_start, right_size);
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+void APPEND_T::append(
+ const full_key_t<KT>& key, const value_t& value, const value_t*& p_value) {
+ if constexpr (FIELD_TYPE == field_type_t::N3) {
+ ceph_abort("not implemented");
+ } else {
+ ceph_abort("should not happen");
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const key_get_type& partial_key) {
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ FieldType::append_key(*p_mut, partial_key, p_append_left);
+ } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+ FieldType::append_key(*p_mut, partial_key, p_append_right);
+ } else {
+ ceph_abort("impossible path");
+ }
+ return {p_mut, p_append_right};
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const full_key_t<KT>& key) {
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ FieldType::template append_key<KT>(*p_mut, key, p_append_left);
+ } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+ FieldType::template append_key<KT>(*p_mut, key, p_append_right);
+ } else {
+ ceph_abort("impossible path");
+ }
+ return {p_mut, p_append_right};
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+char* APPEND_T::wrap() {
+ assert(p_append_left <= p_append_right);
+ assert(p_src);
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (p_src->is_level_tail()) {
+ laddr_t tail_value = p_src->get_end_p_laddr()->value;
+ p_append_right -= sizeof(laddr_t);
+ assert(p_append_left <= p_append_right);
+ p_mut->copy_in_absolute(p_append_right, tail_value);
+ }
+ }
+ p_mut->copy_in_absolute(p_start + offsetof(FieldType, num_keys), num_keys);
+ return p_append_left;
+}
+
+#define APPEND_TEMPLATE(FT, NT, KT) template class node_extent_t<FT, NT>::Appender<KT>
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::HOBJ);
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h
new file mode 100644
index 000000000..cf0ca463c
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h
@@ -0,0 +1,226 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "key_layout.h"
+#include "stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+/**
+ * node_extent_t
+ *
+ * The top indexing stage implementation for node N0/N1/N2/N3, implements
+ * staged contract as an indexable container, and provides access to node
+ * header.
+ *
+ * The specific field layout are defined by FieldType which are
+ * node_fields_0_t, node_fields_1_t, node_fields_2_t, internal_fields_3_t and
+ * leaf_fields_3_t. Diagrams see node_stage_layout.h.
+ */
+template <typename FieldType, node_type_t _NODE_TYPE>
+class node_extent_t {
+ public:
+ using value_t = value_type_t<_NODE_TYPE>;
+ using num_keys_t = typename FieldType::num_keys_t;
+ static constexpr node_type_t NODE_TYPE = _NODE_TYPE;
+ static constexpr field_type_t FIELD_TYPE = FieldType::FIELD_TYPE;
+ static constexpr node_offset_t EXTENT_SIZE =
+ (FieldType::SIZE + DISK_BLOCK_SIZE - 1u) / DISK_BLOCK_SIZE * DISK_BLOCK_SIZE;
+
+ // TODO: remove
+ node_extent_t() = default;
+
+ node_extent_t(const FieldType* p_fields) : p_fields{p_fields} {
+ validate(*p_fields);
+ }
+
+ const char* p_start() const { return fields_start(*p_fields); }
+
+ const char* off_to_ptr(node_offset_t off) const {
+ assert(off <= FieldType::SIZE);
+ return p_start() + off;
+ }
+
+ node_offset_t ptr_to_off(const void* ptr) const {
+ auto _ptr = static_cast<const char*>(ptr);
+ assert(_ptr >= p_start());
+ auto off = _ptr - p_start();
+ assert(off <= FieldType::SIZE);
+ return off;
+ }
+
+ bool is_level_tail() const { return p_fields->is_level_tail(); }
+ level_t level() const { return p_fields->header.level; }
+ node_offset_t free_size() const {
+ return p_fields->template free_size_before<NODE_TYPE>(keys());
+ }
+ node_offset_t total_size() const { return p_fields->total_size(); }
+ const char* p_left_bound() const;
+ template <node_type_t T = NODE_TYPE>
+ std::enable_if_t<T == node_type_t::INTERNAL, const laddr_packed_t*>
+ get_end_p_laddr() const {
+ assert(is_level_tail());
+ if constexpr (FIELD_TYPE == field_type_t::N3) {
+ return &p_fields->child_addrs[keys()];
+ } else {
+ auto offset_start = p_fields->get_item_end_offset(keys());
+ assert(offset_start <= FieldType::SIZE);
+ offset_start -= sizeof(laddr_packed_t);
+ auto p_addr = p_start() + offset_start;
+ return reinterpret_cast<const laddr_packed_t*>(p_addr);
+ }
+ }
+
+ // container type system
+ using key_get_type = typename FieldType::key_get_type;
+ static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE;
+ index_t keys() const { return p_fields->num_keys; }
+ key_get_type operator[] (index_t index) const { return p_fields->get_key(index); }
+ node_offset_t size_before(index_t index) const {
+ auto free_size = p_fields->template free_size_before<NODE_TYPE>(index);
+ assert(total_size() >= free_size);
+ return total_size() - free_size;
+ }
+ node_offset_t size_to_nxt_at(index_t index) const;
+ node_offset_t size_overhead_at(index_t index) const {
+ return FieldType::ITEM_OVERHEAD; }
+ memory_range_t get_nxt_container(index_t index) const;
+
+ template <typename T = FieldType>
+ std::enable_if_t<T::FIELD_TYPE == field_type_t::N3, const value_t*>
+ get_p_value(index_t index) const {
+ assert(index < keys());
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ return &p_fields->child_addrs[index];
+ } else {
+ auto range = get_nxt_container(index);
+ auto ret = reinterpret_cast<const onode_t*>(range.p_start);
+ assert(range.p_start + ret->size == range.p_end);
+ return ret;
+ }
+ }
+
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ assert(p_node_start == p_start());
+ // nothing to encode as the container range is the entire extent
+ }
+
+ static node_extent_t decode(const char* p_node_start,
+ ceph::bufferlist::const_iterator& delta) {
+ // nothing to decode
+ return node_extent_t(reinterpret_cast<const FieldType*>(p_node_start));
+ }
+
+ static void validate(const FieldType& fields) {
+#ifndef NDEBUG
+ assert(fields.header.get_node_type() == NODE_TYPE);
+ assert(fields.header.get_field_type() == FieldType::FIELD_TYPE);
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ assert(fields.header.level > 0u);
+ } else {
+ assert(fields.header.level == 0u);
+ }
+#endif
+ }
+
+ static void bootstrap_extent(
+ NodeExtentMutable&, field_type_t, node_type_t, bool, level_t);
+
+ static void update_is_level_tail(NodeExtentMutable&, const node_extent_t&, bool);
+
+ static node_offset_t header_size() { return FieldType::HEADER_SIZE; }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(
+ const full_key_t<KT>& key, const value_t& value) {
+ auto size = FieldType::estimate_insert_one();
+ if constexpr (FIELD_TYPE == field_type_t::N2) {
+ size += ns_oid_view_t::estimate_size<KT>(key);
+ } else if constexpr (FIELD_TYPE == field_type_t::N3 &&
+ NODE_TYPE == node_type_t::LEAF) {
+ size += value.size;
+ }
+ return size;
+ }
+
+ template <KeyT KT>
+ static const value_t* insert_at(
+ NodeExtentMutable& mut, const node_extent_t&,
+ const full_key_t<KT>& key, const value_t& value,
+ index_t index, node_offset_t size, const char* p_left_bound) {
+ if constexpr (FIELD_TYPE == field_type_t::N3) {
+ ceph_abort("not implemented");
+ } else {
+ ceph_abort("impossible");
+ }
+ }
+
+ template <KeyT KT>
+ static memory_range_t insert_prefix_at(
+ NodeExtentMutable&, const node_extent_t&,
+ const full_key_t<KT>& key,
+ index_t index, node_offset_t size, const char* p_left_bound);
+
+ static void update_size_at(
+ NodeExtentMutable&, const node_extent_t&, index_t index, int change);
+
+ static node_offset_t trim_until(
+ NodeExtentMutable&, const node_extent_t&, index_t index);
+ static node_offset_t trim_at(NodeExtentMutable&, const node_extent_t&,
+ index_t index, node_offset_t trimmed);
+
+ template <KeyT KT>
+ class Appender;
+
+ private:
+ const FieldType& fields() const { return *p_fields; }
+ const FieldType* p_fields;
+};
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+class node_extent_t<FieldType, NODE_TYPE>::Appender {
+ public:
+ Appender(NodeExtentMutable* p_mut, char* p_append)
+ : p_mut{p_mut}, p_start{p_append} {
+#ifndef NDEBUG
+ auto p_fields = reinterpret_cast<const FieldType*>(p_append);
+ assert(*(p_fields->header.get_field_type()) == FIELD_TYPE);
+ assert(p_fields->header.get_node_type() == NODE_TYPE);
+ assert(p_fields->num_keys == 0);
+#endif
+ p_append_left = p_start + FieldType::HEADER_SIZE;
+ p_append_right = p_start + FieldType::SIZE;
+ }
+ void append(const node_extent_t& src, index_t from, index_t items);
+ void append(const full_key_t<KT>&, const value_t&, const value_t*&);
+ char* wrap();
+ std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&);
+ std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&);
+ void wrap_nxt(char* p_append) {
+ if constexpr (FIELD_TYPE != field_type_t::N3) {
+ assert(p_append < p_append_right);
+ assert(p_append_left < p_append);
+ p_append_right = p_append;
+ FieldType::append_offset(*p_mut, p_append - p_start, p_append_left);
+ ++num_keys;
+ } else {
+ ceph_abort("not implemented");
+ }
+ }
+
+ private:
+ const node_extent_t* p_src = nullptr;
+ NodeExtentMutable* p_mut;
+ char* p_start;
+ char* p_append_left;
+ char* p_append_right;
+ num_keys_t num_keys = 0;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc
new file mode 100644
index 000000000..81bfac72a
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_stage_layout.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+void node_header_t::bootstrap_extent(
+ NodeExtentMutable& mut,
+ field_type_t field_type, node_type_t node_type,
+ bool is_level_tail, level_t level) {
+ node_header_t header;
+ header.set_field_type(field_type);
+ header.set_node_type(node_type);
+ header.set_is_level_tail(is_level_tail);
+ header.level = level;
+ mut.copy_in_relative(0, header);
+}
+
+void node_header_t::update_is_level_tail(
+ NodeExtentMutable& mut, const node_header_t& header, bool value) {
+ auto& _header = const_cast<node_header_t&>(header);
+ _header.set_is_level_tail(value);
+ mut.validate_inplace_update(_header);
+}
+
+#define F013_T _node_fields_013_t<SlotType>
+#define F013_INST(ST) _node_fields_013_t<ST>
+
+template <typename SlotType>
+void F013_T::update_size_at(
+ NodeExtentMutable& mut, const me_t& node, index_t index, int change) {
+ assert(index <= node.num_keys);
+ for (const auto* p_slot = &node.slots[index];
+ p_slot < &node.slots[node.num_keys];
+ ++p_slot) {
+ node_offset_t offset = p_slot->right_offset;
+ mut.copy_in_absolute(
+ (void*)&(p_slot->right_offset),
+ node_offset_t(offset - change));
+ }
+}
+
+template <typename SlotType>
+void F013_T::append_key(
+ NodeExtentMutable& mut, const key_t& key, char*& p_append) {
+ mut.copy_in_absolute(p_append, key);
+ p_append += sizeof(key_t);
+}
+
+template <typename SlotType>
+void F013_T::append_offset(
+ NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) {
+ mut.copy_in_absolute(p_append, offset_to_right);
+ p_append += sizeof(node_offset_t);
+}
+
+template <typename SlotType>
+template <KeyT KT>
+void F013_T::insert_at(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ const me_t& node, index_t index, node_offset_t size_right) {
+ assert(index <= node.num_keys);
+ update_size_at(mut, node, index, size_right);
+ auto p_insert = const_cast<char*>(fields_start(node)) +
+ node.get_key_start_offset(index);
+ auto p_shift_end = fields_start(node) + node.get_key_start_offset(node.num_keys);
+ mut.shift_absolute(p_insert, p_shift_end - p_insert, estimate_insert_one());
+ mut.copy_in_absolute((void*)&node.num_keys, num_keys_t(node.num_keys + 1));
+ append_key(mut, key_t::template from_key<KT>(key), p_insert);
+ append_offset(mut, node.get_item_end_offset(index) - size_right, p_insert);
+}
+#define IA_TEMPLATE(ST, KT) template void F013_INST(ST):: \
+ insert_at<KT>(NodeExtentMutable&, const full_key_t<KT>&, \
+ const F013_INST(ST)&, index_t, node_offset_t)
+IA_TEMPLATE(slot_0_t, KeyT::VIEW);
+IA_TEMPLATE(slot_1_t, KeyT::VIEW);
+IA_TEMPLATE(slot_3_t, KeyT::VIEW);
+IA_TEMPLATE(slot_0_t, KeyT::HOBJ);
+IA_TEMPLATE(slot_1_t, KeyT::HOBJ);
+IA_TEMPLATE(slot_3_t, KeyT::HOBJ);
+
+#define F013_TEMPLATE(ST) template struct F013_INST(ST)
+F013_TEMPLATE(slot_0_t);
+F013_TEMPLATE(slot_1_t);
+F013_TEMPLATE(slot_3_t);
+
+void node_fields_2_t::append_offset(
+ NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) {
+ mut.copy_in_absolute(p_append, offset_to_right);
+ p_append += sizeof(node_offset_t);
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h
new file mode 100644
index 000000000..14ba95bf4
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h
@@ -0,0 +1,366 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "key_layout.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+struct node_header_t {
+ static constexpr unsigned FIELD_TYPE_BITS = 6u;
+ static_assert(static_cast<uint8_t>(field_type_t::_MAX) <= 1u << FIELD_TYPE_BITS);
+ static constexpr unsigned NODE_TYPE_BITS = 1u;
+ static constexpr unsigned B_LEVEL_TAIL_BITS = 1u;
+ using bits_t = uint8_t;
+
+ node_header_t() {}
+ std::optional<field_type_t> get_field_type() const {
+ if (field_type >= FIELD_TYPE_MAGIC &&
+ field_type < static_cast<uint8_t>(field_type_t::_MAX)) {
+ return static_cast<field_type_t>(field_type);
+ } else {
+ return std::nullopt;
+ }
+ }
+ node_type_t get_node_type() const {
+ return static_cast<node_type_t>(node_type);
+ }
+ bool get_is_level_tail() const {
+ return is_level_tail;
+ }
+
+ static void bootstrap_extent(
+ NodeExtentMutable&, field_type_t, node_type_t, bool, level_t);
+
+ static void update_is_level_tail(NodeExtentMutable&, const node_header_t&, bool);
+
+ bits_t field_type : FIELD_TYPE_BITS;
+ bits_t node_type : NODE_TYPE_BITS;
+ bits_t is_level_tail : B_LEVEL_TAIL_BITS;
+ static_assert(sizeof(bits_t) * 8 ==
+ FIELD_TYPE_BITS + NODE_TYPE_BITS + B_LEVEL_TAIL_BITS);
+ level_t level;
+
+ private:
+ void set_field_type(field_type_t type) {
+ field_type = static_cast<uint8_t>(type);
+ }
+ void set_node_type(node_type_t type) {
+ node_type = static_cast<uint8_t>(type);
+ }
+ void set_is_level_tail(bool value) {
+ is_level_tail = static_cast<uint8_t>(value);
+ }
+} __attribute__((packed));
+
+template <typename FixedKeyType, field_type_t _FIELD_TYPE>
+struct _slot_t {
+ using key_t = FixedKeyType;
+ static constexpr field_type_t FIELD_TYPE = _FIELD_TYPE;
+ static constexpr node_offset_t OVERHEAD = sizeof(_slot_t) - sizeof(key_t);
+
+ key_t key;
+ node_offset_t right_offset;
+} __attribute__((packed));
+using slot_0_t = _slot_t<shard_pool_crush_t, field_type_t::N0>;
+using slot_1_t = _slot_t<crush_t, field_type_t::N1>;
+using slot_3_t = _slot_t<snap_gen_t, field_type_t::N3>;
+
+struct node_range_t {
+ node_offset_t start;
+ node_offset_t end;
+};
+
+template <typename FieldType>
+const char* fields_start(const FieldType& node) {
+ return reinterpret_cast<const char*>(&node);
+}
+
+template <node_type_t NODE_TYPE, typename FieldType>
+node_range_t fields_free_range_before(
+ const FieldType& node, index_t index) {
+ assert(index <= node.num_keys);
+ node_offset_t offset_start = node.get_key_start_offset(index);
+ node_offset_t offset_end =
+ (index == 0 ? FieldType::SIZE
+ : node.get_item_start_offset(index - 1));
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (node.is_level_tail() && index == node.num_keys) {
+ offset_end -= sizeof(laddr_t);
+ }
+ }
+ assert(offset_start <= offset_end);
+ assert(offset_end - offset_start < FieldType::SIZE);
+ return {offset_start, offset_end};
+}
+
+/**
+ * _node_fields_013_t (node_fields_0_t, node_fields_1_t, leaf_fields_3_t
+ *
+ * The STAGE_LEFT layout implementation for node N0/N1, or the STAGE_RIGHT
+ * layout implementation for leaf node N3.
+ *
+ * The node layout storing n slots:
+ *
+ * # <----------------------------- node range --------------------------------------> #
+ * # #<~># free space #
+ * # <----- left part -----------------------------> # <~# <----- right slots -------> #
+ * # # <---- left slots -------------> #~> # #
+ * # # slots [2, n) |<~># #<~>| right slots [2, n) #
+ * # # <- slot 0 -> | <- slot 1 -> | # # | <-- s1 --> | <-- s0 --> #
+ * # # | | # # | | #
+ * # | num_ # | right | | right | # # | next-stage | next-stage #
+ * # header | keys # key | offset | key | offset | # # | container | container #
+ * # | # 0 | 0 | 1 | 1 |...#...#...| or onode 1 | or onode 0 #
+ * | | ^ ^
+ * | | | |
+ * | +----------------+ |
+ * +--------------------------------------------+
+ */
+template <typename SlotType>
+struct _node_fields_013_t {
+ // TODO: decide by NODE_BLOCK_SIZE, sizeof(SlotType), sizeof(laddr_t)
+ // and the minimal size of variable_key.
+ using num_keys_t = uint8_t;
+ using key_t = typename SlotType::key_t;
+ using key_get_type = const key_t&;
+ using me_t = _node_fields_013_t<SlotType>;
+ static constexpr field_type_t FIELD_TYPE = SlotType::FIELD_TYPE;
+ static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE;
+ static constexpr node_offset_t HEADER_SIZE =
+ sizeof(node_header_t) + sizeof(num_keys_t);
+ static constexpr node_offset_t ITEM_OVERHEAD = SlotType::OVERHEAD;
+
+ bool is_level_tail() const { return header.get_is_level_tail(); }
+ node_offset_t total_size() const { return SIZE; }
+ key_get_type get_key(index_t index) const {
+ assert(index < num_keys);
+ return slots[index].key;
+ }
+ node_offset_t get_key_start_offset(index_t index) const {
+ assert(index <= num_keys);
+ auto offset = HEADER_SIZE + sizeof(SlotType) * index;
+ assert(offset < SIZE);
+ return offset;
+ }
+ node_offset_t get_item_start_offset(index_t index) const {
+ assert(index < num_keys);
+ auto offset = slots[index].right_offset;
+ assert(offset <= SIZE);
+ return offset;
+ }
+ const void* p_offset(index_t index) const {
+ assert(index < num_keys);
+ return &slots[index].right_offset;
+ }
+ node_offset_t get_item_end_offset(index_t index) const {
+ return index == 0 ? SIZE : get_item_start_offset(index - 1);
+ }
+ template <node_type_t NODE_TYPE>
+ node_offset_t free_size_before(index_t index) const {
+ auto range = fields_free_range_before<NODE_TYPE>(*this, index);
+ return range.end - range.start;
+ }
+
+ static node_offset_t estimate_insert_one() { return sizeof(SlotType); }
+ template <KeyT KT>
+ static void insert_at(
+ NodeExtentMutable&, const full_key_t<KT>& key,
+ const me_t& node, index_t index, node_offset_t size_right);
+ static void update_size_at(
+ NodeExtentMutable&, const me_t& node, index_t index, int change);
+ static void append_key(
+ NodeExtentMutable&, const key_t& key, char*& p_append);
+ template <KeyT KT>
+ static void append_key(
+ NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) {
+ append_key(mut, key_t::template from_key<KT>(key), p_append);
+ }
+ static void append_offset(
+ NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append);
+
+ node_header_t header;
+ num_keys_t num_keys = 0u;
+ SlotType slots[];
+} __attribute__((packed));
+using node_fields_0_t = _node_fields_013_t<slot_0_t>;
+using node_fields_1_t = _node_fields_013_t<slot_1_t>;
+
+/**
+ * node_fields_2_t
+ *
+ * The STAGE_STRING layout implementation for node N2.
+ *
+ * The node layout storing n slots:
+ *
+ * # <--------------------------------- node range ----------------------------------------> #
+ * # #<~># free space #
+ * # <------- left part ---------------> # <~# <--------- right slots ---------------------> #
+ * # # <---- offsets ----> #~> #<~>| slots [2, n) #
+ * # # offsets [2, n) |<~># # | <----- slot 1 ----> | <----- slot 0 ----> #
+ * # # | # # | | #
+ * # | num_ # offset | offset | # # | next-stage | ns-oid | next-stage | ns-oid #
+ * # header | keys # 0 | 1 |...#...#...| container1 | 1 | container0 | 0 #
+ * | | ^ ^
+ * | | | |
+ * | +----------------+ |
+ * +-----------------------------------------------+
+ */
+struct node_fields_2_t {
+ // TODO: decide by NODE_BLOCK_SIZE, sizeof(node_off_t), sizeof(laddr_t)
+ // and the minimal size of variable_key.
+ using num_keys_t = uint8_t;
+ using key_t = ns_oid_view_t;
+ using key_get_type = key_t;
+ static constexpr field_type_t FIELD_TYPE = field_type_t::N2;
+ static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE;
+ static constexpr node_offset_t HEADER_SIZE =
+ sizeof(node_header_t) + sizeof(num_keys_t);
+ static constexpr node_offset_t ITEM_OVERHEAD = sizeof(node_offset_t);
+
+ bool is_level_tail() const { return header.get_is_level_tail(); }
+ node_offset_t total_size() const { return SIZE; }
+ key_get_type get_key(index_t index) const {
+ assert(index < num_keys);
+ node_offset_t item_end_offset =
+ (index == 0 ? SIZE : offsets[index - 1]);
+ assert(item_end_offset <= SIZE);
+ const char* p_start = fields_start(*this);
+ return key_t(p_start + item_end_offset);
+ }
+ node_offset_t get_key_start_offset(index_t index) const {
+ assert(index <= num_keys);
+ auto offset = HEADER_SIZE + sizeof(node_offset_t) * num_keys;
+ assert(offset <= SIZE);
+ return offset;
+ }
+ node_offset_t get_item_start_offset(index_t index) const {
+ assert(index < num_keys);
+ auto offset = offsets[index];
+ assert(offset <= SIZE);
+ return offset;
+ }
+ const void* p_offset(index_t index) const {
+ assert(index < num_keys);
+ return &offsets[index];
+ }
+ node_offset_t get_item_end_offset(index_t index) const {
+ return index == 0 ? SIZE : get_item_start_offset(index - 1);
+ }
+ template <node_type_t NODE_TYPE>
+ node_offset_t free_size_before(index_t index) const {
+ auto range = fields_free_range_before<NODE_TYPE>(*this, index);
+ return range.end - range.start;
+ }
+
+ static node_offset_t estimate_insert_one() { return sizeof(node_offset_t); }
+ template <KeyT KT>
+ static void insert_at(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ const node_fields_2_t& node, index_t index, node_offset_t size_right) {
+ ceph_abort("not implemented");
+ }
+ static void update_size_at(
+ NodeExtentMutable& mut, const node_fields_2_t& node, index_t index, int change) {
+ ceph_abort("not implemented");
+ }
+ static void append_key(
+ NodeExtentMutable& mut, const key_t& key, char*& p_append) {
+ ns_oid_view_t::append(mut, key, p_append);
+ }
+ template <KeyT KT>
+ static void append_key(
+ NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) {
+ ns_oid_view_t::append<KT>(mut, key, p_append);
+ }
+ static void append_offset(
+ NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append);
+
+ node_header_t header;
+ num_keys_t num_keys = 0u;
+ node_offset_t offsets[];
+} __attribute__((packed));
+
+/**
+ * internal_fields_3_t
+ *
+ * The STAGE_RIGHT layout implementation for N2.
+ *
+ * The node layout storing 3 children:
+ *
+ * # <---------------- node range ---------------------------> #
+ * # # <-- keys ---> # <---- laddrs -----------> #
+ * # free space: # |<~># |<~>#
+ * # # | # | #
+ * # | num_ # key | key | # laddr | laddr | laddr | #
+ * # header | keys # 0 | 1 |...# 0 | 1 | 2 |...#
+ */
+// TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t)
+static constexpr unsigned MAX_NUM_KEYS_I3 = 170u;
+template <unsigned MAX_NUM_KEYS>
+struct _internal_fields_3_t {
+ using key_get_type = const snap_gen_t&;
+ using me_t = _internal_fields_3_t<MAX_NUM_KEYS>;
+ // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t)
+ using num_keys_t = uint8_t;
+ static constexpr field_type_t FIELD_TYPE = field_type_t::N3;
+ static constexpr node_offset_t SIZE = sizeof(me_t);
+ static constexpr node_offset_t HEADER_SIZE =
+ sizeof(node_header_t) + sizeof(num_keys_t);
+ static constexpr node_offset_t ITEM_OVERHEAD = 0u;
+
+ bool is_level_tail() const { return header.get_is_level_tail(); }
+ node_offset_t total_size() const {
+ if (is_level_tail()) {
+ return SIZE - sizeof(snap_gen_t);
+ } else {
+ return SIZE;
+ }
+ }
+ key_get_type get_key(index_t index) const {
+ assert(index < num_keys);
+ return keys[index];
+ }
+ template <node_type_t NODE_TYPE>
+ std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, node_offset_t>
+ free_size_before(index_t index) const {
+ assert(index <= num_keys);
+ assert(num_keys <= (is_level_tail() ? MAX_NUM_KEYS - 1 : MAX_NUM_KEYS));
+ auto free = (MAX_NUM_KEYS - index) * (sizeof(snap_gen_t) + sizeof(laddr_t));
+ if (is_level_tail() && index == num_keys) {
+ free -= (sizeof(snap_gen_t) + sizeof(laddr_t));
+ }
+ assert(free < SIZE);
+ return free;
+ }
+
+ static node_offset_t estimate_insert_one() {
+ return sizeof(snap_gen_t) + sizeof(laddr_t);
+ }
+ template <KeyT KT>
+ static void insert_at(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ const me_t& node, index_t index, node_offset_t size_right) {
+ ceph_abort("not implemented");
+ }
+ static void update_size_at(
+ NodeExtentMutable& mut, const me_t& node, index_t index, int change) {
+ ceph_abort("not implemented");
+ }
+
+ node_header_t header;
+ num_keys_t num_keys = 0u;
+ snap_gen_t keys[MAX_NUM_KEYS];
+ laddr_packed_t child_addrs[MAX_NUM_KEYS];
+} __attribute__((packed));
+static_assert(_internal_fields_3_t<MAX_NUM_KEYS_I3>::SIZE <= NODE_BLOCK_SIZE &&
+ _internal_fields_3_t<MAX_NUM_KEYS_I3 + 1>::SIZE > NODE_BLOCK_SIZE);
+using internal_fields_3_t = _internal_fields_3_t<MAX_NUM_KEYS_I3>;
+
+using leaf_fields_3_t = _node_fields_013_t<slot_3_t>;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h
new file mode 100644
index 000000000..cac167a98
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h
@@ -0,0 +1,2186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <type_traits>
+
+#include "common/likely.h"
+
+#include "sub_items_stage.h"
+#include "item_iterator_stage.h"
+
+namespace crimson::os::seastore::onode {
+
+struct search_result_bs_t {
+ index_t index;
+ MatchKindBS match;
+};
+template <typename FGetKey>
+search_result_bs_t binary_search(
+ const full_key_t<KeyT::HOBJ>& key,
+ index_t begin, index_t end, FGetKey&& f_get_key) {
+ assert(begin <= end);
+ while (begin < end) {
+ auto total = begin + end;
+ auto mid = total >> 1;
+ // do not copy if return value is reference
+ decltype(f_get_key(mid)) target = f_get_key(mid);
+ auto match = compare_to<KeyT::HOBJ>(key, target);
+ if (match == MatchKindCMP::LT) {
+ end = mid;
+ } else if (match == MatchKindCMP::GT) {
+ begin = mid + 1;
+ } else {
+ return {mid, MatchKindBS::EQ};
+ }
+ }
+ return {begin , MatchKindBS::NE};
+}
+
+template <typename PivotType, typename FGet>
+search_result_bs_t binary_search_r(
+ index_t rend, index_t rbegin, FGet&& f_get, const PivotType& key) {
+ assert(rend <= rbegin);
+ while (rend < rbegin) {
+ auto total = rend + rbegin + 1;
+ auto mid = total >> 1;
+ // do not copy if return value is reference
+ decltype(f_get(mid)) target = f_get(mid);
+ int match = target - key;
+ if (match < 0) {
+ rend = mid;
+ } else if (match > 0) {
+ rbegin = mid - 1;
+ } else {
+ return {mid, MatchKindBS::EQ};
+ }
+ }
+ return {rbegin, MatchKindBS::NE};
+}
+
+inline bool matchable(field_type_t type, match_stat_t mstat) {
+ assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX);
+ /*
+ * compressed prefix by field type:
+ * N0: NONE
+ * N1: pool/shard
+ * N2: pool/shard crush
+ * N3: pool/shard crush ns/oid
+ *
+ * if key matches the node's compressed prefix, return true
+ * else, return false
+ */
+#ifndef NDEBUG
+ if (mstat == MSTAT_END) {
+ assert(type == field_type_t::N0);
+ }
+#endif
+ return mstat + to_unsigned(type) < 4;
+}
+
+inline void assert_mstat(
+ const full_key_t<KeyT::HOBJ>& key,
+ const full_key_t<KeyT::VIEW>& index,
+ match_stat_t mstat) {
+ assert(mstat >= MSTAT_MIN && mstat <= MSTAT_LT2);
+ // key < index ...
+ switch (mstat) {
+ case MSTAT_EQ:
+ break;
+ case MSTAT_LT0:
+ assert(compare_to<KeyT::HOBJ>(key, index.snap_gen_packed()) == MatchKindCMP::LT);
+ break;
+ case MSTAT_LT1:
+ assert(compare_to<KeyT::HOBJ>(key, index.ns_oid_view()) == MatchKindCMP::LT);
+ break;
+ case MSTAT_LT2:
+ if (index.has_shard_pool()) {
+ assert(compare_to<KeyT::HOBJ>(key, shard_pool_crush_t{
+ index.shard_pool_packed(), index.crush_packed()}) == MatchKindCMP::LT);
+ } else {
+ assert(compare_to<KeyT::HOBJ>(key, index.crush_packed()) == MatchKindCMP::LT);
+ }
+ break;
+ default:
+ ceph_abort("impossible path");
+ }
+ // key == index ...
+ switch (mstat) {
+ case MSTAT_EQ:
+ assert(compare_to<KeyT::HOBJ>(key, index.snap_gen_packed()) == MatchKindCMP::EQ);
+ case MSTAT_LT0:
+ if (!index.has_ns_oid())
+ break;
+ assert(index.ns_oid_view().type() == ns_oid_view_t::Type::MAX ||
+ compare_to<KeyT::HOBJ>(key, index.ns_oid_view()) == MatchKindCMP::EQ);
+ case MSTAT_LT1:
+ if (!index.has_crush())
+ break;
+ assert(compare_to<KeyT::HOBJ>(key, index.crush_packed()) == MatchKindCMP::EQ);
+ if (!index.has_shard_pool())
+ break;
+ assert(compare_to<KeyT::HOBJ>(key, index.shard_pool_packed()) == MatchKindCMP::EQ);
+ default:
+ break;
+ }
+}
+
+#define NXT_STAGE_T staged<next_param_t>
+
+enum class TrimType { BEFORE, AFTER, AT };
+
+/**
+ * staged
+ *
+ * Implements recursive logic that modifies or reads the node layout
+ * (N0/N1/N2/N3 * LEAF/INTERNAL) with the multi-stage design. The specific
+ * stage implementation is flexible. So the implementations for different
+ * stages can be assembled independently, as long as they follow the
+ * definitions of container interfaces.
+ *
+ * Multi-stage is designed to index different portions of onode keys
+ * stage-by-stage. There are at most 3 stages for a node:
+ * - STAGE_LEFT: index shard-pool-crush for N0, or index crush for N1 node;
+ * - STAGE_STRING: index ns-oid for N0/N1/N2 nodes;
+ * - STAGE_RIGHT: index snap-gen for N0/N1/N2/N3 nodes;
+ *
+ * The intention is to consolidate the high-level indexing implementations at
+ * the level of stage, so we don't need to write them repeatedly for every
+ * stage and for every node type.
+ */
+template <typename Params>
+struct staged {
+ static_assert(Params::STAGE >= STAGE_BOTTOM);
+ static_assert(Params::STAGE <= STAGE_TOP);
+ using container_t = typename Params::container_t;
+ using key_get_type = typename container_t::key_get_type;
+ using next_param_t = typename Params::next_param_t;
+ using position_t = staged_position_t<Params::STAGE>;
+ using result_t = staged_result_t<Params::NODE_TYPE, Params::STAGE>;
+ using value_t = value_type_t<Params::NODE_TYPE>;
+ static constexpr auto CONTAINER_TYPE = container_t::CONTAINER_TYPE;
+ static constexpr bool IS_BOTTOM = (Params::STAGE == STAGE_BOTTOM);
+ static constexpr auto NODE_TYPE = Params::NODE_TYPE;
+ static constexpr auto STAGE = Params::STAGE;
+
+ template <bool is_exclusive>
+ static void _left_or_right(index_t& split_index, index_t insert_index,
+ std::optional<bool>& is_insert_left) {
+ assert(!is_insert_left.has_value());
+ assert(is_valid_index(split_index));
+ if constexpr (is_exclusive) {
+ if (split_index <= insert_index) {
+ // ...[s_index-1] |!| (i_index) [s_index]...
+ // offset i_position to right
+ is_insert_left = false;
+ } else {
+ // ...[s_index-1] (i_index)) |?[s_index]| ...
+ // ...(i_index)...[s_index-1] |?[s_index]| ...
+ is_insert_left = true;
+ --split_index;
+ }
+ } else {
+ if (split_index < insert_index) {
+ // ...[s_index-1] |?[s_index]| ...[(i_index)[s_index_k]...
+ is_insert_left = false;
+ } else if (split_index > insert_index) {
+ // ...[(i_index)s_index-1] |?[s_index]| ...
+ // ...[(i_index)s_index_k]...[s_index-1] |?[s_index]| ...
+ is_insert_left = true;
+ } else {
+ // ...[s_index-1] |?[(i_index)s_index]| ...
+ // i_to_left = std::nullopt;
+ }
+ }
+ }
+
+ template <ContainerType CTYPE, typename Enable = void> class _iterator_t;
+ template <ContainerType CTYPE>
+ class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::INDEXABLE>> {
+ /*
+ * indexable container type system:
+ * CONTAINER_TYPE = ContainerType::INDEXABLE
+ * keys() const -> index_t
+ * operator[](index_t) const -> key_get_type
+ * size_before(index_t) const -> node_offset_t
+ * size_overhead_at(index_t) const -> node_offset_t
+ * (IS_BOTTOM) get_p_value(index_t) const -> const value_t*
+ * (!IS_BOTTOM) size_to_nxt_at(index_t) const -> node_offset_t
+ * (!IS_BOTTOM) get_nxt_container(index_t) const
+ * encode(p_node_start, encoded)
+ * decode(p_node_start, delta) -> container_t
+ * static:
+ * header_size() -> node_offset_t
+ * estimate_insert(key, value) -> node_offset_t
+ * (IS_BOTTOM) insert_at(mut, src, key, value,
+ * index, size, p_left_bound) -> const value_t*
+ * (!IS_BOTTOM) insert_prefix_at(mut, src, key,
+ * index, size, p_left_bound) -> memory_range_t
+ * (!IS_BOTTOM) update_size_at(mut, src, index, size)
+ * trim_until(mut, container, index) -> trim_size
+ * (!IS_BOTTOM) trim_at(mut, container, index, trimmed) -> trim_size
+ *
+ * Appender::append(const container_t& src, from, items)
+ */
+ public:
+ using me_t = _iterator_t<CTYPE>;
+
+ _iterator_t(const container_t& container) : container{container} {
+ assert(container.keys());
+ }
+
+ index_t index() const {
+ return _index;
+ }
+ key_get_type get_key() const {
+ assert(!is_end());
+ return container[_index];
+ }
+ node_offset_t size_to_nxt() const {
+ assert(!is_end());
+ return container.size_to_nxt_at(_index);
+ }
+ template <typename T = typename NXT_STAGE_T::container_t>
+ std::enable_if_t<!IS_BOTTOM, T> get_nxt_container() const {
+ assert(!is_end());
+ return container.get_nxt_container(_index);
+ }
+ template <typename T = value_t>
+ std::enable_if_t<IS_BOTTOM, const T*> get_p_value() const {
+ assert(!is_end());
+ return container.get_p_value(_index);
+ }
+ bool is_last() const {
+ return _index + 1 == container.keys();
+ }
+ bool is_end() const { return _index == container.keys(); }
+ node_offset_t size() const {
+ assert(!is_end());
+ assert(header_size() == container.size_before(0));
+ assert(container.size_before(_index + 1) > container.size_before(_index));
+ return container.size_before(_index + 1) -
+ container.size_before(_index);
+ }
+ node_offset_t size_overhead() const {
+ assert(!is_end());
+ return container.size_overhead_at(_index);
+ }
+
+ me_t& operator++() {
+ assert(!is_end());
+ assert(!is_last());
+ ++_index;
+ return *this;
+ }
+ void seek_at(index_t index) {
+ assert(index < container.keys());
+ seek_till_end(index);
+ }
+ void seek_till_end(index_t index) {
+ assert(!is_end());
+ assert(this->index() == 0);
+ assert(index <= container.keys());
+ _index = index;
+ }
+ void seek_last() {
+ assert(!is_end());
+ assert(index() == 0);
+ _index = container.keys() - 1;
+ }
+ void set_end() {
+ assert(!is_end());
+ assert(is_last());
+ ++_index;
+ }
+ // Note: possible to return an end iterator
+ MatchKindBS seek(const full_key_t<KeyT::HOBJ>& key, bool exclude_last) {
+ assert(!is_end());
+ assert(index() == 0);
+ index_t end_index = container.keys();
+ if (exclude_last) {
+ assert(end_index);
+ --end_index;
+ assert(compare_to<KeyT::HOBJ>(key, container[end_index]) == MatchKindCMP::LT);
+ }
+ auto ret = binary_search(key, _index, end_index,
+ [this] (index_t index) { return container[index]; });
+ _index = ret.index;
+ return ret.match;
+ }
+
+ template <KeyT KT, typename T = value_t>
+ std::enable_if_t<IS_BOTTOM, const T*> insert(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ const value_t& value, node_offset_t insert_size, const char* p_left_bound) {
+ return container_t::template insert_at<KT>(
+ mut, container, key, value, _index, insert_size, p_left_bound);
+ }
+
+ template <KeyT KT, typename T = memory_range_t>
+ std::enable_if_t<!IS_BOTTOM, T> insert_prefix(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ node_offset_t size, const char* p_left_bound) {
+ return container_t::template insert_prefix_at<KT>(
+ mut, container, key, _index, size, p_left_bound);
+ }
+
+ template <typename T = void>
+ std::enable_if_t<!IS_BOTTOM, T>
+ update_size(NodeExtentMutable& mut, node_offset_t insert_size) {
+ assert(!is_end());
+ container_t::update_size_at(mut, container, _index, insert_size);
+ }
+
+ // Note: possible to return an end iterator when is_exclusive is true
+ template <bool is_exclusive>
+ size_t seek_split_inserted(
+ size_t start_size, size_t extra_size, size_t target_size,
+ index_t& insert_index, size_t insert_size,
+ std::optional<bool>& is_insert_left) {
+ assert(!is_end());
+ assert(index() == 0);
+ // replace insert_index placeholder
+ if constexpr (!is_exclusive) {
+ if (insert_index == INDEX_LAST) {
+ insert_index = container.keys() - 1;
+ }
+ } else {
+ if (insert_index == INDEX_END) {
+ insert_index = container.keys();
+ }
+ }
+ assert(insert_index <= container.keys());
+
+ auto start_size_1 = start_size + extra_size;
+ auto f_get_used_size = [this, start_size, start_size_1,
+ insert_index, insert_size] (index_t index) {
+ size_t current_size;
+ if (unlikely(index == 0)) {
+ current_size = start_size;
+ } else {
+ current_size = start_size_1;
+ if (index > insert_index) {
+ current_size += insert_size;
+ if constexpr (is_exclusive) {
+ --index;
+ }
+ }
+ // already includes header size
+ current_size += container.size_before(index);
+ }
+ return current_size;
+ };
+ index_t s_end;
+ if constexpr (is_exclusive) {
+ s_end = container.keys();
+ } else {
+ s_end = container.keys() - 1;
+ }
+ _index = binary_search_r(0, s_end, f_get_used_size, target_size).index;
+ size_t current_size = f_get_used_size(_index);
+ assert(current_size <= target_size);
+
+ _left_or_right<is_exclusive>(_index, insert_index, is_insert_left);
+ return current_size;
+ }
+
+ size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) {
+ assert(!is_end());
+ assert(index() == 0);
+ auto start_size_1 = start_size + extra_size;
+ auto f_get_used_size = [this, start_size, start_size_1] (index_t index) {
+ size_t current_size;
+ if (unlikely(index == 0)) {
+ current_size = start_size;
+ } else {
+ // already includes header size
+ current_size = start_size_1 + container.size_before(index);
+ }
+ return current_size;
+ };
+ _index = binary_search_r(
+ 0, container.keys() - 1, f_get_used_size, target_size).index;
+ size_t current_size = f_get_used_size(_index);
+ assert(current_size <= target_size);
+ return current_size;
+ }
+
+ // Note: possible to return an end iterater if to_index == INDEX_END
+ template <KeyT KT>
+ void copy_out_until(
+ typename container_t::template Appender<KT>& appender, index_t& to_index) {
+ auto num_keys = container.keys();
+ index_t items;
+ if (to_index == INDEX_END) {
+ items = num_keys - _index;
+ appender.append(container, _index, items);
+ _index = num_keys;
+ to_index = _index;
+ } else if (to_index == INDEX_LAST) {
+ assert(!is_end());
+ items = num_keys - 1 - _index;
+ appender.append(container, _index, items);
+ _index = num_keys - 1;
+ to_index = _index;
+ } else {
+ assert(_index <= to_index);
+ assert(to_index <= num_keys);
+ items = to_index - _index;
+ appender.append(container, _index, items);
+ _index = to_index;
+ }
+ }
+
+ node_offset_t trim_until(NodeExtentMutable& mut) {
+ return container_t::trim_until(mut, container, _index);
+ }
+
+ template <typename T = node_offset_t>
+ std::enable_if_t<!IS_BOTTOM, T>
+ trim_at(NodeExtentMutable& mut, node_offset_t trimmed) {
+ return container_t::trim_at(mut, container, _index, trimmed);
+ }
+
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ container.encode(p_node_start, encoded);
+ ceph::encode(_index, encoded);
+ }
+
+ static me_t decode(const char* p_node_start,
+ ceph::bufferlist::const_iterator& delta) {
+ auto container = container_t::decode(p_node_start, delta);
+ auto ret = me_t(container);
+ index_t index;
+ ceph::decode(index, delta);
+ ret.seek_till_end(index);
+ return ret;
+ }
+
+ static node_offset_t header_size() {
+ return container_t::header_size();
+ }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(
+ const full_key_t<KT>& key, const value_t& value) {
+ return container_t::template estimate_insert<KT>(key, value);
+ }
+
+ private:
+ container_t container;
+ index_t _index = 0;
+ };
+
+ template <ContainerType CTYPE>
+ class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::ITERATIVE>> {
+ /*
+ * iterative container type system (!IS_BOTTOM):
+ * CONTAINER_TYPE = ContainerType::ITERATIVE
+ * index() const -> index_t
+ * get_key() const -> key_get_type
+ * size() const -> node_offset_t
+ * size_to_nxt() const -> node_offset_t
+ * size_overhead() const -> node_offset_t
+ * get_nxt_container() const
+ * has_next() const -> bool
+ * encode(p_node_start, encoded)
+ * decode(p_node_start, delta) -> container_t
+ * operator++()
+ * static:
+ * header_size() -> node_offset_t
+ * estimate_insert(key, value) -> node_offset_t
+ * insert_prefix(mut, src, key, is_end, size, p_left_bound) -> memory_range_t
+ * update_size(mut, src, size)
+ * trim_until(mut, container) -> trim_size
+ * trim_at(mut, container, trimmed) -> trim_size
+ */
+ // currently the iterative iterator is only implemented with STAGE_STRING
+ // for in-node space efficiency
+ static_assert(STAGE == STAGE_STRING);
+ public:
+ using me_t = _iterator_t<CTYPE>;
+
+ _iterator_t(const container_t& container) : container{container} {}
+
+ index_t index() const {
+ if (is_end()) {
+ return container.index() + 1;
+ } else {
+ return container.index();
+ }
+ }
+ key_get_type get_key() const {
+ assert(!is_end());
+ return container.get_key();
+ }
+ node_offset_t size_to_nxt() const {
+ assert(!is_end());
+ return container.size_to_nxt();
+ }
+ const typename NXT_STAGE_T::container_t get_nxt_container() const {
+ assert(!is_end());
+ return container.get_nxt_container();
+ }
+ bool is_last() const {
+ assert(!is_end());
+ return !container.has_next();
+ }
+ bool is_end() const {
+#ifndef NDEBUG
+ if (_is_end) {
+ assert(!container.has_next());
+ }
+#endif
+ return _is_end;
+ }
+ node_offset_t size() const {
+ assert(!is_end());
+ return container.size();
+ }
+ node_offset_t size_overhead() const {
+ assert(!is_end());
+ return container.size_overhead();
+ }
+
+ me_t& operator++() {
+ assert(!is_end());
+ assert(!is_last());
+ ++container;
+ return *this;
+ }
+ void seek_at(index_t index) {
+ assert(!is_end());
+ assert(this->index() == 0);
+ while (index > 0) {
+ assert(container.has_next());
+ ++container;
+ --index;
+ }
+ }
+ void seek_till_end(index_t index) {
+ assert(!is_end());
+ assert(this->index() == 0);
+ while (index > 0) {
+ if (!container.has_next()) {
+ assert(index == 1);
+ set_end();
+ break;
+ }
+ ++container;
+ --index;
+ }
+ }
+ void seek_last() {
+ assert(!is_end());
+ assert(index() == 0);
+ while (container.has_next()) {
+ ++container;
+ }
+ }
+ void set_end() {
+ assert(!is_end());
+ assert(is_last());
+ _is_end = true;
+ }
+ // Note: possible to return an end iterator
+ MatchKindBS seek(const full_key_t<KeyT::HOBJ>& key, bool exclude_last) {
+ assert(!is_end());
+ assert(index() == 0);
+ do {
+ if (exclude_last && is_last()) {
+ assert(compare_to<KeyT::HOBJ>(key, get_key()) == MatchKindCMP::LT);
+ return MatchKindBS::NE;
+ }
+ auto match = compare_to<KeyT::HOBJ>(key, get_key());
+ if (match == MatchKindCMP::LT) {
+ return MatchKindBS::NE;
+ } else if (match == MatchKindCMP::EQ) {
+ return MatchKindBS::EQ;
+ } else {
+ if (container.has_next()) {
+ ++container;
+ } else {
+ // end
+ break;
+ }
+ }
+ } while (true);
+ assert(!exclude_last);
+ set_end();
+ return MatchKindBS::NE;
+ }
+
+ template <KeyT KT>
+ memory_range_t insert_prefix(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ node_offset_t size, const char* p_left_bound) {
+ return container_t::template insert_prefix<KT>(
+ mut, container, key, is_end(), size, p_left_bound);
+ }
+
+ void update_size(NodeExtentMutable& mut, node_offset_t insert_size) {
+ assert(!is_end());
+ container_t::update_size(mut, container, insert_size);
+ }
+
+ // Note: possible to return an end iterator when is_exclusive is true
+ // insert_index can still be INDEX_LAST or INDEX_END
+ template <bool is_exclusive>
+ size_t seek_split_inserted(
+ size_t start_size, size_t extra_size, size_t target_size,
+ index_t& insert_index, size_t insert_size,
+ std::optional<bool>& is_insert_left) {
+ assert(!is_end());
+ assert(index() == 0);
+ size_t current_size = start_size;
+ index_t split_index = 0;
+ extra_size += header_size();
+ do {
+ if constexpr (!is_exclusive) {
+ if (is_last()) {
+ assert(split_index == index());
+ if (insert_index == INDEX_LAST) {
+ insert_index = index();
+ }
+ assert(insert_index <= index());
+ break;
+ }
+ }
+
+ size_t nxt_size = current_size;
+ if (split_index == 0) {
+ nxt_size += extra_size;
+ }
+ if (split_index == insert_index) {
+ nxt_size += insert_size;
+ if constexpr (is_exclusive) {
+ if (nxt_size > target_size) {
+ break;
+ }
+ current_size = nxt_size;
+ ++split_index;
+ }
+ }
+ nxt_size += size();
+ if (nxt_size > target_size) {
+ break;
+ }
+ current_size = nxt_size;
+
+ if constexpr (is_exclusive) {
+ if (is_last()) {
+ assert(split_index == index());
+ set_end();
+ split_index = index();
+ if (insert_index == INDEX_END) {
+ insert_index = index();
+ }
+ assert(insert_index == index());
+ break;
+ } else {
+ ++(*this);
+ ++split_index;
+ }
+ } else {
+ ++(*this);
+ ++split_index;
+ }
+ } while (true);
+ assert(current_size <= target_size);
+
+ _left_or_right<is_exclusive>(split_index, insert_index, is_insert_left);
+ assert(split_index == index());
+ return current_size;
+ }
+
+ size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) {
+ assert(!is_end());
+ assert(index() == 0);
+ size_t current_size = start_size;
+ do {
+ if (is_last()) {
+ break;
+ }
+
+ size_t nxt_size = current_size;
+ if (index() == 0) {
+ nxt_size += extra_size;
+ }
+ nxt_size += size();
+ if (nxt_size > target_size) {
+ break;
+ }
+ current_size = nxt_size;
+ ++(*this);
+ } while (true);
+ assert(current_size <= target_size);
+ return current_size;
+ }
+
+ // Note: possible to return an end iterater if to_index == INDEX_END
+ template <KeyT KT>
+ void copy_out_until(
+ typename container_t::template Appender<KT>& appender, index_t& to_index) {
+ if (is_end()) {
+ assert(!container.has_next());
+ if (to_index == INDEX_END) {
+ to_index = index();
+ }
+ assert(to_index == index());
+ return;
+ }
+ index_t items;
+ if (to_index == INDEX_END || to_index == INDEX_LAST) {
+ items = to_index;
+ } else {
+ assert(is_valid_index(to_index));
+ assert(index() <= to_index);
+ items = to_index - index();
+ }
+ if (appender.append(container, items)) {
+ set_end();
+ }
+ to_index = index();
+ }
+
+ node_offset_t trim_until(NodeExtentMutable& mut) {
+ if (is_end()) {
+ return 0;
+ }
+ return container_t::trim_until(mut, container);
+ }
+
+ node_offset_t trim_at(NodeExtentMutable& mut, node_offset_t trimmed) {
+ assert(!is_end());
+ return container_t::trim_at(mut, container, trimmed);
+ }
+
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ container.encode(p_node_start, encoded);
+ uint8_t is_end = _is_end;
+ ceph::encode(is_end, encoded);
+ }
+
+ static me_t decode(const char* p_node_start,
+ ceph::bufferlist::const_iterator& delta) {
+ auto container = container_t::decode(p_node_start, delta);
+ auto ret = me_t(container);
+ uint8_t is_end;
+ ceph::decode(is_end, delta);
+ if (is_end) {
+ ret.set_end();
+ }
+ return ret;
+ }
+
+ static node_offset_t header_size() {
+ return container_t::header_size();
+ }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(const full_key_t<KT>& key, const value_t& value) {
+ return container_t::template estimate_insert<KT>(key, value);
+ }
+
+ private:
+ container_t container;
+ bool _is_end = false;
+ };
+
+ /*
+ * iterator_t encapsulates both indexable and iterative implementations
+ * from a *non-empty* container.
+ * cstr(const container_t&)
+ * access:
+ * index() -> index_t
+ * get_key() -> key_get_type (const reference or value type)
+ * is_last() -> bool
+ * is_end() -> bool
+ * size() -> node_offset_t
+ * size_overhead() -> node_offset_t
+ * (IS_BOTTOM) get_p_value() -> const value_t*
+ * (!IS_BOTTOM) get_nxt_container() -> nxt_stage::container_t
+ * (!IS_BOTTOM) size_to_nxt() -> node_offset_t
+ * seek:
+ * operator++() -> iterator_t&
+ * seek_at(index)
+ * seek_till_end(index)
+ * seek_last()
+ * set_end()
+ * seek(key, exclude_last) -> MatchKindBS
+ * insert:
+ * (IS_BOTTOM) insert(mut, key, value, size, p_left_bound) -> p_value
+ * (!IS_BOTTOM) insert_prefix(mut, key, size, p_left_bound) -> memory_range_t
+ * (!IS_BOTTOM) update_size(mut, size)
+ * split:
+ * seek_split_inserted<bool is_exclusive>(
+ * start_size, extra_size, target_size, insert_index, insert_size,
+ * std::optional<bool>& is_insert_left)
+ * -> insert to left/right/unknown (!exclusive)
+ * -> insert to left/right (exclusive, can be end)
+ * -> split_size
+ * seek_split(start_size, extra_size, target_size) -> split_size
+ * copy_out_until(appender, to_index) (can be end)
+ * trim_until(mut) -> trim_size
+ * (!IS_BOTTOM) trim_at(mut, trimmed) -> trim_size
+ * denc:
+ * encode(p_node_start, encoded)
+ * decode(p_node_start, delta) -> iterator_t
+ * static:
+ * header_size() -> node_offset_t
+ * estimate_insert(key, value) -> node_offset_t
+ */
+ using iterator_t = _iterator_t<CONTAINER_TYPE>;
+ /* TODO: detailed comments
+ * - trim_until(mut) -> trim_size
+ * * keep 0 to i - 1, and remove the rest, return the size trimmed.
+ * * if this is the end iterator, do nothing and return 0.
+ * * if this is the start iterator, normally needs to go to the higher
+ * stage to trim the entire container.
+ * - trim_at(mut, trimmed) -> trim_size
+ * * trim happens inside the current iterator, causing the size reduced by
+ * <trimmed>, return the total size trimmed.
+ */
+
+ /*
+ * Lookup internals (hide?)
+ */
+
+ template <bool GET_KEY>
+ static result_t smallest_result(
+ const iterator_t& iter, full_key_t<KeyT::VIEW>* index_key) {
+ static_assert(!IS_BOTTOM);
+ assert(!iter.is_end());
+ auto pos_smallest = NXT_STAGE_T::position_t::begin();
+ auto nxt_container = iter.get_nxt_container();
+ auto value_ptr = NXT_STAGE_T::template get_p_value<GET_KEY>(
+ nxt_container, pos_smallest, index_key);
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ return result_t{{iter.index(), pos_smallest}, value_ptr, STAGE};
+ }
+
+ template <bool GET_KEY>
+ static result_t nxt_lower_bound(
+ const full_key_t<KeyT::HOBJ>& key, iterator_t& iter,
+ MatchHistory& history, full_key_t<KeyT::VIEW>* index_key) {
+ static_assert(!IS_BOTTOM);
+ assert(!iter.is_end());
+ auto nxt_container = iter.get_nxt_container();
+ auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>(
+ nxt_container, key, history, index_key);
+ if (nxt_result.is_end()) {
+ if (iter.is_last()) {
+ return result_t::end();
+ } else {
+ return smallest_result<GET_KEY>(++iter, index_key);
+ }
+ } else {
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ return result_t::from_nxt(iter.index(), nxt_result);
+ }
+ }
+
+ template <bool GET_POS, bool GET_KEY, bool GET_VAL>
+ static void lookup_largest_slot(
+ const container_t& container, position_t* p_position,
+ full_key_t<KeyT::VIEW>* p_index_key, const value_t** pp_value) {
+ auto iter = iterator_t(container);
+ iter.seek_last();
+ if constexpr (GET_KEY) {
+ assert(p_index_key);
+ p_index_key->set(iter.get_key());
+ }
+ if constexpr (GET_POS) {
+ assert(p_position);
+ p_position->index = iter.index();
+ }
+ if constexpr (IS_BOTTOM) {
+ if constexpr (GET_VAL) {
+ assert(pp_value);
+ *pp_value = iter.get_p_value();
+ }
+ } else {
+ auto nxt_container = iter.get_nxt_container();
+ if constexpr (GET_POS) {
+ NXT_STAGE_T::template lookup_largest_slot<true, GET_KEY, GET_VAL>(
+ nxt_container, &p_position->nxt, p_index_key, pp_value);
+ } else {
+ NXT_STAGE_T::template lookup_largest_slot<false, GET_KEY, GET_VAL>(
+ nxt_container, nullptr, p_index_key, pp_value);
+ }
+ }
+ }
+
+ template <bool GET_KEY = false>
+ static const value_t* get_p_value(
+ const container_t& container, const position_t& position,
+ full_key_t<KeyT::VIEW>* index_key = nullptr) {
+ auto iter = iterator_t(container);
+ iter.seek_at(position.index);
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ return NXT_STAGE_T::template get_p_value<GET_KEY>(
+ nxt_container, position.nxt, index_key);
+ } else {
+ return iter.get_p_value();
+ }
+ }
+
+ static void get_key_view(
+ const container_t& container,
+ const position_t& position,
+ full_key_t<KeyT::VIEW>& index_key) {
+ auto iter = iterator_t(container);
+ iter.seek_at(position.index);
+ index_key.set(iter.get_key());
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ return NXT_STAGE_T::get_key_view(nxt_container, position.nxt, index_key);
+ }
+ }
+
+ template <bool GET_KEY = false>
+ static result_t lower_bound(
+ const container_t& container,
+ const full_key_t<KeyT::HOBJ>& key,
+ MatchHistory& history,
+ full_key_t<KeyT::VIEW>* index_key = nullptr) {
+ bool exclude_last = false;
+ if (history.get<STAGE>().has_value()) {
+ if (*history.get<STAGE>() == MatchKindCMP::EQ) {
+ // lookup is short-circuited
+ if constexpr (!IS_BOTTOM) {
+ assert(history.get<STAGE - 1>().has_value());
+ if (history.is_GT<STAGE - 1>()) {
+ auto iter = iterator_t(container);
+ bool test_key_equal;
+ if constexpr (STAGE == STAGE_STRING) {
+ // TODO(cross-node string dedup)
+ // test_key_equal = (iter.get_key().type() == ns_oid_view_t::Type::MIN);
+ auto cmp = compare_to<KeyT::HOBJ>(key, iter.get_key());
+ assert(cmp != MatchKindCMP::GT);
+ test_key_equal = (cmp == MatchKindCMP::EQ);
+ } else {
+ auto cmp = compare_to<KeyT::HOBJ>(key, iter.get_key());
+ // From history, key[stage] == parent[stage][index - 1]
+ // which should be the smallest possible value for all
+ // index[stage][*]
+ assert(cmp != MatchKindCMP::GT);
+ test_key_equal = (cmp == MatchKindCMP::EQ);
+ }
+ if (test_key_equal) {
+ return nxt_lower_bound<GET_KEY>(key, iter, history, index_key);
+ } else {
+ // key[stage] < index[stage][left-most]
+ return smallest_result<GET_KEY>(iter, index_key);
+ }
+ }
+ }
+ // IS_BOTTOM || !history.is_GT<STAGE - 1>()
+ auto iter = iterator_t(container);
+ iter.seek_last();
+ if constexpr (STAGE == STAGE_STRING) {
+ // TODO(cross-node string dedup)
+ // assert(iter.get_key().type() == ns_oid_view_t::Type::MAX);
+ assert(compare_to<KeyT::HOBJ>(key, iter.get_key()) == MatchKindCMP::EQ);
+ } else {
+ assert(compare_to<KeyT::HOBJ>(key, iter.get_key()) == MatchKindCMP::EQ);
+ }
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ if constexpr (IS_BOTTOM) {
+ auto value_ptr = iter.get_p_value();
+ return result_t{{iter.index()}, value_ptr, MSTAT_EQ};
+ } else {
+ auto nxt_container = iter.get_nxt_container();
+ auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>(
+ nxt_container, key, history, index_key);
+ // !history.is_GT<STAGE - 1>() means
+ // key[stage+1 ...] <= index[stage+1 ...][*]
+ assert(!nxt_result.is_end());
+ return result_t::from_nxt(iter.index(), nxt_result);
+ }
+ } else if (*history.get<STAGE>() == MatchKindCMP::LT) {
+ exclude_last = true;
+ }
+ }
+ auto iter = iterator_t(container);
+ auto bs_match = iter.seek(key, exclude_last);
+ if (iter.is_end()) {
+ assert(!exclude_last);
+ assert(bs_match == MatchKindBS::NE);
+ history.set<STAGE>(MatchKindCMP::GT);
+ return result_t::end();
+ }
+ history.set<STAGE>(bs_match == MatchKindBS::EQ ?
+ MatchKindCMP::EQ : MatchKindCMP::LT);
+ if constexpr (IS_BOTTOM) {
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ auto value_ptr = iter.get_p_value();
+ return result_t{{iter.index()}, value_ptr,
+ (bs_match == MatchKindBS::EQ ? MSTAT_EQ : MSTAT_LT0)};
+ } else {
+ if (bs_match == MatchKindBS::EQ) {
+ return nxt_lower_bound<GET_KEY>(key, iter, history, index_key);
+ } else {
+ return smallest_result<GET_KEY>(iter, index_key);
+ }
+ }
+ }
+
+ template <KeyT KT>
+ static node_offset_t insert_size(const full_key_t<KT>& key, const value_t& value) {
+ if constexpr (IS_BOTTOM) {
+ return iterator_t::template estimate_insert<KT>(key, value);
+ } else {
+ return iterator_t::template estimate_insert<KT>(key, value) +
+ NXT_STAGE_T::iterator_t::header_size() +
+ NXT_STAGE_T::template insert_size<KT>(key, value);
+ }
+ }
+
+ template <KeyT KT>
+ static node_offset_t insert_size_at(
+ match_stage_t stage, const full_key_t<KeyT::HOBJ>& key, const value_t& value) {
+ if (stage == STAGE) {
+ return insert_size<KT>(key, value);
+ } else {
+ assert(stage < STAGE);
+ return NXT_STAGE_T::template insert_size_at<KT>(stage, key, value);
+ }
+ }
+
+ template <typename T = std::tuple<match_stage_t, node_offset_t>>
+ static std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, T> evaluate_insert(
+ const container_t& container, const full_key_t<KeyT::VIEW>& key,
+ const value_t& value, position_t& position, bool evaluate_last) {
+ auto iter = iterator_t(container);
+ auto& index = position.index;
+ if (evaluate_last || index == INDEX_END) {
+ iter.seek_last();
+ index = iter.index();
+ // evaluate the previous index
+ } else {
+ assert(is_valid_index(index));
+ // evaluate the current index
+ iter.seek_at(index);
+ auto match = compare_to<KeyT::VIEW>(key, iter.get_key());
+ if (match == MatchKindCMP::EQ) {
+ if constexpr (IS_BOTTOM) {
+ ceph_abort("insert conflict at current index!");
+ } else {
+ // insert into the current index
+ auto nxt_container = iter.get_nxt_container();
+ return NXT_STAGE_T::evaluate_insert(
+ nxt_container, key, value, position.nxt, false);
+ }
+ } else {
+ assert(match == MatchKindCMP::LT);
+ if (index == 0) {
+ // already the first index, so insert at the current index
+ return {STAGE, insert_size<KeyT::VIEW>(key, value)};
+ }
+ --index;
+ iter = iterator_t(container);
+ iter.seek_at(index);
+ // proceed to evaluate the previous index
+ }
+ }
+
+ // XXX(multi-type): when key is from a different type of node
+ auto match = compare_to<KeyT::VIEW>(key, iter.get_key());
+ if (match == MatchKindCMP::GT) {
+ // key doesn't match both indexes, so insert at the current index
+ ++index;
+ return {STAGE, insert_size<KeyT::VIEW>(key, value)};
+ } else {
+ assert(match == MatchKindCMP::EQ);
+ if constexpr (IS_BOTTOM) {
+ // ceph_abort?
+ ceph_abort("insert conflict at the previous index!");
+ } else {
+ // insert into the previous index
+ auto nxt_container = iter.get_nxt_container();
+ return NXT_STAGE_T::evaluate_insert(
+ nxt_container, key, value, position.nxt, true);
+ }
+ }
+ }
+
+ template <typename T = bool>
+ static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T>
+ compensate_insert_position_at(match_stage_t stage, position_t& position) {
+ auto& index = position.index;
+ if (stage == STAGE) {
+ assert(index == 0);
+ // insert at the end of the current stage
+ index = INDEX_END;
+ return true;
+ } else {
+ if constexpr (IS_BOTTOM) {
+ ceph_abort("impossible path");
+ } else {
+ assert(stage < STAGE);
+ bool compensate = NXT_STAGE_T::
+ compensate_insert_position_at(stage, position.nxt);
+ if (compensate) {
+ assert(is_valid_index(index));
+ if (index == 0) {
+ // insert into the *last* index of the current stage
+ index = INDEX_LAST;
+ return true;
+ } else {
+ --index;
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+ }
+ }
+
+ static void patch_insert_end(position_t& insert_pos, match_stage_t insert_stage) {
+ assert(insert_stage <= STAGE);
+ if (insert_stage == STAGE) {
+ insert_pos.index = INDEX_END;
+ } else if constexpr (!IS_BOTTOM) {
+ insert_pos.index = INDEX_LAST;
+ NXT_STAGE_T::patch_insert_end(insert_pos.nxt, insert_stage);
+ }
+ }
+
+ template <typename T = std::tuple<match_stage_t, node_offset_t>>
+ static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T> evaluate_insert(
+ const full_key_t<KeyT::HOBJ>& key, const onode_t& value,
+ const MatchHistory& history, match_stat_t mstat, position_t& position) {
+ match_stage_t insert_stage = STAGE_TOP;
+ while (*history.get_by_stage(insert_stage) == MatchKindCMP::EQ) {
+ assert(insert_stage != STAGE_BOTTOM && "insert conflict!");
+ --insert_stage;
+ }
+
+ if (history.is_GT()) {
+ if (position.is_end()) {
+ // no need to compensate insert position
+ assert(insert_stage <= STAGE && "impossible insert stage");
+ } else if (position == position_t::begin()) {
+ // I must be short-circuited by staged::smallest_result()
+ // in staged::lower_bound(), so we need to rely on mstat instead
+ assert(mstat >= MSTAT_LT0 && mstat <= MSTAT_LT3);
+ if (mstat == MSTAT_LT0) {
+ insert_stage = STAGE_RIGHT;
+ } else if (mstat == MSTAT_LT1) {
+ insert_stage = STAGE_STRING;
+ } else {
+ insert_stage = STAGE_LEFT;
+ }
+ // XXX(multi-type): need to upgrade node type before inserting an
+ // incompatible index at front.
+ assert(insert_stage <= STAGE && "incompatible insert");
+ } else {
+ assert(insert_stage <= STAGE && "impossible insert stage");
+ [[maybe_unused]] bool ret = compensate_insert_position_at(insert_stage, position);
+ assert(!ret);
+ }
+ }
+
+ if (position.is_end()) {
+ patch_insert_end(position, insert_stage);
+ }
+
+ node_offset_t insert_size = insert_size_at<KeyT::HOBJ>(insert_stage, key, value);
+
+ return {insert_stage, insert_size};
+ }
+
+ template <KeyT KT>
+ static const value_t* insert_new(
+ NodeExtentMutable& mut, const memory_range_t& range,
+ const full_key_t<KT>& key, const value_t& value) {
+ char* p_insert = const_cast<char*>(range.p_end);
+ const value_t* p_value = nullptr;
+ StagedAppender<KT> appender;
+ appender.init(&mut, p_insert);
+ appender.append(key, value, p_value);
+ [[maybe_unused]] const char* p_insert_front = appender.wrap();
+ assert(p_insert_front == range.p_start);
+ return p_value;
+ }
+
+ template <KeyT KT, bool SPLIT>
+ static const value_t* proceed_insert_recursively(
+ NodeExtentMutable& mut, const container_t& container,
+ const full_key_t<KT>& key, const value_t& value,
+ position_t& position, match_stage_t& stage,
+ node_offset_t& _insert_size, const char* p_left_bound) {
+ // proceed insert from right to left
+ assert(stage <= STAGE);
+ auto iter = iterator_t(container);
+ auto& index = position.index;
+
+ bool do_insert = false;
+ if (stage == STAGE) {
+ if (index == INDEX_END) {
+ iter.seek_last();
+ iter.set_end();
+ index = iter.index();
+ } else {
+ assert(is_valid_index(index));
+ iter.seek_till_end(index);
+ }
+ do_insert = true;
+ } else { // stage < STAGE
+ if (index == INDEX_LAST) {
+ iter.seek_last();
+ index = iter.index();
+ } else {
+ assert(is_valid_index(index));
+ iter.seek_till_end(index);
+ }
+ if constexpr (SPLIT) {
+ if (iter.is_end()) {
+ // insert at the higher stage due to split
+ do_insert = true;
+ _insert_size = insert_size<KT>(key, value);
+ stage = STAGE;
+ }
+ } else {
+ assert(!iter.is_end());
+ }
+ }
+
+ if (do_insert) {
+ if constexpr (!IS_BOTTOM) {
+ position.nxt = position_t::nxt_t::begin();
+ }
+ assert(_insert_size == insert_size<KT>(key, value));
+ if constexpr (IS_BOTTOM) {
+ return iter.template insert<KT>(
+ mut, key, value, _insert_size, p_left_bound);
+ } else {
+ auto range = iter.template insert_prefix<KT>(
+ mut, key, _insert_size, p_left_bound);
+ return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value);
+ }
+ } else {
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ auto p_value = NXT_STAGE_T::template proceed_insert_recursively<KT, SPLIT>(
+ mut, nxt_container, key, value,
+ position.nxt, stage, _insert_size, p_left_bound);
+ iter.update_size(mut, _insert_size);
+ return p_value;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ }
+
+ template <KeyT KT, bool SPLIT>
+ static const value_t* proceed_insert(
+ NodeExtentMutable& mut, const container_t& container,
+ const full_key_t<KT>& key, const value_t& value,
+ position_t& position, match_stage_t& stage, node_offset_t& _insert_size) {
+ auto p_left_bound = container.p_left_bound();
+ if (unlikely(!container.keys())) {
+ if (position.is_end()) {
+ position = position_t::begin();
+ assert(stage == STAGE);
+ assert(_insert_size == insert_size<KT>(key, value));
+ } else if (position == position_t::begin()) {
+ // when insert into a trimmed and empty left node
+ stage = STAGE;
+ _insert_size = insert_size<KT>(key, value);
+ } else {
+ ceph_abort("impossible path");
+ }
+ if constexpr (IS_BOTTOM) {
+ return container_t::template insert_at<KT>(
+ mut, container, key, value, 0, _insert_size, p_left_bound);
+ } else {
+ auto range = container_t::template insert_prefix_at<KT>(
+ mut, container, key, 0, _insert_size, p_left_bound);
+ return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value);
+ }
+ } else {
+ return proceed_insert_recursively<KT, SPLIT>(
+ mut, container, key, value,
+ position, stage, _insert_size, p_left_bound);
+ }
+ }
+
+ static std::ostream& dump(const container_t& container,
+ std::ostream& os,
+ const std::string& prefix,
+ size_t& size,
+ const char* p_start) {
+ auto iter = iterator_t(container);
+ assert(!iter.is_end());
+ std::string prefix_blank(prefix.size(), ' ');
+ const std::string* p_prefix = &prefix;
+ size += iterator_t::header_size();
+ do {
+ std::ostringstream sos;
+ sos << *p_prefix << iter.get_key() << ": ";
+ std::string i_prefix = sos.str();
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ size += iter.size_to_nxt();
+ NXT_STAGE_T::dump(nxt_container, os, i_prefix, size, p_start);
+ } else {
+ auto value_ptr = iter.get_p_value();
+ int offset = reinterpret_cast<const char*>(value_ptr) - p_start;
+ size += iter.size();
+ os << "\n" << i_prefix;
+ if constexpr (NODE_TYPE == node_type_t::LEAF) {
+ os << *value_ptr;
+ } else {
+ os << "0x" << std::hex << value_ptr->value << std::dec;
+ }
+ os << " " << size << "B"
+ << " @" << offset << "B";
+ }
+ if (iter.is_last()) {
+ break;
+ } else {
+ ++iter;
+ p_prefix = &prefix_blank;
+ }
+ } while (true);
+ return os;
+ }
+
+ static void validate(const container_t& container) {
+ auto iter = iterator_t(container);
+ assert(!iter.is_end());
+ auto key = iter.get_key();
+ do {
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ NXT_STAGE_T::validate(nxt_container);
+ }
+ if (iter.is_last()) {
+ break;
+ } else {
+ ++iter;
+ assert(compare_to(key, iter.get_key()) == MatchKindCMP::LT);
+ key = iter.get_key();
+ }
+ } while (true);
+ }
+
+ static void get_stats(const container_t& container, node_stats_t& stats,
+ full_key_t<KeyT::VIEW>& index_key) {
+ auto iter = iterator_t(container);
+ assert(!iter.is_end());
+ stats.size_overhead += iterator_t::header_size();
+ do {
+ index_key.replace(iter.get_key());
+ stats.size_overhead += iter.size_overhead();
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ NXT_STAGE_T::get_stats(nxt_container, stats, index_key);
+ } else {
+ ++stats.num_kvs;
+ size_t kv_logical_size = index_key.size_logical();
+ size_t value_size;
+ if constexpr (NODE_TYPE == node_type_t::LEAF) {
+ value_size = iter.get_p_value()->size;
+ } else {
+ value_size = sizeof(value_t);
+ }
+ stats.size_value += value_size;
+ kv_logical_size += value_size;
+ stats.size_logical += kv_logical_size;
+ }
+ if (iter.is_last()) {
+ break;
+ } else {
+ ++iter;
+ }
+ } while (true);
+ }
+
+ static bool next_position(const container_t& container, position_t& pos) {
+ auto iter = iterator_t(container);
+ assert(!iter.is_end());
+ iter.seek_at(pos.index);
+ bool find_next;
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ find_next = NXT_STAGE_T::next_position(nxt_container, pos.nxt);
+ } else {
+ find_next = true;
+ }
+ if (find_next) {
+ if (iter.is_last()) {
+ return true;
+ } else {
+ pos.index = iter.index() + 1;
+ if constexpr (!IS_BOTTOM) {
+ pos.nxt = NXT_STAGE_T::position_t::begin();
+ }
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ struct _BaseEmpty {};
+ class _BaseWithNxtIterator {
+ protected:
+ typename NXT_STAGE_T::StagedIterator _nxt;
+ };
+ class StagedIterator
+ : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtIterator> {
+ public:
+ StagedIterator() = default;
+ bool valid() const { return iter.has_value(); }
+ index_t index() const {
+ return iter->index();
+ }
+ bool is_end() const { return iter->is_end(); }
+ bool in_progress() const {
+ assert(valid());
+ if constexpr (!IS_BOTTOM) {
+ if (this->_nxt.valid()) {
+ if (this->_nxt.index() == 0) {
+ return this->_nxt.in_progress();
+ } else {
+ return true;
+ }
+ } else {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+ key_get_type get_key() const { return iter->get_key(); }
+
+ iterator_t& get() { return *iter; }
+ void set(const container_t& container) {
+ assert(!valid());
+ iter = iterator_t(container);
+ }
+ void set_end() { iter->set_end(); }
+ typename NXT_STAGE_T::StagedIterator& nxt() {
+ if constexpr (!IS_BOTTOM) {
+ if (!this->_nxt.valid()) {
+ auto nxt_container = iter->get_nxt_container();
+ this->_nxt.set(nxt_container);
+ }
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ typename NXT_STAGE_T::StagedIterator& get_nxt() {
+ if constexpr (!IS_BOTTOM) {
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ StagedIterator& operator++() {
+ if (iter->is_last()) {
+ iter->set_end();
+ } else {
+ ++(*iter);
+ }
+ if constexpr (!IS_BOTTOM) {
+ this->_nxt.reset();
+ }
+ return *this;
+ }
+ void reset() {
+ if (valid()) {
+ iter.reset();
+ if constexpr (!IS_BOTTOM) {
+ this->_nxt.reset();
+ }
+ }
+ }
+ std::ostream& print(std::ostream& os, bool is_top) const {
+ if (valid()) {
+ if (iter->is_end()) {
+ return os << "END";
+ } else {
+ os << index();
+ }
+ } else {
+ if (is_top) {
+ return os << "invalid StagedIterator!";
+ } else {
+ os << "0!";
+ }
+ }
+ if constexpr (!IS_BOTTOM) {
+ os << ", ";
+ return this->_nxt.print(os, false);
+ } else {
+ return os;
+ }
+ }
+ position_t get_pos() const {
+ if (valid()) {
+ if constexpr (IS_BOTTOM) {
+ return position_t{index()};
+ } else {
+ return position_t{index(), this->_nxt.get_pos()};
+ }
+ } else {
+ return position_t::begin();
+ }
+ }
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ uint8_t present = static_cast<bool>(iter);
+ ceph::encode(present, encoded);
+ if (iter.has_value()) {
+ iter->encode(p_node_start, encoded);
+ if constexpr (!IS_BOTTOM) {
+ this->_nxt.encode(p_node_start, encoded);
+ }
+ }
+ }
+ static StagedIterator decode(const char* p_node_start,
+ ceph::bufferlist::const_iterator& delta) {
+ StagedIterator ret;
+ uint8_t present;
+ ceph::decode(present, delta);
+ if (present) {
+ ret.iter = iterator_t::decode(p_node_start, delta);
+ if constexpr (!IS_BOTTOM) {
+ ret._nxt = NXT_STAGE_T::StagedIterator::decode(p_node_start, delta);
+ }
+ }
+ return ret;
+ }
+ friend std::ostream& operator<<(std::ostream& os, const StagedIterator& iter) {
+ return iter.print(os, true);
+ }
+ private:
+ std::optional<iterator_t> iter;
+ };
+
+ static bool recursively_locate_split(
+ size_t& current_size, size_t extra_size,
+ size_t target_size, StagedIterator& split_at) {
+ assert(current_size <= target_size);
+ iterator_t& split_iter = split_at.get();
+ current_size = split_iter.seek_split(current_size, extra_size, target_size);
+ assert(current_size <= target_size);
+ assert(!split_iter.is_end());
+ if (split_iter.index() == 0) {
+ extra_size += iterator_t::header_size();
+ } else {
+ extra_size = 0;
+ }
+ bool locate_nxt;
+ if constexpr (!IS_BOTTOM) {
+ locate_nxt = NXT_STAGE_T::recursively_locate_split(
+ current_size, extra_size + split_iter.size_to_nxt(),
+ target_size, split_at.nxt());
+ } else { // IS_BOTTOM
+ // located upper_bound, fair split strategy
+ size_t nxt_size = split_iter.size() + extra_size;
+ assert(current_size + nxt_size > target_size);
+ if (current_size + nxt_size/2 < target_size) {
+ // include next
+ current_size += nxt_size;
+ locate_nxt = true;
+ } else {
+ // exclude next
+ locate_nxt = false;
+ }
+ }
+ if (locate_nxt) {
+ if (split_iter.is_last()) {
+ return true;
+ } else {
+ ++split_at;
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ static bool recursively_locate_split_inserted(
+ size_t& current_size, size_t extra_size, size_t target_size,
+ position_t& insert_pos, match_stage_t insert_stage, size_t insert_size,
+ std::optional<bool>& is_insert_left, StagedIterator& split_at) {
+ assert(current_size <= target_size);
+ assert(!is_insert_left.has_value());
+ iterator_t& split_iter = split_at.get();
+ auto& insert_index = insert_pos.index;
+ if (insert_stage == STAGE) {
+ current_size = split_iter.template seek_split_inserted<true>(
+ current_size, extra_size, target_size,
+ insert_index, insert_size, is_insert_left);
+ assert(is_insert_left.has_value());
+ assert(current_size <= target_size);
+ if (split_iter.index() == 0) {
+ if (insert_index == 0) {
+ if (*is_insert_left == false) {
+ extra_size += iterator_t::header_size();
+ } else {
+ extra_size = 0;
+ }
+ } else {
+ extra_size += iterator_t::header_size();
+ }
+ } else {
+ extra_size = 0;
+ }
+ if (*is_insert_left == false && split_iter.index() == insert_index) {
+ // split_iter can be end
+ // found the lower-bound of target_size
+ // ...[s_index-1] |!| (i_index) [s_index]...
+
+ // located upper-bound, fair split strategy
+ // look at the next slot (the insert item)
+ size_t nxt_size = insert_size + extra_size;
+ assert(current_size + nxt_size > target_size);
+ if (current_size + nxt_size/2 < target_size) {
+ // include next
+ *is_insert_left = true;
+ current_size += nxt_size;
+ if (split_iter.is_end()) {
+ // ...[s_index-1] (i_index) |!|
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ // exclude next
+ return false;
+ }
+ } else {
+ // Already considered insert effect in the current stage.
+ // Look into the next stage to identify the target_size lower-bound w/o
+ // insert effect.
+ assert(!split_iter.is_end());
+ bool locate_nxt;
+ if constexpr (!IS_BOTTOM) {
+ locate_nxt = NXT_STAGE_T::recursively_locate_split(
+ current_size, extra_size + split_iter.size_to_nxt(),
+ target_size, split_at.nxt());
+ } else { // IS_BOTTOM
+ // located upper-bound, fair split strategy
+ // look at the next slot
+ size_t nxt_size = split_iter.size() + extra_size;
+ assert(current_size + nxt_size > target_size);
+ if (current_size + nxt_size/2 < target_size) {
+ // include next
+ current_size += nxt_size;
+ locate_nxt = true;
+ } else {
+ // exclude next
+ locate_nxt = false;
+ }
+ }
+ if (locate_nxt) {
+ if (split_iter.is_last()) {
+ auto end_index = split_iter.index() + 1;
+ if (insert_index == INDEX_END) {
+ insert_index = end_index;
+ }
+ assert(insert_index <= end_index);
+ if (insert_index == end_index) {
+ assert(*is_insert_left == false);
+ split_iter.set_end();
+ // ...[s_index-1] |!| (i_index)
+ return false;
+ } else {
+ assert(*is_insert_left == true);
+ return true;
+ }
+ } else {
+ ++split_at;
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+ } else {
+ if constexpr (!IS_BOTTOM) {
+ assert(insert_stage < STAGE);
+ current_size = split_iter.template seek_split_inserted<false>(
+ current_size, extra_size, target_size,
+ insert_index, insert_size, is_insert_left);
+ assert(!split_iter.is_end());
+ assert(current_size <= target_size);
+ if (split_iter.index() == 0) {
+ extra_size += iterator_t::header_size();
+ } else {
+ extra_size = 0;
+ }
+ bool locate_nxt;
+ if (!is_insert_left.has_value()) {
+ // Considered insert effect in the current stage, and insert happens
+ // in the lower stage.
+ // Look into the next stage to identify the target_size lower-bound w/
+ // insert effect.
+ assert(split_iter.index() == insert_index);
+ locate_nxt = NXT_STAGE_T::recursively_locate_split_inserted(
+ current_size, extra_size + split_iter.size_to_nxt(), target_size,
+ insert_pos.nxt, insert_stage, insert_size,
+ is_insert_left, split_at.nxt());
+ assert(is_insert_left.has_value());
+#ifndef NDEBUG
+ if (locate_nxt) {
+ assert(*is_insert_left == true);
+ }
+#endif
+ } else {
+ // is_insert_left.has_value() == true
+ // Insert will *not* happen in the lower stage.
+ // Need to look into the next stage to identify the target_size
+ // lower-bound w/ insert effect
+ assert(split_iter.index() != insert_index);
+ locate_nxt = NXT_STAGE_T::recursively_locate_split(
+ current_size, extra_size + split_iter.size_to_nxt(),
+ target_size, split_at.nxt());
+#ifndef NDEBUG
+ if (split_iter.index() < insert_index) {
+ assert(*is_insert_left == false);
+ } else {
+ assert(*is_insert_left == true);
+ }
+#endif
+ }
+ if (locate_nxt) {
+ if (split_iter.is_last()) {
+ return true;
+ } else {
+ ++split_at;
+ return false;
+ }
+ } else {
+ return false;
+ }
+ } else {
+ ceph_abort("impossible path");
+ return false;;
+ }
+ }
+ }
+
+ /*
+ * container appender type system
+ * container_t::Appender(NodeExtentMutable& mut, char* p_append)
+ * append(const container_t& src, index_t from, index_t items)
+ * wrap() -> char*
+ * IF !IS_BOTTOM:
+ * open_nxt(const key_get_type&)
+ * open_nxt(const full_key_t&)
+ * -> std::tuple<NodeExtentMutable&, char*>
+ * wrap_nxt(char* p_append)
+ * ELSE
+ * append(const full_key_t& key, const value_t& value)
+ */
+ template <KeyT KT>
+ struct _BaseWithNxtAppender {
+ typename NXT_STAGE_T::template StagedAppender<KT> _nxt;
+ };
+ template <KeyT KT>
+ class StagedAppender
+ : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtAppender<KT>> {
+ public:
+ StagedAppender() = default;
+ ~StagedAppender() {
+ assert(!require_wrap_nxt);
+ assert(!valid());
+ }
+ bool valid() const { return appender.has_value(); }
+ index_t index() const {
+ assert(valid());
+ return _index;
+ }
+ bool in_progress() const { return require_wrap_nxt; }
+ // TODO: pass by reference
+ void init(NodeExtentMutable* p_mut, char* p_start) {
+ assert(!valid());
+ appender = typename container_t::template Appender<KT>(p_mut, p_start);
+ _index = 0;
+ }
+ // possible to make src_iter end if to_index == INDEX_END
+ void append_until(StagedIterator& src_iter, index_t& to_index) {
+ assert(!require_wrap_nxt);
+ auto s_index = src_iter.index();
+ src_iter.get().template copy_out_until<KT>(*appender, to_index);
+ assert(src_iter.index() == to_index);
+ assert(to_index >= s_index);
+ auto increment = (to_index - s_index);
+ if (increment) {
+ _index += increment;
+ if constexpr (!IS_BOTTOM) {
+ src_iter.get_nxt().reset();
+ }
+ }
+ }
+ void append(const full_key_t<KT>& key,
+ const value_t& value, const value_t*& p_value) {
+ assert(!require_wrap_nxt);
+ if constexpr (!IS_BOTTOM) {
+ auto& nxt = open_nxt(key);
+ nxt.append(key, value, p_value);
+ wrap_nxt();
+ } else {
+ appender->append(key, value, p_value);
+ ++_index;
+ }
+ }
+ char* wrap() {
+ assert(valid());
+ assert(_index > 0);
+ if constexpr (!IS_BOTTOM) {
+ if (require_wrap_nxt) {
+ wrap_nxt();
+ }
+ }
+ auto ret = appender->wrap();
+ appender.reset();
+ return ret;
+ }
+ typename NXT_STAGE_T::template StagedAppender<KT>&
+ open_nxt(key_get_type paritial_key) {
+ assert(!require_wrap_nxt);
+ if constexpr (!IS_BOTTOM) {
+ require_wrap_nxt = true;
+ auto [p_mut, p_append] = appender->open_nxt(paritial_key);
+ this->_nxt.init(p_mut, p_append);
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ typename NXT_STAGE_T::template StagedAppender<KT>&
+ open_nxt(const full_key_t<KT>& key) {
+ assert(!require_wrap_nxt);
+ if constexpr (!IS_BOTTOM) {
+ require_wrap_nxt = true;
+ auto [p_mut, p_append] = appender->open_nxt(key);
+ this->_nxt.init(p_mut, p_append);
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ typename NXT_STAGE_T::template StagedAppender<KT>& get_nxt() {
+ if constexpr (!IS_BOTTOM) {
+ assert(require_wrap_nxt);
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ void wrap_nxt() {
+ if constexpr (!IS_BOTTOM) {
+ assert(require_wrap_nxt);
+ require_wrap_nxt = false;
+ auto p_append = this->_nxt.wrap();
+ appender->wrap_nxt(p_append);
+ ++_index;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ private:
+ std::optional<typename container_t::template Appender<KT>> appender;
+ index_t _index;
+ bool require_wrap_nxt = false;
+ };
+
+ template <KeyT KT>
+ static void _append_range(
+ StagedIterator& src_iter, StagedAppender<KT>& appender, index_t& to_index) {
+ if (src_iter.is_end()) {
+ // append done
+ assert(to_index == INDEX_END);
+ to_index = src_iter.index();
+ } else if constexpr (!IS_BOTTOM) {
+ if (appender.in_progress()) {
+ // appender has appended something at the current item,
+ // cannot append the current item as-a-whole
+ index_t to_index_nxt = INDEX_END;
+ NXT_STAGE_T::template _append_range<KT>(
+ src_iter.nxt(), appender.get_nxt(), to_index_nxt);
+ ++src_iter;
+ appender.wrap_nxt();
+ } else if (src_iter.in_progress()) {
+ // src_iter is not at the beginning of the current item,
+ // cannot append the current item as-a-whole
+ index_t to_index_nxt = INDEX_END;
+ NXT_STAGE_T::template _append_range<KT>(
+ src_iter.nxt(), appender.open_nxt(src_iter.get_key()), to_index_nxt);
+ ++src_iter;
+ appender.wrap_nxt();
+ } else {
+ // we can safely append the current item as-a-whole
+ }
+ }
+ appender.append_until(src_iter, to_index);
+ }
+
+ template <KeyT KT>
+ static void _append_into(StagedIterator& src_iter, StagedAppender<KT>& appender,
+ position_t& position, match_stage_t stage) {
+ assert(position.index == src_iter.index());
+ // reaches the last item
+ if (stage == STAGE) {
+ // done, end recursion
+ if constexpr (!IS_BOTTOM) {
+ position.nxt = position_t::nxt_t::begin();
+ }
+ } else {
+ assert(stage < STAGE);
+ // proceed append in the next stage
+ NXT_STAGE_T::template append_until<KT>(
+ src_iter.nxt(), appender.open_nxt(src_iter.get_key()),
+ position.nxt, stage);
+ }
+ }
+
+ template <KeyT KT>
+ static void append_until(StagedIterator& src_iter, StagedAppender<KT>& appender,
+ position_t& position, match_stage_t stage) {
+ index_t from_index = src_iter.index();
+ index_t& to_index = position.index;
+ assert(from_index <= to_index);
+ if constexpr (IS_BOTTOM) {
+ assert(stage == STAGE);
+ appender.append_until(src_iter, to_index);
+ } else {
+ assert(stage <= STAGE);
+ if (src_iter.index() == to_index) {
+ _append_into<KT>(src_iter, appender, position, stage);
+ } else {
+ if (to_index == INDEX_END) {
+ assert(stage == STAGE);
+ } else if (to_index == INDEX_LAST) {
+ assert(stage < STAGE);
+ }
+ _append_range<KT>(src_iter, appender, to_index);
+ _append_into<KT>(src_iter, appender, position, stage);
+ }
+ }
+ to_index -= from_index;
+ }
+
+ template <KeyT KT>
+ static bool append_insert(
+ const full_key_t<KT>& key, const value_t& value,
+ StagedIterator& src_iter, StagedAppender<KT>& appender,
+ bool is_front_insert, match_stage_t& stage, const value_t*& p_value) {
+ assert(src_iter.valid());
+ if (stage == STAGE) {
+ appender.append(key, value, p_value);
+ if (src_iter.is_end()) {
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ assert(stage < STAGE);
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_is_end = NXT_STAGE_T::template append_insert<KT>(
+ key, value, src_iter.get_nxt(), appender.get_nxt(),
+ is_front_insert, stage, p_value);
+ if (nxt_is_end) {
+ appender.wrap_nxt();
+ ++src_iter;
+ if (is_front_insert) {
+ stage = STAGE;
+ }
+ if (src_iter.is_end()) {
+ return true;
+ }
+ }
+ return false;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ }
+
+ /* TrimType:
+ * BEFORE: remove the entire container, normally means the according higher
+ * stage iterator needs to be trimmed as-a-whole.
+ * AFTER: retain the entire container, normally means the trim should be
+ * start from the next iterator at the higher stage.
+ * AT: trim happens in the current container, and the according higher
+ * stage iterator needs to be adjusted by the trimmed size.
+ */
+ static std::tuple<TrimType, node_offset_t>
+ recursively_trim(NodeExtentMutable& mut, StagedIterator& trim_at) {
+ if (!trim_at.valid()) {
+ return {TrimType::BEFORE, 0u};
+ }
+ if (trim_at.is_end()) {
+ return {TrimType::AFTER, 0u};
+ }
+
+ auto& iter = trim_at.get();
+ if constexpr (!IS_BOTTOM) {
+ auto [type, trimmed] = NXT_STAGE_T::recursively_trim(
+ mut, trim_at.get_nxt());
+ node_offset_t trim_size;
+ if (type == TrimType::AFTER) {
+ if (iter.is_last()) {
+ return {TrimType::AFTER, 0u};
+ }
+ ++trim_at;
+ trim_size = iter.trim_until(mut);
+ } else if (type == TrimType::BEFORE) {
+ if (iter.index() == 0) {
+ return {TrimType::BEFORE, 0u};
+ }
+ trim_size = iter.trim_until(mut);
+ } else {
+ trim_size = iter.trim_at(mut, trimmed);
+ }
+ return {TrimType::AT, trim_size};
+ } else {
+ if (iter.index() == 0) {
+ return {TrimType::BEFORE, 0u};
+ } else {
+ auto trimmed = iter.trim_until(mut);
+ return {TrimType::AT, trimmed};
+ }
+ }
+ }
+
+ static void trim(NodeExtentMutable& mut, StagedIterator& trim_at) {
+ auto [type, trimmed] = recursively_trim(mut, trim_at);
+ if (type == TrimType::BEFORE) {
+ assert(trim_at.valid());
+ auto& iter = trim_at.get();
+ iter.trim_until(mut);
+ }
+ }
+};
+
+/**
+ * Configurations for struct staged
+ *
+ * staged_params_* assembles different container_t implementations (defined by
+ * stated::_iterator_t) by STAGE, and constructs the final multi-stage
+ * implementations for different node layouts defined by
+ * node_extent_t<FieldType, NODE_TYPE>.
+ *
+ * The specialized implementations for different layouts are accessible through
+ * the helper type node_to_stage_t<node_extent_t<FieldType, NODE_TYPE>>.
+ *
+ * Specifically, the settings of 8 layouts are:
+ *
+ * The layout (N0, LEAF/INTERNAL) has 3 stages:
+ * - STAGE_LEFT: node_extent_t<node_fields_0_t, LEAF/INTERNAL>
+ * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL>
+ * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL>
+ *
+ * The layout (N1, LEAF/INTERNAL) has 3 stages:
+ * - STAGE_LEFT: node_extent_t<node_fields_1_t, LEAF/INTERNAL>
+ * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL>
+ * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL>
+ *
+ * The layout (N2, LEAF/INTERNAL) has 2 stages:
+ * - STAGE_STRING: node_extent_t<node_fields_2_t, LEAF/INTERNAL>
+ * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL>
+ *
+ * The layout (N3, LEAF) has 1 stage:
+ * - STAGE_RIGHT: node_extent_t<leaf_fields_3_t, LEAF>
+ *
+ * The layout (N3, INTERNAL) has 1 stage:
+ * - STAGE_RIGHT: node_extent_t<internal_fields_3_t, INTERNAL>
+ */
+
+template <node_type_t _NODE_TYPE>
+struct staged_params_subitems {
+ using container_t = sub_items_t<_NODE_TYPE>;
+ static constexpr auto NODE_TYPE = _NODE_TYPE;
+ static constexpr auto STAGE = STAGE_RIGHT;
+
+ // dummy type in order to make our type system work
+ // any better solution to get rid of this?
+ using next_param_t = staged_params_subitems<NODE_TYPE>;
+};
+
+template <node_type_t _NODE_TYPE>
+struct staged_params_item_iterator {
+ using container_t = item_iterator_t<_NODE_TYPE>;
+ static constexpr auto NODE_TYPE = _NODE_TYPE;
+ static constexpr auto STAGE = STAGE_STRING;
+
+ using next_param_t = staged_params_subitems<NODE_TYPE>;
+};
+
+template <typename NodeType>
+struct staged_params_node_01 {
+ using container_t = NodeType;
+ static constexpr auto NODE_TYPE = NodeType::NODE_TYPE;
+ static constexpr auto STAGE = STAGE_LEFT;
+
+ using next_param_t = staged_params_item_iterator<NODE_TYPE>;
+};
+
+template <typename NodeType>
+struct staged_params_node_2 {
+ using container_t = NodeType;
+ static constexpr auto NODE_TYPE = NodeType::NODE_TYPE;
+ static constexpr auto STAGE = STAGE_STRING;
+
+ using next_param_t = staged_params_subitems<NODE_TYPE>;
+};
+
+template <typename NodeType>
+struct staged_params_node_3 {
+ using container_t = NodeType;
+ static constexpr auto NODE_TYPE = NodeType::NODE_TYPE;
+ static constexpr auto STAGE = STAGE_RIGHT;
+
+ // dummy type in order to make our type system work
+ // any better solution to get rid of this?
+ using next_param_t = staged_params_node_3<NodeType>;
+};
+
+template <typename NodeType, typename Enable = void> struct _node_to_stage_t;
+template <typename NodeType>
+struct _node_to_stage_t<NodeType,
+ std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N0 ||
+ NodeType::FIELD_TYPE == field_type_t::N1>> {
+ using type = staged<staged_params_node_01<NodeType>>;
+};
+template <typename NodeType>
+struct _node_to_stage_t<NodeType,
+ std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N2>> {
+ using type = staged<staged_params_node_2<NodeType>>;
+};
+template <typename NodeType>
+struct _node_to_stage_t<NodeType,
+ std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N3>> {
+ using type = staged<staged_params_node_3<NodeType>>;
+};
+template <typename NodeType>
+using node_to_stage_t = typename _node_to_stage_t<NodeType>::type;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h
new file mode 100644
index 000000000..a9d5cef3b
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h
@@ -0,0 +1,411 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <optional>
+#include <ostream>
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/tree_types.h"
+
+namespace crimson::os::seastore::onode {
+
+using match_stage_t = int8_t;
+constexpr match_stage_t STAGE_LEFT = 2; // shard/pool/crush
+constexpr match_stage_t STAGE_STRING = 1; // nspace/oid
+constexpr match_stage_t STAGE_RIGHT = 0; // snap/gen
+constexpr auto STAGE_TOP = STAGE_LEFT;
+constexpr auto STAGE_BOTTOM = STAGE_RIGHT;
+constexpr bool is_valid_stage(match_stage_t stage) {
+ return std::clamp(stage, STAGE_BOTTOM, STAGE_TOP) == stage;
+}
+// TODO: replace by
+// using match_history_t = int8_t;
+// left_m, str_m, right_m
+// 3: GT,
+// 2: EQ, GT,
+// 1: EQ, EQ, GT
+// 0: EQ, EQ, EQ
+// -1: EQ, EQ, LT
+// -2: EQ, LT,
+// -3: LT,
+
+struct MatchHistory {
+ template <match_stage_t STAGE>
+ const std::optional<MatchKindCMP>& get() const {
+ static_assert(is_valid_stage(STAGE));
+ if constexpr (STAGE == STAGE_RIGHT) {
+ return right_match;
+ } else if (STAGE == STAGE_STRING) {
+ return string_match;
+ } else {
+ return left_match;
+ }
+ }
+
+ const std::optional<MatchKindCMP>&
+ get_by_stage(match_stage_t stage) const {
+ assert(is_valid_stage(stage));
+ if (stage == STAGE_RIGHT) {
+ return right_match;
+ } else if (stage == STAGE_STRING) {
+ return string_match;
+ } else {
+ return left_match;
+ }
+ }
+
+ template <match_stage_t STAGE = STAGE_TOP>
+ const bool is_GT() const;
+
+ template <match_stage_t STAGE>
+ void set(MatchKindCMP match) {
+ static_assert(is_valid_stage(STAGE));
+ if constexpr (STAGE < STAGE_TOP) {
+ assert(*get<STAGE + 1>() == MatchKindCMP::EQ);
+ }
+ assert(!get<STAGE>().has_value() || *get<STAGE>() != MatchKindCMP::EQ);
+ const_cast<std::optional<MatchKindCMP>&>(get<STAGE>()) = match;
+ }
+
+ std::ostream& dump(std::ostream& os) const {
+ os << "history(";
+ dump_each(os, left_match) << ", ";
+ dump_each(os, string_match) << ", ";
+ dump_each(os, right_match) << ")";
+ return os;
+ }
+
+ std::ostream& dump_each(
+ std::ostream& os, const std::optional<MatchKindCMP>& match) const {
+ if (!match.has_value()) {
+ return os << "--";
+ } else if (*match == MatchKindCMP::LT) {
+ return os << "LT";
+ } else if (*match == MatchKindCMP::EQ) {
+ return os << "EQ";
+ } else if (*match == MatchKindCMP::GT) {
+ return os << "GT";
+ } else {
+ ceph_abort("impossble path");
+ }
+ }
+
+ std::optional<MatchKindCMP> left_match;
+ std::optional<MatchKindCMP> string_match;
+ std::optional<MatchKindCMP> right_match;
+};
+inline std::ostream& operator<<(std::ostream& os, const MatchHistory& pos) {
+ return pos.dump(os);
+}
+
+template <match_stage_t STAGE>
+struct _check_GT_t {
+ static bool eval(const MatchHistory* history) {
+ return history->get<STAGE>() &&
+ (*history->get<STAGE>() == MatchKindCMP::GT ||
+ (*history->get<STAGE>() == MatchKindCMP::EQ &&
+ _check_GT_t<STAGE - 1>::eval(history)));
+ }
+};
+template <>
+struct _check_GT_t<STAGE_RIGHT> {
+ static bool eval(const MatchHistory* history) {
+ return history->get<STAGE_RIGHT>() &&
+ *history->get<STAGE_RIGHT>() == MatchKindCMP::GT;
+ }
+};
+template <match_stage_t STAGE>
+const bool MatchHistory::is_GT() const {
+ static_assert(is_valid_stage(STAGE));
+ if constexpr (STAGE < STAGE_TOP) {
+ assert(get<STAGE + 1>() == MatchKindCMP::EQ);
+ }
+ return _check_GT_t<STAGE>::eval(this);
+}
+
+template <match_stage_t STAGE>
+struct staged_position_t {
+ static_assert(is_valid_stage(STAGE));
+ using me_t = staged_position_t<STAGE>;
+ using nxt_t = staged_position_t<STAGE - 1>;
+ bool is_end() const {
+ if (index == INDEX_END) {
+ return true;
+ } else {
+ assert(is_valid_index(index));
+ return false;
+ }
+ }
+ index_t& index_by_stage(match_stage_t stage) {
+ assert(stage <= STAGE);
+ if (STAGE == stage) {
+ return index;
+ } else {
+ return nxt.index_by_stage(stage);
+ }
+ }
+
+ int cmp(const me_t& o) const {
+ if (index > o.index) {
+ return 1;
+ } else if (index < o.index) {
+ return -1;
+ } else {
+ return nxt.cmp(o.nxt);
+ }
+ }
+ bool operator>(const me_t& o) const { return cmp(o) > 0; }
+ bool operator>=(const me_t& o) const { return cmp(o) >= 0; }
+ bool operator<(const me_t& o) const { return cmp(o) < 0; }
+ bool operator<=(const me_t& o) const { return cmp(o) <= 0; }
+ bool operator==(const me_t& o) const { return cmp(o) == 0; }
+ bool operator!=(const me_t& o) const { return cmp(o) != 0; }
+
+ me_t& operator-=(const me_t& o) {
+ assert(is_valid_index(o.index));
+ assert(index >= o.index);
+ if (index != INDEX_END) {
+ assert(is_valid_index(index));
+ index -= o.index;
+ if (index == 0) {
+ nxt -= o.nxt;
+ }
+ }
+ return *this;
+ }
+
+ void encode(ceph::bufferlist& encoded) const {
+ ceph::encode(index, encoded);
+ nxt.encode(encoded);
+ }
+
+ static me_t decode(ceph::bufferlist::const_iterator& delta) {
+ me_t ret;
+ ceph::decode(ret.index, delta);
+ ret.nxt = nxt_t::decode(delta);
+ return ret;
+ }
+
+ static me_t begin() { return {0u, nxt_t::begin()}; }
+ static me_t end() {
+ return {INDEX_END, nxt_t::end()};
+ }
+
+ index_t index;
+ nxt_t nxt;
+};
+template <match_stage_t STAGE>
+std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE>& pos) {
+ if (pos.index == INDEX_END) {
+ os << "END";
+ } else if (pos.index == INDEX_LAST) {
+ os << "LAST";
+ } else {
+ os << pos.index;
+ assert(is_valid_index(pos.index));
+ }
+ return os << ", " << pos.nxt;
+}
+
+template <>
+struct staged_position_t<STAGE_BOTTOM> {
+ using me_t = staged_position_t<STAGE_BOTTOM>;
+ bool is_end() const {
+ if (index == INDEX_END) {
+ return true;
+ } else {
+ assert(is_valid_index(index));
+ return false;
+ }
+ }
+ index_t& index_by_stage(match_stage_t stage) {
+ assert(stage == STAGE_BOTTOM);
+ return index;
+ }
+
+ int cmp(const staged_position_t<STAGE_BOTTOM>& o) const {
+ if (index > o.index) {
+ return 1;
+ } else if (index < o.index) {
+ return -1;
+ } else {
+ return 0;
+ }
+ }
+ bool operator>(const me_t& o) const { return cmp(o) > 0; }
+ bool operator>=(const me_t& o) const { return cmp(o) >= 0; }
+ bool operator<(const me_t& o) const { return cmp(o) < 0; }
+ bool operator<=(const me_t& o) const { return cmp(o) <= 0; }
+ bool operator==(const me_t& o) const { return cmp(o) == 0; }
+ bool operator!=(const me_t& o) const { return cmp(o) != 0; }
+
+ me_t& operator-=(const me_t& o) {
+ assert(is_valid_index(o.index));
+ assert(index >= o.index);
+ if (index != INDEX_END) {
+ assert(is_valid_index(index));
+ index -= o.index;
+ }
+ return *this;
+ }
+
+ void encode(ceph::bufferlist& encoded) const {
+ ceph::encode(index, encoded);
+ }
+
+ static me_t decode(ceph::bufferlist::const_iterator& delta) {
+ me_t ret;
+ ceph::decode(ret.index, delta);
+ return ret;
+ }
+
+ static me_t begin() { return {0u}; }
+ static me_t end() { return {INDEX_END}; }
+
+ index_t index;
+};
+template <>
+inline std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE_BOTTOM>& pos) {
+ if (pos.index == INDEX_END) {
+ os << "END";
+ } else if (pos.index == INDEX_LAST) {
+ os << "LAST";
+ } else {
+ os << pos.index;
+ assert(is_valid_index(pos.index));
+ }
+ return os;
+}
+
+using search_position_t = staged_position_t<STAGE_TOP>;
+
+template <match_stage_t STAGE>
+const staged_position_t<STAGE>& cast_down(const search_position_t& pos) {
+ if constexpr (STAGE == STAGE_LEFT) {
+ return pos;
+ } else if constexpr (STAGE == STAGE_STRING) {
+#ifndef NDEBUG
+ if (pos.is_end()) {
+ assert(pos.nxt.is_end());
+ } else {
+ assert(pos.index == 0u);
+ }
+#endif
+ return pos.nxt;
+ } else if constexpr (STAGE == STAGE_RIGHT) {
+#ifndef NDEBUG
+ if (pos.is_end()) {
+ assert(pos.nxt.nxt.is_end());
+ } else {
+ assert(pos.index == 0u);
+ assert(pos.nxt.index == 0u);
+ }
+#endif
+ return pos.nxt.nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+template <match_stage_t STAGE>
+staged_position_t<STAGE>& cast_down(search_position_t& pos) {
+ const search_position_t& _pos = pos;
+ return const_cast<staged_position_t<STAGE>&>(cast_down<STAGE>(_pos));
+}
+
+template <match_stage_t STAGE>
+staged_position_t<STAGE>& cast_down_fill_0(search_position_t& pos) {
+ if constexpr (STAGE == STAGE_LEFT) {
+ return pos;
+ } if constexpr (STAGE == STAGE_STRING) {
+ pos.index = 0;
+ return pos.nxt;
+ } else if constexpr (STAGE == STAGE_RIGHT) {
+ pos.index = 0;
+ pos.nxt.index = 0;
+ return pos.nxt.nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+inline search_position_t&& normalize(search_position_t&& pos) { return std::move(pos); }
+
+template <match_stage_t STAGE, typename = std::enable_if_t<STAGE != STAGE_TOP>>
+search_position_t normalize(staged_position_t<STAGE>&& pos) {
+ if (pos.is_end()) {
+ return search_position_t::end();
+ }
+ if constexpr (STAGE == STAGE_STRING) {
+ return {0u, std::move(pos)};
+ } else if (STAGE == STAGE_RIGHT) {
+ return {0u, {0u, std::move(pos)}};
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+struct memory_range_t {
+ const char* p_start;
+ const char* p_end;
+};
+
+enum class ContainerType { ITERATIVE, INDEXABLE };
+
+template <node_type_t> struct value_type;
+template<> struct value_type<node_type_t::INTERNAL> { using type = laddr_packed_t; };
+template<> struct value_type<node_type_t::LEAF> { using type = onode_t; };
+template <node_type_t NODE_TYPE>
+using value_type_t = typename value_type<NODE_TYPE>::type;
+
+template <node_type_t NODE_TYPE, match_stage_t STAGE>
+struct staged_result_t {
+ using me_t = staged_result_t<NODE_TYPE, STAGE>;
+ bool is_end() const { return position.is_end(); }
+
+ static me_t end() {
+ return {staged_position_t<STAGE>::end(), nullptr, MSTAT_END};
+ }
+ template <typename T = me_t>
+ static std::enable_if_t<STAGE != STAGE_BOTTOM, T> from_nxt(
+ index_t index, const staged_result_t<NODE_TYPE, STAGE - 1>& nxt_stage_result) {
+ return {{index, nxt_stage_result.position},
+ nxt_stage_result.p_value,
+ nxt_stage_result.mstat};
+ }
+
+ staged_position_t<STAGE> position;
+ const value_type_t<NODE_TYPE>* p_value;
+ match_stat_t mstat;
+};
+
+template <node_type_t NODE_TYPE>
+using lookup_result_t = staged_result_t<NODE_TYPE, STAGE_TOP>;
+
+template <node_type_t NODE_TYPE>
+lookup_result_t<NODE_TYPE>&& normalize(
+ lookup_result_t<NODE_TYPE>&& result) { return std::move(result); }
+
+template <node_type_t NODE_TYPE, match_stage_t STAGE,
+ typename = std::enable_if_t<STAGE != STAGE_TOP>>
+lookup_result_t<NODE_TYPE> normalize(
+ staged_result_t<NODE_TYPE, STAGE>&& result) {
+ // FIXME: assert result.mstat correct
+ return {normalize(std::move(result.position)), result.p_value, result.mstat};
+}
+
+struct node_stats_t {
+ size_t size_persistent = 0;
+ size_t size_filled = 0;
+ // filled by staged::get_stats()
+ size_t size_logical = 0;
+ size_t size_overhead = 0;
+ size_t size_value = 0;
+ unsigned num_kvs = 0;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc
new file mode 100644
index 000000000..aaca6c3c6
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc
@@ -0,0 +1,208 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "sub_items_stage.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+template <KeyT KT>
+const laddr_packed_t* internal_sub_items_t::insert_at(
+ NodeExtentMutable& mut, const internal_sub_items_t& sub_items,
+ const full_key_t<KT>& key, const laddr_packed_t& value,
+ index_t index, node_offset_t size, const char* p_left_bound) {
+ assert(index <= sub_items.keys());
+ assert(size == estimate_insert<KT>(key, value));
+ const char* p_shift_start = p_left_bound;
+ const char* p_shift_end = reinterpret_cast<const char*>(
+ sub_items.p_first_item + 1 - index);
+ mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size);
+
+ auto p_insert = const_cast<char*>(p_shift_end) - size;
+ auto item = internal_sub_item_t{snap_gen_t::from_key<KT>(key), value};
+ mut.copy_in_absolute(p_insert, item);
+ return &reinterpret_cast<internal_sub_item_t*>(p_insert)->value;
+}
+#define IA_TEMPLATE(KT) \
+ template const laddr_packed_t* internal_sub_items_t::insert_at<KT>( \
+ NodeExtentMutable&, const internal_sub_items_t&, const full_key_t<KT>&, \
+ const laddr_packed_t&, index_t, node_offset_t, const char*)
+IA_TEMPLATE(KeyT::VIEW);
+IA_TEMPLATE(KeyT::HOBJ);
+
+node_offset_t internal_sub_items_t::trim_until(
+ NodeExtentMutable&, internal_sub_items_t& items, index_t index) {
+ assert(index != 0);
+ auto keys = items.keys();
+ assert(index <= keys);
+ size_t ret = sizeof(internal_sub_item_t) * (keys - index);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+}
+
+template <KeyT KT>
+void internal_sub_items_t::Appender<KT>::append(
+ const internal_sub_items_t& src, index_t from, index_t items) {
+ assert(from <= src.keys());
+ if (items == 0) {
+ return;
+ }
+ assert(from < src.keys());
+ assert(from + items <= src.keys());
+ node_offset_t size = sizeof(internal_sub_item_t) * items;
+ p_append -= size;
+ p_mut->copy_in_absolute(p_append, src.p_first_item + 1 - from - items, size);
+}
+
+template <KeyT KT>
+void internal_sub_items_t::Appender<KT>::append(
+ const full_key_t<KT>& key, const laddr_packed_t& value,
+ const laddr_packed_t*& p_value) {
+ p_append -= sizeof(internal_sub_item_t);
+ auto item = internal_sub_item_t{snap_gen_t::from_key<KT>(key), value};
+ p_mut->copy_in_absolute(p_append, item);
+ p_value = &reinterpret_cast<internal_sub_item_t*>(p_append)->value;
+}
+
+template <KeyT KT>
+const onode_t* leaf_sub_items_t::insert_at(
+ NodeExtentMutable& mut, const leaf_sub_items_t& sub_items,
+ const full_key_t<KT>& key, const onode_t& value,
+ index_t index, node_offset_t size, const char* p_left_bound) {
+ assert(index <= sub_items.keys());
+ assert(size == estimate_insert<KT>(key, value));
+ // a. [... item(index)] << size
+ const char* p_shift_start = p_left_bound;
+ const char* p_shift_end = sub_items.get_item_end(index);
+ mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size);
+
+ // b. insert item
+ auto p_insert = const_cast<char*>(p_shift_end - size);
+ auto p_value = reinterpret_cast<const onode_t*>(p_insert);
+ mut.copy_in_absolute(p_insert, &value, value.size);
+ p_insert += value.size;
+ mut.copy_in_absolute(p_insert, snap_gen_t::template from_key<KT>(key));
+ assert(p_insert + sizeof(snap_gen_t) + sizeof(node_offset_t) == p_shift_end);
+
+ // c. compensate affected offsets
+ auto item_size = value.size + sizeof(snap_gen_t);
+ for (auto i = index; i < sub_items.keys(); ++i) {
+ const node_offset_packed_t& offset_i = sub_items.get_offset(i);
+ mut.copy_in_absolute((void*)&offset_i, node_offset_t(offset_i.value + item_size));
+ }
+
+ // d. [item(index-1) ... item(0) ... offset(index)] <<< sizeof(node_offset_t)
+ const char* p_offset = (index == 0 ?
+ (const char*)&sub_items.get_offset(0) + sizeof(node_offset_t) :
+ (const char*)&sub_items.get_offset(index - 1));
+ p_shift_start = p_shift_end;
+ p_shift_end = p_offset;
+ mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)sizeof(node_offset_t));
+
+ // e. insert offset
+ node_offset_t offset_to_item_start = item_size + sub_items.get_offset_to_end(index);
+ mut.copy_in_absolute(
+ const_cast<char*>(p_shift_end) - sizeof(node_offset_t), offset_to_item_start);
+
+ // f. update num_sub_keys
+ mut.copy_in_absolute((void*)sub_items.p_num_keys, num_keys_t(sub_items.keys() + 1));
+
+ return p_value;
+}
+template const onode_t* leaf_sub_items_t::insert_at<KeyT::HOBJ>(
+ NodeExtentMutable&, const leaf_sub_items_t&, const full_key_t<KeyT::HOBJ>&,
+ const onode_t&, index_t, node_offset_t, const char*);
+
+node_offset_t leaf_sub_items_t::trim_until(
+ NodeExtentMutable& mut, leaf_sub_items_t& items, index_t index) {
+ assert(index != 0);
+ auto keys = items.keys();
+ assert(index <= keys);
+ if (index == keys) {
+ return 0;
+ }
+ index_t trim_items = keys - index;
+ const char* p_items_start = items.p_start();
+ const char* p_shift_start = items.get_item_end(index);
+ const char* p_shift_end = items.get_item_end(0);
+ size_t size_trim_offsets = sizeof(node_offset_t) * trim_items;
+ mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start,
+ size_trim_offsets);
+ mut.copy_in_absolute((void*)items.p_num_keys, num_keys_t(index));
+ size_t ret = size_trim_offsets + (p_shift_start - p_items_start);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+}
+
+template class internal_sub_items_t::Appender<KeyT::VIEW>;
+template class internal_sub_items_t::Appender<KeyT::HOBJ>;
+
+// helper type for the visitor
+template<class... Ts> struct overloaded : Ts... { using Ts::operator()...; };
+// explicit deduction guide
+template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
+
+template <KeyT KT>
+char* leaf_sub_items_t::Appender<KT>::wrap() {
+ auto p_cur = p_append;
+ num_keys_t num_keys = 0;
+ for (auto i = 0u; i < cnt; ++i) {
+ auto& a = appends[i];
+ std::visit(overloaded {
+ [&] (const range_items_t& arg) { num_keys += arg.items; },
+ [&] (const kv_item_t& arg) { ++num_keys; }
+ }, a);
+ }
+ assert(num_keys);
+ p_cur -= sizeof(num_keys_t);
+ p_mut->copy_in_absolute(p_cur, num_keys);
+
+ node_offset_t last_offset = 0;
+ for (auto i = 0u; i < cnt; ++i) {
+ auto& a = appends[i];
+ std::visit(overloaded {
+ [&] (const range_items_t& arg) {
+ int compensate = (last_offset - op_src->get_offset_to_end(arg.from));
+ node_offset_t offset;
+ for (auto i = arg.from; i < arg.from + arg.items; ++i) {
+ offset = op_src->get_offset(i).value + compensate;
+ p_cur -= sizeof(node_offset_t);
+ p_mut->copy_in_absolute(p_cur, offset);
+ }
+ last_offset = offset;
+ },
+ [&] (const kv_item_t& arg) {
+ last_offset += sizeof(snap_gen_t) + arg.p_value->size;
+ p_cur -= sizeof(node_offset_t);
+ p_mut->copy_in_absolute(p_cur, last_offset);
+ }
+ }, a);
+ }
+
+ for (auto i = 0u; i < cnt; ++i) {
+ auto& a = appends[i];
+ std::visit(overloaded {
+ [&] (const range_items_t& arg) {
+ auto _p_start = op_src->get_item_end(arg.from + arg.items);
+ size_t _len = op_src->get_item_end(arg.from) - _p_start;
+ p_cur -= _len;
+ p_mut->copy_in_absolute(p_cur, _p_start, _len);
+ },
+ [&] (const kv_item_t& arg) {
+ assert(pp_value);
+ p_cur -= sizeof(snap_gen_t);
+ p_mut->copy_in_absolute(p_cur, snap_gen_t::template from_key<KT>(*arg.p_key));
+ p_cur -= arg.p_value->size;
+ p_mut->copy_in_absolute(p_cur, arg.p_value, arg.p_value->size);
+ *pp_value = reinterpret_cast<const onode_t*>(p_cur);
+ }
+ }, a);
+ }
+ return p_cur;
+}
+
+template class leaf_sub_items_t::Appender<KeyT::VIEW>;
+template class leaf_sub_items_t::Appender<KeyT::HOBJ>;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h
new file mode 100644
index 000000000..8ef5f7472
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h
@@ -0,0 +1,341 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <variant>
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "key_layout.h"
+#include "stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+struct internal_sub_item_t {
+ const snap_gen_t& get_key() const { return key; }
+ const laddr_packed_t* get_p_value() const { return &value; }
+
+ snap_gen_t key;
+ laddr_packed_t value;
+} __attribute__((packed));
+
+/**
+ * internal_sub_items_t
+ *
+ * The STAGE_RIGHT implementation for internal node N0/N1/N2, implements staged
+ * contract as an indexable container to index snap-gen to child node
+ * addresses.
+ *
+ * The layout of the contaner storing n sub-items:
+ *
+ * # <--------- container range -----------> #
+ * #<~># sub-items [2, n) #
+ * # # <- sub-item 1 -> # <- sub-item 0 -> #
+ * #...# snap-gen | laddr # snap-gen | laddr #
+ * ^
+ * |
+ * p_first_item +
+ */
+class internal_sub_items_t {
+ public:
+ using num_keys_t = index_t;
+
+ internal_sub_items_t(const memory_range_t& range) {
+ assert(range.p_start < range.p_end);
+ assert((range.p_end - range.p_start) % sizeof(internal_sub_item_t) == 0);
+ num_items = (range.p_end - range.p_start) / sizeof(internal_sub_item_t);
+ assert(num_items > 0);
+ auto _p_first_item = range.p_end - sizeof(internal_sub_item_t);
+ p_first_item = reinterpret_cast<const internal_sub_item_t*>(_p_first_item);
+ }
+
+ // container type system
+ using key_get_type = const snap_gen_t&;
+ static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE;
+ num_keys_t keys() const { return num_items; }
+ key_get_type operator[](index_t index) const {
+ assert(index < num_items);
+ return (p_first_item - index)->get_key();
+ }
+ node_offset_t size_before(index_t index) const {
+ size_t ret = index * sizeof(internal_sub_item_t);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ }
+ const laddr_packed_t* get_p_value(index_t index) const {
+ assert(index < num_items);
+ return (p_first_item - index)->get_p_value();
+ }
+ node_offset_t size_overhead_at(index_t index) const { return 0u; }
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ auto p_end = reinterpret_cast<const char*>(p_first_item) +
+ sizeof(internal_sub_item_t);
+ auto p_start = p_end - num_items * sizeof(internal_sub_item_t);
+ int start_offset = p_start - p_node_start;
+ int end_offset = p_end - p_node_start;
+ assert(start_offset > 0 &&
+ start_offset < end_offset &&
+ end_offset < NODE_BLOCK_SIZE);
+ ceph::encode(static_cast<node_offset_t>(start_offset), encoded);
+ ceph::encode(static_cast<node_offset_t>(end_offset), encoded);
+ }
+
+ static internal_sub_items_t decode(
+ const char* p_node_start, ceph::bufferlist::const_iterator& delta) {
+ node_offset_t start_offset;
+ ceph::decode(start_offset, delta);
+ node_offset_t end_offset;
+ ceph::decode(end_offset, delta);
+ assert(start_offset < end_offset);
+ assert(end_offset <= NODE_BLOCK_SIZE);
+ return internal_sub_items_t({p_node_start + start_offset,
+ p_node_start + end_offset});
+ }
+
+ static node_offset_t header_size() { return 0u; }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(
+ const full_key_t<KT>&, const laddr_packed_t&) {
+ return sizeof(internal_sub_item_t);
+ }
+
+ template <KeyT KT>
+ static const laddr_packed_t* insert_at(
+ NodeExtentMutable&, const internal_sub_items_t&,
+ const full_key_t<KT>&, const laddr_packed_t&,
+ index_t index, node_offset_t size, const char* p_left_bound);
+
+ static node_offset_t trim_until(NodeExtentMutable&, internal_sub_items_t&, index_t);
+
+ template <KeyT KT>
+ class Appender;
+
+ private:
+ index_t num_items;
+ const internal_sub_item_t* p_first_item;
+};
+
+template <KeyT KT>
+class internal_sub_items_t::Appender {
+ public:
+ Appender(NodeExtentMutable* p_mut, char* p_append)
+ : p_mut{p_mut}, p_append{p_append} {}
+ void append(const internal_sub_items_t& src, index_t from, index_t items);
+ void append(const full_key_t<KT>&, const laddr_packed_t&, const laddr_packed_t*&);
+ char* wrap() { return p_append; }
+ private:
+ NodeExtentMutable* p_mut;
+ char* p_append;
+};
+
+/**
+ * leaf_sub_items_t
+ *
+ * The STAGE_RIGHT implementation for leaf node N0/N1/N2, implements staged
+ * contract as an indexable container to index snap-gen to onode_t.
+ *
+ * The layout of the contaner storing n sub-items:
+ *
+ * # <------------------------ container range -------------------------------> #
+ * # <---------- sub-items ----------------> # <--- offsets ---------# #
+ * #<~># sub-items [2, n) #<~>| offsets [2, n) # #
+ * # # <- sub-item 1 -> # <- sub-item 0 -> # | # #
+ * #...# snap-gen | onode # snap-gen | onode #...| offset1 | offset0 # num_keys #
+ * ^ ^ ^
+ * | | |
+ * p_items_end + p_offsets + |
+ * p_num_keys +
+ */
+class leaf_sub_items_t {
+ public:
+ // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t),
+ // and the minimal size of onode_t
+ using num_keys_t = uint8_t;
+
+ leaf_sub_items_t(const memory_range_t& range) {
+ assert(range.p_start < range.p_end);
+ auto _p_num_keys = range.p_end - sizeof(num_keys_t);
+ assert(range.p_start < _p_num_keys);
+ p_num_keys = reinterpret_cast<const num_keys_t*>(_p_num_keys);
+ assert(keys());
+ auto _p_offsets = _p_num_keys - sizeof(node_offset_t);
+ assert(range.p_start < _p_offsets);
+ p_offsets = reinterpret_cast<const node_offset_packed_t*>(_p_offsets);
+ p_items_end = reinterpret_cast<const char*>(&get_offset(keys() - 1));
+ assert(range.p_start < p_items_end);
+ assert(range.p_start == p_start());
+ }
+
+ bool operator==(const leaf_sub_items_t& x) {
+ return (p_num_keys == x.p_num_keys &&
+ p_offsets == x.p_offsets &&
+ p_items_end == x.p_items_end);
+ }
+
+ const char* p_start() const { return get_item_end(keys()); }
+
+ const node_offset_packed_t& get_offset(index_t index) const {
+ assert(index < keys());
+ return *(p_offsets - index);
+ }
+
+ const node_offset_t get_offset_to_end(index_t index) const {
+ assert(index <= keys());
+ return index == 0 ? 0 : get_offset(index - 1).value;
+ }
+
+ const char* get_item_start(index_t index) const {
+ return p_items_end - get_offset(index).value;
+ }
+
+ const char* get_item_end(index_t index) const {
+ return p_items_end - get_offset_to_end(index);
+ }
+
+ // container type system
+ using key_get_type = const snap_gen_t&;
+ static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE;
+ num_keys_t keys() const { return *p_num_keys; }
+ key_get_type operator[](index_t index) const {
+ assert(index < keys());
+ auto pointer = get_item_end(index);
+ assert(get_item_start(index) < pointer);
+ pointer -= sizeof(snap_gen_t);
+ assert(get_item_start(index) < pointer);
+ return *reinterpret_cast<const snap_gen_t*>(pointer);
+ }
+ node_offset_t size_before(index_t index) const {
+ assert(index <= keys());
+ size_t ret;
+ if (index == 0) {
+ ret = sizeof(num_keys_t);
+ } else {
+ --index;
+ ret = sizeof(num_keys_t) +
+ (index + 1) * sizeof(node_offset_t) +
+ get_offset(index).value;
+ }
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ }
+ node_offset_t size_overhead_at(index_t index) const { return sizeof(node_offset_t); }
+ const onode_t* get_p_value(index_t index) const {
+ assert(index < keys());
+ auto pointer = get_item_start(index);
+ auto value = reinterpret_cast<const onode_t*>(pointer);
+ assert(pointer + value->size + sizeof(snap_gen_t) == get_item_end(index));
+ return value;
+ }
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ auto p_end = reinterpret_cast<const char*>(p_num_keys) +
+ sizeof(num_keys_t);
+ int start_offset = p_start() - p_node_start;
+ int end_offset = p_end - p_node_start;
+ assert(start_offset > 0 &&
+ start_offset < end_offset &&
+ end_offset < NODE_BLOCK_SIZE);
+ ceph::encode(static_cast<node_offset_t>(start_offset), encoded);
+ ceph::encode(static_cast<node_offset_t>(end_offset), encoded);
+ }
+
+ static leaf_sub_items_t decode(
+ const char* p_node_start, ceph::bufferlist::const_iterator& delta) {
+ node_offset_t start_offset;
+ ceph::decode(start_offset, delta);
+ node_offset_t end_offset;
+ ceph::decode(end_offset, delta);
+ assert(start_offset < end_offset);
+ assert(end_offset <= NODE_BLOCK_SIZE);
+ return leaf_sub_items_t({p_node_start + start_offset,
+ p_node_start + end_offset});
+ }
+
+ static node_offset_t header_size() { return sizeof(num_keys_t); }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(const full_key_t<KT>&, const onode_t& value) {
+ return value.size + sizeof(snap_gen_t) + sizeof(node_offset_t);
+ }
+
+ template <KeyT KT>
+ static const onode_t* insert_at(
+ NodeExtentMutable&, const leaf_sub_items_t&,
+ const full_key_t<KT>&, const onode_t&,
+ index_t index, node_offset_t size, const char* p_left_bound);
+
+ static node_offset_t trim_until(NodeExtentMutable&, leaf_sub_items_t&, index_t index);
+
+ template <KeyT KT>
+ class Appender;
+
+ private:
+ // TODO: support unaligned access
+ const num_keys_t* p_num_keys;
+ const node_offset_packed_t* p_offsets;
+ const char* p_items_end;
+};
+
+constexpr index_t APPENDER_LIMIT = 3u;
+
+template <KeyT KT>
+class leaf_sub_items_t::Appender {
+ struct range_items_t {
+ index_t from;
+ index_t items;
+ };
+ struct kv_item_t {
+ const full_key_t<KT>* p_key;
+ const onode_t* p_value;
+ };
+ using var_t = std::variant<range_items_t, kv_item_t>;
+
+ public:
+ Appender(NodeExtentMutable* p_mut, char* p_append)
+ : p_mut{p_mut}, p_append{p_append} {
+ }
+
+ void append(const leaf_sub_items_t& src, index_t from, index_t items) {
+ assert(cnt <= APPENDER_LIMIT);
+ assert(from <= src.keys());
+ if (items == 0) {
+ return;
+ }
+ if (op_src) {
+ assert(*op_src == src);
+ } else {
+ op_src = src;
+ }
+ assert(from < src.keys());
+ assert(from + items <= src.keys());
+ appends[cnt] = range_items_t{from, items};
+ ++cnt;
+ }
+ void append(const full_key_t<KT>& key,
+ const onode_t& value, const onode_t*& p_value) {
+ assert(pp_value == nullptr);
+ assert(cnt <= APPENDER_LIMIT);
+ appends[cnt] = kv_item_t{&key, &value};
+ ++cnt;
+ pp_value = &p_value;
+ }
+ char* wrap();
+
+ private:
+ std::optional<leaf_sub_items_t> op_src;
+ const onode_t** pp_value = nullptr;
+ NodeExtentMutable* p_mut;
+ char* p_append;
+ var_t appends[APPENDER_LIMIT];
+ index_t cnt = 0;
+};
+
+template <node_type_t> struct _sub_items_t;
+template<> struct _sub_items_t<node_type_t::INTERNAL> { using type = internal_sub_items_t; };
+template<> struct _sub_items_t<node_type_t::LEAF> { using type = leaf_sub_items_t; };
+template <node_type_t NODE_TYPE>
+using sub_items_t = typename _sub_items_t<NODE_TYPE>::type;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc
new file mode 100644
index 000000000..5a28f5097
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc
@@ -0,0 +1,26 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "super.h"
+#include "node.h"
+
+namespace crimson::os::seastore::onode {
+
+Ref<Node> RootNodeTrackerIsolated::get_root(Transaction& t) const {
+ auto iter = tracked_supers.find(&t);
+ if (iter == tracked_supers.end()) {
+ return nullptr;
+ } else {
+ return iter->second->get_p_root();
+ }
+}
+
+Ref<Node> RootNodeTrackerShared::get_root(Transaction&) const {
+ if (is_clean()) {
+ return nullptr;
+ } else {
+ return tracked_super->get_p_root();
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.h b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h
new file mode 100644
index 000000000..5eefee9ff
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h
@@ -0,0 +1,143 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+
+#include "crimson/common/type_helpers.h"
+
+#include "fwd.h"
+
+namespace crimson::os::seastore::onode {
+
+class Node;
+class Super;
+
+/**
+ * RootNodeTracker
+ *
+ * An abstracted tracker to get the root node by Transaction.
+ */
+class RootNodeTracker {
+ public:
+ virtual ~RootNodeTracker() = default;
+ virtual bool is_clean() const = 0;
+ virtual Ref<Node> get_root(Transaction&) const = 0;
+ static RootNodeTrackerURef create(bool read_isolated);
+ protected:
+ RootNodeTracker() = default;
+ RootNodeTracker(const RootNodeTracker&) = delete;
+ RootNodeTracker(RootNodeTracker&&) = delete;
+ RootNodeTracker& operator=(const RootNodeTracker&) = delete;
+ RootNodeTracker& operator=(RootNodeTracker&&) = delete;
+ virtual void do_track_super(Transaction&, Super&) = 0;
+ virtual void do_untrack_super(Transaction&, Super&) = 0;
+ friend class Super;
+};
+
+/**
+ * Super
+ *
+ * The parent of root node. It contains the relationship between a Transaction
+ * and a root node address.
+ */
+class Super {
+ public:
+ using URef = std::unique_ptr<Super>;
+ Super(const Super&) = delete;
+ Super(Super&&) = delete;
+ Super& operator=(const Super&) = delete;
+ Super& operator=(Super&&) = delete;
+ virtual ~Super() {
+ assert(tracked_root_node == nullptr);
+ tracker.do_untrack_super(t, *this);
+ }
+
+ virtual laddr_t get_root_laddr() const = 0;
+ virtual void write_root_laddr(context_t, laddr_t) = 0;
+
+ void do_track_root(Node& root) {
+ assert(tracked_root_node == nullptr);
+ tracked_root_node = &root;
+ }
+ void do_untrack_root(Node& root) {
+ assert(tracked_root_node == &root);
+ tracked_root_node = nullptr;
+ }
+ Node* get_p_root() const {
+ assert(tracked_root_node != nullptr);
+ return tracked_root_node;
+ }
+
+ protected:
+ Super(Transaction& t, RootNodeTracker& tracker)
+ : t{t}, tracker{tracker} {
+ tracker.do_track_super(t, *this);
+ }
+
+ private:
+ Transaction& t;
+ RootNodeTracker& tracker;
+ Node* tracked_root_node = nullptr;
+};
+
+/**
+ * RootNodeTrackerIsolated
+ *
+ * A concrete RootNodeTracker implementation which provides root node isolation
+ * between Transactions for Seastore backend.
+ */
+class RootNodeTrackerIsolated final : public RootNodeTracker {
+ public:
+ ~RootNodeTrackerIsolated() override { assert(is_clean()); }
+ protected:
+ bool is_clean() const override {
+ return tracked_supers.empty();
+ }
+ void do_track_super(Transaction& t, Super& super) override {
+ assert(tracked_supers.find(&t) == tracked_supers.end());
+ tracked_supers[&t] = &super;
+ }
+ void do_untrack_super(Transaction& t, Super& super) override {
+ [[maybe_unused]] auto removed = tracked_supers.erase(&t);
+ assert(removed);
+ }
+ ::Ref<Node> get_root(Transaction& t) const override;
+ std::map<Transaction*, Super*> tracked_supers;
+};
+
+/**
+ * RootNodeTrackerShared
+ *
+ * A concrete RootNodeTracker implementation which has no isolation between
+ * Transactions for Dummy backend.
+ */
+class RootNodeTrackerShared final : public RootNodeTracker {
+ public:
+ ~RootNodeTrackerShared() override { assert(is_clean()); }
+ protected:
+ bool is_clean() const override {
+ return tracked_super == nullptr;
+ }
+ void do_track_super(Transaction&, Super& super) override {
+ assert(is_clean());
+ tracked_super = &super;
+ }
+ void do_untrack_super(Transaction&, Super& super) override {
+ assert(tracked_super == &super);
+ tracked_super = nullptr;
+ }
+ ::Ref<Node> get_root(Transaction&) const override;
+ Super* tracked_super = nullptr;
+};
+
+inline RootNodeTrackerURef RootNodeTracker::create(bool read_isolated) {
+ if (read_isolated) {
+ return RootNodeTrackerURef(new RootNodeTrackerIsolated());
+ } else {
+ return RootNodeTrackerURef(new RootNodeTrackerShared());
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc
new file mode 100644
index 000000000..2c8c21652
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tree.h"
+
+#include "node.h"
+#include "node_extent_manager.h"
+#include "stages/key_layout.h"
+#include "super.h"
+
+namespace crimson::os::seastore::onode {
+
+using btree_ertr = Btree::btree_ertr;
+template <class ValueT=void>
+using btree_future = Btree::btree_future<ValueT>;
+using Cursor = Btree::Cursor;
+
+Cursor::Cursor(Btree* p_tree, Ref<tree_cursor_t> _p_cursor)
+ : p_tree(p_tree) {
+ if (_p_cursor->is_end()) {
+ // no need to hold the leaf node
+ } else {
+ p_cursor = _p_cursor;
+ }
+}
+Cursor::Cursor(Btree* p_tree) : p_tree{p_tree} {}
+Cursor::Cursor(const Cursor&) = default;
+Cursor::Cursor(Cursor&&) noexcept = default;
+Cursor& Cursor::operator=(const Cursor&) = default;
+Cursor& Cursor::operator=(Cursor&&) = default;
+Cursor::~Cursor() = default;
+
+bool Cursor::is_end() const {
+ if (p_cursor) {
+ assert(!p_cursor->is_end());
+ return false;
+ } else {
+ return true;
+ }
+}
+
+ghobject_t Cursor::get_ghobj() const {
+ return p_cursor->get_key_view().to_ghobj();
+}
+
+const onode_t* Cursor::value() const {
+ return p_cursor->get_p_value();
+}
+
+bool Cursor::operator==(const Cursor& x) const {
+ return p_cursor == x.p_cursor;
+}
+
+Cursor& Cursor::operator++() {
+ // TODO
+ return *this;
+}
+
+Cursor Cursor::operator++(int) {
+ Cursor tmp = *this;
+ ++*this;
+ return tmp;
+}
+
+Cursor Cursor::make_end(Btree* p_tree) {
+ return {p_tree};
+}
+
+Btree::Btree(NodeExtentManagerURef&& _nm)
+ : nm{std::move(_nm)},
+ root_tracker{RootNodeTracker::create(nm->is_read_isolated())} {}
+
+Btree::~Btree() { assert(root_tracker->is_clean()); }
+
+btree_future<> Btree::mkfs(Transaction& t) {
+ return Node::mkfs(get_context(t), *root_tracker);
+}
+
+btree_future<Cursor> Btree::begin(Transaction& t) {
+ return get_root(t).safe_then([this, &t](auto root) {
+ return root->lookup_smallest(get_context(t));
+ }).safe_then([this](auto cursor) {
+ return Cursor{this, cursor};
+ });
+}
+
+btree_future<Cursor> Btree::last(Transaction& t) {
+ return get_root(t).safe_then([this, &t](auto root) {
+ return root->lookup_largest(get_context(t));
+ }).safe_then([this](auto cursor) {
+ return Cursor(this, cursor);
+ });
+}
+
+Cursor Btree::end() {
+ return Cursor::make_end(this);
+}
+
+btree_future<bool>
+Btree::contains(Transaction& t, const ghobject_t& obj) {
+ return seastar::do_with(
+ full_key_t<KeyT::HOBJ>(obj),
+ [this, &t](auto& key) -> btree_future<bool> {
+ return get_root(t).safe_then([this, &t, &key](auto root) {
+ // TODO: improve lower_bound()
+ return root->lower_bound(get_context(t), key);
+ }).safe_then([](auto result) {
+ return MatchKindBS::EQ == result.match();
+ });
+ }
+ );
+}
+
+btree_future<Cursor>
+Btree::find(Transaction& t, const ghobject_t& obj) {
+ return seastar::do_with(
+ full_key_t<KeyT::HOBJ>(obj),
+ [this, &t](auto& key) -> btree_future<Cursor> {
+ return get_root(t).safe_then([this, &t, &key](auto root) {
+ // TODO: improve lower_bound()
+ return root->lower_bound(get_context(t), key);
+ }).safe_then([this](auto result) {
+ if (result.match() == MatchKindBS::EQ) {
+ return Cursor(this, result.p_cursor);
+ } else {
+ return Cursor::make_end(this);
+ }
+ });
+ }
+ );
+}
+
+btree_future<Cursor>
+Btree::lower_bound(Transaction& t, const ghobject_t& obj) {
+ return seastar::do_with(
+ full_key_t<KeyT::HOBJ>(obj),
+ [this, &t](auto& key) -> btree_future<Cursor> {
+ return get_root(t).safe_then([this, &t, &key](auto root) {
+ return root->lower_bound(get_context(t), key);
+ }).safe_then([this](auto result) {
+ return Cursor(this, result.p_cursor);
+ });
+ }
+ );
+}
+
+btree_future<std::pair<Cursor, bool>>
+Btree::insert(Transaction& t, const ghobject_t& obj, const onode_t& value) {
+ return seastar::do_with(
+ full_key_t<KeyT::HOBJ>(obj),
+ [this, &t, &value](auto& key) -> btree_future<std::pair<Cursor, bool>> {
+ return get_root(t).safe_then([this, &t, &key, &value](auto root) {
+ return root->insert(get_context(t), key, value);
+ }).safe_then([this](auto ret) {
+ auto& [cursor, success] = ret;
+ return std::make_pair(Cursor(this, cursor), success);
+ });
+ }
+ );
+}
+
+btree_future<size_t> Btree::erase(Transaction& t, const ghobject_t& obj) {
+ // TODO
+ return btree_ertr::make_ready_future<size_t>(0u);
+}
+
+btree_future<Cursor> Btree::erase(Cursor& pos) {
+ // TODO
+ return btree_ertr::make_ready_future<Cursor>(
+ Cursor::make_end(this));
+}
+
+btree_future<Cursor>
+Btree::erase(Cursor& first, Cursor& last) {
+ // TODO
+ return btree_ertr::make_ready_future<Cursor>(
+ Cursor::make_end(this));
+}
+
+btree_future<size_t> Btree::height(Transaction& t) {
+ return get_root(t).safe_then([](auto root) {
+ return size_t(root->level() + 1);
+ });
+}
+
+btree_future<tree_stats_t> Btree::get_stats_slow(Transaction& t) {
+ return get_root(t).safe_then([this, &t](auto root) {
+ unsigned height = root->level() + 1;
+ return root->get_tree_stats(get_context(t)
+ ).safe_then([height](auto stats) {
+ stats.height = height;
+ return btree_ertr::make_ready_future<tree_stats_t>(stats);
+ });
+ });
+}
+
+std::ostream& Btree::dump(Transaction& t, std::ostream& os) {
+ auto root = root_tracker->get_root(t);
+ if (root) {
+ root->dump(os);
+ } else {
+ os << "empty tree!";
+ }
+ return os;
+}
+
+std::ostream& Btree::print(std::ostream& os) const {
+ return os << "BTree-" << *nm;
+}
+
+btree_future<Ref<Node>> Btree::get_root(Transaction& t) {
+ auto root = root_tracker->get_root(t);
+ if (root) {
+ return btree_ertr::make_ready_future<Ref<Node>>(root);
+ } else {
+ return Node::load_root(get_context(t), *root_tracker);
+ }
+}
+
+bool Btree::test_is_clean() const {
+ return root_tracker->is_clean();
+}
+
+btree_future<> Btree::test_clone_from(
+ Transaction& t, Transaction& t_from, Btree& from) {
+ // Note: assume the tree to clone is tracked correctly in memory.
+ // In some unit tests, parts of the tree are stubbed out that they
+ // should not be loaded from NodeExtentManager.
+ return from.get_root(t_from
+ ).safe_then([this, &t](auto root_from) {
+ return root_from->test_clone_root(get_context(t), *root_tracker);
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h
new file mode 100644
index 000000000..7ee618cb3
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h
@@ -0,0 +1,119 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+
+#include "common/hobject.h"
+#include "crimson/common/type_helpers.h"
+
+#include "fwd.h"
+#include "tree_types.h"
+
+/**
+ * tree.h
+ *
+ * An example implementation to expose tree interfaces to users. The current
+ * interface design is based on:
+ * - ceph::os::Transaction::create/touch/remove()
+ * - ceph::ObjectStore::collection_list()
+ * - ceph::BlueStore::get_onode()
+ * - db->get_iterator(PREFIIX_OBJ) by ceph::BlueStore::fsck()
+ *
+ * TODO: Redesign the interfaces based on real onode manager requirements.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class Node;
+class Btree {
+ public:
+ using btree_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error,
+ crimson::ct_error::invarg,
+ crimson::ct_error::enoent,
+ crimson::ct_error::erange>;
+ template <class ValueT=void>
+ using btree_future = btree_ertr::future<ValueT>;
+
+ Btree(NodeExtentManagerURef&& nm);
+ Btree(const Btree&) = delete;
+ Btree(Btree&&) = delete;
+ Btree& operator=(const Btree&) = delete;
+ Btree& operator=(Btree&&) = delete;
+ ~Btree();
+
+ btree_future<> mkfs(Transaction&);
+
+ class Cursor;
+ // lookup
+ btree_future<Cursor> begin(Transaction&);
+ btree_future<Cursor> last(Transaction&);
+ Cursor end();
+ btree_future<bool> contains(Transaction&, const ghobject_t&);
+ btree_future<Cursor> find(Transaction&, const ghobject_t&);
+ btree_future<Cursor> lower_bound(Transaction&, const ghobject_t&);
+
+ // modifiers
+ // TODO: replace onode_t
+ btree_future<std::pair<Cursor, bool>>
+ insert(Transaction&, const ghobject_t&, const onode_t&);
+ btree_future<size_t> erase(Transaction&, const ghobject_t& key);
+ btree_future<Cursor> erase(Cursor& pos);
+ btree_future<Cursor> erase(Cursor& first, Cursor& last);
+
+ // stats
+ btree_future<size_t> height(Transaction&);
+ btree_future<tree_stats_t> get_stats_slow(Transaction&);
+ std::ostream& dump(Transaction&, std::ostream&);
+ std::ostream& print(std::ostream& os) const;
+
+ // test_only
+ bool test_is_clean() const;
+ btree_future<> test_clone_from(Transaction& t, Transaction& t_from, Btree& from);
+
+ private:
+ context_t get_context(Transaction& t) { return {*nm, t}; }
+ btree_future<Ref<Node>> get_root(Transaction& t);
+
+ NodeExtentManagerURef nm;
+ RootNodeTrackerURef root_tracker;
+
+ friend class DummyChildPool;
+};
+inline std::ostream& operator<<(std::ostream& os, const Btree& tree) {
+ return tree.print(os);
+}
+
+class tree_cursor_t;
+class Btree::Cursor {
+ public:
+ Cursor(const Cursor&);
+ Cursor(Cursor&&) noexcept;
+ Cursor& operator=(const Cursor&);
+ Cursor& operator=(Cursor&&);
+ ~Cursor();
+
+ bool is_end() const;
+ // XXX: return key_view_t to avoid unecessary ghobject_t constructions
+ ghobject_t get_ghobj() const;
+ const onode_t* value() const;
+ bool operator==(const Cursor& x) const;
+ bool operator!=(const Cursor& x) const { return !(*this == x); }
+ Cursor& operator++();
+ Cursor operator++(int);
+
+ private:
+ Cursor(Btree*, Ref<tree_cursor_t>);
+ Cursor(Btree*);
+
+ static Cursor make_end(Btree*);
+
+ Btree* p_tree;
+ Ref<tree_cursor_t> p_cursor;
+
+ friend class Btree;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h
new file mode 100644
index 000000000..0bb345e0a
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h
@@ -0,0 +1,125 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+
+namespace crimson::os::seastore::onode {
+
+// TODO: Redesign according to real requirement from onode manager
+struct onode_t {
+ // onode should be smaller than a node
+ uint16_t size; // address up to 64 KiB sized node
+ uint16_t id;
+ // omap, extent_map, inline data
+
+ bool operator==(const onode_t& o) const { return size == o.size && id == o.id; }
+ bool operator!=(const onode_t& o) const { return !(*this == o); }
+
+ void encode(ceph::bufferlist& encoded) const {
+ ceph::encode(size, encoded);
+ ceph::encode(id, encoded);
+ }
+ static onode_t decode(ceph::bufferlist::const_iterator& delta) {
+ uint16_t size;
+ ceph::decode(size, delta);
+ uint16_t id;
+ ceph::decode(id, delta);
+ onode_t ret{size, id};
+ return ret;
+ }
+ static void validate_tail_magic(const onode_t& onode) {
+ auto p_target = (const char*)&onode + onode.size - sizeof(uint32_t);
+ uint32_t target;
+ std::memcpy(&target, p_target, sizeof(uint32_t));
+ ceph_assert(target == onode.size * 137);
+ }
+ static std::unique_ptr<char[]> allocate(const onode_t& config) {
+ ceph_assert(config.size >= sizeof(onode_t) + sizeof(uint32_t));
+
+ auto ret = std::make_unique<char[]>(config.size);
+ char* p_mem = ret.get();
+ auto p_onode = reinterpret_cast<onode_t*>(p_mem);
+ *p_onode = config;
+
+ uint32_t tail_magic = config.size * 137;
+ p_mem += (config.size - sizeof(uint32_t));
+ std::memcpy(p_mem, &tail_magic, sizeof(uint32_t));
+ validate_tail_magic(*p_onode);
+
+ return ret;
+ }
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const onode_t& node) {
+ return os << "onode(" << node.id << ", " << node.size << "B)";
+}
+
+struct tree_stats_t {
+ size_t size_persistent_leaf = 0;
+ size_t size_persistent_internal = 0;
+ size_t size_filled_leaf = 0;
+ size_t size_filled_internal = 0;
+ size_t size_logical_leaf = 0;
+ size_t size_logical_internal = 0;
+ size_t size_overhead_leaf = 0;
+ size_t size_overhead_internal = 0;
+ size_t size_value_leaf = 0;
+ size_t size_value_internal = 0;
+ unsigned num_kvs_leaf = 0;
+ unsigned num_kvs_internal = 0;
+ unsigned num_nodes_leaf = 0;
+ unsigned num_nodes_internal = 0;
+ unsigned height = 0;
+
+ size_t size_persistent() const {
+ return size_persistent_leaf + size_persistent_internal; }
+ size_t size_filled() const {
+ return size_filled_leaf + size_filled_internal; }
+ size_t size_logical() const {
+ return size_logical_leaf + size_logical_internal; }
+ size_t size_overhead() const {
+ return size_overhead_leaf + size_overhead_internal; }
+ size_t size_value() const {
+ return size_value_leaf + size_value_internal; }
+ unsigned num_kvs() const {
+ return num_kvs_leaf + num_kvs_internal; }
+ unsigned num_nodes() const {
+ return num_nodes_leaf + num_nodes_internal; }
+
+ double ratio_fullness() const {
+ return (double)size_filled() / size_persistent(); }
+ double ratio_key_compression() const {
+ return (double)(size_filled() - size_value()) / (size_logical() - size_value()); }
+ double ratio_overhead() const {
+ return (double)size_overhead() / size_filled(); }
+ double ratio_keys_leaf() const {
+ return (double)num_kvs_leaf / num_kvs(); }
+ double ratio_nodes_leaf() const {
+ return (double)num_nodes_leaf / num_nodes(); }
+ double ratio_filled_leaf() const {
+ return (double)size_filled_leaf / size_filled(); }
+};
+inline std::ostream& operator<<(std::ostream& os, const tree_stats_t& stats) {
+ os << "Tree stats:"
+ << "\n height = " << stats.height
+ << "\n num values = " << stats.num_kvs_leaf
+ << "\n num nodes = " << stats.num_nodes()
+ << " (leaf=" << stats.num_nodes_leaf
+ << ", internal=" << stats.num_nodes_internal << ")"
+ << "\n size persistent = " << stats.size_persistent() << "B"
+ << "\n size filled = " << stats.size_filled() << "B"
+ << " (value=" << stats.size_value_leaf << "B"
+ << ", rest=" << stats.size_filled() - stats.size_value_leaf << "B)"
+ << "\n size logical = " << stats.size_logical() << "B"
+ << "\n size overhead = " << stats.size_overhead() << "B"
+ << "\n ratio fullness = " << stats.ratio_fullness()
+ << "\n ratio keys leaf = " << stats.ratio_keys_leaf()
+ << "\n ratio nodes leaf = " << stats.ratio_nodes_leaf()
+ << "\n ratio filled leaf = " << stats.ratio_filled_leaf()
+ << "\n ratio key compression = " << stats.ratio_key_compression();
+ assert(stats.num_kvs_internal + 1 == stats.num_nodes());
+ return os;
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h
new file mode 100644
index 000000000..536052003
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h
@@ -0,0 +1,333 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <cstring>
+#include <random>
+#include <string>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+#include "crimson/common/log.h"
+#include "stages/key_layout.h"
+#include "tree.h"
+
+/**
+ * tree_utils.h
+ *
+ * Contains shared logic for unit tests and perf tool.
+ */
+
+namespace crimson::os::seastore::onode {
+
+class Onodes {
+ public:
+ Onodes(size_t n) {
+ for (size_t i = 1; i <= n; ++i) {
+ auto p_onode = &create(i * 8);
+ onodes.push_back(p_onode);
+ }
+ }
+
+ Onodes(std::vector<size_t> sizes) {
+ for (auto& size : sizes) {
+ auto p_onode = &create(size);
+ onodes.push_back(p_onode);
+ }
+ }
+
+ ~Onodes() = default;
+
+ const onode_t& create(size_t size) {
+ ceph_assert(size <= std::numeric_limits<uint16_t>::max());
+ onode_t config{static_cast<uint16_t>(size), id++};
+ auto onode = onode_t::allocate(config);
+ auto p_onode = onode.get();
+ tracked_onodes.push_back(std::move(onode));
+ return *reinterpret_cast<onode_t*>(p_onode);
+ }
+
+ const onode_t& pick() const {
+ auto index = rd() % onodes.size();
+ return *onodes[index];
+ }
+
+ const onode_t& pick_largest() const {
+ return *onodes[onodes.size() - 1];
+ }
+
+ static void validate_cursor(
+ const Btree::Cursor& cursor, const ghobject_t& key, const onode_t& onode) {
+ ceph_assert(!cursor.is_end());
+ ceph_assert(cursor.get_ghobj() == key);
+ ceph_assert(cursor.value());
+ ceph_assert(cursor.value() != &onode);
+ ceph_assert(*cursor.value() == onode);
+ onode_t::validate_tail_magic(*cursor.value());
+ }
+
+ private:
+ uint16_t id = 0;
+ mutable std::random_device rd;
+ std::vector<const onode_t*> onodes;
+ std::vector<std::unique_ptr<char[]>> tracked_onodes;
+};
+
+class KVPool {
+ struct kv_conf_t {
+ unsigned index2;
+ unsigned index1;
+ unsigned index0;
+ size_t ns_size;
+ size_t oid_size;
+ const onode_t* p_value;
+
+ ghobject_t get_ghobj() const {
+ assert(index1 < 10);
+ std::ostringstream os_ns;
+ os_ns << "ns" << index1;
+ unsigned current_size = (unsigned)os_ns.tellp();
+ assert(ns_size >= current_size);
+ os_ns << std::string(ns_size - current_size, '_');
+
+ std::ostringstream os_oid;
+ os_oid << "oid" << index1;
+ current_size = (unsigned)os_oid.tellp();
+ assert(oid_size >= current_size);
+ os_oid << std::string(oid_size - current_size, '_');
+
+ return ghobject_t(shard_id_t(index2), index2, index2,
+ os_ns.str(), os_oid.str(), index0, index0);
+ }
+ };
+ using kv_vector_t = std::vector<kv_conf_t>;
+
+ public:
+ using kv_t = std::pair<ghobject_t, const onode_t*>;
+
+ KVPool(const std::vector<size_t>& str_sizes,
+ const std::vector<size_t>& onode_sizes,
+ const std::pair<unsigned, unsigned>& range2,
+ const std::pair<unsigned, unsigned>& range1,
+ const std::pair<unsigned, unsigned>& range0)
+ : str_sizes{str_sizes}, onodes{onode_sizes} {
+ ceph_assert(range2.first < range2.second);
+ ceph_assert(range2.second - 1 <= (unsigned)std::numeric_limits<shard_t>::max());
+ ceph_assert(range2.second - 1 <= std::numeric_limits<crush_hash_t>::max());
+ ceph_assert(range1.first < range1.second);
+ ceph_assert(range1.second - 1 <= 9);
+ ceph_assert(range0.first < range0.second);
+ std::random_device rd;
+ for (unsigned i = range2.first; i < range2.second; ++i) {
+ for (unsigned j = range1.first; j < range1.second; ++j) {
+ auto ns_size = (unsigned)str_sizes[rd() % str_sizes.size()];
+ auto oid_size = (unsigned)str_sizes[rd() % str_sizes.size()];
+ for (unsigned k = range0.first; k < range0.second; ++k) {
+ kvs.emplace_back(kv_conf_t{i, j, k, ns_size, oid_size, &onodes.pick()});
+ }
+ }
+ }
+ random_kvs = kvs;
+ std::random_shuffle(random_kvs.begin(), random_kvs.end());
+ }
+
+ class iterator_t {
+ public:
+ iterator_t() = default;
+ iterator_t(const iterator_t&) = default;
+ iterator_t(iterator_t&&) = default;
+ iterator_t& operator=(const iterator_t&) = default;
+ iterator_t& operator=(iterator_t&&) = default;
+
+ kv_t get_kv() const {
+ assert(!is_end());
+ auto& conf = (*p_kvs)[i];
+ return std::make_pair(conf.get_ghobj(), conf.p_value);
+ }
+ bool is_end() const { return !p_kvs || i >= p_kvs->size(); }
+ size_t index() const { return i; }
+
+ iterator_t& operator++() {
+ assert(!is_end());
+ ++i;
+ return *this;
+ }
+
+ iterator_t operator++(int) {
+ iterator_t tmp = *this;
+ ++*this;
+ return tmp;
+ }
+
+ private:
+ iterator_t(const kv_vector_t& kvs) : p_kvs{&kvs} {}
+
+ const kv_vector_t* p_kvs = nullptr;
+ size_t i = 0;
+ friend class KVPool;
+ };
+
+ iterator_t begin() const {
+ return iterator_t(kvs);
+ }
+
+ iterator_t random_begin() const {
+ return iterator_t(random_kvs);
+ }
+
+ size_t size() const {
+ return kvs.size();
+ }
+
+ private:
+ std::vector<size_t> str_sizes;
+ Onodes onodes;
+ kv_vector_t kvs;
+ kv_vector_t random_kvs;
+};
+
+template <bool TRACK>
+class TreeBuilder {
+ public:
+ using ertr = Btree::btree_ertr;
+ template <class ValueT=void>
+ using future = ertr::future<ValueT>;
+
+ TreeBuilder(KVPool& kvs, NodeExtentManagerURef&& nm)
+ : kvs{kvs} {
+ tree.emplace(std::move(nm));
+ }
+
+ future<> bootstrap(Transaction& t) {
+ std::ostringstream oss;
+#ifndef NDEBUG
+ oss << "debug=on, ";
+#else
+ oss << "debug=off, ";
+#endif
+#ifdef UNIT_TESTS_BUILT
+ oss << "UNIT_TEST_BUILT=on, ";
+#else
+ oss << "UNIT_TEST_BUILT=off, ";
+#endif
+ if constexpr (TRACK) {
+ oss << "track=on, ";
+ } else {
+ oss << "track=off, ";
+ }
+ oss << *tree;
+ logger().warn("TreeBuilder: {}, bootstrapping ...", oss.str());
+ return tree->mkfs(t);
+ }
+
+ future<> insert(Transaction& t) {
+ kv_iter = kvs.random_begin();
+ auto cursors = seastar::make_lw_shared<std::vector<Btree::Cursor>>();
+ logger().warn("start inserting {} kvs ...", kvs.size());
+ auto start_time = mono_clock::now();
+ return crimson::do_until([&t, this, cursors]() -> future<bool> {
+ if (kv_iter.is_end()) {
+ return ertr::make_ready_future<bool>(true);
+ }
+ auto [key, p_value] = kv_iter.get_kv();
+ logger().debug("[{}] {} -> {}", kv_iter.index(), key_hobj_t{key}, *p_value);
+ return tree->insert(t, key, *p_value
+ ).safe_then([&t, this, cursors](auto ret) {
+ auto& [cursor, success] = ret;
+ assert(success == true);
+ if constexpr (TRACK) {
+ cursors->emplace_back(cursor);
+ }
+#ifndef NDEBUG
+ auto [key, p_value] = kv_iter.get_kv();
+ Onodes::validate_cursor(cursor, key, *p_value);
+ return tree->lower_bound(t, key).safe_then([this, cursor](auto cursor_) {
+ auto [key, p_value] = kv_iter.get_kv();
+ ceph_assert(cursor_.get_ghobj() == key);
+ ceph_assert(cursor_.value() == cursor.value());
+ ++kv_iter;
+ return ertr::make_ready_future<bool>(false);
+ });
+#else
+ ++kv_iter;
+ return ertr::make_ready_future<bool>(false);
+#endif
+ });
+ }).safe_then([&t, this, start_time, cursors] {
+ std::chrono::duration<double> duration = mono_clock::now() - start_time;
+ logger().warn("Insert done! {}s", duration.count());
+ if (!cursors->empty()) {
+ logger().info("Verifing tracked cursors ...");
+ kv_iter = kvs.random_begin();
+ return seastar::do_with(
+ cursors->begin(), [&t, this, cursors](auto& c_iter) {
+ return crimson::do_until([&t, this, &c_iter, cursors]() -> future<bool> {
+ if (kv_iter.is_end()) {
+ logger().info("Verify done!");
+ return ertr::make_ready_future<bool>(true);
+ }
+ assert(c_iter != cursors->end());
+ auto [k, v] = kv_iter.get_kv();
+ // validate values in tree keep intact
+ return tree->lower_bound(t, k).safe_then([this, &c_iter](auto cursor) {
+ auto [k, v] = kv_iter.get_kv();
+ Onodes::validate_cursor(cursor, k, *v);
+ // validate values in cursors keep intact
+ Onodes::validate_cursor(*c_iter, k, *v);
+ ++kv_iter;
+ ++c_iter;
+ return ertr::make_ready_future<bool>(false);
+ });
+ });
+ });
+ } else {
+ return ertr::now();
+ }
+ });
+ }
+
+ future<> get_stats(Transaction& t) {
+ return tree->get_stats_slow(t
+ ).safe_then([this](auto stats) {
+ logger().warn("{}", stats);
+ });
+ }
+
+ void reload(NodeExtentManagerURef&& nm) {
+ tree.emplace(std::move(nm));
+ }
+
+ future<> validate(Transaction& t) {
+ logger().info("Verifing insertion ...");
+ return seastar::do_with(
+ kvs.begin(), [&t, this] (auto& kvs_iter) {
+ return crimson::do_until([&t, this, &kvs_iter]() -> future<bool> {
+ if (kvs_iter.is_end()) {
+ logger().info("Verify done!");
+ return ertr::make_ready_future<bool>(true);
+ }
+ auto [k, v] = kvs_iter.get_kv();
+ return tree->lower_bound(t, k
+ ).safe_then([&kvs_iter, k=k, v=v] (auto cursor) {
+ Onodes::validate_cursor(cursor, k, *v);
+ ++kvs_iter;
+ return ertr::make_ready_future<bool>(false);
+ });
+ });
+ });
+ }
+
+ private:
+ static seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+
+ KVPool& kvs;
+ std::optional<Btree> tree;
+ KVPool::iterator_t kv_iter;
+};
+
+}