summaryrefslogtreecommitdiffstats
path: root/src/crimson/os/seastore/onode_manager/staged-fltree/stages
diff options
context:
space:
mode:
Diffstat (limited to 'src/crimson/os/seastore/onode_manager/staged-fltree/stages')
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc165
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h180
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc32
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h846
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc318
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h226
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc96
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h366
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h2186
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h411
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc208
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h341
12 files changed, 5375 insertions, 0 deletions
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc
new file mode 100644
index 000000000..443c6cabd
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc
@@ -0,0 +1,165 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "item_iterator_stage.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+#define ITER_T item_iterator_t<NODE_TYPE>
+#define ITER_INST(NT) item_iterator_t<NT>
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+memory_range_t ITER_T::insert_prefix(
+ NodeExtentMutable& mut, const ITER_T& iter, const full_key_t<KT>& key,
+ bool is_end, node_offset_t size, const char* p_left_bound) {
+ // 1. insert range
+ char* p_insert;
+ if (is_end) {
+ assert(!iter.has_next());
+ p_insert = const_cast<char*>(iter.p_start());
+ } else {
+ p_insert = const_cast<char*>(iter.p_end());
+ }
+ char* p_insert_front = p_insert - size;
+
+ // 2. shift memory
+ const char* p_shift_start = p_left_bound;
+ const char* p_shift_end = p_insert;
+ mut.shift_absolute(p_shift_start,
+ p_shift_end - p_shift_start,
+ -(int)size);
+
+ // 3. append header
+ p_insert -= sizeof(node_offset_t);
+ node_offset_t back_offset = (p_insert - p_insert_front);
+ mut.copy_in_absolute(p_insert, back_offset);
+ ns_oid_view_t::append<KT>(mut, key, p_insert);
+
+ return {p_insert_front, p_insert};
+}
+#define IP_TEMPLATE(NT, KT) \
+ template memory_range_t ITER_INST(NT)::insert_prefix<KT>( \
+ NodeExtentMutable&, const ITER_INST(NT)&, const full_key_t<KT>&, \
+ bool, node_offset_t, const char*)
+IP_TEMPLATE(node_type_t::LEAF, KeyT::VIEW);
+IP_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW);
+IP_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ);
+IP_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ);
+
+template <node_type_t NODE_TYPE>
+void ITER_T::update_size(
+ NodeExtentMutable& mut, const ITER_T& iter, int change) {
+ node_offset_t offset = iter.get_back_offset();
+ int new_size = change + offset;
+ assert(new_size > 0 && new_size < NODE_BLOCK_SIZE);
+ mut.copy_in_absolute(
+ (void*)iter.get_item_range().p_end, node_offset_t(new_size));
+}
+
+template <node_type_t NODE_TYPE>
+node_offset_t ITER_T::trim_until(NodeExtentMutable&, const ITER_T& iter) {
+ assert(iter.index() != 0);
+ size_t ret = iter.p_end() - iter.p_items_start;
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+}
+
+template <node_type_t NODE_TYPE>
+node_offset_t ITER_T::trim_at(
+ NodeExtentMutable& mut, const ITER_T& iter, node_offset_t trimmed) {
+ size_t trim_size = iter.p_start() - iter.p_items_start + trimmed;
+ assert(trim_size < NODE_BLOCK_SIZE);
+ assert(iter.get_back_offset() > trimmed);
+ node_offset_t new_offset = iter.get_back_offset() - trimmed;
+ mut.copy_in_absolute((void*)iter.item_range.p_end, new_offset);
+ return trim_size;
+}
+
+#define ITER_TEMPLATE(NT) template class ITER_INST(NT)
+ITER_TEMPLATE(node_type_t::LEAF);
+ITER_TEMPLATE(node_type_t::INTERNAL);
+
+#define APPEND_T ITER_T::Appender<KT>
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+bool APPEND_T::append(const ITER_T& src, index_t& items) {
+ auto p_end = src.p_end();
+ bool append_till_end = false;
+ if (is_valid_index(items)) {
+ for (auto i = 1u; i <= items; ++i) {
+ if (!src.has_next()) {
+ assert(i == items);
+ append_till_end = true;
+ break;
+ }
+ ++src;
+ }
+ } else {
+ if (items == INDEX_END) {
+ append_till_end = true;
+ } else {
+ assert(items == INDEX_LAST);
+ }
+ items = 0;
+ while (src.has_next()) {
+ ++src;
+ ++items;
+ }
+ if (append_till_end) {
+ ++items;
+ }
+ }
+
+ const char* p_start;
+ if (append_till_end) {
+ p_start = src.p_start();
+ } else {
+ p_start = src.p_end();
+ }
+ assert(p_end >= p_start);
+ size_t append_size = p_end - p_start;
+ p_append -= append_size;
+ p_mut->copy_in_absolute(p_append, p_start, append_size);
+ return append_till_end;
+}
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const key_get_type& partial_key) {
+ p_append -= sizeof(node_offset_t);
+ p_offset_while_open = p_append;
+ ns_oid_view_t::append(*p_mut, partial_key, p_append);
+ return {p_mut, p_append};
+}
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const full_key_t<KT>& key) {
+ p_append -= sizeof(node_offset_t);
+ p_offset_while_open = p_append;
+ ns_oid_view_t::append<KT>(*p_mut, key, p_append);
+ return {p_mut, p_append};
+}
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+void APPEND_T::wrap_nxt(char* _p_append) {
+ assert(_p_append < p_append);
+ p_mut->copy_in_absolute(
+ p_offset_while_open, node_offset_t(p_offset_while_open - _p_append));
+ p_append = _p_append;
+}
+
+#define APPEND_TEMPLATE(NT, KT) template class ITER_INST(NT)::Appender<KT>
+APPEND_TEMPLATE(node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ);
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h
new file mode 100644
index 000000000..bb68eec8f
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h
@@ -0,0 +1,180 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "key_layout.h"
+#include "stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+/**
+ * item_iterator_t
+ *
+ * The STAGE_STRING implementation for node N0/N1, implements staged contract
+ * as an iterative container to resolve crush hash conflicts.
+ *
+ * The layout of the contaner to index ns, oid strings storing n items:
+ *
+ * # <--------- container range ---------> #
+ * #<~># items [i+1, n) #
+ * # # items [0, i) #<~>#
+ * # # <------ item i -------------> # #
+ * # # <--- item_range ---> | # #
+ * # # | # #
+ * # # next-stage | ns-oid | back_ # #
+ * # # contaner | strings | offset # #
+ * #...# range | | #...#
+ * ^ ^ | ^
+ * | | | |
+ * | +---------------------------+ |
+ * + p_items_start p_items_end +
+ */
+template <node_type_t NODE_TYPE>
+class item_iterator_t {
+ using value_t = value_type_t<NODE_TYPE>;
+ public:
+ item_iterator_t(const memory_range_t& range)
+ : p_items_start(range.p_start), p_items_end(range.p_end) {
+ assert(p_items_start < p_items_end);
+ next_item_range(p_items_end);
+ }
+
+ const char* p_start() const { return item_range.p_start; }
+ const char* p_end() const { return item_range.p_end + sizeof(node_offset_t); }
+ const memory_range_t& get_item_range() const { return item_range; }
+ node_offset_t get_back_offset() const { return back_offset; }
+
+ // container type system
+ using key_get_type = const ns_oid_view_t&;
+ static constexpr auto CONTAINER_TYPE = ContainerType::ITERATIVE;
+ index_t index() const { return _index; }
+ key_get_type get_key() const {
+ if (!key.has_value()) {
+ key = ns_oid_view_t(item_range.p_end);
+ assert(item_range.p_start < (*key).p_start());
+ }
+ return *key;
+ }
+ node_offset_t size() const {
+ size_t ret = item_range.p_end - item_range.p_start + sizeof(node_offset_t);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ };
+ node_offset_t size_to_nxt() const {
+ size_t ret = get_key().size() + sizeof(node_offset_t);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ }
+ node_offset_t size_overhead() const {
+ return sizeof(node_offset_t) + get_key().size_overhead();
+ }
+ memory_range_t get_nxt_container() const {
+ return {item_range.p_start, get_key().p_start()};
+ }
+ bool has_next() const {
+ assert(p_items_start <= item_range.p_start);
+ return p_items_start < item_range.p_start;
+ }
+ const item_iterator_t<NODE_TYPE>& operator++() const {
+ assert(has_next());
+ next_item_range(item_range.p_start);
+ key.reset();
+ ++_index;
+ return *this;
+ }
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ int start_offset = p_items_start - p_node_start;
+ int end_offset = p_items_end - p_node_start;
+ assert(start_offset > 0 && start_offset < NODE_BLOCK_SIZE);
+ assert(end_offset > 0 && end_offset <= NODE_BLOCK_SIZE);
+ ceph::encode(static_cast<node_offset_t>(start_offset), encoded);
+ ceph::encode(static_cast<node_offset_t>(end_offset), encoded);
+ ceph::encode(_index, encoded);
+ }
+
+ static item_iterator_t decode(const char* p_node_start,
+ ceph::bufferlist::const_iterator& delta) {
+ node_offset_t start_offset;
+ ceph::decode(start_offset, delta);
+ node_offset_t end_offset;
+ ceph::decode(end_offset, delta);
+ assert(start_offset < end_offset);
+ assert(end_offset <= NODE_BLOCK_SIZE);
+ index_t index;
+ ceph::decode(index, delta);
+
+ item_iterator_t ret({p_node_start + start_offset,
+ p_node_start + end_offset});
+ while (index > 0) {
+ ++ret;
+ --index;
+ }
+ return ret;
+ }
+
+ static node_offset_t header_size() { return 0u; }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(
+ const full_key_t<KT>& key, const value_t&) {
+ return ns_oid_view_t::estimate_size<KT>(key) + sizeof(node_offset_t);
+ }
+
+ template <KeyT KT>
+ static memory_range_t insert_prefix(
+ NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter,
+ const full_key_t<KT>& key, bool is_end,
+ node_offset_t size, const char* p_left_bound);
+
+ static void update_size(
+ NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter, int change);
+
+ static node_offset_t trim_until(NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&);
+ static node_offset_t trim_at(
+ NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&, node_offset_t trimmed);
+
+ template <KeyT KT>
+ class Appender;
+
+ private:
+ void next_item_range(const char* p_end) const {
+ auto p_item_end = p_end - sizeof(node_offset_t);
+ assert(p_items_start < p_item_end);
+ back_offset = reinterpret_cast<const node_offset_packed_t*>(p_item_end)->value;
+ assert(back_offset);
+ const char* p_item_start = p_item_end - back_offset;
+ assert(p_items_start <= p_item_start);
+ item_range = {p_item_start, p_item_end};
+ }
+
+ const char* p_items_start;
+ const char* p_items_end;
+ mutable memory_range_t item_range;
+ mutable node_offset_t back_offset;
+ mutable std::optional<ns_oid_view_t> key;
+ mutable index_t _index = 0u;
+};
+
+template <node_type_t NODE_TYPE>
+template <KeyT KT>
+class item_iterator_t<NODE_TYPE>::Appender {
+ public:
+ Appender(NodeExtentMutable* p_mut, char* p_append)
+ : p_mut{p_mut}, p_append{p_append} {}
+ bool append(const item_iterator_t<NODE_TYPE>& src, index_t& items);
+ char* wrap() { return p_append; }
+ std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&);
+ std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&);
+ void wrap_nxt(char* _p_append);
+
+ private:
+ NodeExtentMutable* p_mut;
+ char* p_append;
+ char* p_offset_while_open;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc
new file mode 100644
index 000000000..d60bb8d09
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "key_layout.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+void string_key_view_t::append_str(
+ NodeExtentMutable& mut, std::string_view str, char*& p_append) {
+ assert(is_valid_size(str.length()));
+ p_append -= sizeof(string_size_t);
+ string_size_t len = str.length();
+ mut.copy_in_absolute(p_append, len);
+ p_append -= len;
+ mut.copy_in_absolute(p_append, str.data(), len);
+}
+
+void string_key_view_t::append_dedup(
+ NodeExtentMutable& mut, const Type& dedup_type, char*& p_append) {
+ p_append -= sizeof(string_size_t);
+ if (dedup_type == Type::MIN) {
+ mut.copy_in_absolute(p_append, MIN);
+ } else if (dedup_type == Type::MAX) {
+ mut.copy_in_absolute(p_append, MAX);
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h
new file mode 100644
index 000000000..cc1f546c1
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h
@@ -0,0 +1,846 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <limits>
+#include <optional>
+#include <ostream>
+
+#include "common/hobject.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h"
+
+namespace crimson::os::seastore::onode {
+
+using shard_t = int8_t;
+using pool_t = int64_t;
+using crush_hash_t = uint32_t;
+using snap_t = uint64_t;
+using gen_t = uint64_t;
+static_assert(sizeof(shard_t) == sizeof(ghobject_t().shard_id.id));
+static_assert(sizeof(pool_t) == sizeof(ghobject_t().hobj.pool));
+static_assert(sizeof(crush_hash_t) == sizeof(ghobject_t().hobj.get_hash()));
+static_assert(sizeof(snap_t) == sizeof(ghobject_t().hobj.snap.val));
+static_assert(sizeof(gen_t) == sizeof(ghobject_t().generation));
+
+class NodeExtentMutable;
+class key_view_t;
+class key_hobj_t;
+enum class KeyT { VIEW, HOBJ };
+template <KeyT> struct _full_key_type;
+template<> struct _full_key_type<KeyT::VIEW> { using type = key_view_t; };
+template<> struct _full_key_type<KeyT::HOBJ> { using type = key_hobj_t; };
+template <KeyT type>
+using full_key_t = typename _full_key_type<type>::type;
+
+struct node_offset_packed_t {
+ node_offset_t value;
+} __attribute__((packed));
+
+// TODO: consider alignments
+struct shard_pool_t {
+ bool operator==(const shard_pool_t& x) const {
+ return (shard == x.shard && pool == x.pool);
+ }
+ bool operator!=(const shard_pool_t& x) const { return !(*this == x); }
+
+ template <KeyT KT>
+ static shard_pool_t from_key(const full_key_t<KT>& key);
+
+ shard_t shard;
+ pool_t pool;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const shard_pool_t& sp) {
+ return os << (unsigned)sp.shard << "," << sp.pool;
+}
+inline MatchKindCMP compare_to(const shard_pool_t& l, const shard_pool_t& r) {
+ auto ret = toMatchKindCMP(l.shard, r.shard);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return toMatchKindCMP(l.pool, r.pool);
+}
+
+struct crush_t {
+ bool operator==(const crush_t& x) const { return crush == x.crush; }
+ bool operator!=(const crush_t& x) const { return !(*this == x); }
+
+ template <KeyT KT>
+ static crush_t from_key(const full_key_t<KT>& key);
+
+ crush_hash_t crush;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const crush_t& c) {
+ return os << c.crush;
+}
+inline MatchKindCMP compare_to(const crush_t& l, const crush_t& r) {
+ return toMatchKindCMP(l.crush, r.crush);
+}
+
+struct shard_pool_crush_t {
+ bool operator==(const shard_pool_crush_t& x) const {
+ return (shard_pool == x.shard_pool && crush == x.crush);
+ }
+ bool operator!=(const shard_pool_crush_t& x) const { return !(*this == x); }
+
+ template <KeyT KT>
+ static shard_pool_crush_t from_key(const full_key_t<KT>& key);
+
+ shard_pool_t shard_pool;
+ crush_t crush;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const shard_pool_crush_t& spc) {
+ return os << spc.shard_pool << "," << spc.crush;
+}
+inline MatchKindCMP compare_to(
+ const shard_pool_crush_t& l, const shard_pool_crush_t& r) {
+ auto ret = compare_to(l.shard_pool, r.shard_pool);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return compare_to(l.crush, r.crush);
+}
+
+struct snap_gen_t {
+ bool operator==(const snap_gen_t& x) const {
+ return (snap == x.snap && gen == x.gen);
+ }
+ bool operator!=(const snap_gen_t& x) const { return !(*this == x); }
+
+ template <KeyT KT>
+ static snap_gen_t from_key(const full_key_t<KT>& key);
+
+ snap_t snap;
+ gen_t gen;
+} __attribute__((packed));
+inline std::ostream& operator<<(std::ostream& os, const snap_gen_t& sg) {
+ return os << sg.snap << "," << sg.gen;
+}
+inline MatchKindCMP compare_to(const snap_gen_t& l, const snap_gen_t& r) {
+ auto ret = toMatchKindCMP(l.snap, r.snap);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return toMatchKindCMP(l.gen, r.gen);
+}
+
+/**
+ * string_key_view_t
+ *
+ * The layout to store char array as an oid or an ns string which may be
+ * compressed.
+ *
+ * If compressed, the physical block only stores an unsigned int of
+ * string_size_t, with value 0 denoting Type::MIN, and value max() denoting
+ * Type::MAX.
+ *
+ * If not compressed (Type::STR), the physical block stores the char array and
+ * a valid string_size_t value.
+ */
+struct string_key_view_t {
+ enum class Type {MIN, STR, MAX};
+ // presumably the maximum string length is 2KiB
+ using string_size_t = uint16_t;
+ static constexpr auto MAX = std::numeric_limits<string_size_t>::max();
+ static constexpr auto MIN = string_size_t(0u);
+ static auto is_valid_size(size_t size) {
+ return (size > MIN && size < MAX);
+ }
+
+ string_key_view_t(const char* p_end) {
+ p_length = p_end - sizeof(string_size_t);
+ std::memcpy(&length, p_length, sizeof(string_size_t));
+ if (is_valid_size(length)) {
+ auto _p_key = p_length - length;
+ p_key = static_cast<const char*>(_p_key);
+ } else {
+ assert(length == MAX || length == MIN);
+ p_key = nullptr;
+ }
+ }
+ Type type() const {
+ if (length == MIN) {
+ return Type::MIN;
+ } else if (length == MAX) {
+ return Type::MAX;
+ } else {
+ assert(is_valid_size(length));
+ return Type::STR;
+ }
+ }
+ const char* p_start() const {
+ if (p_key) {
+ return p_key;
+ } else {
+ return p_length;
+ }
+ }
+ const char* p_next_end() const {
+ if (p_key) {
+ return p_start();
+ } else {
+ return p_length + sizeof(string_size_t);
+ }
+ }
+ node_offset_t size() const {
+ size_t ret = length + sizeof(string_size_t);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ }
+ node_offset_t size_logical() const {
+ assert(type() == Type::STR);
+ assert(is_valid_size(length));
+ return length;
+ }
+ node_offset_t size_overhead() const {
+ assert(type() == Type::STR);
+ return sizeof(string_size_t);
+ }
+
+ std::string_view to_string_view() const {
+ assert(type() == Type::STR);
+ assert(is_valid_size(length));
+ return {p_key, length};
+ }
+ bool operator==(const string_key_view_t& x) const {
+ if (type() == x.type() && type() != Type::STR)
+ return true;
+ if (type() != x.type())
+ return false;
+ if (length != x.length)
+ return false;
+ return (memcmp(p_key, x.p_key, length) == 0);
+ }
+ bool operator!=(const string_key_view_t& x) const { return !(*this == x); }
+
+ static void append_str(
+ NodeExtentMutable&, std::string_view, char*& p_append);
+
+ static void test_append_str(std::string_view str, char*& p_append) {
+ assert(is_valid_size(str.length()));
+ p_append -= sizeof(string_size_t);
+ string_size_t len = str.length();
+ std::memcpy(p_append, &len, sizeof(string_size_t));
+ p_append -= len;
+ std::memcpy(p_append, str.data(), len);
+ }
+
+ static void append_dedup(
+ NodeExtentMutable&, const Type& dedup_type, char*& p_append);
+
+ static void test_append_dedup(const Type& dedup_type, char*& p_append) {
+ p_append -= sizeof(string_size_t);
+ string_size_t len;
+ if (dedup_type == Type::MIN) {
+ len = MIN;
+ } else if (dedup_type == Type::MAX) {
+ len = MAX;
+ } else {
+ ceph_abort("impossible path");
+ }
+ std::memcpy(p_append, &len, sizeof(string_size_t));
+ }
+
+ const char* p_key;
+ const char* p_length;
+ // TODO: remove if p_length is aligned
+ string_size_t length;
+};
+
+/**
+ * string_view_masked_t
+ *
+ * A common class to hide the underlying string implementation regardless of a
+ * string_key_view_t (maybe compressed), a string/string_view, or a compressed
+ * string. And leverage this consistant class to do compare, print, convert and
+ * append operations.
+ */
+class string_view_masked_t {
+ public:
+ using string_size_t = string_key_view_t::string_size_t;
+ using Type = string_key_view_t::Type;
+ explicit string_view_masked_t(const string_key_view_t& index)
+ : type{index.type()} {
+ if (type == Type::STR) {
+ view = index.to_string_view();
+ }
+ }
+ explicit string_view_masked_t(std::string_view str)
+ : type{Type::STR}, view{str} {
+ assert(string_key_view_t::is_valid_size(view.size()));
+ }
+
+ Type get_type() const { return type; }
+ std::string_view to_string_view() const {
+ assert(get_type() == Type::STR);
+ return view;
+ }
+ string_size_t size() const {
+ assert(get_type() == Type::STR);
+ assert(string_key_view_t::is_valid_size(view.size()));
+ return view.size();
+ }
+ bool operator==(const string_view_masked_t& x) const {
+ if (get_type() == x.get_type() && get_type() != Type::STR)
+ return true;
+ if (get_type() != x.get_type())
+ return false;
+ if (size() != x.size())
+ return false;
+ return (memcmp(view.data(), x.view.data(), size()) == 0);
+ }
+ bool operator!=(const string_view_masked_t& x) const { return !(*this == x); }
+ void encode(ceph::bufferlist& bl) const {
+ if (get_type() == Type::MIN) {
+ ceph::encode(string_key_view_t::MIN, bl);
+ } else if (get_type() == Type::MAX) {
+ ceph::encode(string_key_view_t::MAX, bl);
+ } else {
+ ceph::encode(size(), bl);
+ ceph::encode_nohead(view, bl);
+ }
+ }
+ static auto min() { return string_view_masked_t{Type::MIN}; }
+ static auto max() { return string_view_masked_t{Type::MAX}; }
+ static string_view_masked_t decode(
+ std::string& str_storage, ceph::bufferlist::const_iterator& delta) {
+ string_size_t size;
+ ceph::decode(size, delta);
+ if (size == string_key_view_t::MIN) {
+ return min();
+ } else if (size == string_key_view_t::MAX) {
+ return max();
+ } else {
+ ceph::decode_nohead(size, str_storage, delta);
+ return string_view_masked_t(str_storage);
+ }
+ }
+
+ private:
+ explicit string_view_masked_t(Type type)
+ : type{type} {}
+
+ Type type;
+ std::string_view view;
+};
+inline MatchKindCMP compare_to(const string_view_masked_t& l, const string_view_masked_t& r) {
+ using Type = string_view_masked_t::Type;
+ auto l_type = l.get_type();
+ auto r_type = r.get_type();
+ if (l_type == Type::STR && r_type == Type::STR) {
+ assert(l.size() && r.size());
+ return toMatchKindCMP(l.to_string_view(), r.to_string_view());
+ } else if (l_type == r_type) {
+ return MatchKindCMP::EQ;
+ } else if (l_type == Type::MIN || r_type == Type::MAX) {
+ return MatchKindCMP::LT;
+ } else { // l_type == Type::MAX || r_type == Type::MIN
+ return MatchKindCMP::GT;
+ }
+}
+inline MatchKindCMP compare_to(std::string_view l, const string_view_masked_t& r) {
+ using Type = string_view_masked_t::Type;
+ assert(l.length());
+ auto r_type = r.get_type();
+ if (r_type == Type::MIN) {
+ return MatchKindCMP::GT;
+ } else if (r_type == Type::MAX) {
+ return MatchKindCMP::LT;
+ } else { // r_type == Type::STR
+ assert(r.size());
+ return toMatchKindCMP(l, r.to_string_view());
+ }
+}
+inline MatchKindCMP compare_to(const string_view_masked_t& l, std::string_view r) {
+ return reverse(compare_to(r, l));
+}
+inline std::ostream& operator<<(std::ostream& os, const string_view_masked_t& masked) {
+ using Type = string_view_masked_t::Type;
+ auto type = masked.get_type();
+ if (type == Type::MIN) {
+ return os << "MIN";
+ } else if (type == Type::MAX) {
+ return os << "MAX";
+ } else { // type == Type::STR
+ auto view = masked.to_string_view();
+ if (view.length() <= 12) {
+ os << "\"" << view << "\"";
+ } else {
+ os << "\"" << std::string_view(view.data(), 4) << ".."
+ << std::string_view(view.data() + view.length() - 2, 2)
+ << "/" << view.length() << "B\"";
+ }
+ return os;
+ }
+}
+
+struct ns_oid_view_t {
+ using string_size_t = string_key_view_t::string_size_t;
+ using Type = string_key_view_t::Type;
+
+ ns_oid_view_t(const char* p_end) : nspace(p_end), oid(nspace.p_next_end()) {}
+ Type type() const { return oid.type(); }
+ const char* p_start() const { return oid.p_start(); }
+ node_offset_t size() const {
+ if (type() == Type::STR) {
+ size_t ret = nspace.size() + oid.size();
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ } else {
+ return sizeof(string_size_t);
+ }
+ }
+ node_offset_t size_logical() const {
+ assert(type() == Type::STR);
+ return nspace.size_logical() + oid.size_logical();
+ }
+ node_offset_t size_overhead() const {
+ assert(type() == Type::STR);
+ return nspace.size_overhead() + oid.size_overhead();
+ }
+ bool operator==(const ns_oid_view_t& x) const {
+ return (string_view_masked_t{nspace} == string_view_masked_t{x.nspace} &&
+ string_view_masked_t{oid} == string_view_masked_t{x.oid});
+ }
+ bool operator!=(const ns_oid_view_t& x) const { return !(*this == x); }
+
+ template <KeyT KT>
+ static node_offset_t estimate_size(const full_key_t<KT>& key);
+
+ template <KeyT KT>
+ static void append(NodeExtentMutable&,
+ const full_key_t<KT>& key,
+ char*& p_append);
+
+ static void append(NodeExtentMutable& mut,
+ const ns_oid_view_t& view,
+ char*& p_append) {
+ if (view.type() == Type::STR) {
+ string_key_view_t::append_str(mut, view.nspace.to_string_view(), p_append);
+ string_key_view_t::append_str(mut, view.oid.to_string_view(), p_append);
+ } else {
+ string_key_view_t::append_dedup(mut, view.type(), p_append);
+ }
+ }
+
+ template <KeyT KT>
+ static void test_append(const full_key_t<KT>& key, char*& p_append);
+
+ string_key_view_t nspace;
+ string_key_view_t oid;
+};
+inline std::ostream& operator<<(std::ostream& os, const ns_oid_view_t& ns_oid) {
+ return os << string_view_masked_t{ns_oid.nspace} << ","
+ << string_view_masked_t{ns_oid.oid};
+}
+inline MatchKindCMP compare_to(const ns_oid_view_t& l, const ns_oid_view_t& r) {
+ auto ret = compare_to(string_view_masked_t{l.nspace},
+ string_view_masked_t{r.nspace});
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return compare_to(string_view_masked_t{l.oid},
+ string_view_masked_t{r.oid});
+}
+
+/**
+ * key_hobj_t
+ *
+ * A specialized implementation of a full_key_t storing a ghobject_t passed
+ * from user.
+ */
+class key_hobj_t {
+ public:
+ explicit key_hobj_t(const ghobject_t& ghobj) : ghobj{ghobj} {}
+ /*
+ * common interfaces as a full_key_t
+ */
+ shard_t shard() const {
+ return ghobj.shard_id;
+ }
+ pool_t pool() const {
+ return ghobj.hobj.pool;
+ }
+ crush_hash_t crush() const {
+ return ghobj.hobj.get_hash();
+ }
+ std::string_view nspace() const {
+ // TODO(cross-node string dedup)
+ return ghobj.hobj.nspace;
+ }
+ string_view_masked_t nspace_masked() const {
+ // TODO(cross-node string dedup)
+ return string_view_masked_t{nspace()};
+ }
+ std::string_view oid() const {
+ // TODO(cross-node string dedup)
+ return ghobj.hobj.oid.name;
+ }
+ string_view_masked_t oid_masked() const {
+ // TODO(cross-node string dedup)
+ return string_view_masked_t{oid()};
+ }
+ ns_oid_view_t::Type dedup_type() const {
+ return _dedup_type;
+ }
+ snap_t snap() const {
+ return ghobj.hobj.snap;
+ }
+ gen_t gen() const {
+ return ghobj.generation;
+ }
+
+ bool operator==(const full_key_t<KeyT::VIEW>& o) const;
+ bool operator==(const full_key_t<KeyT::HOBJ>& o) const;
+ bool operator!=(const full_key_t<KeyT::VIEW>& o) const {
+ return !operator==(o);
+ }
+ bool operator!=(const full_key_t<KeyT::HOBJ>& o) const {
+ return !operator==(o);
+ }
+
+ std::ostream& dump(std::ostream& os) const {
+ os << "key_hobj(" << (unsigned)shard() << ","
+ << pool() << "," << crush() << "; "
+ << string_view_masked_t{nspace()} << ","
+ << string_view_masked_t{oid()} << "; "
+ << snap() << "," << gen() << ")";
+ return os;
+ }
+
+ static key_hobj_t decode(ceph::bufferlist::const_iterator& delta) {
+ shard_t shard;
+ ceph::decode(shard, delta);
+ pool_t pool;
+ ceph::decode(pool, delta);
+ crush_hash_t crush;
+ ceph::decode(crush, delta);
+ std::string nspace;
+ auto nspace_masked = string_view_masked_t::decode(nspace, delta);
+ // TODO(cross-node string dedup)
+ assert(nspace_masked.get_type() == string_view_masked_t::Type::STR);
+ std::string oid;
+ auto oid_masked = string_view_masked_t::decode(oid, delta);
+ // TODO(cross-node string dedup)
+ assert(oid_masked.get_type() == string_view_masked_t::Type::STR);
+ snap_t snap;
+ ceph::decode(snap, delta);
+ gen_t gen;
+ ceph::decode(gen, delta);
+ return key_hobj_t(ghobject_t(
+ shard_id_t(shard), pool, crush, nspace, oid, snap, gen));
+ }
+
+ private:
+ ns_oid_view_t::Type _dedup_type = ns_oid_view_t::Type::STR;
+ ghobject_t ghobj;
+};
+inline std::ostream& operator<<(std::ostream& os, const key_hobj_t& key) {
+ return key.dump(os);
+}
+
+/**
+ * key_view_t
+ *
+ * A specialized implementation of a full_key_t pointing to the locations
+ * storing the full key in a tree node.
+ */
+class key_view_t {
+ public:
+ /**
+ * common interfaces as a full_key_t
+ */
+ shard_t shard() const {
+ return shard_pool_packed().shard;
+ }
+ pool_t pool() const {
+ return shard_pool_packed().pool;
+ }
+ crush_hash_t crush() const {
+ return crush_packed().crush;
+ }
+ std::string_view nspace() const {
+ // TODO(cross-node string dedup)
+ return ns_oid_view().nspace.to_string_view();
+ }
+ string_view_masked_t nspace_masked() const {
+ // TODO(cross-node string dedup)
+ return string_view_masked_t{ns_oid_view().nspace};
+ }
+ std::string_view oid() const {
+ // TODO(cross-node string dedup)
+ return ns_oid_view().oid.to_string_view();
+ }
+ string_view_masked_t oid_masked() const {
+ // TODO(cross-node string dedup)
+ return string_view_masked_t{ns_oid_view().oid};
+ }
+ ns_oid_view_t::Type dedup_type() const {
+ return ns_oid_view().type();
+ }
+ snap_t snap() const {
+ return snap_gen_packed().snap;
+ }
+ gen_t gen() const {
+ return snap_gen_packed().gen;
+ }
+
+ bool operator==(const full_key_t<KeyT::VIEW>& o) const;
+ bool operator==(const full_key_t<KeyT::HOBJ>& o) const;
+ bool operator!=(const full_key_t<KeyT::VIEW>& o) const {
+ return !operator==(o);
+ }
+ bool operator!=(const full_key_t<KeyT::HOBJ>& o) const {
+ return !operator==(o);
+ }
+
+ /**
+ * key_view_t specific interfaces
+ */
+ bool has_shard_pool() const {
+ return p_shard_pool != nullptr;
+ }
+ bool has_crush() const {
+ return p_crush != nullptr;
+ }
+ bool has_ns_oid() const {
+ return p_ns_oid.has_value();
+ }
+ bool has_snap_gen() const {
+ return p_snap_gen != nullptr;
+ }
+
+ const shard_pool_t& shard_pool_packed() const {
+ assert(has_shard_pool());
+ return *p_shard_pool;
+ }
+ const crush_t& crush_packed() const {
+ assert(has_crush());
+ return *p_crush;
+ }
+ const ns_oid_view_t& ns_oid_view() const {
+ assert(has_ns_oid());
+ return *p_ns_oid;
+ }
+ const snap_gen_t& snap_gen_packed() const {
+ assert(has_snap_gen());
+ return *p_snap_gen;
+ }
+
+ size_t size_logical() const {
+ return sizeof(shard_t) + sizeof(pool_t) + sizeof(crush_hash_t) +
+ sizeof(snap_t) + sizeof(gen_t) + ns_oid_view().size_logical();
+ }
+
+ ghobject_t to_ghobj() const {
+ return ghobject_t(
+ shard_id_t(shard()), pool(), crush(),
+ std::string(nspace()), std::string(oid()), snap(), gen());
+ }
+
+ void replace(const crush_t& key) { p_crush = &key; }
+ void set(const crush_t& key) {
+ assert(!has_crush());
+ replace(key);
+ }
+ void replace(const shard_pool_crush_t& key) { p_shard_pool = &key.shard_pool; }
+ void set(const shard_pool_crush_t& key) {
+ set(key.crush);
+ assert(!has_shard_pool());
+ replace(key);
+ }
+ void replace(const ns_oid_view_t& key) { p_ns_oid = key; }
+ void set(const ns_oid_view_t& key) {
+ assert(!has_ns_oid());
+ replace(key);
+ }
+ void replace(const snap_gen_t& key) { p_snap_gen = &key; }
+ void set(const snap_gen_t& key) {
+ assert(!has_snap_gen());
+ replace(key);
+ }
+
+ std::ostream& dump(std::ostream& os) const {
+ os << "key_view(";
+ if (has_shard_pool()) {
+ os << (unsigned)shard() << "," << pool() << ",";
+ } else {
+ os << "X,X,";
+ }
+ if (has_crush()) {
+ os << crush() << "; ";
+ } else {
+ os << "X; ";
+ }
+ if (has_ns_oid()) {
+ os << ns_oid_view() << "; ";
+ } else {
+ os << "X,X; ";
+ }
+ if (has_snap_gen()) {
+ os << snap() << "," << gen() << ")";
+ } else {
+ os << "X,X)";
+ }
+ return os;
+ }
+
+ private:
+ const shard_pool_t* p_shard_pool = nullptr;
+ const crush_t* p_crush = nullptr;
+ std::optional<ns_oid_view_t> p_ns_oid;
+ const snap_gen_t* p_snap_gen = nullptr;
+};
+
+template <KeyT KT>
+void encode_key(const full_key_t<KT>& key, ceph::bufferlist& bl) {
+ ceph::encode(key.shard(), bl);
+ ceph::encode(key.pool(), bl);
+ ceph::encode(key.crush(), bl);
+ key.nspace_masked().encode(bl);
+ key.oid_masked().encode(bl);
+ ceph::encode(key.snap(), bl);
+ ceph::encode(key.gen(), bl);
+}
+
+inline MatchKindCMP compare_to(std::string_view l, std::string_view r) {
+ return toMatchKindCMP(l, r);
+}
+template <KeyT TypeL, KeyT TypeR>
+bool compare_full_key(const full_key_t<TypeL>& l, const full_key_t<TypeR>& r) {
+ if (l.shard() != r.shard())
+ return false;
+ if (l.pool() != r.pool())
+ return false;
+ if (l.crush() != r.crush())
+ return false;
+ if (compare_to(l.nspace(), r.nspace()) != MatchKindCMP::EQ)
+ return false;
+ if (compare_to(l.oid(), r.oid()) != MatchKindCMP::EQ)
+ return false;
+ if (l.snap() != r.snap())
+ return false;
+ if (l.gen() != r.gen())
+ return false;
+ return true;
+}
+
+inline bool key_hobj_t::operator==(const full_key_t<KeyT::VIEW>& o) const {
+ return compare_full_key<KeyT::HOBJ, KeyT::VIEW>(*this, o);
+}
+inline bool key_hobj_t::operator==(const full_key_t<KeyT::HOBJ>& o) const {
+ return compare_full_key<KeyT::HOBJ, KeyT::HOBJ>(*this, o);
+}
+inline bool key_view_t::operator==(const full_key_t<KeyT::VIEW>& o) const {
+ return compare_full_key<KeyT::VIEW, KeyT::VIEW>(*this, o);
+}
+inline bool key_view_t::operator==(const full_key_t<KeyT::HOBJ>& o) const {
+ return compare_full_key<KeyT::VIEW, KeyT::HOBJ>(*this, o);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const key_view_t& key) {
+ return key.dump(os);
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const shard_pool_t& target) {
+ auto ret = toMatchKindCMP(key.shard(), target.shard);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return toMatchKindCMP(key.pool(), target.pool);
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const crush_t& target) {
+ return toMatchKindCMP(key.crush(), target.crush);
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const shard_pool_crush_t& target) {
+ auto ret = compare_to<Type>(key, target.shard_pool);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return compare_to<Type>(key, target.crush);
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const ns_oid_view_t& target) {
+ auto ret = compare_to(key.nspace(), string_view_masked_t{target.nspace});
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return compare_to(key.oid(), string_view_masked_t{target.oid});
+}
+
+template <KeyT Type>
+MatchKindCMP compare_to(const full_key_t<Type>& key, const snap_gen_t& target) {
+ auto ret = toMatchKindCMP(key.snap(), target.snap);
+ if (ret != MatchKindCMP::EQ)
+ return ret;
+ return toMatchKindCMP(key.gen(), target.gen);
+}
+
+template <KeyT KT>
+shard_pool_t shard_pool_t::from_key(const full_key_t<KT>& key) {
+ if constexpr (KT == KeyT::VIEW) {
+ return key.shard_pool_packed();
+ } else {
+ return {key.shard(), key.pool()};
+ }
+}
+
+template <KeyT KT>
+crush_t crush_t::from_key(const full_key_t<KT>& key) {
+ if constexpr (KT == KeyT::VIEW) {
+ return key.crush_packed();
+ } else {
+ return {key.crush()};
+ }
+}
+
+template <KeyT KT>
+shard_pool_crush_t shard_pool_crush_t::from_key(const full_key_t<KT>& key) {
+ return {shard_pool_t::from_key<KT>(key), crush_t::from_key<KT>(key)};
+}
+
+template <KeyT KT>
+snap_gen_t snap_gen_t::from_key(const full_key_t<KT>& key) {
+ if constexpr (KT == KeyT::VIEW) {
+ return key.snap_gen_packed();
+ } else {
+ return {key.snap(), key.gen()};
+ }
+}
+
+template <KeyT KT>
+node_offset_t ns_oid_view_t::estimate_size(const full_key_t<KT>& key) {
+ if constexpr (KT == KeyT::VIEW) {
+ return key.ns_oid_view().size();
+ } else {
+ if (key.dedup_type() != Type::STR) {
+ // size after deduplication
+ return sizeof(string_size_t);
+ } else {
+ return 2 * sizeof(string_size_t) + key.nspace().size() + key.oid().size();
+ }
+ }
+}
+
+template <KeyT KT>
+void ns_oid_view_t::append(
+ NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) {
+ if (key.dedup_type() == Type::STR) {
+ string_key_view_t::append_str(mut, key.nspace(), p_append);
+ string_key_view_t::append_str(mut, key.oid(), p_append);
+ } else {
+ string_key_view_t::append_dedup(mut, key.dedup_type(), p_append);
+ }
+}
+
+template <KeyT KT>
+void ns_oid_view_t::test_append(const full_key_t<KT>& key, char*& p_append) {
+ if (key.dedup_type() == Type::STR) {
+ string_key_view_t::test_append_str(key.nspace(), p_append);
+ string_key_view_t::test_append_str(key.oid(), p_append);
+ } else {
+ string_key_view_t::test_append_dedup(key.dedup_type(), p_append);
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc
new file mode 100644
index 000000000..4a5988185
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc
@@ -0,0 +1,318 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_stage.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+#include "node_stage_layout.h"
+
+namespace crimson::os::seastore::onode {
+
+#define NODE_T node_extent_t<FieldType, NODE_TYPE>
+#define NODE_INST(FT, NT) node_extent_t<FT, NT>
+
+template <typename FieldType, node_type_t NODE_TYPE>
+const char* NODE_T::p_left_bound() const {
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ // N3 internal node doesn't have the right part
+ return nullptr;
+ } else {
+ auto ret = p_start() + fields().get_item_end_offset(keys());
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (is_level_tail()) {
+ ret -= sizeof(laddr_t);
+ }
+ }
+ return ret;
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+node_offset_t NODE_T::size_to_nxt_at(index_t index) const {
+ assert(index < keys());
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ return FieldType::estimate_insert_one();
+ } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+ auto p_end = p_start() + p_fields->get_item_end_offset(index);
+ return FieldType::estimate_insert_one() + ns_oid_view_t(p_end).size();
+ } else {
+ ceph_abort("N3 node is not nested");
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+memory_range_t NODE_T::get_nxt_container(index_t index) const {
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ ceph_abort("N3 internal node doesn't have the right part");
+ } else {
+ node_offset_t item_start_offset = p_fields->get_item_start_offset(index);
+ node_offset_t item_end_offset = p_fields->get_item_end_offset(index);
+ assert(item_start_offset < item_end_offset);
+ auto item_p_start = p_start() + item_start_offset;
+ auto item_p_end = p_start() + item_end_offset;
+ if constexpr (FIELD_TYPE == field_type_t::N2) {
+ // range for sub_items_t<NODE_TYPE>
+ item_p_end = ns_oid_view_t(item_p_end).p_start();
+ assert(item_p_start < item_p_end);
+ } else {
+ // range for item_iterator_t<NODE_TYPE>
+ }
+ return {item_p_start, item_p_end};
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+void NODE_T::bootstrap_extent(
+ NodeExtentMutable& mut,
+ field_type_t field_type, node_type_t node_type,
+ bool is_level_tail, level_t level) {
+ node_header_t::bootstrap_extent(
+ mut, field_type, node_type, is_level_tail, level);
+ mut.copy_in_relative(
+ sizeof(node_header_t), typename FieldType::num_keys_t(0u));
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+void NODE_T::update_is_level_tail(
+ NodeExtentMutable& mut, const node_extent_t& extent, bool value) {
+ node_header_t::update_is_level_tail(mut, extent.p_fields->header, value);
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+memory_range_t NODE_T::insert_prefix_at(
+ NodeExtentMutable& mut, const node_extent_t& node, const full_key_t<KT>& key,
+ index_t index, node_offset_t size, const char* p_left_bound) {
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ assert(index <= node.keys());
+ assert(p_left_bound == node.p_left_bound());
+ assert(size > FieldType::estimate_insert_one());
+ auto size_right = size - FieldType::estimate_insert_one();
+ const char* p_insert = node.p_start() + node.fields().get_item_end_offset(index);
+ const char* p_insert_front = p_insert - size_right;
+ FieldType::template insert_at<KT>(mut, key, node.fields(), index, size_right);
+ mut.shift_absolute(p_left_bound,
+ p_insert - p_left_bound,
+ -(int)size_right);
+ return {p_insert_front, p_insert};
+ } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+ ceph_abort("not implemented");
+ } else {
+ ceph_abort("impossible");
+ }
+}
+#define IPA_TEMPLATE(FT, NT, KT) \
+ template memory_range_t NODE_INST(FT, NT)::insert_prefix_at<KT>( \
+ NodeExtentMutable&, const node_extent_t&, const full_key_t<KT>&, \
+ index_t, node_offset_t, const char*)
+IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW);
+IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ);
+IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ);
+
+template <typename FieldType, node_type_t NODE_TYPE>
+void NODE_T::update_size_at(
+ NodeExtentMutable& mut, const node_extent_t& node, index_t index, int change) {
+ assert(index < node.keys());
+ FieldType::update_size_at(mut, node.fields(), index, change);
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+node_offset_t NODE_T::trim_until(
+ NodeExtentMutable& mut, const node_extent_t& node, index_t index) {
+ assert(!node.is_level_tail());
+ auto keys = node.keys();
+ assert(index <= keys);
+ if (index == keys) {
+ return 0;
+ }
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ ceph_abort("not implemented");
+ } else {
+ mut.copy_in_absolute(
+ (void*)&node.p_fields->num_keys, num_keys_t(index));
+ }
+ // no need to calculate trim size for node
+ return 0;
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+node_offset_t NODE_T::trim_at(
+ NodeExtentMutable& mut, const node_extent_t& node,
+ index_t index, node_offset_t trimmed) {
+ assert(!node.is_level_tail());
+ assert(index < node.keys());
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ ceph_abort("not implemented");
+ } else {
+ node_offset_t offset = node.p_fields->get_item_start_offset(index);
+ size_t new_offset = offset + trimmed;
+ assert(new_offset < node.p_fields->get_item_end_offset(index));
+ mut.copy_in_absolute(const_cast<void*>(node.p_fields->p_offset(index)),
+ node_offset_t(new_offset));
+ mut.copy_in_absolute(
+ (void*)&node.p_fields->num_keys, num_keys_t(index + 1));
+ }
+ // no need to calculate trim size for node
+ return 0;
+}
+
+#define NODE_TEMPLATE(FT, NT) template class NODE_INST(FT, NT)
+NODE_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL);
+NODE_TEMPLATE(node_fields_0_t, node_type_t::LEAF);
+NODE_TEMPLATE(node_fields_1_t, node_type_t::LEAF);
+NODE_TEMPLATE(node_fields_2_t, node_type_t::LEAF);
+NODE_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF);
+
+#define APPEND_T node_extent_t<FieldType, NODE_TYPE>::Appender<KT>
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+void APPEND_T::append(const node_extent_t& src, index_t from, index_t items) {
+ assert(from <= src.keys());
+ if (p_src == nullptr) {
+ p_src = &src;
+ } else {
+ assert(p_src == &src);
+ }
+ if (items == 0) {
+ return;
+ }
+ assert(from < src.keys());
+ assert(from + items <= src.keys());
+ num_keys += items;
+ if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) {
+ ceph_abort("impossible path");
+ } else {
+ // append left part forwards
+ node_offset_t offset_left_start = src.fields().get_key_start_offset(from);
+ node_offset_t offset_left_end = src.fields().get_key_start_offset(from + items);
+ node_offset_t left_size = offset_left_end - offset_left_start;
+ if (num_keys == 0) {
+ // no need to adjust offset
+ assert(from == 0);
+ assert(p_start + offset_left_start == p_append_left);
+ p_mut->copy_in_absolute(p_append_left,
+ src.p_start() + offset_left_start, left_size);
+ } else {
+ node_offset_t step_size = FieldType::estimate_insert_one();
+ node_offset_t offset_base = src.fields().get_item_end_offset(from);
+ int offset_change = p_append_right - p_start - offset_base;
+ auto p_offset_dst = p_append_left;
+ if constexpr (FIELD_TYPE != field_type_t::N2) {
+ // copy keys
+ p_mut->copy_in_absolute(p_append_left,
+ src.p_start() + offset_left_start, left_size);
+ // point to offset for update
+ p_offset_dst += sizeof(typename FieldType::key_t);
+ }
+ for (auto i = from; i < from + items; ++i) {
+ p_mut->copy_in_absolute(p_offset_dst,
+ node_offset_t(src.fields().get_item_start_offset(i) + offset_change));
+ p_offset_dst += step_size;
+ }
+ assert(p_append_left + left_size + sizeof(typename FieldType::key_t) ==
+ p_offset_dst);
+ }
+ p_append_left += left_size;
+
+ // append right part backwards
+ node_offset_t offset_right_start = src.fields().get_item_end_offset(from + items);
+ node_offset_t offset_right_end = src.fields().get_item_end_offset(from);
+ node_offset_t right_size = offset_right_end - offset_right_start;
+ p_append_right -= right_size;
+ p_mut->copy_in_absolute(p_append_right,
+ src.p_start() + offset_right_start, right_size);
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+void APPEND_T::append(
+ const full_key_t<KT>& key, const value_t& value, const value_t*& p_value) {
+ if constexpr (FIELD_TYPE == field_type_t::N3) {
+ ceph_abort("not implemented");
+ } else {
+ ceph_abort("should not happen");
+ }
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const key_get_type& partial_key) {
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ FieldType::append_key(*p_mut, partial_key, p_append_left);
+ } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+ FieldType::append_key(*p_mut, partial_key, p_append_right);
+ } else {
+ ceph_abort("impossible path");
+ }
+ return {p_mut, p_append_right};
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+std::tuple<NodeExtentMutable*, char*>
+APPEND_T::open_nxt(const full_key_t<KT>& key) {
+ if constexpr (FIELD_TYPE == field_type_t::N0 ||
+ FIELD_TYPE == field_type_t::N1) {
+ FieldType::template append_key<KT>(*p_mut, key, p_append_left);
+ } else if constexpr (FIELD_TYPE == field_type_t::N2) {
+ FieldType::template append_key<KT>(*p_mut, key, p_append_right);
+ } else {
+ ceph_abort("impossible path");
+ }
+ return {p_mut, p_append_right};
+}
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+char* APPEND_T::wrap() {
+ assert(p_append_left <= p_append_right);
+ assert(p_src);
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (p_src->is_level_tail()) {
+ laddr_t tail_value = p_src->get_end_p_laddr()->value;
+ p_append_right -= sizeof(laddr_t);
+ assert(p_append_left <= p_append_right);
+ p_mut->copy_in_absolute(p_append_right, tail_value);
+ }
+ }
+ p_mut->copy_in_absolute(p_start + offsetof(FieldType, num_keys), num_keys);
+ return p_append_left;
+}
+
+#define APPEND_TEMPLATE(FT, NT, KT) template class node_extent_t<FT, NT>::Appender<KT>
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::VIEW);
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ);
+APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::HOBJ);
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h
new file mode 100644
index 000000000..cf0ca463c
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h
@@ -0,0 +1,226 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "key_layout.h"
+#include "stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+/**
+ * node_extent_t
+ *
+ * The top indexing stage implementation for node N0/N1/N2/N3, implements
+ * staged contract as an indexable container, and provides access to node
+ * header.
+ *
+ * The specific field layout are defined by FieldType which are
+ * node_fields_0_t, node_fields_1_t, node_fields_2_t, internal_fields_3_t and
+ * leaf_fields_3_t. Diagrams see node_stage_layout.h.
+ */
+template <typename FieldType, node_type_t _NODE_TYPE>
+class node_extent_t {
+ public:
+ using value_t = value_type_t<_NODE_TYPE>;
+ using num_keys_t = typename FieldType::num_keys_t;
+ static constexpr node_type_t NODE_TYPE = _NODE_TYPE;
+ static constexpr field_type_t FIELD_TYPE = FieldType::FIELD_TYPE;
+ static constexpr node_offset_t EXTENT_SIZE =
+ (FieldType::SIZE + DISK_BLOCK_SIZE - 1u) / DISK_BLOCK_SIZE * DISK_BLOCK_SIZE;
+
+ // TODO: remove
+ node_extent_t() = default;
+
+ node_extent_t(const FieldType* p_fields) : p_fields{p_fields} {
+ validate(*p_fields);
+ }
+
+ const char* p_start() const { return fields_start(*p_fields); }
+
+ const char* off_to_ptr(node_offset_t off) const {
+ assert(off <= FieldType::SIZE);
+ return p_start() + off;
+ }
+
+ node_offset_t ptr_to_off(const void* ptr) const {
+ auto _ptr = static_cast<const char*>(ptr);
+ assert(_ptr >= p_start());
+ auto off = _ptr - p_start();
+ assert(off <= FieldType::SIZE);
+ return off;
+ }
+
+ bool is_level_tail() const { return p_fields->is_level_tail(); }
+ level_t level() const { return p_fields->header.level; }
+ node_offset_t free_size() const {
+ return p_fields->template free_size_before<NODE_TYPE>(keys());
+ }
+ node_offset_t total_size() const { return p_fields->total_size(); }
+ const char* p_left_bound() const;
+ template <node_type_t T = NODE_TYPE>
+ std::enable_if_t<T == node_type_t::INTERNAL, const laddr_packed_t*>
+ get_end_p_laddr() const {
+ assert(is_level_tail());
+ if constexpr (FIELD_TYPE == field_type_t::N3) {
+ return &p_fields->child_addrs[keys()];
+ } else {
+ auto offset_start = p_fields->get_item_end_offset(keys());
+ assert(offset_start <= FieldType::SIZE);
+ offset_start -= sizeof(laddr_packed_t);
+ auto p_addr = p_start() + offset_start;
+ return reinterpret_cast<const laddr_packed_t*>(p_addr);
+ }
+ }
+
+ // container type system
+ using key_get_type = typename FieldType::key_get_type;
+ static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE;
+ index_t keys() const { return p_fields->num_keys; }
+ key_get_type operator[] (index_t index) const { return p_fields->get_key(index); }
+ node_offset_t size_before(index_t index) const {
+ auto free_size = p_fields->template free_size_before<NODE_TYPE>(index);
+ assert(total_size() >= free_size);
+ return total_size() - free_size;
+ }
+ node_offset_t size_to_nxt_at(index_t index) const;
+ node_offset_t size_overhead_at(index_t index) const {
+ return FieldType::ITEM_OVERHEAD; }
+ memory_range_t get_nxt_container(index_t index) const;
+
+ template <typename T = FieldType>
+ std::enable_if_t<T::FIELD_TYPE == field_type_t::N3, const value_t*>
+ get_p_value(index_t index) const {
+ assert(index < keys());
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ return &p_fields->child_addrs[index];
+ } else {
+ auto range = get_nxt_container(index);
+ auto ret = reinterpret_cast<const onode_t*>(range.p_start);
+ assert(range.p_start + ret->size == range.p_end);
+ return ret;
+ }
+ }
+
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ assert(p_node_start == p_start());
+ // nothing to encode as the container range is the entire extent
+ }
+
+ static node_extent_t decode(const char* p_node_start,
+ ceph::bufferlist::const_iterator& delta) {
+ // nothing to decode
+ return node_extent_t(reinterpret_cast<const FieldType*>(p_node_start));
+ }
+
+ static void validate(const FieldType& fields) {
+#ifndef NDEBUG
+ assert(fields.header.get_node_type() == NODE_TYPE);
+ assert(fields.header.get_field_type() == FieldType::FIELD_TYPE);
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ assert(fields.header.level > 0u);
+ } else {
+ assert(fields.header.level == 0u);
+ }
+#endif
+ }
+
+ static void bootstrap_extent(
+ NodeExtentMutable&, field_type_t, node_type_t, bool, level_t);
+
+ static void update_is_level_tail(NodeExtentMutable&, const node_extent_t&, bool);
+
+ static node_offset_t header_size() { return FieldType::HEADER_SIZE; }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(
+ const full_key_t<KT>& key, const value_t& value) {
+ auto size = FieldType::estimate_insert_one();
+ if constexpr (FIELD_TYPE == field_type_t::N2) {
+ size += ns_oid_view_t::estimate_size<KT>(key);
+ } else if constexpr (FIELD_TYPE == field_type_t::N3 &&
+ NODE_TYPE == node_type_t::LEAF) {
+ size += value.size;
+ }
+ return size;
+ }
+
+ template <KeyT KT>
+ static const value_t* insert_at(
+ NodeExtentMutable& mut, const node_extent_t&,
+ const full_key_t<KT>& key, const value_t& value,
+ index_t index, node_offset_t size, const char* p_left_bound) {
+ if constexpr (FIELD_TYPE == field_type_t::N3) {
+ ceph_abort("not implemented");
+ } else {
+ ceph_abort("impossible");
+ }
+ }
+
+ template <KeyT KT>
+ static memory_range_t insert_prefix_at(
+ NodeExtentMutable&, const node_extent_t&,
+ const full_key_t<KT>& key,
+ index_t index, node_offset_t size, const char* p_left_bound);
+
+ static void update_size_at(
+ NodeExtentMutable&, const node_extent_t&, index_t index, int change);
+
+ static node_offset_t trim_until(
+ NodeExtentMutable&, const node_extent_t&, index_t index);
+ static node_offset_t trim_at(NodeExtentMutable&, const node_extent_t&,
+ index_t index, node_offset_t trimmed);
+
+ template <KeyT KT>
+ class Appender;
+
+ private:
+ const FieldType& fields() const { return *p_fields; }
+ const FieldType* p_fields;
+};
+
+template <typename FieldType, node_type_t NODE_TYPE>
+template <KeyT KT>
+class node_extent_t<FieldType, NODE_TYPE>::Appender {
+ public:
+ Appender(NodeExtentMutable* p_mut, char* p_append)
+ : p_mut{p_mut}, p_start{p_append} {
+#ifndef NDEBUG
+ auto p_fields = reinterpret_cast<const FieldType*>(p_append);
+ assert(*(p_fields->header.get_field_type()) == FIELD_TYPE);
+ assert(p_fields->header.get_node_type() == NODE_TYPE);
+ assert(p_fields->num_keys == 0);
+#endif
+ p_append_left = p_start + FieldType::HEADER_SIZE;
+ p_append_right = p_start + FieldType::SIZE;
+ }
+ void append(const node_extent_t& src, index_t from, index_t items);
+ void append(const full_key_t<KT>&, const value_t&, const value_t*&);
+ char* wrap();
+ std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&);
+ std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&);
+ void wrap_nxt(char* p_append) {
+ if constexpr (FIELD_TYPE != field_type_t::N3) {
+ assert(p_append < p_append_right);
+ assert(p_append_left < p_append);
+ p_append_right = p_append;
+ FieldType::append_offset(*p_mut, p_append - p_start, p_append_left);
+ ++num_keys;
+ } else {
+ ceph_abort("not implemented");
+ }
+ }
+
+ private:
+ const node_extent_t* p_src = nullptr;
+ NodeExtentMutable* p_mut;
+ char* p_start;
+ char* p_append_left;
+ char* p_append_right;
+ num_keys_t num_keys = 0;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc
new file mode 100644
index 000000000..81bfac72a
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "node_stage_layout.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+void node_header_t::bootstrap_extent(
+ NodeExtentMutable& mut,
+ field_type_t field_type, node_type_t node_type,
+ bool is_level_tail, level_t level) {
+ node_header_t header;
+ header.set_field_type(field_type);
+ header.set_node_type(node_type);
+ header.set_is_level_tail(is_level_tail);
+ header.level = level;
+ mut.copy_in_relative(0, header);
+}
+
+void node_header_t::update_is_level_tail(
+ NodeExtentMutable& mut, const node_header_t& header, bool value) {
+ auto& _header = const_cast<node_header_t&>(header);
+ _header.set_is_level_tail(value);
+ mut.validate_inplace_update(_header);
+}
+
+#define F013_T _node_fields_013_t<SlotType>
+#define F013_INST(ST) _node_fields_013_t<ST>
+
+template <typename SlotType>
+void F013_T::update_size_at(
+ NodeExtentMutable& mut, const me_t& node, index_t index, int change) {
+ assert(index <= node.num_keys);
+ for (const auto* p_slot = &node.slots[index];
+ p_slot < &node.slots[node.num_keys];
+ ++p_slot) {
+ node_offset_t offset = p_slot->right_offset;
+ mut.copy_in_absolute(
+ (void*)&(p_slot->right_offset),
+ node_offset_t(offset - change));
+ }
+}
+
+template <typename SlotType>
+void F013_T::append_key(
+ NodeExtentMutable& mut, const key_t& key, char*& p_append) {
+ mut.copy_in_absolute(p_append, key);
+ p_append += sizeof(key_t);
+}
+
+template <typename SlotType>
+void F013_T::append_offset(
+ NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) {
+ mut.copy_in_absolute(p_append, offset_to_right);
+ p_append += sizeof(node_offset_t);
+}
+
+template <typename SlotType>
+template <KeyT KT>
+void F013_T::insert_at(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ const me_t& node, index_t index, node_offset_t size_right) {
+ assert(index <= node.num_keys);
+ update_size_at(mut, node, index, size_right);
+ auto p_insert = const_cast<char*>(fields_start(node)) +
+ node.get_key_start_offset(index);
+ auto p_shift_end = fields_start(node) + node.get_key_start_offset(node.num_keys);
+ mut.shift_absolute(p_insert, p_shift_end - p_insert, estimate_insert_one());
+ mut.copy_in_absolute((void*)&node.num_keys, num_keys_t(node.num_keys + 1));
+ append_key(mut, key_t::template from_key<KT>(key), p_insert);
+ append_offset(mut, node.get_item_end_offset(index) - size_right, p_insert);
+}
+#define IA_TEMPLATE(ST, KT) template void F013_INST(ST):: \
+ insert_at<KT>(NodeExtentMutable&, const full_key_t<KT>&, \
+ const F013_INST(ST)&, index_t, node_offset_t)
+IA_TEMPLATE(slot_0_t, KeyT::VIEW);
+IA_TEMPLATE(slot_1_t, KeyT::VIEW);
+IA_TEMPLATE(slot_3_t, KeyT::VIEW);
+IA_TEMPLATE(slot_0_t, KeyT::HOBJ);
+IA_TEMPLATE(slot_1_t, KeyT::HOBJ);
+IA_TEMPLATE(slot_3_t, KeyT::HOBJ);
+
+#define F013_TEMPLATE(ST) template struct F013_INST(ST)
+F013_TEMPLATE(slot_0_t);
+F013_TEMPLATE(slot_1_t);
+F013_TEMPLATE(slot_3_t);
+
+void node_fields_2_t::append_offset(
+ NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) {
+ mut.copy_in_absolute(p_append, offset_to_right);
+ p_append += sizeof(node_offset_t);
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h
new file mode 100644
index 000000000..14ba95bf4
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h
@@ -0,0 +1,366 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "key_layout.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+struct node_header_t {
+ static constexpr unsigned FIELD_TYPE_BITS = 6u;
+ static_assert(static_cast<uint8_t>(field_type_t::_MAX) <= 1u << FIELD_TYPE_BITS);
+ static constexpr unsigned NODE_TYPE_BITS = 1u;
+ static constexpr unsigned B_LEVEL_TAIL_BITS = 1u;
+ using bits_t = uint8_t;
+
+ node_header_t() {}
+ std::optional<field_type_t> get_field_type() const {
+ if (field_type >= FIELD_TYPE_MAGIC &&
+ field_type < static_cast<uint8_t>(field_type_t::_MAX)) {
+ return static_cast<field_type_t>(field_type);
+ } else {
+ return std::nullopt;
+ }
+ }
+ node_type_t get_node_type() const {
+ return static_cast<node_type_t>(node_type);
+ }
+ bool get_is_level_tail() const {
+ return is_level_tail;
+ }
+
+ static void bootstrap_extent(
+ NodeExtentMutable&, field_type_t, node_type_t, bool, level_t);
+
+ static void update_is_level_tail(NodeExtentMutable&, const node_header_t&, bool);
+
+ bits_t field_type : FIELD_TYPE_BITS;
+ bits_t node_type : NODE_TYPE_BITS;
+ bits_t is_level_tail : B_LEVEL_TAIL_BITS;
+ static_assert(sizeof(bits_t) * 8 ==
+ FIELD_TYPE_BITS + NODE_TYPE_BITS + B_LEVEL_TAIL_BITS);
+ level_t level;
+
+ private:
+ void set_field_type(field_type_t type) {
+ field_type = static_cast<uint8_t>(type);
+ }
+ void set_node_type(node_type_t type) {
+ node_type = static_cast<uint8_t>(type);
+ }
+ void set_is_level_tail(bool value) {
+ is_level_tail = static_cast<uint8_t>(value);
+ }
+} __attribute__((packed));
+
+template <typename FixedKeyType, field_type_t _FIELD_TYPE>
+struct _slot_t {
+ using key_t = FixedKeyType;
+ static constexpr field_type_t FIELD_TYPE = _FIELD_TYPE;
+ static constexpr node_offset_t OVERHEAD = sizeof(_slot_t) - sizeof(key_t);
+
+ key_t key;
+ node_offset_t right_offset;
+} __attribute__((packed));
+using slot_0_t = _slot_t<shard_pool_crush_t, field_type_t::N0>;
+using slot_1_t = _slot_t<crush_t, field_type_t::N1>;
+using slot_3_t = _slot_t<snap_gen_t, field_type_t::N3>;
+
+struct node_range_t {
+ node_offset_t start;
+ node_offset_t end;
+};
+
+template <typename FieldType>
+const char* fields_start(const FieldType& node) {
+ return reinterpret_cast<const char*>(&node);
+}
+
+template <node_type_t NODE_TYPE, typename FieldType>
+node_range_t fields_free_range_before(
+ const FieldType& node, index_t index) {
+ assert(index <= node.num_keys);
+ node_offset_t offset_start = node.get_key_start_offset(index);
+ node_offset_t offset_end =
+ (index == 0 ? FieldType::SIZE
+ : node.get_item_start_offset(index - 1));
+ if constexpr (NODE_TYPE == node_type_t::INTERNAL) {
+ if (node.is_level_tail() && index == node.num_keys) {
+ offset_end -= sizeof(laddr_t);
+ }
+ }
+ assert(offset_start <= offset_end);
+ assert(offset_end - offset_start < FieldType::SIZE);
+ return {offset_start, offset_end};
+}
+
+/**
+ * _node_fields_013_t (node_fields_0_t, node_fields_1_t, leaf_fields_3_t
+ *
+ * The STAGE_LEFT layout implementation for node N0/N1, or the STAGE_RIGHT
+ * layout implementation for leaf node N3.
+ *
+ * The node layout storing n slots:
+ *
+ * # <----------------------------- node range --------------------------------------> #
+ * # #<~># free space #
+ * # <----- left part -----------------------------> # <~# <----- right slots -------> #
+ * # # <---- left slots -------------> #~> # #
+ * # # slots [2, n) |<~># #<~>| right slots [2, n) #
+ * # # <- slot 0 -> | <- slot 1 -> | # # | <-- s1 --> | <-- s0 --> #
+ * # # | | # # | | #
+ * # | num_ # | right | | right | # # | next-stage | next-stage #
+ * # header | keys # key | offset | key | offset | # # | container | container #
+ * # | # 0 | 0 | 1 | 1 |...#...#...| or onode 1 | or onode 0 #
+ * | | ^ ^
+ * | | | |
+ * | +----------------+ |
+ * +--------------------------------------------+
+ */
+template <typename SlotType>
+struct _node_fields_013_t {
+ // TODO: decide by NODE_BLOCK_SIZE, sizeof(SlotType), sizeof(laddr_t)
+ // and the minimal size of variable_key.
+ using num_keys_t = uint8_t;
+ using key_t = typename SlotType::key_t;
+ using key_get_type = const key_t&;
+ using me_t = _node_fields_013_t<SlotType>;
+ static constexpr field_type_t FIELD_TYPE = SlotType::FIELD_TYPE;
+ static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE;
+ static constexpr node_offset_t HEADER_SIZE =
+ sizeof(node_header_t) + sizeof(num_keys_t);
+ static constexpr node_offset_t ITEM_OVERHEAD = SlotType::OVERHEAD;
+
+ bool is_level_tail() const { return header.get_is_level_tail(); }
+ node_offset_t total_size() const { return SIZE; }
+ key_get_type get_key(index_t index) const {
+ assert(index < num_keys);
+ return slots[index].key;
+ }
+ node_offset_t get_key_start_offset(index_t index) const {
+ assert(index <= num_keys);
+ auto offset = HEADER_SIZE + sizeof(SlotType) * index;
+ assert(offset < SIZE);
+ return offset;
+ }
+ node_offset_t get_item_start_offset(index_t index) const {
+ assert(index < num_keys);
+ auto offset = slots[index].right_offset;
+ assert(offset <= SIZE);
+ return offset;
+ }
+ const void* p_offset(index_t index) const {
+ assert(index < num_keys);
+ return &slots[index].right_offset;
+ }
+ node_offset_t get_item_end_offset(index_t index) const {
+ return index == 0 ? SIZE : get_item_start_offset(index - 1);
+ }
+ template <node_type_t NODE_TYPE>
+ node_offset_t free_size_before(index_t index) const {
+ auto range = fields_free_range_before<NODE_TYPE>(*this, index);
+ return range.end - range.start;
+ }
+
+ static node_offset_t estimate_insert_one() { return sizeof(SlotType); }
+ template <KeyT KT>
+ static void insert_at(
+ NodeExtentMutable&, const full_key_t<KT>& key,
+ const me_t& node, index_t index, node_offset_t size_right);
+ static void update_size_at(
+ NodeExtentMutable&, const me_t& node, index_t index, int change);
+ static void append_key(
+ NodeExtentMutable&, const key_t& key, char*& p_append);
+ template <KeyT KT>
+ static void append_key(
+ NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) {
+ append_key(mut, key_t::template from_key<KT>(key), p_append);
+ }
+ static void append_offset(
+ NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append);
+
+ node_header_t header;
+ num_keys_t num_keys = 0u;
+ SlotType slots[];
+} __attribute__((packed));
+using node_fields_0_t = _node_fields_013_t<slot_0_t>;
+using node_fields_1_t = _node_fields_013_t<slot_1_t>;
+
+/**
+ * node_fields_2_t
+ *
+ * The STAGE_STRING layout implementation for node N2.
+ *
+ * The node layout storing n slots:
+ *
+ * # <--------------------------------- node range ----------------------------------------> #
+ * # #<~># free space #
+ * # <------- left part ---------------> # <~# <--------- right slots ---------------------> #
+ * # # <---- offsets ----> #~> #<~>| slots [2, n) #
+ * # # offsets [2, n) |<~># # | <----- slot 1 ----> | <----- slot 0 ----> #
+ * # # | # # | | #
+ * # | num_ # offset | offset | # # | next-stage | ns-oid | next-stage | ns-oid #
+ * # header | keys # 0 | 1 |...#...#...| container1 | 1 | container0 | 0 #
+ * | | ^ ^
+ * | | | |
+ * | +----------------+ |
+ * +-----------------------------------------------+
+ */
+struct node_fields_2_t {
+ // TODO: decide by NODE_BLOCK_SIZE, sizeof(node_off_t), sizeof(laddr_t)
+ // and the minimal size of variable_key.
+ using num_keys_t = uint8_t;
+ using key_t = ns_oid_view_t;
+ using key_get_type = key_t;
+ static constexpr field_type_t FIELD_TYPE = field_type_t::N2;
+ static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE;
+ static constexpr node_offset_t HEADER_SIZE =
+ sizeof(node_header_t) + sizeof(num_keys_t);
+ static constexpr node_offset_t ITEM_OVERHEAD = sizeof(node_offset_t);
+
+ bool is_level_tail() const { return header.get_is_level_tail(); }
+ node_offset_t total_size() const { return SIZE; }
+ key_get_type get_key(index_t index) const {
+ assert(index < num_keys);
+ node_offset_t item_end_offset =
+ (index == 0 ? SIZE : offsets[index - 1]);
+ assert(item_end_offset <= SIZE);
+ const char* p_start = fields_start(*this);
+ return key_t(p_start + item_end_offset);
+ }
+ node_offset_t get_key_start_offset(index_t index) const {
+ assert(index <= num_keys);
+ auto offset = HEADER_SIZE + sizeof(node_offset_t) * num_keys;
+ assert(offset <= SIZE);
+ return offset;
+ }
+ node_offset_t get_item_start_offset(index_t index) const {
+ assert(index < num_keys);
+ auto offset = offsets[index];
+ assert(offset <= SIZE);
+ return offset;
+ }
+ const void* p_offset(index_t index) const {
+ assert(index < num_keys);
+ return &offsets[index];
+ }
+ node_offset_t get_item_end_offset(index_t index) const {
+ return index == 0 ? SIZE : get_item_start_offset(index - 1);
+ }
+ template <node_type_t NODE_TYPE>
+ node_offset_t free_size_before(index_t index) const {
+ auto range = fields_free_range_before<NODE_TYPE>(*this, index);
+ return range.end - range.start;
+ }
+
+ static node_offset_t estimate_insert_one() { return sizeof(node_offset_t); }
+ template <KeyT KT>
+ static void insert_at(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ const node_fields_2_t& node, index_t index, node_offset_t size_right) {
+ ceph_abort("not implemented");
+ }
+ static void update_size_at(
+ NodeExtentMutable& mut, const node_fields_2_t& node, index_t index, int change) {
+ ceph_abort("not implemented");
+ }
+ static void append_key(
+ NodeExtentMutable& mut, const key_t& key, char*& p_append) {
+ ns_oid_view_t::append(mut, key, p_append);
+ }
+ template <KeyT KT>
+ static void append_key(
+ NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) {
+ ns_oid_view_t::append<KT>(mut, key, p_append);
+ }
+ static void append_offset(
+ NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append);
+
+ node_header_t header;
+ num_keys_t num_keys = 0u;
+ node_offset_t offsets[];
+} __attribute__((packed));
+
+/**
+ * internal_fields_3_t
+ *
+ * The STAGE_RIGHT layout implementation for N2.
+ *
+ * The node layout storing 3 children:
+ *
+ * # <---------------- node range ---------------------------> #
+ * # # <-- keys ---> # <---- laddrs -----------> #
+ * # free space: # |<~># |<~>#
+ * # # | # | #
+ * # | num_ # key | key | # laddr | laddr | laddr | #
+ * # header | keys # 0 | 1 |...# 0 | 1 | 2 |...#
+ */
+// TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t)
+static constexpr unsigned MAX_NUM_KEYS_I3 = 170u;
+template <unsigned MAX_NUM_KEYS>
+struct _internal_fields_3_t {
+ using key_get_type = const snap_gen_t&;
+ using me_t = _internal_fields_3_t<MAX_NUM_KEYS>;
+ // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t)
+ using num_keys_t = uint8_t;
+ static constexpr field_type_t FIELD_TYPE = field_type_t::N3;
+ static constexpr node_offset_t SIZE = sizeof(me_t);
+ static constexpr node_offset_t HEADER_SIZE =
+ sizeof(node_header_t) + sizeof(num_keys_t);
+ static constexpr node_offset_t ITEM_OVERHEAD = 0u;
+
+ bool is_level_tail() const { return header.get_is_level_tail(); }
+ node_offset_t total_size() const {
+ if (is_level_tail()) {
+ return SIZE - sizeof(snap_gen_t);
+ } else {
+ return SIZE;
+ }
+ }
+ key_get_type get_key(index_t index) const {
+ assert(index < num_keys);
+ return keys[index];
+ }
+ template <node_type_t NODE_TYPE>
+ std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, node_offset_t>
+ free_size_before(index_t index) const {
+ assert(index <= num_keys);
+ assert(num_keys <= (is_level_tail() ? MAX_NUM_KEYS - 1 : MAX_NUM_KEYS));
+ auto free = (MAX_NUM_KEYS - index) * (sizeof(snap_gen_t) + sizeof(laddr_t));
+ if (is_level_tail() && index == num_keys) {
+ free -= (sizeof(snap_gen_t) + sizeof(laddr_t));
+ }
+ assert(free < SIZE);
+ return free;
+ }
+
+ static node_offset_t estimate_insert_one() {
+ return sizeof(snap_gen_t) + sizeof(laddr_t);
+ }
+ template <KeyT KT>
+ static void insert_at(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ const me_t& node, index_t index, node_offset_t size_right) {
+ ceph_abort("not implemented");
+ }
+ static void update_size_at(
+ NodeExtentMutable& mut, const me_t& node, index_t index, int change) {
+ ceph_abort("not implemented");
+ }
+
+ node_header_t header;
+ num_keys_t num_keys = 0u;
+ snap_gen_t keys[MAX_NUM_KEYS];
+ laddr_packed_t child_addrs[MAX_NUM_KEYS];
+} __attribute__((packed));
+static_assert(_internal_fields_3_t<MAX_NUM_KEYS_I3>::SIZE <= NODE_BLOCK_SIZE &&
+ _internal_fields_3_t<MAX_NUM_KEYS_I3 + 1>::SIZE > NODE_BLOCK_SIZE);
+using internal_fields_3_t = _internal_fields_3_t<MAX_NUM_KEYS_I3>;
+
+using leaf_fields_3_t = _node_fields_013_t<slot_3_t>;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h
new file mode 100644
index 000000000..cac167a98
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h
@@ -0,0 +1,2186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <type_traits>
+
+#include "common/likely.h"
+
+#include "sub_items_stage.h"
+#include "item_iterator_stage.h"
+
+namespace crimson::os::seastore::onode {
+
+struct search_result_bs_t {
+ index_t index;
+ MatchKindBS match;
+};
+template <typename FGetKey>
+search_result_bs_t binary_search(
+ const full_key_t<KeyT::HOBJ>& key,
+ index_t begin, index_t end, FGetKey&& f_get_key) {
+ assert(begin <= end);
+ while (begin < end) {
+ auto total = begin + end;
+ auto mid = total >> 1;
+ // do not copy if return value is reference
+ decltype(f_get_key(mid)) target = f_get_key(mid);
+ auto match = compare_to<KeyT::HOBJ>(key, target);
+ if (match == MatchKindCMP::LT) {
+ end = mid;
+ } else if (match == MatchKindCMP::GT) {
+ begin = mid + 1;
+ } else {
+ return {mid, MatchKindBS::EQ};
+ }
+ }
+ return {begin , MatchKindBS::NE};
+}
+
+template <typename PivotType, typename FGet>
+search_result_bs_t binary_search_r(
+ index_t rend, index_t rbegin, FGet&& f_get, const PivotType& key) {
+ assert(rend <= rbegin);
+ while (rend < rbegin) {
+ auto total = rend + rbegin + 1;
+ auto mid = total >> 1;
+ // do not copy if return value is reference
+ decltype(f_get(mid)) target = f_get(mid);
+ int match = target - key;
+ if (match < 0) {
+ rend = mid;
+ } else if (match > 0) {
+ rbegin = mid - 1;
+ } else {
+ return {mid, MatchKindBS::EQ};
+ }
+ }
+ return {rbegin, MatchKindBS::NE};
+}
+
+inline bool matchable(field_type_t type, match_stat_t mstat) {
+ assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX);
+ /*
+ * compressed prefix by field type:
+ * N0: NONE
+ * N1: pool/shard
+ * N2: pool/shard crush
+ * N3: pool/shard crush ns/oid
+ *
+ * if key matches the node's compressed prefix, return true
+ * else, return false
+ */
+#ifndef NDEBUG
+ if (mstat == MSTAT_END) {
+ assert(type == field_type_t::N0);
+ }
+#endif
+ return mstat + to_unsigned(type) < 4;
+}
+
+inline void assert_mstat(
+ const full_key_t<KeyT::HOBJ>& key,
+ const full_key_t<KeyT::VIEW>& index,
+ match_stat_t mstat) {
+ assert(mstat >= MSTAT_MIN && mstat <= MSTAT_LT2);
+ // key < index ...
+ switch (mstat) {
+ case MSTAT_EQ:
+ break;
+ case MSTAT_LT0:
+ assert(compare_to<KeyT::HOBJ>(key, index.snap_gen_packed()) == MatchKindCMP::LT);
+ break;
+ case MSTAT_LT1:
+ assert(compare_to<KeyT::HOBJ>(key, index.ns_oid_view()) == MatchKindCMP::LT);
+ break;
+ case MSTAT_LT2:
+ if (index.has_shard_pool()) {
+ assert(compare_to<KeyT::HOBJ>(key, shard_pool_crush_t{
+ index.shard_pool_packed(), index.crush_packed()}) == MatchKindCMP::LT);
+ } else {
+ assert(compare_to<KeyT::HOBJ>(key, index.crush_packed()) == MatchKindCMP::LT);
+ }
+ break;
+ default:
+ ceph_abort("impossible path");
+ }
+ // key == index ...
+ switch (mstat) {
+ case MSTAT_EQ:
+ assert(compare_to<KeyT::HOBJ>(key, index.snap_gen_packed()) == MatchKindCMP::EQ);
+ case MSTAT_LT0:
+ if (!index.has_ns_oid())
+ break;
+ assert(index.ns_oid_view().type() == ns_oid_view_t::Type::MAX ||
+ compare_to<KeyT::HOBJ>(key, index.ns_oid_view()) == MatchKindCMP::EQ);
+ case MSTAT_LT1:
+ if (!index.has_crush())
+ break;
+ assert(compare_to<KeyT::HOBJ>(key, index.crush_packed()) == MatchKindCMP::EQ);
+ if (!index.has_shard_pool())
+ break;
+ assert(compare_to<KeyT::HOBJ>(key, index.shard_pool_packed()) == MatchKindCMP::EQ);
+ default:
+ break;
+ }
+}
+
+#define NXT_STAGE_T staged<next_param_t>
+
+enum class TrimType { BEFORE, AFTER, AT };
+
+/**
+ * staged
+ *
+ * Implements recursive logic that modifies or reads the node layout
+ * (N0/N1/N2/N3 * LEAF/INTERNAL) with the multi-stage design. The specific
+ * stage implementation is flexible. So the implementations for different
+ * stages can be assembled independently, as long as they follow the
+ * definitions of container interfaces.
+ *
+ * Multi-stage is designed to index different portions of onode keys
+ * stage-by-stage. There are at most 3 stages for a node:
+ * - STAGE_LEFT: index shard-pool-crush for N0, or index crush for N1 node;
+ * - STAGE_STRING: index ns-oid for N0/N1/N2 nodes;
+ * - STAGE_RIGHT: index snap-gen for N0/N1/N2/N3 nodes;
+ *
+ * The intention is to consolidate the high-level indexing implementations at
+ * the level of stage, so we don't need to write them repeatedly for every
+ * stage and for every node type.
+ */
+template <typename Params>
+struct staged {
+ static_assert(Params::STAGE >= STAGE_BOTTOM);
+ static_assert(Params::STAGE <= STAGE_TOP);
+ using container_t = typename Params::container_t;
+ using key_get_type = typename container_t::key_get_type;
+ using next_param_t = typename Params::next_param_t;
+ using position_t = staged_position_t<Params::STAGE>;
+ using result_t = staged_result_t<Params::NODE_TYPE, Params::STAGE>;
+ using value_t = value_type_t<Params::NODE_TYPE>;
+ static constexpr auto CONTAINER_TYPE = container_t::CONTAINER_TYPE;
+ static constexpr bool IS_BOTTOM = (Params::STAGE == STAGE_BOTTOM);
+ static constexpr auto NODE_TYPE = Params::NODE_TYPE;
+ static constexpr auto STAGE = Params::STAGE;
+
+ template <bool is_exclusive>
+ static void _left_or_right(index_t& split_index, index_t insert_index,
+ std::optional<bool>& is_insert_left) {
+ assert(!is_insert_left.has_value());
+ assert(is_valid_index(split_index));
+ if constexpr (is_exclusive) {
+ if (split_index <= insert_index) {
+ // ...[s_index-1] |!| (i_index) [s_index]...
+ // offset i_position to right
+ is_insert_left = false;
+ } else {
+ // ...[s_index-1] (i_index)) |?[s_index]| ...
+ // ...(i_index)...[s_index-1] |?[s_index]| ...
+ is_insert_left = true;
+ --split_index;
+ }
+ } else {
+ if (split_index < insert_index) {
+ // ...[s_index-1] |?[s_index]| ...[(i_index)[s_index_k]...
+ is_insert_left = false;
+ } else if (split_index > insert_index) {
+ // ...[(i_index)s_index-1] |?[s_index]| ...
+ // ...[(i_index)s_index_k]...[s_index-1] |?[s_index]| ...
+ is_insert_left = true;
+ } else {
+ // ...[s_index-1] |?[(i_index)s_index]| ...
+ // i_to_left = std::nullopt;
+ }
+ }
+ }
+
+ template <ContainerType CTYPE, typename Enable = void> class _iterator_t;
+ template <ContainerType CTYPE>
+ class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::INDEXABLE>> {
+ /*
+ * indexable container type system:
+ * CONTAINER_TYPE = ContainerType::INDEXABLE
+ * keys() const -> index_t
+ * operator[](index_t) const -> key_get_type
+ * size_before(index_t) const -> node_offset_t
+ * size_overhead_at(index_t) const -> node_offset_t
+ * (IS_BOTTOM) get_p_value(index_t) const -> const value_t*
+ * (!IS_BOTTOM) size_to_nxt_at(index_t) const -> node_offset_t
+ * (!IS_BOTTOM) get_nxt_container(index_t) const
+ * encode(p_node_start, encoded)
+ * decode(p_node_start, delta) -> container_t
+ * static:
+ * header_size() -> node_offset_t
+ * estimate_insert(key, value) -> node_offset_t
+ * (IS_BOTTOM) insert_at(mut, src, key, value,
+ * index, size, p_left_bound) -> const value_t*
+ * (!IS_BOTTOM) insert_prefix_at(mut, src, key,
+ * index, size, p_left_bound) -> memory_range_t
+ * (!IS_BOTTOM) update_size_at(mut, src, index, size)
+ * trim_until(mut, container, index) -> trim_size
+ * (!IS_BOTTOM) trim_at(mut, container, index, trimmed) -> trim_size
+ *
+ * Appender::append(const container_t& src, from, items)
+ */
+ public:
+ using me_t = _iterator_t<CTYPE>;
+
+ _iterator_t(const container_t& container) : container{container} {
+ assert(container.keys());
+ }
+
+ index_t index() const {
+ return _index;
+ }
+ key_get_type get_key() const {
+ assert(!is_end());
+ return container[_index];
+ }
+ node_offset_t size_to_nxt() const {
+ assert(!is_end());
+ return container.size_to_nxt_at(_index);
+ }
+ template <typename T = typename NXT_STAGE_T::container_t>
+ std::enable_if_t<!IS_BOTTOM, T> get_nxt_container() const {
+ assert(!is_end());
+ return container.get_nxt_container(_index);
+ }
+ template <typename T = value_t>
+ std::enable_if_t<IS_BOTTOM, const T*> get_p_value() const {
+ assert(!is_end());
+ return container.get_p_value(_index);
+ }
+ bool is_last() const {
+ return _index + 1 == container.keys();
+ }
+ bool is_end() const { return _index == container.keys(); }
+ node_offset_t size() const {
+ assert(!is_end());
+ assert(header_size() == container.size_before(0));
+ assert(container.size_before(_index + 1) > container.size_before(_index));
+ return container.size_before(_index + 1) -
+ container.size_before(_index);
+ }
+ node_offset_t size_overhead() const {
+ assert(!is_end());
+ return container.size_overhead_at(_index);
+ }
+
+ me_t& operator++() {
+ assert(!is_end());
+ assert(!is_last());
+ ++_index;
+ return *this;
+ }
+ void seek_at(index_t index) {
+ assert(index < container.keys());
+ seek_till_end(index);
+ }
+ void seek_till_end(index_t index) {
+ assert(!is_end());
+ assert(this->index() == 0);
+ assert(index <= container.keys());
+ _index = index;
+ }
+ void seek_last() {
+ assert(!is_end());
+ assert(index() == 0);
+ _index = container.keys() - 1;
+ }
+ void set_end() {
+ assert(!is_end());
+ assert(is_last());
+ ++_index;
+ }
+ // Note: possible to return an end iterator
+ MatchKindBS seek(const full_key_t<KeyT::HOBJ>& key, bool exclude_last) {
+ assert(!is_end());
+ assert(index() == 0);
+ index_t end_index = container.keys();
+ if (exclude_last) {
+ assert(end_index);
+ --end_index;
+ assert(compare_to<KeyT::HOBJ>(key, container[end_index]) == MatchKindCMP::LT);
+ }
+ auto ret = binary_search(key, _index, end_index,
+ [this] (index_t index) { return container[index]; });
+ _index = ret.index;
+ return ret.match;
+ }
+
+ template <KeyT KT, typename T = value_t>
+ std::enable_if_t<IS_BOTTOM, const T*> insert(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ const value_t& value, node_offset_t insert_size, const char* p_left_bound) {
+ return container_t::template insert_at<KT>(
+ mut, container, key, value, _index, insert_size, p_left_bound);
+ }
+
+ template <KeyT KT, typename T = memory_range_t>
+ std::enable_if_t<!IS_BOTTOM, T> insert_prefix(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ node_offset_t size, const char* p_left_bound) {
+ return container_t::template insert_prefix_at<KT>(
+ mut, container, key, _index, size, p_left_bound);
+ }
+
+ template <typename T = void>
+ std::enable_if_t<!IS_BOTTOM, T>
+ update_size(NodeExtentMutable& mut, node_offset_t insert_size) {
+ assert(!is_end());
+ container_t::update_size_at(mut, container, _index, insert_size);
+ }
+
+ // Note: possible to return an end iterator when is_exclusive is true
+ template <bool is_exclusive>
+ size_t seek_split_inserted(
+ size_t start_size, size_t extra_size, size_t target_size,
+ index_t& insert_index, size_t insert_size,
+ std::optional<bool>& is_insert_left) {
+ assert(!is_end());
+ assert(index() == 0);
+ // replace insert_index placeholder
+ if constexpr (!is_exclusive) {
+ if (insert_index == INDEX_LAST) {
+ insert_index = container.keys() - 1;
+ }
+ } else {
+ if (insert_index == INDEX_END) {
+ insert_index = container.keys();
+ }
+ }
+ assert(insert_index <= container.keys());
+
+ auto start_size_1 = start_size + extra_size;
+ auto f_get_used_size = [this, start_size, start_size_1,
+ insert_index, insert_size] (index_t index) {
+ size_t current_size;
+ if (unlikely(index == 0)) {
+ current_size = start_size;
+ } else {
+ current_size = start_size_1;
+ if (index > insert_index) {
+ current_size += insert_size;
+ if constexpr (is_exclusive) {
+ --index;
+ }
+ }
+ // already includes header size
+ current_size += container.size_before(index);
+ }
+ return current_size;
+ };
+ index_t s_end;
+ if constexpr (is_exclusive) {
+ s_end = container.keys();
+ } else {
+ s_end = container.keys() - 1;
+ }
+ _index = binary_search_r(0, s_end, f_get_used_size, target_size).index;
+ size_t current_size = f_get_used_size(_index);
+ assert(current_size <= target_size);
+
+ _left_or_right<is_exclusive>(_index, insert_index, is_insert_left);
+ return current_size;
+ }
+
+ size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) {
+ assert(!is_end());
+ assert(index() == 0);
+ auto start_size_1 = start_size + extra_size;
+ auto f_get_used_size = [this, start_size, start_size_1] (index_t index) {
+ size_t current_size;
+ if (unlikely(index == 0)) {
+ current_size = start_size;
+ } else {
+ // already includes header size
+ current_size = start_size_1 + container.size_before(index);
+ }
+ return current_size;
+ };
+ _index = binary_search_r(
+ 0, container.keys() - 1, f_get_used_size, target_size).index;
+ size_t current_size = f_get_used_size(_index);
+ assert(current_size <= target_size);
+ return current_size;
+ }
+
+ // Note: possible to return an end iterater if to_index == INDEX_END
+ template <KeyT KT>
+ void copy_out_until(
+ typename container_t::template Appender<KT>& appender, index_t& to_index) {
+ auto num_keys = container.keys();
+ index_t items;
+ if (to_index == INDEX_END) {
+ items = num_keys - _index;
+ appender.append(container, _index, items);
+ _index = num_keys;
+ to_index = _index;
+ } else if (to_index == INDEX_LAST) {
+ assert(!is_end());
+ items = num_keys - 1 - _index;
+ appender.append(container, _index, items);
+ _index = num_keys - 1;
+ to_index = _index;
+ } else {
+ assert(_index <= to_index);
+ assert(to_index <= num_keys);
+ items = to_index - _index;
+ appender.append(container, _index, items);
+ _index = to_index;
+ }
+ }
+
+ node_offset_t trim_until(NodeExtentMutable& mut) {
+ return container_t::trim_until(mut, container, _index);
+ }
+
+ template <typename T = node_offset_t>
+ std::enable_if_t<!IS_BOTTOM, T>
+ trim_at(NodeExtentMutable& mut, node_offset_t trimmed) {
+ return container_t::trim_at(mut, container, _index, trimmed);
+ }
+
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ container.encode(p_node_start, encoded);
+ ceph::encode(_index, encoded);
+ }
+
+ static me_t decode(const char* p_node_start,
+ ceph::bufferlist::const_iterator& delta) {
+ auto container = container_t::decode(p_node_start, delta);
+ auto ret = me_t(container);
+ index_t index;
+ ceph::decode(index, delta);
+ ret.seek_till_end(index);
+ return ret;
+ }
+
+ static node_offset_t header_size() {
+ return container_t::header_size();
+ }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(
+ const full_key_t<KT>& key, const value_t& value) {
+ return container_t::template estimate_insert<KT>(key, value);
+ }
+
+ private:
+ container_t container;
+ index_t _index = 0;
+ };
+
+ template <ContainerType CTYPE>
+ class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::ITERATIVE>> {
+ /*
+ * iterative container type system (!IS_BOTTOM):
+ * CONTAINER_TYPE = ContainerType::ITERATIVE
+ * index() const -> index_t
+ * get_key() const -> key_get_type
+ * size() const -> node_offset_t
+ * size_to_nxt() const -> node_offset_t
+ * size_overhead() const -> node_offset_t
+ * get_nxt_container() const
+ * has_next() const -> bool
+ * encode(p_node_start, encoded)
+ * decode(p_node_start, delta) -> container_t
+ * operator++()
+ * static:
+ * header_size() -> node_offset_t
+ * estimate_insert(key, value) -> node_offset_t
+ * insert_prefix(mut, src, key, is_end, size, p_left_bound) -> memory_range_t
+ * update_size(mut, src, size)
+ * trim_until(mut, container) -> trim_size
+ * trim_at(mut, container, trimmed) -> trim_size
+ */
+ // currently the iterative iterator is only implemented with STAGE_STRING
+ // for in-node space efficiency
+ static_assert(STAGE == STAGE_STRING);
+ public:
+ using me_t = _iterator_t<CTYPE>;
+
+ _iterator_t(const container_t& container) : container{container} {}
+
+ index_t index() const {
+ if (is_end()) {
+ return container.index() + 1;
+ } else {
+ return container.index();
+ }
+ }
+ key_get_type get_key() const {
+ assert(!is_end());
+ return container.get_key();
+ }
+ node_offset_t size_to_nxt() const {
+ assert(!is_end());
+ return container.size_to_nxt();
+ }
+ const typename NXT_STAGE_T::container_t get_nxt_container() const {
+ assert(!is_end());
+ return container.get_nxt_container();
+ }
+ bool is_last() const {
+ assert(!is_end());
+ return !container.has_next();
+ }
+ bool is_end() const {
+#ifndef NDEBUG
+ if (_is_end) {
+ assert(!container.has_next());
+ }
+#endif
+ return _is_end;
+ }
+ node_offset_t size() const {
+ assert(!is_end());
+ return container.size();
+ }
+ node_offset_t size_overhead() const {
+ assert(!is_end());
+ return container.size_overhead();
+ }
+
+ me_t& operator++() {
+ assert(!is_end());
+ assert(!is_last());
+ ++container;
+ return *this;
+ }
+ void seek_at(index_t index) {
+ assert(!is_end());
+ assert(this->index() == 0);
+ while (index > 0) {
+ assert(container.has_next());
+ ++container;
+ --index;
+ }
+ }
+ void seek_till_end(index_t index) {
+ assert(!is_end());
+ assert(this->index() == 0);
+ while (index > 0) {
+ if (!container.has_next()) {
+ assert(index == 1);
+ set_end();
+ break;
+ }
+ ++container;
+ --index;
+ }
+ }
+ void seek_last() {
+ assert(!is_end());
+ assert(index() == 0);
+ while (container.has_next()) {
+ ++container;
+ }
+ }
+ void set_end() {
+ assert(!is_end());
+ assert(is_last());
+ _is_end = true;
+ }
+ // Note: possible to return an end iterator
+ MatchKindBS seek(const full_key_t<KeyT::HOBJ>& key, bool exclude_last) {
+ assert(!is_end());
+ assert(index() == 0);
+ do {
+ if (exclude_last && is_last()) {
+ assert(compare_to<KeyT::HOBJ>(key, get_key()) == MatchKindCMP::LT);
+ return MatchKindBS::NE;
+ }
+ auto match = compare_to<KeyT::HOBJ>(key, get_key());
+ if (match == MatchKindCMP::LT) {
+ return MatchKindBS::NE;
+ } else if (match == MatchKindCMP::EQ) {
+ return MatchKindBS::EQ;
+ } else {
+ if (container.has_next()) {
+ ++container;
+ } else {
+ // end
+ break;
+ }
+ }
+ } while (true);
+ assert(!exclude_last);
+ set_end();
+ return MatchKindBS::NE;
+ }
+
+ template <KeyT KT>
+ memory_range_t insert_prefix(
+ NodeExtentMutable& mut, const full_key_t<KT>& key,
+ node_offset_t size, const char* p_left_bound) {
+ return container_t::template insert_prefix<KT>(
+ mut, container, key, is_end(), size, p_left_bound);
+ }
+
+ void update_size(NodeExtentMutable& mut, node_offset_t insert_size) {
+ assert(!is_end());
+ container_t::update_size(mut, container, insert_size);
+ }
+
+ // Note: possible to return an end iterator when is_exclusive is true
+ // insert_index can still be INDEX_LAST or INDEX_END
+ template <bool is_exclusive>
+ size_t seek_split_inserted(
+ size_t start_size, size_t extra_size, size_t target_size,
+ index_t& insert_index, size_t insert_size,
+ std::optional<bool>& is_insert_left) {
+ assert(!is_end());
+ assert(index() == 0);
+ size_t current_size = start_size;
+ index_t split_index = 0;
+ extra_size += header_size();
+ do {
+ if constexpr (!is_exclusive) {
+ if (is_last()) {
+ assert(split_index == index());
+ if (insert_index == INDEX_LAST) {
+ insert_index = index();
+ }
+ assert(insert_index <= index());
+ break;
+ }
+ }
+
+ size_t nxt_size = current_size;
+ if (split_index == 0) {
+ nxt_size += extra_size;
+ }
+ if (split_index == insert_index) {
+ nxt_size += insert_size;
+ if constexpr (is_exclusive) {
+ if (nxt_size > target_size) {
+ break;
+ }
+ current_size = nxt_size;
+ ++split_index;
+ }
+ }
+ nxt_size += size();
+ if (nxt_size > target_size) {
+ break;
+ }
+ current_size = nxt_size;
+
+ if constexpr (is_exclusive) {
+ if (is_last()) {
+ assert(split_index == index());
+ set_end();
+ split_index = index();
+ if (insert_index == INDEX_END) {
+ insert_index = index();
+ }
+ assert(insert_index == index());
+ break;
+ } else {
+ ++(*this);
+ ++split_index;
+ }
+ } else {
+ ++(*this);
+ ++split_index;
+ }
+ } while (true);
+ assert(current_size <= target_size);
+
+ _left_or_right<is_exclusive>(split_index, insert_index, is_insert_left);
+ assert(split_index == index());
+ return current_size;
+ }
+
+ size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) {
+ assert(!is_end());
+ assert(index() == 0);
+ size_t current_size = start_size;
+ do {
+ if (is_last()) {
+ break;
+ }
+
+ size_t nxt_size = current_size;
+ if (index() == 0) {
+ nxt_size += extra_size;
+ }
+ nxt_size += size();
+ if (nxt_size > target_size) {
+ break;
+ }
+ current_size = nxt_size;
+ ++(*this);
+ } while (true);
+ assert(current_size <= target_size);
+ return current_size;
+ }
+
+ // Note: possible to return an end iterater if to_index == INDEX_END
+ template <KeyT KT>
+ void copy_out_until(
+ typename container_t::template Appender<KT>& appender, index_t& to_index) {
+ if (is_end()) {
+ assert(!container.has_next());
+ if (to_index == INDEX_END) {
+ to_index = index();
+ }
+ assert(to_index == index());
+ return;
+ }
+ index_t items;
+ if (to_index == INDEX_END || to_index == INDEX_LAST) {
+ items = to_index;
+ } else {
+ assert(is_valid_index(to_index));
+ assert(index() <= to_index);
+ items = to_index - index();
+ }
+ if (appender.append(container, items)) {
+ set_end();
+ }
+ to_index = index();
+ }
+
+ node_offset_t trim_until(NodeExtentMutable& mut) {
+ if (is_end()) {
+ return 0;
+ }
+ return container_t::trim_until(mut, container);
+ }
+
+ node_offset_t trim_at(NodeExtentMutable& mut, node_offset_t trimmed) {
+ assert(!is_end());
+ return container_t::trim_at(mut, container, trimmed);
+ }
+
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ container.encode(p_node_start, encoded);
+ uint8_t is_end = _is_end;
+ ceph::encode(is_end, encoded);
+ }
+
+ static me_t decode(const char* p_node_start,
+ ceph::bufferlist::const_iterator& delta) {
+ auto container = container_t::decode(p_node_start, delta);
+ auto ret = me_t(container);
+ uint8_t is_end;
+ ceph::decode(is_end, delta);
+ if (is_end) {
+ ret.set_end();
+ }
+ return ret;
+ }
+
+ static node_offset_t header_size() {
+ return container_t::header_size();
+ }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(const full_key_t<KT>& key, const value_t& value) {
+ return container_t::template estimate_insert<KT>(key, value);
+ }
+
+ private:
+ container_t container;
+ bool _is_end = false;
+ };
+
+ /*
+ * iterator_t encapsulates both indexable and iterative implementations
+ * from a *non-empty* container.
+ * cstr(const container_t&)
+ * access:
+ * index() -> index_t
+ * get_key() -> key_get_type (const reference or value type)
+ * is_last() -> bool
+ * is_end() -> bool
+ * size() -> node_offset_t
+ * size_overhead() -> node_offset_t
+ * (IS_BOTTOM) get_p_value() -> const value_t*
+ * (!IS_BOTTOM) get_nxt_container() -> nxt_stage::container_t
+ * (!IS_BOTTOM) size_to_nxt() -> node_offset_t
+ * seek:
+ * operator++() -> iterator_t&
+ * seek_at(index)
+ * seek_till_end(index)
+ * seek_last()
+ * set_end()
+ * seek(key, exclude_last) -> MatchKindBS
+ * insert:
+ * (IS_BOTTOM) insert(mut, key, value, size, p_left_bound) -> p_value
+ * (!IS_BOTTOM) insert_prefix(mut, key, size, p_left_bound) -> memory_range_t
+ * (!IS_BOTTOM) update_size(mut, size)
+ * split:
+ * seek_split_inserted<bool is_exclusive>(
+ * start_size, extra_size, target_size, insert_index, insert_size,
+ * std::optional<bool>& is_insert_left)
+ * -> insert to left/right/unknown (!exclusive)
+ * -> insert to left/right (exclusive, can be end)
+ * -> split_size
+ * seek_split(start_size, extra_size, target_size) -> split_size
+ * copy_out_until(appender, to_index) (can be end)
+ * trim_until(mut) -> trim_size
+ * (!IS_BOTTOM) trim_at(mut, trimmed) -> trim_size
+ * denc:
+ * encode(p_node_start, encoded)
+ * decode(p_node_start, delta) -> iterator_t
+ * static:
+ * header_size() -> node_offset_t
+ * estimate_insert(key, value) -> node_offset_t
+ */
+ using iterator_t = _iterator_t<CONTAINER_TYPE>;
+ /* TODO: detailed comments
+ * - trim_until(mut) -> trim_size
+ * * keep 0 to i - 1, and remove the rest, return the size trimmed.
+ * * if this is the end iterator, do nothing and return 0.
+ * * if this is the start iterator, normally needs to go to the higher
+ * stage to trim the entire container.
+ * - trim_at(mut, trimmed) -> trim_size
+ * * trim happens inside the current iterator, causing the size reduced by
+ * <trimmed>, return the total size trimmed.
+ */
+
+ /*
+ * Lookup internals (hide?)
+ */
+
+ template <bool GET_KEY>
+ static result_t smallest_result(
+ const iterator_t& iter, full_key_t<KeyT::VIEW>* index_key) {
+ static_assert(!IS_BOTTOM);
+ assert(!iter.is_end());
+ auto pos_smallest = NXT_STAGE_T::position_t::begin();
+ auto nxt_container = iter.get_nxt_container();
+ auto value_ptr = NXT_STAGE_T::template get_p_value<GET_KEY>(
+ nxt_container, pos_smallest, index_key);
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ return result_t{{iter.index(), pos_smallest}, value_ptr, STAGE};
+ }
+
+ template <bool GET_KEY>
+ static result_t nxt_lower_bound(
+ const full_key_t<KeyT::HOBJ>& key, iterator_t& iter,
+ MatchHistory& history, full_key_t<KeyT::VIEW>* index_key) {
+ static_assert(!IS_BOTTOM);
+ assert(!iter.is_end());
+ auto nxt_container = iter.get_nxt_container();
+ auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>(
+ nxt_container, key, history, index_key);
+ if (nxt_result.is_end()) {
+ if (iter.is_last()) {
+ return result_t::end();
+ } else {
+ return smallest_result<GET_KEY>(++iter, index_key);
+ }
+ } else {
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ return result_t::from_nxt(iter.index(), nxt_result);
+ }
+ }
+
+ template <bool GET_POS, bool GET_KEY, bool GET_VAL>
+ static void lookup_largest_slot(
+ const container_t& container, position_t* p_position,
+ full_key_t<KeyT::VIEW>* p_index_key, const value_t** pp_value) {
+ auto iter = iterator_t(container);
+ iter.seek_last();
+ if constexpr (GET_KEY) {
+ assert(p_index_key);
+ p_index_key->set(iter.get_key());
+ }
+ if constexpr (GET_POS) {
+ assert(p_position);
+ p_position->index = iter.index();
+ }
+ if constexpr (IS_BOTTOM) {
+ if constexpr (GET_VAL) {
+ assert(pp_value);
+ *pp_value = iter.get_p_value();
+ }
+ } else {
+ auto nxt_container = iter.get_nxt_container();
+ if constexpr (GET_POS) {
+ NXT_STAGE_T::template lookup_largest_slot<true, GET_KEY, GET_VAL>(
+ nxt_container, &p_position->nxt, p_index_key, pp_value);
+ } else {
+ NXT_STAGE_T::template lookup_largest_slot<false, GET_KEY, GET_VAL>(
+ nxt_container, nullptr, p_index_key, pp_value);
+ }
+ }
+ }
+
+ template <bool GET_KEY = false>
+ static const value_t* get_p_value(
+ const container_t& container, const position_t& position,
+ full_key_t<KeyT::VIEW>* index_key = nullptr) {
+ auto iter = iterator_t(container);
+ iter.seek_at(position.index);
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ return NXT_STAGE_T::template get_p_value<GET_KEY>(
+ nxt_container, position.nxt, index_key);
+ } else {
+ return iter.get_p_value();
+ }
+ }
+
+ static void get_key_view(
+ const container_t& container,
+ const position_t& position,
+ full_key_t<KeyT::VIEW>& index_key) {
+ auto iter = iterator_t(container);
+ iter.seek_at(position.index);
+ index_key.set(iter.get_key());
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ return NXT_STAGE_T::get_key_view(nxt_container, position.nxt, index_key);
+ }
+ }
+
+ template <bool GET_KEY = false>
+ static result_t lower_bound(
+ const container_t& container,
+ const full_key_t<KeyT::HOBJ>& key,
+ MatchHistory& history,
+ full_key_t<KeyT::VIEW>* index_key = nullptr) {
+ bool exclude_last = false;
+ if (history.get<STAGE>().has_value()) {
+ if (*history.get<STAGE>() == MatchKindCMP::EQ) {
+ // lookup is short-circuited
+ if constexpr (!IS_BOTTOM) {
+ assert(history.get<STAGE - 1>().has_value());
+ if (history.is_GT<STAGE - 1>()) {
+ auto iter = iterator_t(container);
+ bool test_key_equal;
+ if constexpr (STAGE == STAGE_STRING) {
+ // TODO(cross-node string dedup)
+ // test_key_equal = (iter.get_key().type() == ns_oid_view_t::Type::MIN);
+ auto cmp = compare_to<KeyT::HOBJ>(key, iter.get_key());
+ assert(cmp != MatchKindCMP::GT);
+ test_key_equal = (cmp == MatchKindCMP::EQ);
+ } else {
+ auto cmp = compare_to<KeyT::HOBJ>(key, iter.get_key());
+ // From history, key[stage] == parent[stage][index - 1]
+ // which should be the smallest possible value for all
+ // index[stage][*]
+ assert(cmp != MatchKindCMP::GT);
+ test_key_equal = (cmp == MatchKindCMP::EQ);
+ }
+ if (test_key_equal) {
+ return nxt_lower_bound<GET_KEY>(key, iter, history, index_key);
+ } else {
+ // key[stage] < index[stage][left-most]
+ return smallest_result<GET_KEY>(iter, index_key);
+ }
+ }
+ }
+ // IS_BOTTOM || !history.is_GT<STAGE - 1>()
+ auto iter = iterator_t(container);
+ iter.seek_last();
+ if constexpr (STAGE == STAGE_STRING) {
+ // TODO(cross-node string dedup)
+ // assert(iter.get_key().type() == ns_oid_view_t::Type::MAX);
+ assert(compare_to<KeyT::HOBJ>(key, iter.get_key()) == MatchKindCMP::EQ);
+ } else {
+ assert(compare_to<KeyT::HOBJ>(key, iter.get_key()) == MatchKindCMP::EQ);
+ }
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ if constexpr (IS_BOTTOM) {
+ auto value_ptr = iter.get_p_value();
+ return result_t{{iter.index()}, value_ptr, MSTAT_EQ};
+ } else {
+ auto nxt_container = iter.get_nxt_container();
+ auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>(
+ nxt_container, key, history, index_key);
+ // !history.is_GT<STAGE - 1>() means
+ // key[stage+1 ...] <= index[stage+1 ...][*]
+ assert(!nxt_result.is_end());
+ return result_t::from_nxt(iter.index(), nxt_result);
+ }
+ } else if (*history.get<STAGE>() == MatchKindCMP::LT) {
+ exclude_last = true;
+ }
+ }
+ auto iter = iterator_t(container);
+ auto bs_match = iter.seek(key, exclude_last);
+ if (iter.is_end()) {
+ assert(!exclude_last);
+ assert(bs_match == MatchKindBS::NE);
+ history.set<STAGE>(MatchKindCMP::GT);
+ return result_t::end();
+ }
+ history.set<STAGE>(bs_match == MatchKindBS::EQ ?
+ MatchKindCMP::EQ : MatchKindCMP::LT);
+ if constexpr (IS_BOTTOM) {
+ if constexpr (GET_KEY) {
+ index_key->set(iter.get_key());
+ }
+ auto value_ptr = iter.get_p_value();
+ return result_t{{iter.index()}, value_ptr,
+ (bs_match == MatchKindBS::EQ ? MSTAT_EQ : MSTAT_LT0)};
+ } else {
+ if (bs_match == MatchKindBS::EQ) {
+ return nxt_lower_bound<GET_KEY>(key, iter, history, index_key);
+ } else {
+ return smallest_result<GET_KEY>(iter, index_key);
+ }
+ }
+ }
+
+ template <KeyT KT>
+ static node_offset_t insert_size(const full_key_t<KT>& key, const value_t& value) {
+ if constexpr (IS_BOTTOM) {
+ return iterator_t::template estimate_insert<KT>(key, value);
+ } else {
+ return iterator_t::template estimate_insert<KT>(key, value) +
+ NXT_STAGE_T::iterator_t::header_size() +
+ NXT_STAGE_T::template insert_size<KT>(key, value);
+ }
+ }
+
+ template <KeyT KT>
+ static node_offset_t insert_size_at(
+ match_stage_t stage, const full_key_t<KeyT::HOBJ>& key, const value_t& value) {
+ if (stage == STAGE) {
+ return insert_size<KT>(key, value);
+ } else {
+ assert(stage < STAGE);
+ return NXT_STAGE_T::template insert_size_at<KT>(stage, key, value);
+ }
+ }
+
+ template <typename T = std::tuple<match_stage_t, node_offset_t>>
+ static std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, T> evaluate_insert(
+ const container_t& container, const full_key_t<KeyT::VIEW>& key,
+ const value_t& value, position_t& position, bool evaluate_last) {
+ auto iter = iterator_t(container);
+ auto& index = position.index;
+ if (evaluate_last || index == INDEX_END) {
+ iter.seek_last();
+ index = iter.index();
+ // evaluate the previous index
+ } else {
+ assert(is_valid_index(index));
+ // evaluate the current index
+ iter.seek_at(index);
+ auto match = compare_to<KeyT::VIEW>(key, iter.get_key());
+ if (match == MatchKindCMP::EQ) {
+ if constexpr (IS_BOTTOM) {
+ ceph_abort("insert conflict at current index!");
+ } else {
+ // insert into the current index
+ auto nxt_container = iter.get_nxt_container();
+ return NXT_STAGE_T::evaluate_insert(
+ nxt_container, key, value, position.nxt, false);
+ }
+ } else {
+ assert(match == MatchKindCMP::LT);
+ if (index == 0) {
+ // already the first index, so insert at the current index
+ return {STAGE, insert_size<KeyT::VIEW>(key, value)};
+ }
+ --index;
+ iter = iterator_t(container);
+ iter.seek_at(index);
+ // proceed to evaluate the previous index
+ }
+ }
+
+ // XXX(multi-type): when key is from a different type of node
+ auto match = compare_to<KeyT::VIEW>(key, iter.get_key());
+ if (match == MatchKindCMP::GT) {
+ // key doesn't match both indexes, so insert at the current index
+ ++index;
+ return {STAGE, insert_size<KeyT::VIEW>(key, value)};
+ } else {
+ assert(match == MatchKindCMP::EQ);
+ if constexpr (IS_BOTTOM) {
+ // ceph_abort?
+ ceph_abort("insert conflict at the previous index!");
+ } else {
+ // insert into the previous index
+ auto nxt_container = iter.get_nxt_container();
+ return NXT_STAGE_T::evaluate_insert(
+ nxt_container, key, value, position.nxt, true);
+ }
+ }
+ }
+
+ template <typename T = bool>
+ static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T>
+ compensate_insert_position_at(match_stage_t stage, position_t& position) {
+ auto& index = position.index;
+ if (stage == STAGE) {
+ assert(index == 0);
+ // insert at the end of the current stage
+ index = INDEX_END;
+ return true;
+ } else {
+ if constexpr (IS_BOTTOM) {
+ ceph_abort("impossible path");
+ } else {
+ assert(stage < STAGE);
+ bool compensate = NXT_STAGE_T::
+ compensate_insert_position_at(stage, position.nxt);
+ if (compensate) {
+ assert(is_valid_index(index));
+ if (index == 0) {
+ // insert into the *last* index of the current stage
+ index = INDEX_LAST;
+ return true;
+ } else {
+ --index;
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+ }
+ }
+
+ static void patch_insert_end(position_t& insert_pos, match_stage_t insert_stage) {
+ assert(insert_stage <= STAGE);
+ if (insert_stage == STAGE) {
+ insert_pos.index = INDEX_END;
+ } else if constexpr (!IS_BOTTOM) {
+ insert_pos.index = INDEX_LAST;
+ NXT_STAGE_T::patch_insert_end(insert_pos.nxt, insert_stage);
+ }
+ }
+
+ template <typename T = std::tuple<match_stage_t, node_offset_t>>
+ static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T> evaluate_insert(
+ const full_key_t<KeyT::HOBJ>& key, const onode_t& value,
+ const MatchHistory& history, match_stat_t mstat, position_t& position) {
+ match_stage_t insert_stage = STAGE_TOP;
+ while (*history.get_by_stage(insert_stage) == MatchKindCMP::EQ) {
+ assert(insert_stage != STAGE_BOTTOM && "insert conflict!");
+ --insert_stage;
+ }
+
+ if (history.is_GT()) {
+ if (position.is_end()) {
+ // no need to compensate insert position
+ assert(insert_stage <= STAGE && "impossible insert stage");
+ } else if (position == position_t::begin()) {
+ // I must be short-circuited by staged::smallest_result()
+ // in staged::lower_bound(), so we need to rely on mstat instead
+ assert(mstat >= MSTAT_LT0 && mstat <= MSTAT_LT3);
+ if (mstat == MSTAT_LT0) {
+ insert_stage = STAGE_RIGHT;
+ } else if (mstat == MSTAT_LT1) {
+ insert_stage = STAGE_STRING;
+ } else {
+ insert_stage = STAGE_LEFT;
+ }
+ // XXX(multi-type): need to upgrade node type before inserting an
+ // incompatible index at front.
+ assert(insert_stage <= STAGE && "incompatible insert");
+ } else {
+ assert(insert_stage <= STAGE && "impossible insert stage");
+ [[maybe_unused]] bool ret = compensate_insert_position_at(insert_stage, position);
+ assert(!ret);
+ }
+ }
+
+ if (position.is_end()) {
+ patch_insert_end(position, insert_stage);
+ }
+
+ node_offset_t insert_size = insert_size_at<KeyT::HOBJ>(insert_stage, key, value);
+
+ return {insert_stage, insert_size};
+ }
+
+ template <KeyT KT>
+ static const value_t* insert_new(
+ NodeExtentMutable& mut, const memory_range_t& range,
+ const full_key_t<KT>& key, const value_t& value) {
+ char* p_insert = const_cast<char*>(range.p_end);
+ const value_t* p_value = nullptr;
+ StagedAppender<KT> appender;
+ appender.init(&mut, p_insert);
+ appender.append(key, value, p_value);
+ [[maybe_unused]] const char* p_insert_front = appender.wrap();
+ assert(p_insert_front == range.p_start);
+ return p_value;
+ }
+
+ template <KeyT KT, bool SPLIT>
+ static const value_t* proceed_insert_recursively(
+ NodeExtentMutable& mut, const container_t& container,
+ const full_key_t<KT>& key, const value_t& value,
+ position_t& position, match_stage_t& stage,
+ node_offset_t& _insert_size, const char* p_left_bound) {
+ // proceed insert from right to left
+ assert(stage <= STAGE);
+ auto iter = iterator_t(container);
+ auto& index = position.index;
+
+ bool do_insert = false;
+ if (stage == STAGE) {
+ if (index == INDEX_END) {
+ iter.seek_last();
+ iter.set_end();
+ index = iter.index();
+ } else {
+ assert(is_valid_index(index));
+ iter.seek_till_end(index);
+ }
+ do_insert = true;
+ } else { // stage < STAGE
+ if (index == INDEX_LAST) {
+ iter.seek_last();
+ index = iter.index();
+ } else {
+ assert(is_valid_index(index));
+ iter.seek_till_end(index);
+ }
+ if constexpr (SPLIT) {
+ if (iter.is_end()) {
+ // insert at the higher stage due to split
+ do_insert = true;
+ _insert_size = insert_size<KT>(key, value);
+ stage = STAGE;
+ }
+ } else {
+ assert(!iter.is_end());
+ }
+ }
+
+ if (do_insert) {
+ if constexpr (!IS_BOTTOM) {
+ position.nxt = position_t::nxt_t::begin();
+ }
+ assert(_insert_size == insert_size<KT>(key, value));
+ if constexpr (IS_BOTTOM) {
+ return iter.template insert<KT>(
+ mut, key, value, _insert_size, p_left_bound);
+ } else {
+ auto range = iter.template insert_prefix<KT>(
+ mut, key, _insert_size, p_left_bound);
+ return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value);
+ }
+ } else {
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ auto p_value = NXT_STAGE_T::template proceed_insert_recursively<KT, SPLIT>(
+ mut, nxt_container, key, value,
+ position.nxt, stage, _insert_size, p_left_bound);
+ iter.update_size(mut, _insert_size);
+ return p_value;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ }
+
+ template <KeyT KT, bool SPLIT>
+ static const value_t* proceed_insert(
+ NodeExtentMutable& mut, const container_t& container,
+ const full_key_t<KT>& key, const value_t& value,
+ position_t& position, match_stage_t& stage, node_offset_t& _insert_size) {
+ auto p_left_bound = container.p_left_bound();
+ if (unlikely(!container.keys())) {
+ if (position.is_end()) {
+ position = position_t::begin();
+ assert(stage == STAGE);
+ assert(_insert_size == insert_size<KT>(key, value));
+ } else if (position == position_t::begin()) {
+ // when insert into a trimmed and empty left node
+ stage = STAGE;
+ _insert_size = insert_size<KT>(key, value);
+ } else {
+ ceph_abort("impossible path");
+ }
+ if constexpr (IS_BOTTOM) {
+ return container_t::template insert_at<KT>(
+ mut, container, key, value, 0, _insert_size, p_left_bound);
+ } else {
+ auto range = container_t::template insert_prefix_at<KT>(
+ mut, container, key, 0, _insert_size, p_left_bound);
+ return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value);
+ }
+ } else {
+ return proceed_insert_recursively<KT, SPLIT>(
+ mut, container, key, value,
+ position, stage, _insert_size, p_left_bound);
+ }
+ }
+
+ static std::ostream& dump(const container_t& container,
+ std::ostream& os,
+ const std::string& prefix,
+ size_t& size,
+ const char* p_start) {
+ auto iter = iterator_t(container);
+ assert(!iter.is_end());
+ std::string prefix_blank(prefix.size(), ' ');
+ const std::string* p_prefix = &prefix;
+ size += iterator_t::header_size();
+ do {
+ std::ostringstream sos;
+ sos << *p_prefix << iter.get_key() << ": ";
+ std::string i_prefix = sos.str();
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ size += iter.size_to_nxt();
+ NXT_STAGE_T::dump(nxt_container, os, i_prefix, size, p_start);
+ } else {
+ auto value_ptr = iter.get_p_value();
+ int offset = reinterpret_cast<const char*>(value_ptr) - p_start;
+ size += iter.size();
+ os << "\n" << i_prefix;
+ if constexpr (NODE_TYPE == node_type_t::LEAF) {
+ os << *value_ptr;
+ } else {
+ os << "0x" << std::hex << value_ptr->value << std::dec;
+ }
+ os << " " << size << "B"
+ << " @" << offset << "B";
+ }
+ if (iter.is_last()) {
+ break;
+ } else {
+ ++iter;
+ p_prefix = &prefix_blank;
+ }
+ } while (true);
+ return os;
+ }
+
+ static void validate(const container_t& container) {
+ auto iter = iterator_t(container);
+ assert(!iter.is_end());
+ auto key = iter.get_key();
+ do {
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ NXT_STAGE_T::validate(nxt_container);
+ }
+ if (iter.is_last()) {
+ break;
+ } else {
+ ++iter;
+ assert(compare_to(key, iter.get_key()) == MatchKindCMP::LT);
+ key = iter.get_key();
+ }
+ } while (true);
+ }
+
+ static void get_stats(const container_t& container, node_stats_t& stats,
+ full_key_t<KeyT::VIEW>& index_key) {
+ auto iter = iterator_t(container);
+ assert(!iter.is_end());
+ stats.size_overhead += iterator_t::header_size();
+ do {
+ index_key.replace(iter.get_key());
+ stats.size_overhead += iter.size_overhead();
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ NXT_STAGE_T::get_stats(nxt_container, stats, index_key);
+ } else {
+ ++stats.num_kvs;
+ size_t kv_logical_size = index_key.size_logical();
+ size_t value_size;
+ if constexpr (NODE_TYPE == node_type_t::LEAF) {
+ value_size = iter.get_p_value()->size;
+ } else {
+ value_size = sizeof(value_t);
+ }
+ stats.size_value += value_size;
+ kv_logical_size += value_size;
+ stats.size_logical += kv_logical_size;
+ }
+ if (iter.is_last()) {
+ break;
+ } else {
+ ++iter;
+ }
+ } while (true);
+ }
+
+ static bool next_position(const container_t& container, position_t& pos) {
+ auto iter = iterator_t(container);
+ assert(!iter.is_end());
+ iter.seek_at(pos.index);
+ bool find_next;
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_container = iter.get_nxt_container();
+ find_next = NXT_STAGE_T::next_position(nxt_container, pos.nxt);
+ } else {
+ find_next = true;
+ }
+ if (find_next) {
+ if (iter.is_last()) {
+ return true;
+ } else {
+ pos.index = iter.index() + 1;
+ if constexpr (!IS_BOTTOM) {
+ pos.nxt = NXT_STAGE_T::position_t::begin();
+ }
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ struct _BaseEmpty {};
+ class _BaseWithNxtIterator {
+ protected:
+ typename NXT_STAGE_T::StagedIterator _nxt;
+ };
+ class StagedIterator
+ : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtIterator> {
+ public:
+ StagedIterator() = default;
+ bool valid() const { return iter.has_value(); }
+ index_t index() const {
+ return iter->index();
+ }
+ bool is_end() const { return iter->is_end(); }
+ bool in_progress() const {
+ assert(valid());
+ if constexpr (!IS_BOTTOM) {
+ if (this->_nxt.valid()) {
+ if (this->_nxt.index() == 0) {
+ return this->_nxt.in_progress();
+ } else {
+ return true;
+ }
+ } else {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+ key_get_type get_key() const { return iter->get_key(); }
+
+ iterator_t& get() { return *iter; }
+ void set(const container_t& container) {
+ assert(!valid());
+ iter = iterator_t(container);
+ }
+ void set_end() { iter->set_end(); }
+ typename NXT_STAGE_T::StagedIterator& nxt() {
+ if constexpr (!IS_BOTTOM) {
+ if (!this->_nxt.valid()) {
+ auto nxt_container = iter->get_nxt_container();
+ this->_nxt.set(nxt_container);
+ }
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ typename NXT_STAGE_T::StagedIterator& get_nxt() {
+ if constexpr (!IS_BOTTOM) {
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ StagedIterator& operator++() {
+ if (iter->is_last()) {
+ iter->set_end();
+ } else {
+ ++(*iter);
+ }
+ if constexpr (!IS_BOTTOM) {
+ this->_nxt.reset();
+ }
+ return *this;
+ }
+ void reset() {
+ if (valid()) {
+ iter.reset();
+ if constexpr (!IS_BOTTOM) {
+ this->_nxt.reset();
+ }
+ }
+ }
+ std::ostream& print(std::ostream& os, bool is_top) const {
+ if (valid()) {
+ if (iter->is_end()) {
+ return os << "END";
+ } else {
+ os << index();
+ }
+ } else {
+ if (is_top) {
+ return os << "invalid StagedIterator!";
+ } else {
+ os << "0!";
+ }
+ }
+ if constexpr (!IS_BOTTOM) {
+ os << ", ";
+ return this->_nxt.print(os, false);
+ } else {
+ return os;
+ }
+ }
+ position_t get_pos() const {
+ if (valid()) {
+ if constexpr (IS_BOTTOM) {
+ return position_t{index()};
+ } else {
+ return position_t{index(), this->_nxt.get_pos()};
+ }
+ } else {
+ return position_t::begin();
+ }
+ }
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ uint8_t present = static_cast<bool>(iter);
+ ceph::encode(present, encoded);
+ if (iter.has_value()) {
+ iter->encode(p_node_start, encoded);
+ if constexpr (!IS_BOTTOM) {
+ this->_nxt.encode(p_node_start, encoded);
+ }
+ }
+ }
+ static StagedIterator decode(const char* p_node_start,
+ ceph::bufferlist::const_iterator& delta) {
+ StagedIterator ret;
+ uint8_t present;
+ ceph::decode(present, delta);
+ if (present) {
+ ret.iter = iterator_t::decode(p_node_start, delta);
+ if constexpr (!IS_BOTTOM) {
+ ret._nxt = NXT_STAGE_T::StagedIterator::decode(p_node_start, delta);
+ }
+ }
+ return ret;
+ }
+ friend std::ostream& operator<<(std::ostream& os, const StagedIterator& iter) {
+ return iter.print(os, true);
+ }
+ private:
+ std::optional<iterator_t> iter;
+ };
+
+ static bool recursively_locate_split(
+ size_t& current_size, size_t extra_size,
+ size_t target_size, StagedIterator& split_at) {
+ assert(current_size <= target_size);
+ iterator_t& split_iter = split_at.get();
+ current_size = split_iter.seek_split(current_size, extra_size, target_size);
+ assert(current_size <= target_size);
+ assert(!split_iter.is_end());
+ if (split_iter.index() == 0) {
+ extra_size += iterator_t::header_size();
+ } else {
+ extra_size = 0;
+ }
+ bool locate_nxt;
+ if constexpr (!IS_BOTTOM) {
+ locate_nxt = NXT_STAGE_T::recursively_locate_split(
+ current_size, extra_size + split_iter.size_to_nxt(),
+ target_size, split_at.nxt());
+ } else { // IS_BOTTOM
+ // located upper_bound, fair split strategy
+ size_t nxt_size = split_iter.size() + extra_size;
+ assert(current_size + nxt_size > target_size);
+ if (current_size + nxt_size/2 < target_size) {
+ // include next
+ current_size += nxt_size;
+ locate_nxt = true;
+ } else {
+ // exclude next
+ locate_nxt = false;
+ }
+ }
+ if (locate_nxt) {
+ if (split_iter.is_last()) {
+ return true;
+ } else {
+ ++split_at;
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ static bool recursively_locate_split_inserted(
+ size_t& current_size, size_t extra_size, size_t target_size,
+ position_t& insert_pos, match_stage_t insert_stage, size_t insert_size,
+ std::optional<bool>& is_insert_left, StagedIterator& split_at) {
+ assert(current_size <= target_size);
+ assert(!is_insert_left.has_value());
+ iterator_t& split_iter = split_at.get();
+ auto& insert_index = insert_pos.index;
+ if (insert_stage == STAGE) {
+ current_size = split_iter.template seek_split_inserted<true>(
+ current_size, extra_size, target_size,
+ insert_index, insert_size, is_insert_left);
+ assert(is_insert_left.has_value());
+ assert(current_size <= target_size);
+ if (split_iter.index() == 0) {
+ if (insert_index == 0) {
+ if (*is_insert_left == false) {
+ extra_size += iterator_t::header_size();
+ } else {
+ extra_size = 0;
+ }
+ } else {
+ extra_size += iterator_t::header_size();
+ }
+ } else {
+ extra_size = 0;
+ }
+ if (*is_insert_left == false && split_iter.index() == insert_index) {
+ // split_iter can be end
+ // found the lower-bound of target_size
+ // ...[s_index-1] |!| (i_index) [s_index]...
+
+ // located upper-bound, fair split strategy
+ // look at the next slot (the insert item)
+ size_t nxt_size = insert_size + extra_size;
+ assert(current_size + nxt_size > target_size);
+ if (current_size + nxt_size/2 < target_size) {
+ // include next
+ *is_insert_left = true;
+ current_size += nxt_size;
+ if (split_iter.is_end()) {
+ // ...[s_index-1] (i_index) |!|
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ // exclude next
+ return false;
+ }
+ } else {
+ // Already considered insert effect in the current stage.
+ // Look into the next stage to identify the target_size lower-bound w/o
+ // insert effect.
+ assert(!split_iter.is_end());
+ bool locate_nxt;
+ if constexpr (!IS_BOTTOM) {
+ locate_nxt = NXT_STAGE_T::recursively_locate_split(
+ current_size, extra_size + split_iter.size_to_nxt(),
+ target_size, split_at.nxt());
+ } else { // IS_BOTTOM
+ // located upper-bound, fair split strategy
+ // look at the next slot
+ size_t nxt_size = split_iter.size() + extra_size;
+ assert(current_size + nxt_size > target_size);
+ if (current_size + nxt_size/2 < target_size) {
+ // include next
+ current_size += nxt_size;
+ locate_nxt = true;
+ } else {
+ // exclude next
+ locate_nxt = false;
+ }
+ }
+ if (locate_nxt) {
+ if (split_iter.is_last()) {
+ auto end_index = split_iter.index() + 1;
+ if (insert_index == INDEX_END) {
+ insert_index = end_index;
+ }
+ assert(insert_index <= end_index);
+ if (insert_index == end_index) {
+ assert(*is_insert_left == false);
+ split_iter.set_end();
+ // ...[s_index-1] |!| (i_index)
+ return false;
+ } else {
+ assert(*is_insert_left == true);
+ return true;
+ }
+ } else {
+ ++split_at;
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+ } else {
+ if constexpr (!IS_BOTTOM) {
+ assert(insert_stage < STAGE);
+ current_size = split_iter.template seek_split_inserted<false>(
+ current_size, extra_size, target_size,
+ insert_index, insert_size, is_insert_left);
+ assert(!split_iter.is_end());
+ assert(current_size <= target_size);
+ if (split_iter.index() == 0) {
+ extra_size += iterator_t::header_size();
+ } else {
+ extra_size = 0;
+ }
+ bool locate_nxt;
+ if (!is_insert_left.has_value()) {
+ // Considered insert effect in the current stage, and insert happens
+ // in the lower stage.
+ // Look into the next stage to identify the target_size lower-bound w/
+ // insert effect.
+ assert(split_iter.index() == insert_index);
+ locate_nxt = NXT_STAGE_T::recursively_locate_split_inserted(
+ current_size, extra_size + split_iter.size_to_nxt(), target_size,
+ insert_pos.nxt, insert_stage, insert_size,
+ is_insert_left, split_at.nxt());
+ assert(is_insert_left.has_value());
+#ifndef NDEBUG
+ if (locate_nxt) {
+ assert(*is_insert_left == true);
+ }
+#endif
+ } else {
+ // is_insert_left.has_value() == true
+ // Insert will *not* happen in the lower stage.
+ // Need to look into the next stage to identify the target_size
+ // lower-bound w/ insert effect
+ assert(split_iter.index() != insert_index);
+ locate_nxt = NXT_STAGE_T::recursively_locate_split(
+ current_size, extra_size + split_iter.size_to_nxt(),
+ target_size, split_at.nxt());
+#ifndef NDEBUG
+ if (split_iter.index() < insert_index) {
+ assert(*is_insert_left == false);
+ } else {
+ assert(*is_insert_left == true);
+ }
+#endif
+ }
+ if (locate_nxt) {
+ if (split_iter.is_last()) {
+ return true;
+ } else {
+ ++split_at;
+ return false;
+ }
+ } else {
+ return false;
+ }
+ } else {
+ ceph_abort("impossible path");
+ return false;;
+ }
+ }
+ }
+
+ /*
+ * container appender type system
+ * container_t::Appender(NodeExtentMutable& mut, char* p_append)
+ * append(const container_t& src, index_t from, index_t items)
+ * wrap() -> char*
+ * IF !IS_BOTTOM:
+ * open_nxt(const key_get_type&)
+ * open_nxt(const full_key_t&)
+ * -> std::tuple<NodeExtentMutable&, char*>
+ * wrap_nxt(char* p_append)
+ * ELSE
+ * append(const full_key_t& key, const value_t& value)
+ */
+ template <KeyT KT>
+ struct _BaseWithNxtAppender {
+ typename NXT_STAGE_T::template StagedAppender<KT> _nxt;
+ };
+ template <KeyT KT>
+ class StagedAppender
+ : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtAppender<KT>> {
+ public:
+ StagedAppender() = default;
+ ~StagedAppender() {
+ assert(!require_wrap_nxt);
+ assert(!valid());
+ }
+ bool valid() const { return appender.has_value(); }
+ index_t index() const {
+ assert(valid());
+ return _index;
+ }
+ bool in_progress() const { return require_wrap_nxt; }
+ // TODO: pass by reference
+ void init(NodeExtentMutable* p_mut, char* p_start) {
+ assert(!valid());
+ appender = typename container_t::template Appender<KT>(p_mut, p_start);
+ _index = 0;
+ }
+ // possible to make src_iter end if to_index == INDEX_END
+ void append_until(StagedIterator& src_iter, index_t& to_index) {
+ assert(!require_wrap_nxt);
+ auto s_index = src_iter.index();
+ src_iter.get().template copy_out_until<KT>(*appender, to_index);
+ assert(src_iter.index() == to_index);
+ assert(to_index >= s_index);
+ auto increment = (to_index - s_index);
+ if (increment) {
+ _index += increment;
+ if constexpr (!IS_BOTTOM) {
+ src_iter.get_nxt().reset();
+ }
+ }
+ }
+ void append(const full_key_t<KT>& key,
+ const value_t& value, const value_t*& p_value) {
+ assert(!require_wrap_nxt);
+ if constexpr (!IS_BOTTOM) {
+ auto& nxt = open_nxt(key);
+ nxt.append(key, value, p_value);
+ wrap_nxt();
+ } else {
+ appender->append(key, value, p_value);
+ ++_index;
+ }
+ }
+ char* wrap() {
+ assert(valid());
+ assert(_index > 0);
+ if constexpr (!IS_BOTTOM) {
+ if (require_wrap_nxt) {
+ wrap_nxt();
+ }
+ }
+ auto ret = appender->wrap();
+ appender.reset();
+ return ret;
+ }
+ typename NXT_STAGE_T::template StagedAppender<KT>&
+ open_nxt(key_get_type paritial_key) {
+ assert(!require_wrap_nxt);
+ if constexpr (!IS_BOTTOM) {
+ require_wrap_nxt = true;
+ auto [p_mut, p_append] = appender->open_nxt(paritial_key);
+ this->_nxt.init(p_mut, p_append);
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ typename NXT_STAGE_T::template StagedAppender<KT>&
+ open_nxt(const full_key_t<KT>& key) {
+ assert(!require_wrap_nxt);
+ if constexpr (!IS_BOTTOM) {
+ require_wrap_nxt = true;
+ auto [p_mut, p_append] = appender->open_nxt(key);
+ this->_nxt.init(p_mut, p_append);
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ typename NXT_STAGE_T::template StagedAppender<KT>& get_nxt() {
+ if constexpr (!IS_BOTTOM) {
+ assert(require_wrap_nxt);
+ return this->_nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ void wrap_nxt() {
+ if constexpr (!IS_BOTTOM) {
+ assert(require_wrap_nxt);
+ require_wrap_nxt = false;
+ auto p_append = this->_nxt.wrap();
+ appender->wrap_nxt(p_append);
+ ++_index;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ private:
+ std::optional<typename container_t::template Appender<KT>> appender;
+ index_t _index;
+ bool require_wrap_nxt = false;
+ };
+
+ template <KeyT KT>
+ static void _append_range(
+ StagedIterator& src_iter, StagedAppender<KT>& appender, index_t& to_index) {
+ if (src_iter.is_end()) {
+ // append done
+ assert(to_index == INDEX_END);
+ to_index = src_iter.index();
+ } else if constexpr (!IS_BOTTOM) {
+ if (appender.in_progress()) {
+ // appender has appended something at the current item,
+ // cannot append the current item as-a-whole
+ index_t to_index_nxt = INDEX_END;
+ NXT_STAGE_T::template _append_range<KT>(
+ src_iter.nxt(), appender.get_nxt(), to_index_nxt);
+ ++src_iter;
+ appender.wrap_nxt();
+ } else if (src_iter.in_progress()) {
+ // src_iter is not at the beginning of the current item,
+ // cannot append the current item as-a-whole
+ index_t to_index_nxt = INDEX_END;
+ NXT_STAGE_T::template _append_range<KT>(
+ src_iter.nxt(), appender.open_nxt(src_iter.get_key()), to_index_nxt);
+ ++src_iter;
+ appender.wrap_nxt();
+ } else {
+ // we can safely append the current item as-a-whole
+ }
+ }
+ appender.append_until(src_iter, to_index);
+ }
+
+ template <KeyT KT>
+ static void _append_into(StagedIterator& src_iter, StagedAppender<KT>& appender,
+ position_t& position, match_stage_t stage) {
+ assert(position.index == src_iter.index());
+ // reaches the last item
+ if (stage == STAGE) {
+ // done, end recursion
+ if constexpr (!IS_BOTTOM) {
+ position.nxt = position_t::nxt_t::begin();
+ }
+ } else {
+ assert(stage < STAGE);
+ // proceed append in the next stage
+ NXT_STAGE_T::template append_until<KT>(
+ src_iter.nxt(), appender.open_nxt(src_iter.get_key()),
+ position.nxt, stage);
+ }
+ }
+
+ template <KeyT KT>
+ static void append_until(StagedIterator& src_iter, StagedAppender<KT>& appender,
+ position_t& position, match_stage_t stage) {
+ index_t from_index = src_iter.index();
+ index_t& to_index = position.index;
+ assert(from_index <= to_index);
+ if constexpr (IS_BOTTOM) {
+ assert(stage == STAGE);
+ appender.append_until(src_iter, to_index);
+ } else {
+ assert(stage <= STAGE);
+ if (src_iter.index() == to_index) {
+ _append_into<KT>(src_iter, appender, position, stage);
+ } else {
+ if (to_index == INDEX_END) {
+ assert(stage == STAGE);
+ } else if (to_index == INDEX_LAST) {
+ assert(stage < STAGE);
+ }
+ _append_range<KT>(src_iter, appender, to_index);
+ _append_into<KT>(src_iter, appender, position, stage);
+ }
+ }
+ to_index -= from_index;
+ }
+
+ template <KeyT KT>
+ static bool append_insert(
+ const full_key_t<KT>& key, const value_t& value,
+ StagedIterator& src_iter, StagedAppender<KT>& appender,
+ bool is_front_insert, match_stage_t& stage, const value_t*& p_value) {
+ assert(src_iter.valid());
+ if (stage == STAGE) {
+ appender.append(key, value, p_value);
+ if (src_iter.is_end()) {
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ assert(stage < STAGE);
+ if constexpr (!IS_BOTTOM) {
+ auto nxt_is_end = NXT_STAGE_T::template append_insert<KT>(
+ key, value, src_iter.get_nxt(), appender.get_nxt(),
+ is_front_insert, stage, p_value);
+ if (nxt_is_end) {
+ appender.wrap_nxt();
+ ++src_iter;
+ if (is_front_insert) {
+ stage = STAGE;
+ }
+ if (src_iter.is_end()) {
+ return true;
+ }
+ }
+ return false;
+ } else {
+ ceph_abort("impossible path");
+ }
+ }
+ }
+
+ /* TrimType:
+ * BEFORE: remove the entire container, normally means the according higher
+ * stage iterator needs to be trimmed as-a-whole.
+ * AFTER: retain the entire container, normally means the trim should be
+ * start from the next iterator at the higher stage.
+ * AT: trim happens in the current container, and the according higher
+ * stage iterator needs to be adjusted by the trimmed size.
+ */
+ static std::tuple<TrimType, node_offset_t>
+ recursively_trim(NodeExtentMutable& mut, StagedIterator& trim_at) {
+ if (!trim_at.valid()) {
+ return {TrimType::BEFORE, 0u};
+ }
+ if (trim_at.is_end()) {
+ return {TrimType::AFTER, 0u};
+ }
+
+ auto& iter = trim_at.get();
+ if constexpr (!IS_BOTTOM) {
+ auto [type, trimmed] = NXT_STAGE_T::recursively_trim(
+ mut, trim_at.get_nxt());
+ node_offset_t trim_size;
+ if (type == TrimType::AFTER) {
+ if (iter.is_last()) {
+ return {TrimType::AFTER, 0u};
+ }
+ ++trim_at;
+ trim_size = iter.trim_until(mut);
+ } else if (type == TrimType::BEFORE) {
+ if (iter.index() == 0) {
+ return {TrimType::BEFORE, 0u};
+ }
+ trim_size = iter.trim_until(mut);
+ } else {
+ trim_size = iter.trim_at(mut, trimmed);
+ }
+ return {TrimType::AT, trim_size};
+ } else {
+ if (iter.index() == 0) {
+ return {TrimType::BEFORE, 0u};
+ } else {
+ auto trimmed = iter.trim_until(mut);
+ return {TrimType::AT, trimmed};
+ }
+ }
+ }
+
+ static void trim(NodeExtentMutable& mut, StagedIterator& trim_at) {
+ auto [type, trimmed] = recursively_trim(mut, trim_at);
+ if (type == TrimType::BEFORE) {
+ assert(trim_at.valid());
+ auto& iter = trim_at.get();
+ iter.trim_until(mut);
+ }
+ }
+};
+
+/**
+ * Configurations for struct staged
+ *
+ * staged_params_* assembles different container_t implementations (defined by
+ * stated::_iterator_t) by STAGE, and constructs the final multi-stage
+ * implementations for different node layouts defined by
+ * node_extent_t<FieldType, NODE_TYPE>.
+ *
+ * The specialized implementations for different layouts are accessible through
+ * the helper type node_to_stage_t<node_extent_t<FieldType, NODE_TYPE>>.
+ *
+ * Specifically, the settings of 8 layouts are:
+ *
+ * The layout (N0, LEAF/INTERNAL) has 3 stages:
+ * - STAGE_LEFT: node_extent_t<node_fields_0_t, LEAF/INTERNAL>
+ * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL>
+ * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL>
+ *
+ * The layout (N1, LEAF/INTERNAL) has 3 stages:
+ * - STAGE_LEFT: node_extent_t<node_fields_1_t, LEAF/INTERNAL>
+ * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL>
+ * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL>
+ *
+ * The layout (N2, LEAF/INTERNAL) has 2 stages:
+ * - STAGE_STRING: node_extent_t<node_fields_2_t, LEAF/INTERNAL>
+ * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL>
+ *
+ * The layout (N3, LEAF) has 1 stage:
+ * - STAGE_RIGHT: node_extent_t<leaf_fields_3_t, LEAF>
+ *
+ * The layout (N3, INTERNAL) has 1 stage:
+ * - STAGE_RIGHT: node_extent_t<internal_fields_3_t, INTERNAL>
+ */
+
+template <node_type_t _NODE_TYPE>
+struct staged_params_subitems {
+ using container_t = sub_items_t<_NODE_TYPE>;
+ static constexpr auto NODE_TYPE = _NODE_TYPE;
+ static constexpr auto STAGE = STAGE_RIGHT;
+
+ // dummy type in order to make our type system work
+ // any better solution to get rid of this?
+ using next_param_t = staged_params_subitems<NODE_TYPE>;
+};
+
+template <node_type_t _NODE_TYPE>
+struct staged_params_item_iterator {
+ using container_t = item_iterator_t<_NODE_TYPE>;
+ static constexpr auto NODE_TYPE = _NODE_TYPE;
+ static constexpr auto STAGE = STAGE_STRING;
+
+ using next_param_t = staged_params_subitems<NODE_TYPE>;
+};
+
+template <typename NodeType>
+struct staged_params_node_01 {
+ using container_t = NodeType;
+ static constexpr auto NODE_TYPE = NodeType::NODE_TYPE;
+ static constexpr auto STAGE = STAGE_LEFT;
+
+ using next_param_t = staged_params_item_iterator<NODE_TYPE>;
+};
+
+template <typename NodeType>
+struct staged_params_node_2 {
+ using container_t = NodeType;
+ static constexpr auto NODE_TYPE = NodeType::NODE_TYPE;
+ static constexpr auto STAGE = STAGE_STRING;
+
+ using next_param_t = staged_params_subitems<NODE_TYPE>;
+};
+
+template <typename NodeType>
+struct staged_params_node_3 {
+ using container_t = NodeType;
+ static constexpr auto NODE_TYPE = NodeType::NODE_TYPE;
+ static constexpr auto STAGE = STAGE_RIGHT;
+
+ // dummy type in order to make our type system work
+ // any better solution to get rid of this?
+ using next_param_t = staged_params_node_3<NodeType>;
+};
+
+template <typename NodeType, typename Enable = void> struct _node_to_stage_t;
+template <typename NodeType>
+struct _node_to_stage_t<NodeType,
+ std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N0 ||
+ NodeType::FIELD_TYPE == field_type_t::N1>> {
+ using type = staged<staged_params_node_01<NodeType>>;
+};
+template <typename NodeType>
+struct _node_to_stage_t<NodeType,
+ std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N2>> {
+ using type = staged<staged_params_node_2<NodeType>>;
+};
+template <typename NodeType>
+struct _node_to_stage_t<NodeType,
+ std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N3>> {
+ using type = staged<staged_params_node_3<NodeType>>;
+};
+template <typename NodeType>
+using node_to_stage_t = typename _node_to_stage_t<NodeType>::type;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h
new file mode 100644
index 000000000..a9d5cef3b
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h
@@ -0,0 +1,411 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <optional>
+#include <ostream>
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/tree_types.h"
+
+namespace crimson::os::seastore::onode {
+
+using match_stage_t = int8_t;
+constexpr match_stage_t STAGE_LEFT = 2; // shard/pool/crush
+constexpr match_stage_t STAGE_STRING = 1; // nspace/oid
+constexpr match_stage_t STAGE_RIGHT = 0; // snap/gen
+constexpr auto STAGE_TOP = STAGE_LEFT;
+constexpr auto STAGE_BOTTOM = STAGE_RIGHT;
+constexpr bool is_valid_stage(match_stage_t stage) {
+ return std::clamp(stage, STAGE_BOTTOM, STAGE_TOP) == stage;
+}
+// TODO: replace by
+// using match_history_t = int8_t;
+// left_m, str_m, right_m
+// 3: GT,
+// 2: EQ, GT,
+// 1: EQ, EQ, GT
+// 0: EQ, EQ, EQ
+// -1: EQ, EQ, LT
+// -2: EQ, LT,
+// -3: LT,
+
+struct MatchHistory {
+ template <match_stage_t STAGE>
+ const std::optional<MatchKindCMP>& get() const {
+ static_assert(is_valid_stage(STAGE));
+ if constexpr (STAGE == STAGE_RIGHT) {
+ return right_match;
+ } else if (STAGE == STAGE_STRING) {
+ return string_match;
+ } else {
+ return left_match;
+ }
+ }
+
+ const std::optional<MatchKindCMP>&
+ get_by_stage(match_stage_t stage) const {
+ assert(is_valid_stage(stage));
+ if (stage == STAGE_RIGHT) {
+ return right_match;
+ } else if (stage == STAGE_STRING) {
+ return string_match;
+ } else {
+ return left_match;
+ }
+ }
+
+ template <match_stage_t STAGE = STAGE_TOP>
+ const bool is_GT() const;
+
+ template <match_stage_t STAGE>
+ void set(MatchKindCMP match) {
+ static_assert(is_valid_stage(STAGE));
+ if constexpr (STAGE < STAGE_TOP) {
+ assert(*get<STAGE + 1>() == MatchKindCMP::EQ);
+ }
+ assert(!get<STAGE>().has_value() || *get<STAGE>() != MatchKindCMP::EQ);
+ const_cast<std::optional<MatchKindCMP>&>(get<STAGE>()) = match;
+ }
+
+ std::ostream& dump(std::ostream& os) const {
+ os << "history(";
+ dump_each(os, left_match) << ", ";
+ dump_each(os, string_match) << ", ";
+ dump_each(os, right_match) << ")";
+ return os;
+ }
+
+ std::ostream& dump_each(
+ std::ostream& os, const std::optional<MatchKindCMP>& match) const {
+ if (!match.has_value()) {
+ return os << "--";
+ } else if (*match == MatchKindCMP::LT) {
+ return os << "LT";
+ } else if (*match == MatchKindCMP::EQ) {
+ return os << "EQ";
+ } else if (*match == MatchKindCMP::GT) {
+ return os << "GT";
+ } else {
+ ceph_abort("impossble path");
+ }
+ }
+
+ std::optional<MatchKindCMP> left_match;
+ std::optional<MatchKindCMP> string_match;
+ std::optional<MatchKindCMP> right_match;
+};
+inline std::ostream& operator<<(std::ostream& os, const MatchHistory& pos) {
+ return pos.dump(os);
+}
+
+template <match_stage_t STAGE>
+struct _check_GT_t {
+ static bool eval(const MatchHistory* history) {
+ return history->get<STAGE>() &&
+ (*history->get<STAGE>() == MatchKindCMP::GT ||
+ (*history->get<STAGE>() == MatchKindCMP::EQ &&
+ _check_GT_t<STAGE - 1>::eval(history)));
+ }
+};
+template <>
+struct _check_GT_t<STAGE_RIGHT> {
+ static bool eval(const MatchHistory* history) {
+ return history->get<STAGE_RIGHT>() &&
+ *history->get<STAGE_RIGHT>() == MatchKindCMP::GT;
+ }
+};
+template <match_stage_t STAGE>
+const bool MatchHistory::is_GT() const {
+ static_assert(is_valid_stage(STAGE));
+ if constexpr (STAGE < STAGE_TOP) {
+ assert(get<STAGE + 1>() == MatchKindCMP::EQ);
+ }
+ return _check_GT_t<STAGE>::eval(this);
+}
+
+template <match_stage_t STAGE>
+struct staged_position_t {
+ static_assert(is_valid_stage(STAGE));
+ using me_t = staged_position_t<STAGE>;
+ using nxt_t = staged_position_t<STAGE - 1>;
+ bool is_end() const {
+ if (index == INDEX_END) {
+ return true;
+ } else {
+ assert(is_valid_index(index));
+ return false;
+ }
+ }
+ index_t& index_by_stage(match_stage_t stage) {
+ assert(stage <= STAGE);
+ if (STAGE == stage) {
+ return index;
+ } else {
+ return nxt.index_by_stage(stage);
+ }
+ }
+
+ int cmp(const me_t& o) const {
+ if (index > o.index) {
+ return 1;
+ } else if (index < o.index) {
+ return -1;
+ } else {
+ return nxt.cmp(o.nxt);
+ }
+ }
+ bool operator>(const me_t& o) const { return cmp(o) > 0; }
+ bool operator>=(const me_t& o) const { return cmp(o) >= 0; }
+ bool operator<(const me_t& o) const { return cmp(o) < 0; }
+ bool operator<=(const me_t& o) const { return cmp(o) <= 0; }
+ bool operator==(const me_t& o) const { return cmp(o) == 0; }
+ bool operator!=(const me_t& o) const { return cmp(o) != 0; }
+
+ me_t& operator-=(const me_t& o) {
+ assert(is_valid_index(o.index));
+ assert(index >= o.index);
+ if (index != INDEX_END) {
+ assert(is_valid_index(index));
+ index -= o.index;
+ if (index == 0) {
+ nxt -= o.nxt;
+ }
+ }
+ return *this;
+ }
+
+ void encode(ceph::bufferlist& encoded) const {
+ ceph::encode(index, encoded);
+ nxt.encode(encoded);
+ }
+
+ static me_t decode(ceph::bufferlist::const_iterator& delta) {
+ me_t ret;
+ ceph::decode(ret.index, delta);
+ ret.nxt = nxt_t::decode(delta);
+ return ret;
+ }
+
+ static me_t begin() { return {0u, nxt_t::begin()}; }
+ static me_t end() {
+ return {INDEX_END, nxt_t::end()};
+ }
+
+ index_t index;
+ nxt_t nxt;
+};
+template <match_stage_t STAGE>
+std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE>& pos) {
+ if (pos.index == INDEX_END) {
+ os << "END";
+ } else if (pos.index == INDEX_LAST) {
+ os << "LAST";
+ } else {
+ os << pos.index;
+ assert(is_valid_index(pos.index));
+ }
+ return os << ", " << pos.nxt;
+}
+
+template <>
+struct staged_position_t<STAGE_BOTTOM> {
+ using me_t = staged_position_t<STAGE_BOTTOM>;
+ bool is_end() const {
+ if (index == INDEX_END) {
+ return true;
+ } else {
+ assert(is_valid_index(index));
+ return false;
+ }
+ }
+ index_t& index_by_stage(match_stage_t stage) {
+ assert(stage == STAGE_BOTTOM);
+ return index;
+ }
+
+ int cmp(const staged_position_t<STAGE_BOTTOM>& o) const {
+ if (index > o.index) {
+ return 1;
+ } else if (index < o.index) {
+ return -1;
+ } else {
+ return 0;
+ }
+ }
+ bool operator>(const me_t& o) const { return cmp(o) > 0; }
+ bool operator>=(const me_t& o) const { return cmp(o) >= 0; }
+ bool operator<(const me_t& o) const { return cmp(o) < 0; }
+ bool operator<=(const me_t& o) const { return cmp(o) <= 0; }
+ bool operator==(const me_t& o) const { return cmp(o) == 0; }
+ bool operator!=(const me_t& o) const { return cmp(o) != 0; }
+
+ me_t& operator-=(const me_t& o) {
+ assert(is_valid_index(o.index));
+ assert(index >= o.index);
+ if (index != INDEX_END) {
+ assert(is_valid_index(index));
+ index -= o.index;
+ }
+ return *this;
+ }
+
+ void encode(ceph::bufferlist& encoded) const {
+ ceph::encode(index, encoded);
+ }
+
+ static me_t decode(ceph::bufferlist::const_iterator& delta) {
+ me_t ret;
+ ceph::decode(ret.index, delta);
+ return ret;
+ }
+
+ static me_t begin() { return {0u}; }
+ static me_t end() { return {INDEX_END}; }
+
+ index_t index;
+};
+template <>
+inline std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE_BOTTOM>& pos) {
+ if (pos.index == INDEX_END) {
+ os << "END";
+ } else if (pos.index == INDEX_LAST) {
+ os << "LAST";
+ } else {
+ os << pos.index;
+ assert(is_valid_index(pos.index));
+ }
+ return os;
+}
+
+using search_position_t = staged_position_t<STAGE_TOP>;
+
+template <match_stage_t STAGE>
+const staged_position_t<STAGE>& cast_down(const search_position_t& pos) {
+ if constexpr (STAGE == STAGE_LEFT) {
+ return pos;
+ } else if constexpr (STAGE == STAGE_STRING) {
+#ifndef NDEBUG
+ if (pos.is_end()) {
+ assert(pos.nxt.is_end());
+ } else {
+ assert(pos.index == 0u);
+ }
+#endif
+ return pos.nxt;
+ } else if constexpr (STAGE == STAGE_RIGHT) {
+#ifndef NDEBUG
+ if (pos.is_end()) {
+ assert(pos.nxt.nxt.is_end());
+ } else {
+ assert(pos.index == 0u);
+ assert(pos.nxt.index == 0u);
+ }
+#endif
+ return pos.nxt.nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+template <match_stage_t STAGE>
+staged_position_t<STAGE>& cast_down(search_position_t& pos) {
+ const search_position_t& _pos = pos;
+ return const_cast<staged_position_t<STAGE>&>(cast_down<STAGE>(_pos));
+}
+
+template <match_stage_t STAGE>
+staged_position_t<STAGE>& cast_down_fill_0(search_position_t& pos) {
+ if constexpr (STAGE == STAGE_LEFT) {
+ return pos;
+ } if constexpr (STAGE == STAGE_STRING) {
+ pos.index = 0;
+ return pos.nxt;
+ } else if constexpr (STAGE == STAGE_RIGHT) {
+ pos.index = 0;
+ pos.nxt.index = 0;
+ return pos.nxt.nxt;
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+inline search_position_t&& normalize(search_position_t&& pos) { return std::move(pos); }
+
+template <match_stage_t STAGE, typename = std::enable_if_t<STAGE != STAGE_TOP>>
+search_position_t normalize(staged_position_t<STAGE>&& pos) {
+ if (pos.is_end()) {
+ return search_position_t::end();
+ }
+ if constexpr (STAGE == STAGE_STRING) {
+ return {0u, std::move(pos)};
+ } else if (STAGE == STAGE_RIGHT) {
+ return {0u, {0u, std::move(pos)}};
+ } else {
+ ceph_abort("impossible path");
+ }
+}
+
+struct memory_range_t {
+ const char* p_start;
+ const char* p_end;
+};
+
+enum class ContainerType { ITERATIVE, INDEXABLE };
+
+template <node_type_t> struct value_type;
+template<> struct value_type<node_type_t::INTERNAL> { using type = laddr_packed_t; };
+template<> struct value_type<node_type_t::LEAF> { using type = onode_t; };
+template <node_type_t NODE_TYPE>
+using value_type_t = typename value_type<NODE_TYPE>::type;
+
+template <node_type_t NODE_TYPE, match_stage_t STAGE>
+struct staged_result_t {
+ using me_t = staged_result_t<NODE_TYPE, STAGE>;
+ bool is_end() const { return position.is_end(); }
+
+ static me_t end() {
+ return {staged_position_t<STAGE>::end(), nullptr, MSTAT_END};
+ }
+ template <typename T = me_t>
+ static std::enable_if_t<STAGE != STAGE_BOTTOM, T> from_nxt(
+ index_t index, const staged_result_t<NODE_TYPE, STAGE - 1>& nxt_stage_result) {
+ return {{index, nxt_stage_result.position},
+ nxt_stage_result.p_value,
+ nxt_stage_result.mstat};
+ }
+
+ staged_position_t<STAGE> position;
+ const value_type_t<NODE_TYPE>* p_value;
+ match_stat_t mstat;
+};
+
+template <node_type_t NODE_TYPE>
+using lookup_result_t = staged_result_t<NODE_TYPE, STAGE_TOP>;
+
+template <node_type_t NODE_TYPE>
+lookup_result_t<NODE_TYPE>&& normalize(
+ lookup_result_t<NODE_TYPE>&& result) { return std::move(result); }
+
+template <node_type_t NODE_TYPE, match_stage_t STAGE,
+ typename = std::enable_if_t<STAGE != STAGE_TOP>>
+lookup_result_t<NODE_TYPE> normalize(
+ staged_result_t<NODE_TYPE, STAGE>&& result) {
+ // FIXME: assert result.mstat correct
+ return {normalize(std::move(result.position)), result.p_value, result.mstat};
+}
+
+struct node_stats_t {
+ size_t size_persistent = 0;
+ size_t size_filled = 0;
+ // filled by staged::get_stats()
+ size_t size_logical = 0;
+ size_t size_overhead = 0;
+ size_t size_value = 0;
+ unsigned num_kvs = 0;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc
new file mode 100644
index 000000000..aaca6c3c6
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc
@@ -0,0 +1,208 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "sub_items_stage.h"
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h"
+
+namespace crimson::os::seastore::onode {
+
+template <KeyT KT>
+const laddr_packed_t* internal_sub_items_t::insert_at(
+ NodeExtentMutable& mut, const internal_sub_items_t& sub_items,
+ const full_key_t<KT>& key, const laddr_packed_t& value,
+ index_t index, node_offset_t size, const char* p_left_bound) {
+ assert(index <= sub_items.keys());
+ assert(size == estimate_insert<KT>(key, value));
+ const char* p_shift_start = p_left_bound;
+ const char* p_shift_end = reinterpret_cast<const char*>(
+ sub_items.p_first_item + 1 - index);
+ mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size);
+
+ auto p_insert = const_cast<char*>(p_shift_end) - size;
+ auto item = internal_sub_item_t{snap_gen_t::from_key<KT>(key), value};
+ mut.copy_in_absolute(p_insert, item);
+ return &reinterpret_cast<internal_sub_item_t*>(p_insert)->value;
+}
+#define IA_TEMPLATE(KT) \
+ template const laddr_packed_t* internal_sub_items_t::insert_at<KT>( \
+ NodeExtentMutable&, const internal_sub_items_t&, const full_key_t<KT>&, \
+ const laddr_packed_t&, index_t, node_offset_t, const char*)
+IA_TEMPLATE(KeyT::VIEW);
+IA_TEMPLATE(KeyT::HOBJ);
+
+node_offset_t internal_sub_items_t::trim_until(
+ NodeExtentMutable&, internal_sub_items_t& items, index_t index) {
+ assert(index != 0);
+ auto keys = items.keys();
+ assert(index <= keys);
+ size_t ret = sizeof(internal_sub_item_t) * (keys - index);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+}
+
+template <KeyT KT>
+void internal_sub_items_t::Appender<KT>::append(
+ const internal_sub_items_t& src, index_t from, index_t items) {
+ assert(from <= src.keys());
+ if (items == 0) {
+ return;
+ }
+ assert(from < src.keys());
+ assert(from + items <= src.keys());
+ node_offset_t size = sizeof(internal_sub_item_t) * items;
+ p_append -= size;
+ p_mut->copy_in_absolute(p_append, src.p_first_item + 1 - from - items, size);
+}
+
+template <KeyT KT>
+void internal_sub_items_t::Appender<KT>::append(
+ const full_key_t<KT>& key, const laddr_packed_t& value,
+ const laddr_packed_t*& p_value) {
+ p_append -= sizeof(internal_sub_item_t);
+ auto item = internal_sub_item_t{snap_gen_t::from_key<KT>(key), value};
+ p_mut->copy_in_absolute(p_append, item);
+ p_value = &reinterpret_cast<internal_sub_item_t*>(p_append)->value;
+}
+
+template <KeyT KT>
+const onode_t* leaf_sub_items_t::insert_at(
+ NodeExtentMutable& mut, const leaf_sub_items_t& sub_items,
+ const full_key_t<KT>& key, const onode_t& value,
+ index_t index, node_offset_t size, const char* p_left_bound) {
+ assert(index <= sub_items.keys());
+ assert(size == estimate_insert<KT>(key, value));
+ // a. [... item(index)] << size
+ const char* p_shift_start = p_left_bound;
+ const char* p_shift_end = sub_items.get_item_end(index);
+ mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size);
+
+ // b. insert item
+ auto p_insert = const_cast<char*>(p_shift_end - size);
+ auto p_value = reinterpret_cast<const onode_t*>(p_insert);
+ mut.copy_in_absolute(p_insert, &value, value.size);
+ p_insert += value.size;
+ mut.copy_in_absolute(p_insert, snap_gen_t::template from_key<KT>(key));
+ assert(p_insert + sizeof(snap_gen_t) + sizeof(node_offset_t) == p_shift_end);
+
+ // c. compensate affected offsets
+ auto item_size = value.size + sizeof(snap_gen_t);
+ for (auto i = index; i < sub_items.keys(); ++i) {
+ const node_offset_packed_t& offset_i = sub_items.get_offset(i);
+ mut.copy_in_absolute((void*)&offset_i, node_offset_t(offset_i.value + item_size));
+ }
+
+ // d. [item(index-1) ... item(0) ... offset(index)] <<< sizeof(node_offset_t)
+ const char* p_offset = (index == 0 ?
+ (const char*)&sub_items.get_offset(0) + sizeof(node_offset_t) :
+ (const char*)&sub_items.get_offset(index - 1));
+ p_shift_start = p_shift_end;
+ p_shift_end = p_offset;
+ mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)sizeof(node_offset_t));
+
+ // e. insert offset
+ node_offset_t offset_to_item_start = item_size + sub_items.get_offset_to_end(index);
+ mut.copy_in_absolute(
+ const_cast<char*>(p_shift_end) - sizeof(node_offset_t), offset_to_item_start);
+
+ // f. update num_sub_keys
+ mut.copy_in_absolute((void*)sub_items.p_num_keys, num_keys_t(sub_items.keys() + 1));
+
+ return p_value;
+}
+template const onode_t* leaf_sub_items_t::insert_at<KeyT::HOBJ>(
+ NodeExtentMutable&, const leaf_sub_items_t&, const full_key_t<KeyT::HOBJ>&,
+ const onode_t&, index_t, node_offset_t, const char*);
+
+node_offset_t leaf_sub_items_t::trim_until(
+ NodeExtentMutable& mut, leaf_sub_items_t& items, index_t index) {
+ assert(index != 0);
+ auto keys = items.keys();
+ assert(index <= keys);
+ if (index == keys) {
+ return 0;
+ }
+ index_t trim_items = keys - index;
+ const char* p_items_start = items.p_start();
+ const char* p_shift_start = items.get_item_end(index);
+ const char* p_shift_end = items.get_item_end(0);
+ size_t size_trim_offsets = sizeof(node_offset_t) * trim_items;
+ mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start,
+ size_trim_offsets);
+ mut.copy_in_absolute((void*)items.p_num_keys, num_keys_t(index));
+ size_t ret = size_trim_offsets + (p_shift_start - p_items_start);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+}
+
+template class internal_sub_items_t::Appender<KeyT::VIEW>;
+template class internal_sub_items_t::Appender<KeyT::HOBJ>;
+
+// helper type for the visitor
+template<class... Ts> struct overloaded : Ts... { using Ts::operator()...; };
+// explicit deduction guide
+template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
+
+template <KeyT KT>
+char* leaf_sub_items_t::Appender<KT>::wrap() {
+ auto p_cur = p_append;
+ num_keys_t num_keys = 0;
+ for (auto i = 0u; i < cnt; ++i) {
+ auto& a = appends[i];
+ std::visit(overloaded {
+ [&] (const range_items_t& arg) { num_keys += arg.items; },
+ [&] (const kv_item_t& arg) { ++num_keys; }
+ }, a);
+ }
+ assert(num_keys);
+ p_cur -= sizeof(num_keys_t);
+ p_mut->copy_in_absolute(p_cur, num_keys);
+
+ node_offset_t last_offset = 0;
+ for (auto i = 0u; i < cnt; ++i) {
+ auto& a = appends[i];
+ std::visit(overloaded {
+ [&] (const range_items_t& arg) {
+ int compensate = (last_offset - op_src->get_offset_to_end(arg.from));
+ node_offset_t offset;
+ for (auto i = arg.from; i < arg.from + arg.items; ++i) {
+ offset = op_src->get_offset(i).value + compensate;
+ p_cur -= sizeof(node_offset_t);
+ p_mut->copy_in_absolute(p_cur, offset);
+ }
+ last_offset = offset;
+ },
+ [&] (const kv_item_t& arg) {
+ last_offset += sizeof(snap_gen_t) + arg.p_value->size;
+ p_cur -= sizeof(node_offset_t);
+ p_mut->copy_in_absolute(p_cur, last_offset);
+ }
+ }, a);
+ }
+
+ for (auto i = 0u; i < cnt; ++i) {
+ auto& a = appends[i];
+ std::visit(overloaded {
+ [&] (const range_items_t& arg) {
+ auto _p_start = op_src->get_item_end(arg.from + arg.items);
+ size_t _len = op_src->get_item_end(arg.from) - _p_start;
+ p_cur -= _len;
+ p_mut->copy_in_absolute(p_cur, _p_start, _len);
+ },
+ [&] (const kv_item_t& arg) {
+ assert(pp_value);
+ p_cur -= sizeof(snap_gen_t);
+ p_mut->copy_in_absolute(p_cur, snap_gen_t::template from_key<KT>(*arg.p_key));
+ p_cur -= arg.p_value->size;
+ p_mut->copy_in_absolute(p_cur, arg.p_value, arg.p_value->size);
+ *pp_value = reinterpret_cast<const onode_t*>(p_cur);
+ }
+ }, a);
+ }
+ return p_cur;
+}
+
+template class leaf_sub_items_t::Appender<KeyT::VIEW>;
+template class leaf_sub_items_t::Appender<KeyT::HOBJ>;
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h
new file mode 100644
index 000000000..8ef5f7472
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h
@@ -0,0 +1,341 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <variant>
+
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
+#include "key_layout.h"
+#include "stage_types.h"
+
+namespace crimson::os::seastore::onode {
+
+class NodeExtentMutable;
+
+struct internal_sub_item_t {
+ const snap_gen_t& get_key() const { return key; }
+ const laddr_packed_t* get_p_value() const { return &value; }
+
+ snap_gen_t key;
+ laddr_packed_t value;
+} __attribute__((packed));
+
+/**
+ * internal_sub_items_t
+ *
+ * The STAGE_RIGHT implementation for internal node N0/N1/N2, implements staged
+ * contract as an indexable container to index snap-gen to child node
+ * addresses.
+ *
+ * The layout of the contaner storing n sub-items:
+ *
+ * # <--------- container range -----------> #
+ * #<~># sub-items [2, n) #
+ * # # <- sub-item 1 -> # <- sub-item 0 -> #
+ * #...# snap-gen | laddr # snap-gen | laddr #
+ * ^
+ * |
+ * p_first_item +
+ */
+class internal_sub_items_t {
+ public:
+ using num_keys_t = index_t;
+
+ internal_sub_items_t(const memory_range_t& range) {
+ assert(range.p_start < range.p_end);
+ assert((range.p_end - range.p_start) % sizeof(internal_sub_item_t) == 0);
+ num_items = (range.p_end - range.p_start) / sizeof(internal_sub_item_t);
+ assert(num_items > 0);
+ auto _p_first_item = range.p_end - sizeof(internal_sub_item_t);
+ p_first_item = reinterpret_cast<const internal_sub_item_t*>(_p_first_item);
+ }
+
+ // container type system
+ using key_get_type = const snap_gen_t&;
+ static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE;
+ num_keys_t keys() const { return num_items; }
+ key_get_type operator[](index_t index) const {
+ assert(index < num_items);
+ return (p_first_item - index)->get_key();
+ }
+ node_offset_t size_before(index_t index) const {
+ size_t ret = index * sizeof(internal_sub_item_t);
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ }
+ const laddr_packed_t* get_p_value(index_t index) const {
+ assert(index < num_items);
+ return (p_first_item - index)->get_p_value();
+ }
+ node_offset_t size_overhead_at(index_t index) const { return 0u; }
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ auto p_end = reinterpret_cast<const char*>(p_first_item) +
+ sizeof(internal_sub_item_t);
+ auto p_start = p_end - num_items * sizeof(internal_sub_item_t);
+ int start_offset = p_start - p_node_start;
+ int end_offset = p_end - p_node_start;
+ assert(start_offset > 0 &&
+ start_offset < end_offset &&
+ end_offset < NODE_BLOCK_SIZE);
+ ceph::encode(static_cast<node_offset_t>(start_offset), encoded);
+ ceph::encode(static_cast<node_offset_t>(end_offset), encoded);
+ }
+
+ static internal_sub_items_t decode(
+ const char* p_node_start, ceph::bufferlist::const_iterator& delta) {
+ node_offset_t start_offset;
+ ceph::decode(start_offset, delta);
+ node_offset_t end_offset;
+ ceph::decode(end_offset, delta);
+ assert(start_offset < end_offset);
+ assert(end_offset <= NODE_BLOCK_SIZE);
+ return internal_sub_items_t({p_node_start + start_offset,
+ p_node_start + end_offset});
+ }
+
+ static node_offset_t header_size() { return 0u; }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(
+ const full_key_t<KT>&, const laddr_packed_t&) {
+ return sizeof(internal_sub_item_t);
+ }
+
+ template <KeyT KT>
+ static const laddr_packed_t* insert_at(
+ NodeExtentMutable&, const internal_sub_items_t&,
+ const full_key_t<KT>&, const laddr_packed_t&,
+ index_t index, node_offset_t size, const char* p_left_bound);
+
+ static node_offset_t trim_until(NodeExtentMutable&, internal_sub_items_t&, index_t);
+
+ template <KeyT KT>
+ class Appender;
+
+ private:
+ index_t num_items;
+ const internal_sub_item_t* p_first_item;
+};
+
+template <KeyT KT>
+class internal_sub_items_t::Appender {
+ public:
+ Appender(NodeExtentMutable* p_mut, char* p_append)
+ : p_mut{p_mut}, p_append{p_append} {}
+ void append(const internal_sub_items_t& src, index_t from, index_t items);
+ void append(const full_key_t<KT>&, const laddr_packed_t&, const laddr_packed_t*&);
+ char* wrap() { return p_append; }
+ private:
+ NodeExtentMutable* p_mut;
+ char* p_append;
+};
+
+/**
+ * leaf_sub_items_t
+ *
+ * The STAGE_RIGHT implementation for leaf node N0/N1/N2, implements staged
+ * contract as an indexable container to index snap-gen to onode_t.
+ *
+ * The layout of the contaner storing n sub-items:
+ *
+ * # <------------------------ container range -------------------------------> #
+ * # <---------- sub-items ----------------> # <--- offsets ---------# #
+ * #<~># sub-items [2, n) #<~>| offsets [2, n) # #
+ * # # <- sub-item 1 -> # <- sub-item 0 -> # | # #
+ * #...# snap-gen | onode # snap-gen | onode #...| offset1 | offset0 # num_keys #
+ * ^ ^ ^
+ * | | |
+ * p_items_end + p_offsets + |
+ * p_num_keys +
+ */
+class leaf_sub_items_t {
+ public:
+ // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t),
+ // and the minimal size of onode_t
+ using num_keys_t = uint8_t;
+
+ leaf_sub_items_t(const memory_range_t& range) {
+ assert(range.p_start < range.p_end);
+ auto _p_num_keys = range.p_end - sizeof(num_keys_t);
+ assert(range.p_start < _p_num_keys);
+ p_num_keys = reinterpret_cast<const num_keys_t*>(_p_num_keys);
+ assert(keys());
+ auto _p_offsets = _p_num_keys - sizeof(node_offset_t);
+ assert(range.p_start < _p_offsets);
+ p_offsets = reinterpret_cast<const node_offset_packed_t*>(_p_offsets);
+ p_items_end = reinterpret_cast<const char*>(&get_offset(keys() - 1));
+ assert(range.p_start < p_items_end);
+ assert(range.p_start == p_start());
+ }
+
+ bool operator==(const leaf_sub_items_t& x) {
+ return (p_num_keys == x.p_num_keys &&
+ p_offsets == x.p_offsets &&
+ p_items_end == x.p_items_end);
+ }
+
+ const char* p_start() const { return get_item_end(keys()); }
+
+ const node_offset_packed_t& get_offset(index_t index) const {
+ assert(index < keys());
+ return *(p_offsets - index);
+ }
+
+ const node_offset_t get_offset_to_end(index_t index) const {
+ assert(index <= keys());
+ return index == 0 ? 0 : get_offset(index - 1).value;
+ }
+
+ const char* get_item_start(index_t index) const {
+ return p_items_end - get_offset(index).value;
+ }
+
+ const char* get_item_end(index_t index) const {
+ return p_items_end - get_offset_to_end(index);
+ }
+
+ // container type system
+ using key_get_type = const snap_gen_t&;
+ static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE;
+ num_keys_t keys() const { return *p_num_keys; }
+ key_get_type operator[](index_t index) const {
+ assert(index < keys());
+ auto pointer = get_item_end(index);
+ assert(get_item_start(index) < pointer);
+ pointer -= sizeof(snap_gen_t);
+ assert(get_item_start(index) < pointer);
+ return *reinterpret_cast<const snap_gen_t*>(pointer);
+ }
+ node_offset_t size_before(index_t index) const {
+ assert(index <= keys());
+ size_t ret;
+ if (index == 0) {
+ ret = sizeof(num_keys_t);
+ } else {
+ --index;
+ ret = sizeof(num_keys_t) +
+ (index + 1) * sizeof(node_offset_t) +
+ get_offset(index).value;
+ }
+ assert(ret < NODE_BLOCK_SIZE);
+ return ret;
+ }
+ node_offset_t size_overhead_at(index_t index) const { return sizeof(node_offset_t); }
+ const onode_t* get_p_value(index_t index) const {
+ assert(index < keys());
+ auto pointer = get_item_start(index);
+ auto value = reinterpret_cast<const onode_t*>(pointer);
+ assert(pointer + value->size + sizeof(snap_gen_t) == get_item_end(index));
+ return value;
+ }
+ void encode(const char* p_node_start, ceph::bufferlist& encoded) const {
+ auto p_end = reinterpret_cast<const char*>(p_num_keys) +
+ sizeof(num_keys_t);
+ int start_offset = p_start() - p_node_start;
+ int end_offset = p_end - p_node_start;
+ assert(start_offset > 0 &&
+ start_offset < end_offset &&
+ end_offset < NODE_BLOCK_SIZE);
+ ceph::encode(static_cast<node_offset_t>(start_offset), encoded);
+ ceph::encode(static_cast<node_offset_t>(end_offset), encoded);
+ }
+
+ static leaf_sub_items_t decode(
+ const char* p_node_start, ceph::bufferlist::const_iterator& delta) {
+ node_offset_t start_offset;
+ ceph::decode(start_offset, delta);
+ node_offset_t end_offset;
+ ceph::decode(end_offset, delta);
+ assert(start_offset < end_offset);
+ assert(end_offset <= NODE_BLOCK_SIZE);
+ return leaf_sub_items_t({p_node_start + start_offset,
+ p_node_start + end_offset});
+ }
+
+ static node_offset_t header_size() { return sizeof(num_keys_t); }
+
+ template <KeyT KT>
+ static node_offset_t estimate_insert(const full_key_t<KT>&, const onode_t& value) {
+ return value.size + sizeof(snap_gen_t) + sizeof(node_offset_t);
+ }
+
+ template <KeyT KT>
+ static const onode_t* insert_at(
+ NodeExtentMutable&, const leaf_sub_items_t&,
+ const full_key_t<KT>&, const onode_t&,
+ index_t index, node_offset_t size, const char* p_left_bound);
+
+ static node_offset_t trim_until(NodeExtentMutable&, leaf_sub_items_t&, index_t index);
+
+ template <KeyT KT>
+ class Appender;
+
+ private:
+ // TODO: support unaligned access
+ const num_keys_t* p_num_keys;
+ const node_offset_packed_t* p_offsets;
+ const char* p_items_end;
+};
+
+constexpr index_t APPENDER_LIMIT = 3u;
+
+template <KeyT KT>
+class leaf_sub_items_t::Appender {
+ struct range_items_t {
+ index_t from;
+ index_t items;
+ };
+ struct kv_item_t {
+ const full_key_t<KT>* p_key;
+ const onode_t* p_value;
+ };
+ using var_t = std::variant<range_items_t, kv_item_t>;
+
+ public:
+ Appender(NodeExtentMutable* p_mut, char* p_append)
+ : p_mut{p_mut}, p_append{p_append} {
+ }
+
+ void append(const leaf_sub_items_t& src, index_t from, index_t items) {
+ assert(cnt <= APPENDER_LIMIT);
+ assert(from <= src.keys());
+ if (items == 0) {
+ return;
+ }
+ if (op_src) {
+ assert(*op_src == src);
+ } else {
+ op_src = src;
+ }
+ assert(from < src.keys());
+ assert(from + items <= src.keys());
+ appends[cnt] = range_items_t{from, items};
+ ++cnt;
+ }
+ void append(const full_key_t<KT>& key,
+ const onode_t& value, const onode_t*& p_value) {
+ assert(pp_value == nullptr);
+ assert(cnt <= APPENDER_LIMIT);
+ appends[cnt] = kv_item_t{&key, &value};
+ ++cnt;
+ pp_value = &p_value;
+ }
+ char* wrap();
+
+ private:
+ std::optional<leaf_sub_items_t> op_src;
+ const onode_t** pp_value = nullptr;
+ NodeExtentMutable* p_mut;
+ char* p_append;
+ var_t appends[APPENDER_LIMIT];
+ index_t cnt = 0;
+};
+
+template <node_type_t> struct _sub_items_t;
+template<> struct _sub_items_t<node_type_t::INTERNAL> { using type = internal_sub_items_t; };
+template<> struct _sub_items_t<node_type_t::LEAF> { using type = leaf_sub_items_t; };
+template <node_type_t NODE_TYPE>
+using sub_items_t = typename _sub_items_t<NODE_TYPE>::type;
+
+}