diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
commit | 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch) | |
tree | 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/crimson/os/seastore/onode_manager/staged-fltree/stages | |
parent | Initial commit. (diff) | |
download | ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip |
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
12 files changed, 5375 insertions, 0 deletions
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc new file mode 100644 index 000000000..443c6cabd --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc @@ -0,0 +1,165 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "item_iterator_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +#define ITER_T item_iterator_t<NODE_TYPE> +#define ITER_INST(NT) item_iterator_t<NT> + +template <node_type_t NODE_TYPE> +template <KeyT KT> +memory_range_t ITER_T::insert_prefix( + NodeExtentMutable& mut, const ITER_T& iter, const full_key_t<KT>& key, + bool is_end, node_offset_t size, const char* p_left_bound) { + // 1. insert range + char* p_insert; + if (is_end) { + assert(!iter.has_next()); + p_insert = const_cast<char*>(iter.p_start()); + } else { + p_insert = const_cast<char*>(iter.p_end()); + } + char* p_insert_front = p_insert - size; + + // 2. shift memory + const char* p_shift_start = p_left_bound; + const char* p_shift_end = p_insert; + mut.shift_absolute(p_shift_start, + p_shift_end - p_shift_start, + -(int)size); + + // 3. append header + p_insert -= sizeof(node_offset_t); + node_offset_t back_offset = (p_insert - p_insert_front); + mut.copy_in_absolute(p_insert, back_offset); + ns_oid_view_t::append<KT>(mut, key, p_insert); + + return {p_insert_front, p_insert}; +} +#define IP_TEMPLATE(NT, KT) \ + template memory_range_t ITER_INST(NT)::insert_prefix<KT>( \ + NodeExtentMutable&, const ITER_INST(NT)&, const full_key_t<KT>&, \ + bool, node_offset_t, const char*) +IP_TEMPLATE(node_type_t::LEAF, KeyT::VIEW); +IP_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW); +IP_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ); +IP_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ); + +template <node_type_t NODE_TYPE> +void ITER_T::update_size( + NodeExtentMutable& mut, const ITER_T& iter, int change) { + node_offset_t offset = iter.get_back_offset(); + int new_size = change + offset; + assert(new_size > 0 && new_size < NODE_BLOCK_SIZE); + mut.copy_in_absolute( + (void*)iter.get_item_range().p_end, node_offset_t(new_size)); +} + +template <node_type_t NODE_TYPE> +node_offset_t ITER_T::trim_until(NodeExtentMutable&, const ITER_T& iter) { + assert(iter.index() != 0); + size_t ret = iter.p_end() - iter.p_items_start; + assert(ret < NODE_BLOCK_SIZE); + return ret; +} + +template <node_type_t NODE_TYPE> +node_offset_t ITER_T::trim_at( + NodeExtentMutable& mut, const ITER_T& iter, node_offset_t trimmed) { + size_t trim_size = iter.p_start() - iter.p_items_start + trimmed; + assert(trim_size < NODE_BLOCK_SIZE); + assert(iter.get_back_offset() > trimmed); + node_offset_t new_offset = iter.get_back_offset() - trimmed; + mut.copy_in_absolute((void*)iter.item_range.p_end, new_offset); + return trim_size; +} + +#define ITER_TEMPLATE(NT) template class ITER_INST(NT) +ITER_TEMPLATE(node_type_t::LEAF); +ITER_TEMPLATE(node_type_t::INTERNAL); + +#define APPEND_T ITER_T::Appender<KT> + +template <node_type_t NODE_TYPE> +template <KeyT KT> +bool APPEND_T::append(const ITER_T& src, index_t& items) { + auto p_end = src.p_end(); + bool append_till_end = false; + if (is_valid_index(items)) { + for (auto i = 1u; i <= items; ++i) { + if (!src.has_next()) { + assert(i == items); + append_till_end = true; + break; + } + ++src; + } + } else { + if (items == INDEX_END) { + append_till_end = true; + } else { + assert(items == INDEX_LAST); + } + items = 0; + while (src.has_next()) { + ++src; + ++items; + } + if (append_till_end) { + ++items; + } + } + + const char* p_start; + if (append_till_end) { + p_start = src.p_start(); + } else { + p_start = src.p_end(); + } + assert(p_end >= p_start); + size_t append_size = p_end - p_start; + p_append -= append_size; + p_mut->copy_in_absolute(p_append, p_start, append_size); + return append_till_end; +} + +template <node_type_t NODE_TYPE> +template <KeyT KT> +std::tuple<NodeExtentMutable*, char*> +APPEND_T::open_nxt(const key_get_type& partial_key) { + p_append -= sizeof(node_offset_t); + p_offset_while_open = p_append; + ns_oid_view_t::append(*p_mut, partial_key, p_append); + return {p_mut, p_append}; +} + +template <node_type_t NODE_TYPE> +template <KeyT KT> +std::tuple<NodeExtentMutable*, char*> +APPEND_T::open_nxt(const full_key_t<KT>& key) { + p_append -= sizeof(node_offset_t); + p_offset_while_open = p_append; + ns_oid_view_t::append<KT>(*p_mut, key, p_append); + return {p_mut, p_append}; +} + +template <node_type_t NODE_TYPE> +template <KeyT KT> +void APPEND_T::wrap_nxt(char* _p_append) { + assert(_p_append < p_append); + p_mut->copy_in_absolute( + p_offset_while_open, node_offset_t(p_offset_while_open - _p_append)); + p_append = _p_append; +} + +#define APPEND_TEMPLATE(NT, KT) template class ITER_INST(NT)::Appender<KT> +APPEND_TEMPLATE(node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ); + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h new file mode 100644 index 000000000..bb68eec8f --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h @@ -0,0 +1,180 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +/** + * item_iterator_t + * + * The STAGE_STRING implementation for node N0/N1, implements staged contract + * as an iterative container to resolve crush hash conflicts. + * + * The layout of the contaner to index ns, oid strings storing n items: + * + * # <--------- container range ---------> # + * #<~># items [i+1, n) # + * # # items [0, i) #<~># + * # # <------ item i -------------> # # + * # # <--- item_range ---> | # # + * # # | # # + * # # next-stage | ns-oid | back_ # # + * # # contaner | strings | offset # # + * #...# range | | #...# + * ^ ^ | ^ + * | | | | + * | +---------------------------+ | + * + p_items_start p_items_end + + */ +template <node_type_t NODE_TYPE> +class item_iterator_t { + using value_t = value_type_t<NODE_TYPE>; + public: + item_iterator_t(const memory_range_t& range) + : p_items_start(range.p_start), p_items_end(range.p_end) { + assert(p_items_start < p_items_end); + next_item_range(p_items_end); + } + + const char* p_start() const { return item_range.p_start; } + const char* p_end() const { return item_range.p_end + sizeof(node_offset_t); } + const memory_range_t& get_item_range() const { return item_range; } + node_offset_t get_back_offset() const { return back_offset; } + + // container type system + using key_get_type = const ns_oid_view_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::ITERATIVE; + index_t index() const { return _index; } + key_get_type get_key() const { + if (!key.has_value()) { + key = ns_oid_view_t(item_range.p_end); + assert(item_range.p_start < (*key).p_start()); + } + return *key; + } + node_offset_t size() const { + size_t ret = item_range.p_end - item_range.p_start + sizeof(node_offset_t); + assert(ret < NODE_BLOCK_SIZE); + return ret; + }; + node_offset_t size_to_nxt() const { + size_t ret = get_key().size() + sizeof(node_offset_t); + assert(ret < NODE_BLOCK_SIZE); + return ret; + } + node_offset_t size_overhead() const { + return sizeof(node_offset_t) + get_key().size_overhead(); + } + memory_range_t get_nxt_container() const { + return {item_range.p_start, get_key().p_start()}; + } + bool has_next() const { + assert(p_items_start <= item_range.p_start); + return p_items_start < item_range.p_start; + } + const item_iterator_t<NODE_TYPE>& operator++() const { + assert(has_next()); + next_item_range(item_range.p_start); + key.reset(); + ++_index; + return *this; + } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + int start_offset = p_items_start - p_node_start; + int end_offset = p_items_end - p_node_start; + assert(start_offset > 0 && start_offset < NODE_BLOCK_SIZE); + assert(end_offset > 0 && end_offset <= NODE_BLOCK_SIZE); + ceph::encode(static_cast<node_offset_t>(start_offset), encoded); + ceph::encode(static_cast<node_offset_t>(end_offset), encoded); + ceph::encode(_index, encoded); + } + + static item_iterator_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t end_offset; + ceph::decode(end_offset, delta); + assert(start_offset < end_offset); + assert(end_offset <= NODE_BLOCK_SIZE); + index_t index; + ceph::decode(index, delta); + + item_iterator_t ret({p_node_start + start_offset, + p_node_start + end_offset}); + while (index > 0) { + ++ret; + --index; + } + return ret; + } + + static node_offset_t header_size() { return 0u; } + + template <KeyT KT> + static node_offset_t estimate_insert( + const full_key_t<KT>& key, const value_t&) { + return ns_oid_view_t::estimate_size<KT>(key) + sizeof(node_offset_t); + } + + template <KeyT KT> + static memory_range_t insert_prefix( + NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter, + const full_key_t<KT>& key, bool is_end, + node_offset_t size, const char* p_left_bound); + + static void update_size( + NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter, int change); + + static node_offset_t trim_until(NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&); + static node_offset_t trim_at( + NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&, node_offset_t trimmed); + + template <KeyT KT> + class Appender; + + private: + void next_item_range(const char* p_end) const { + auto p_item_end = p_end - sizeof(node_offset_t); + assert(p_items_start < p_item_end); + back_offset = reinterpret_cast<const node_offset_packed_t*>(p_item_end)->value; + assert(back_offset); + const char* p_item_start = p_item_end - back_offset; + assert(p_items_start <= p_item_start); + item_range = {p_item_start, p_item_end}; + } + + const char* p_items_start; + const char* p_items_end; + mutable memory_range_t item_range; + mutable node_offset_t back_offset; + mutable std::optional<ns_oid_view_t> key; + mutable index_t _index = 0u; +}; + +template <node_type_t NODE_TYPE> +template <KeyT KT> +class item_iterator_t<NODE_TYPE>::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} {} + bool append(const item_iterator_t<NODE_TYPE>& src, index_t& items); + char* wrap() { return p_append; } + std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&); + std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&); + void wrap_nxt(char* _p_append); + + private: + NodeExtentMutable* p_mut; + char* p_append; + char* p_offset_while_open; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc new file mode 100644 index 000000000..d60bb8d09 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc @@ -0,0 +1,32 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "key_layout.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +void string_key_view_t::append_str( + NodeExtentMutable& mut, std::string_view str, char*& p_append) { + assert(is_valid_size(str.length())); + p_append -= sizeof(string_size_t); + string_size_t len = str.length(); + mut.copy_in_absolute(p_append, len); + p_append -= len; + mut.copy_in_absolute(p_append, str.data(), len); +} + +void string_key_view_t::append_dedup( + NodeExtentMutable& mut, const Type& dedup_type, char*& p_append) { + p_append -= sizeof(string_size_t); + if (dedup_type == Type::MIN) { + mut.copy_in_absolute(p_append, MIN); + } else if (dedup_type == Type::MAX) { + mut.copy_in_absolute(p_append, MAX); + } else { + ceph_abort("impossible path"); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h new file mode 100644 index 000000000..cc1f546c1 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h @@ -0,0 +1,846 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <limits> +#include <optional> +#include <ostream> + +#include "common/hobject.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h" + +namespace crimson::os::seastore::onode { + +using shard_t = int8_t; +using pool_t = int64_t; +using crush_hash_t = uint32_t; +using snap_t = uint64_t; +using gen_t = uint64_t; +static_assert(sizeof(shard_t) == sizeof(ghobject_t().shard_id.id)); +static_assert(sizeof(pool_t) == sizeof(ghobject_t().hobj.pool)); +static_assert(sizeof(crush_hash_t) == sizeof(ghobject_t().hobj.get_hash())); +static_assert(sizeof(snap_t) == sizeof(ghobject_t().hobj.snap.val)); +static_assert(sizeof(gen_t) == sizeof(ghobject_t().generation)); + +class NodeExtentMutable; +class key_view_t; +class key_hobj_t; +enum class KeyT { VIEW, HOBJ }; +template <KeyT> struct _full_key_type; +template<> struct _full_key_type<KeyT::VIEW> { using type = key_view_t; }; +template<> struct _full_key_type<KeyT::HOBJ> { using type = key_hobj_t; }; +template <KeyT type> +using full_key_t = typename _full_key_type<type>::type; + +struct node_offset_packed_t { + node_offset_t value; +} __attribute__((packed)); + +// TODO: consider alignments +struct shard_pool_t { + bool operator==(const shard_pool_t& x) const { + return (shard == x.shard && pool == x.pool); + } + bool operator!=(const shard_pool_t& x) const { return !(*this == x); } + + template <KeyT KT> + static shard_pool_t from_key(const full_key_t<KT>& key); + + shard_t shard; + pool_t pool; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const shard_pool_t& sp) { + return os << (unsigned)sp.shard << "," << sp.pool; +} +inline MatchKindCMP compare_to(const shard_pool_t& l, const shard_pool_t& r) { + auto ret = toMatchKindCMP(l.shard, r.shard); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(l.pool, r.pool); +} + +struct crush_t { + bool operator==(const crush_t& x) const { return crush == x.crush; } + bool operator!=(const crush_t& x) const { return !(*this == x); } + + template <KeyT KT> + static crush_t from_key(const full_key_t<KT>& key); + + crush_hash_t crush; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const crush_t& c) { + return os << c.crush; +} +inline MatchKindCMP compare_to(const crush_t& l, const crush_t& r) { + return toMatchKindCMP(l.crush, r.crush); +} + +struct shard_pool_crush_t { + bool operator==(const shard_pool_crush_t& x) const { + return (shard_pool == x.shard_pool && crush == x.crush); + } + bool operator!=(const shard_pool_crush_t& x) const { return !(*this == x); } + + template <KeyT KT> + static shard_pool_crush_t from_key(const full_key_t<KT>& key); + + shard_pool_t shard_pool; + crush_t crush; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const shard_pool_crush_t& spc) { + return os << spc.shard_pool << "," << spc.crush; +} +inline MatchKindCMP compare_to( + const shard_pool_crush_t& l, const shard_pool_crush_t& r) { + auto ret = compare_to(l.shard_pool, r.shard_pool); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to(l.crush, r.crush); +} + +struct snap_gen_t { + bool operator==(const snap_gen_t& x) const { + return (snap == x.snap && gen == x.gen); + } + bool operator!=(const snap_gen_t& x) const { return !(*this == x); } + + template <KeyT KT> + static snap_gen_t from_key(const full_key_t<KT>& key); + + snap_t snap; + gen_t gen; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const snap_gen_t& sg) { + return os << sg.snap << "," << sg.gen; +} +inline MatchKindCMP compare_to(const snap_gen_t& l, const snap_gen_t& r) { + auto ret = toMatchKindCMP(l.snap, r.snap); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(l.gen, r.gen); +} + +/** + * string_key_view_t + * + * The layout to store char array as an oid or an ns string which may be + * compressed. + * + * If compressed, the physical block only stores an unsigned int of + * string_size_t, with value 0 denoting Type::MIN, and value max() denoting + * Type::MAX. + * + * If not compressed (Type::STR), the physical block stores the char array and + * a valid string_size_t value. + */ +struct string_key_view_t { + enum class Type {MIN, STR, MAX}; + // presumably the maximum string length is 2KiB + using string_size_t = uint16_t; + static constexpr auto MAX = std::numeric_limits<string_size_t>::max(); + static constexpr auto MIN = string_size_t(0u); + static auto is_valid_size(size_t size) { + return (size > MIN && size < MAX); + } + + string_key_view_t(const char* p_end) { + p_length = p_end - sizeof(string_size_t); + std::memcpy(&length, p_length, sizeof(string_size_t)); + if (is_valid_size(length)) { + auto _p_key = p_length - length; + p_key = static_cast<const char*>(_p_key); + } else { + assert(length == MAX || length == MIN); + p_key = nullptr; + } + } + Type type() const { + if (length == MIN) { + return Type::MIN; + } else if (length == MAX) { + return Type::MAX; + } else { + assert(is_valid_size(length)); + return Type::STR; + } + } + const char* p_start() const { + if (p_key) { + return p_key; + } else { + return p_length; + } + } + const char* p_next_end() const { + if (p_key) { + return p_start(); + } else { + return p_length + sizeof(string_size_t); + } + } + node_offset_t size() const { + size_t ret = length + sizeof(string_size_t); + assert(ret < NODE_BLOCK_SIZE); + return ret; + } + node_offset_t size_logical() const { + assert(type() == Type::STR); + assert(is_valid_size(length)); + return length; + } + node_offset_t size_overhead() const { + assert(type() == Type::STR); + return sizeof(string_size_t); + } + + std::string_view to_string_view() const { + assert(type() == Type::STR); + assert(is_valid_size(length)); + return {p_key, length}; + } + bool operator==(const string_key_view_t& x) const { + if (type() == x.type() && type() != Type::STR) + return true; + if (type() != x.type()) + return false; + if (length != x.length) + return false; + return (memcmp(p_key, x.p_key, length) == 0); + } + bool operator!=(const string_key_view_t& x) const { return !(*this == x); } + + static void append_str( + NodeExtentMutable&, std::string_view, char*& p_append); + + static void test_append_str(std::string_view str, char*& p_append) { + assert(is_valid_size(str.length())); + p_append -= sizeof(string_size_t); + string_size_t len = str.length(); + std::memcpy(p_append, &len, sizeof(string_size_t)); + p_append -= len; + std::memcpy(p_append, str.data(), len); + } + + static void append_dedup( + NodeExtentMutable&, const Type& dedup_type, char*& p_append); + + static void test_append_dedup(const Type& dedup_type, char*& p_append) { + p_append -= sizeof(string_size_t); + string_size_t len; + if (dedup_type == Type::MIN) { + len = MIN; + } else if (dedup_type == Type::MAX) { + len = MAX; + } else { + ceph_abort("impossible path"); + } + std::memcpy(p_append, &len, sizeof(string_size_t)); + } + + const char* p_key; + const char* p_length; + // TODO: remove if p_length is aligned + string_size_t length; +}; + +/** + * string_view_masked_t + * + * A common class to hide the underlying string implementation regardless of a + * string_key_view_t (maybe compressed), a string/string_view, or a compressed + * string. And leverage this consistant class to do compare, print, convert and + * append operations. + */ +class string_view_masked_t { + public: + using string_size_t = string_key_view_t::string_size_t; + using Type = string_key_view_t::Type; + explicit string_view_masked_t(const string_key_view_t& index) + : type{index.type()} { + if (type == Type::STR) { + view = index.to_string_view(); + } + } + explicit string_view_masked_t(std::string_view str) + : type{Type::STR}, view{str} { + assert(string_key_view_t::is_valid_size(view.size())); + } + + Type get_type() const { return type; } + std::string_view to_string_view() const { + assert(get_type() == Type::STR); + return view; + } + string_size_t size() const { + assert(get_type() == Type::STR); + assert(string_key_view_t::is_valid_size(view.size())); + return view.size(); + } + bool operator==(const string_view_masked_t& x) const { + if (get_type() == x.get_type() && get_type() != Type::STR) + return true; + if (get_type() != x.get_type()) + return false; + if (size() != x.size()) + return false; + return (memcmp(view.data(), x.view.data(), size()) == 0); + } + bool operator!=(const string_view_masked_t& x) const { return !(*this == x); } + void encode(ceph::bufferlist& bl) const { + if (get_type() == Type::MIN) { + ceph::encode(string_key_view_t::MIN, bl); + } else if (get_type() == Type::MAX) { + ceph::encode(string_key_view_t::MAX, bl); + } else { + ceph::encode(size(), bl); + ceph::encode_nohead(view, bl); + } + } + static auto min() { return string_view_masked_t{Type::MIN}; } + static auto max() { return string_view_masked_t{Type::MAX}; } + static string_view_masked_t decode( + std::string& str_storage, ceph::bufferlist::const_iterator& delta) { + string_size_t size; + ceph::decode(size, delta); + if (size == string_key_view_t::MIN) { + return min(); + } else if (size == string_key_view_t::MAX) { + return max(); + } else { + ceph::decode_nohead(size, str_storage, delta); + return string_view_masked_t(str_storage); + } + } + + private: + explicit string_view_masked_t(Type type) + : type{type} {} + + Type type; + std::string_view view; +}; +inline MatchKindCMP compare_to(const string_view_masked_t& l, const string_view_masked_t& r) { + using Type = string_view_masked_t::Type; + auto l_type = l.get_type(); + auto r_type = r.get_type(); + if (l_type == Type::STR && r_type == Type::STR) { + assert(l.size() && r.size()); + return toMatchKindCMP(l.to_string_view(), r.to_string_view()); + } else if (l_type == r_type) { + return MatchKindCMP::EQ; + } else if (l_type == Type::MIN || r_type == Type::MAX) { + return MatchKindCMP::LT; + } else { // l_type == Type::MAX || r_type == Type::MIN + return MatchKindCMP::GT; + } +} +inline MatchKindCMP compare_to(std::string_view l, const string_view_masked_t& r) { + using Type = string_view_masked_t::Type; + assert(l.length()); + auto r_type = r.get_type(); + if (r_type == Type::MIN) { + return MatchKindCMP::GT; + } else if (r_type == Type::MAX) { + return MatchKindCMP::LT; + } else { // r_type == Type::STR + assert(r.size()); + return toMatchKindCMP(l, r.to_string_view()); + } +} +inline MatchKindCMP compare_to(const string_view_masked_t& l, std::string_view r) { + return reverse(compare_to(r, l)); +} +inline std::ostream& operator<<(std::ostream& os, const string_view_masked_t& masked) { + using Type = string_view_masked_t::Type; + auto type = masked.get_type(); + if (type == Type::MIN) { + return os << "MIN"; + } else if (type == Type::MAX) { + return os << "MAX"; + } else { // type == Type::STR + auto view = masked.to_string_view(); + if (view.length() <= 12) { + os << "\"" << view << "\""; + } else { + os << "\"" << std::string_view(view.data(), 4) << ".." + << std::string_view(view.data() + view.length() - 2, 2) + << "/" << view.length() << "B\""; + } + return os; + } +} + +struct ns_oid_view_t { + using string_size_t = string_key_view_t::string_size_t; + using Type = string_key_view_t::Type; + + ns_oid_view_t(const char* p_end) : nspace(p_end), oid(nspace.p_next_end()) {} + Type type() const { return oid.type(); } + const char* p_start() const { return oid.p_start(); } + node_offset_t size() const { + if (type() == Type::STR) { + size_t ret = nspace.size() + oid.size(); + assert(ret < NODE_BLOCK_SIZE); + return ret; + } else { + return sizeof(string_size_t); + } + } + node_offset_t size_logical() const { + assert(type() == Type::STR); + return nspace.size_logical() + oid.size_logical(); + } + node_offset_t size_overhead() const { + assert(type() == Type::STR); + return nspace.size_overhead() + oid.size_overhead(); + } + bool operator==(const ns_oid_view_t& x) const { + return (string_view_masked_t{nspace} == string_view_masked_t{x.nspace} && + string_view_masked_t{oid} == string_view_masked_t{x.oid}); + } + bool operator!=(const ns_oid_view_t& x) const { return !(*this == x); } + + template <KeyT KT> + static node_offset_t estimate_size(const full_key_t<KT>& key); + + template <KeyT KT> + static void append(NodeExtentMutable&, + const full_key_t<KT>& key, + char*& p_append); + + static void append(NodeExtentMutable& mut, + const ns_oid_view_t& view, + char*& p_append) { + if (view.type() == Type::STR) { + string_key_view_t::append_str(mut, view.nspace.to_string_view(), p_append); + string_key_view_t::append_str(mut, view.oid.to_string_view(), p_append); + } else { + string_key_view_t::append_dedup(mut, view.type(), p_append); + } + } + + template <KeyT KT> + static void test_append(const full_key_t<KT>& key, char*& p_append); + + string_key_view_t nspace; + string_key_view_t oid; +}; +inline std::ostream& operator<<(std::ostream& os, const ns_oid_view_t& ns_oid) { + return os << string_view_masked_t{ns_oid.nspace} << "," + << string_view_masked_t{ns_oid.oid}; +} +inline MatchKindCMP compare_to(const ns_oid_view_t& l, const ns_oid_view_t& r) { + auto ret = compare_to(string_view_masked_t{l.nspace}, + string_view_masked_t{r.nspace}); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to(string_view_masked_t{l.oid}, + string_view_masked_t{r.oid}); +} + +/** + * key_hobj_t + * + * A specialized implementation of a full_key_t storing a ghobject_t passed + * from user. + */ +class key_hobj_t { + public: + explicit key_hobj_t(const ghobject_t& ghobj) : ghobj{ghobj} {} + /* + * common interfaces as a full_key_t + */ + shard_t shard() const { + return ghobj.shard_id; + } + pool_t pool() const { + return ghobj.hobj.pool; + } + crush_hash_t crush() const { + return ghobj.hobj.get_hash(); + } + std::string_view nspace() const { + // TODO(cross-node string dedup) + return ghobj.hobj.nspace; + } + string_view_masked_t nspace_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{nspace()}; + } + std::string_view oid() const { + // TODO(cross-node string dedup) + return ghobj.hobj.oid.name; + } + string_view_masked_t oid_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{oid()}; + } + ns_oid_view_t::Type dedup_type() const { + return _dedup_type; + } + snap_t snap() const { + return ghobj.hobj.snap; + } + gen_t gen() const { + return ghobj.generation; + } + + bool operator==(const full_key_t<KeyT::VIEW>& o) const; + bool operator==(const full_key_t<KeyT::HOBJ>& o) const; + bool operator!=(const full_key_t<KeyT::VIEW>& o) const { + return !operator==(o); + } + bool operator!=(const full_key_t<KeyT::HOBJ>& o) const { + return !operator==(o); + } + + std::ostream& dump(std::ostream& os) const { + os << "key_hobj(" << (unsigned)shard() << "," + << pool() << "," << crush() << "; " + << string_view_masked_t{nspace()} << "," + << string_view_masked_t{oid()} << "; " + << snap() << "," << gen() << ")"; + return os; + } + + static key_hobj_t decode(ceph::bufferlist::const_iterator& delta) { + shard_t shard; + ceph::decode(shard, delta); + pool_t pool; + ceph::decode(pool, delta); + crush_hash_t crush; + ceph::decode(crush, delta); + std::string nspace; + auto nspace_masked = string_view_masked_t::decode(nspace, delta); + // TODO(cross-node string dedup) + assert(nspace_masked.get_type() == string_view_masked_t::Type::STR); + std::string oid; + auto oid_masked = string_view_masked_t::decode(oid, delta); + // TODO(cross-node string dedup) + assert(oid_masked.get_type() == string_view_masked_t::Type::STR); + snap_t snap; + ceph::decode(snap, delta); + gen_t gen; + ceph::decode(gen, delta); + return key_hobj_t(ghobject_t( + shard_id_t(shard), pool, crush, nspace, oid, snap, gen)); + } + + private: + ns_oid_view_t::Type _dedup_type = ns_oid_view_t::Type::STR; + ghobject_t ghobj; +}; +inline std::ostream& operator<<(std::ostream& os, const key_hobj_t& key) { + return key.dump(os); +} + +/** + * key_view_t + * + * A specialized implementation of a full_key_t pointing to the locations + * storing the full key in a tree node. + */ +class key_view_t { + public: + /** + * common interfaces as a full_key_t + */ + shard_t shard() const { + return shard_pool_packed().shard; + } + pool_t pool() const { + return shard_pool_packed().pool; + } + crush_hash_t crush() const { + return crush_packed().crush; + } + std::string_view nspace() const { + // TODO(cross-node string dedup) + return ns_oid_view().nspace.to_string_view(); + } + string_view_masked_t nspace_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{ns_oid_view().nspace}; + } + std::string_view oid() const { + // TODO(cross-node string dedup) + return ns_oid_view().oid.to_string_view(); + } + string_view_masked_t oid_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{ns_oid_view().oid}; + } + ns_oid_view_t::Type dedup_type() const { + return ns_oid_view().type(); + } + snap_t snap() const { + return snap_gen_packed().snap; + } + gen_t gen() const { + return snap_gen_packed().gen; + } + + bool operator==(const full_key_t<KeyT::VIEW>& o) const; + bool operator==(const full_key_t<KeyT::HOBJ>& o) const; + bool operator!=(const full_key_t<KeyT::VIEW>& o) const { + return !operator==(o); + } + bool operator!=(const full_key_t<KeyT::HOBJ>& o) const { + return !operator==(o); + } + + /** + * key_view_t specific interfaces + */ + bool has_shard_pool() const { + return p_shard_pool != nullptr; + } + bool has_crush() const { + return p_crush != nullptr; + } + bool has_ns_oid() const { + return p_ns_oid.has_value(); + } + bool has_snap_gen() const { + return p_snap_gen != nullptr; + } + + const shard_pool_t& shard_pool_packed() const { + assert(has_shard_pool()); + return *p_shard_pool; + } + const crush_t& crush_packed() const { + assert(has_crush()); + return *p_crush; + } + const ns_oid_view_t& ns_oid_view() const { + assert(has_ns_oid()); + return *p_ns_oid; + } + const snap_gen_t& snap_gen_packed() const { + assert(has_snap_gen()); + return *p_snap_gen; + } + + size_t size_logical() const { + return sizeof(shard_t) + sizeof(pool_t) + sizeof(crush_hash_t) + + sizeof(snap_t) + sizeof(gen_t) + ns_oid_view().size_logical(); + } + + ghobject_t to_ghobj() const { + return ghobject_t( + shard_id_t(shard()), pool(), crush(), + std::string(nspace()), std::string(oid()), snap(), gen()); + } + + void replace(const crush_t& key) { p_crush = &key; } + void set(const crush_t& key) { + assert(!has_crush()); + replace(key); + } + void replace(const shard_pool_crush_t& key) { p_shard_pool = &key.shard_pool; } + void set(const shard_pool_crush_t& key) { + set(key.crush); + assert(!has_shard_pool()); + replace(key); + } + void replace(const ns_oid_view_t& key) { p_ns_oid = key; } + void set(const ns_oid_view_t& key) { + assert(!has_ns_oid()); + replace(key); + } + void replace(const snap_gen_t& key) { p_snap_gen = &key; } + void set(const snap_gen_t& key) { + assert(!has_snap_gen()); + replace(key); + } + + std::ostream& dump(std::ostream& os) const { + os << "key_view("; + if (has_shard_pool()) { + os << (unsigned)shard() << "," << pool() << ","; + } else { + os << "X,X,"; + } + if (has_crush()) { + os << crush() << "; "; + } else { + os << "X; "; + } + if (has_ns_oid()) { + os << ns_oid_view() << "; "; + } else { + os << "X,X; "; + } + if (has_snap_gen()) { + os << snap() << "," << gen() << ")"; + } else { + os << "X,X)"; + } + return os; + } + + private: + const shard_pool_t* p_shard_pool = nullptr; + const crush_t* p_crush = nullptr; + std::optional<ns_oid_view_t> p_ns_oid; + const snap_gen_t* p_snap_gen = nullptr; +}; + +template <KeyT KT> +void encode_key(const full_key_t<KT>& key, ceph::bufferlist& bl) { + ceph::encode(key.shard(), bl); + ceph::encode(key.pool(), bl); + ceph::encode(key.crush(), bl); + key.nspace_masked().encode(bl); + key.oid_masked().encode(bl); + ceph::encode(key.snap(), bl); + ceph::encode(key.gen(), bl); +} + +inline MatchKindCMP compare_to(std::string_view l, std::string_view r) { + return toMatchKindCMP(l, r); +} +template <KeyT TypeL, KeyT TypeR> +bool compare_full_key(const full_key_t<TypeL>& l, const full_key_t<TypeR>& r) { + if (l.shard() != r.shard()) + return false; + if (l.pool() != r.pool()) + return false; + if (l.crush() != r.crush()) + return false; + if (compare_to(l.nspace(), r.nspace()) != MatchKindCMP::EQ) + return false; + if (compare_to(l.oid(), r.oid()) != MatchKindCMP::EQ) + return false; + if (l.snap() != r.snap()) + return false; + if (l.gen() != r.gen()) + return false; + return true; +} + +inline bool key_hobj_t::operator==(const full_key_t<KeyT::VIEW>& o) const { + return compare_full_key<KeyT::HOBJ, KeyT::VIEW>(*this, o); +} +inline bool key_hobj_t::operator==(const full_key_t<KeyT::HOBJ>& o) const { + return compare_full_key<KeyT::HOBJ, KeyT::HOBJ>(*this, o); +} +inline bool key_view_t::operator==(const full_key_t<KeyT::VIEW>& o) const { + return compare_full_key<KeyT::VIEW, KeyT::VIEW>(*this, o); +} +inline bool key_view_t::operator==(const full_key_t<KeyT::HOBJ>& o) const { + return compare_full_key<KeyT::VIEW, KeyT::HOBJ>(*this, o); +} + +inline std::ostream& operator<<(std::ostream& os, const key_view_t& key) { + return key.dump(os); +} + +template <KeyT Type> +MatchKindCMP compare_to(const full_key_t<Type>& key, const shard_pool_t& target) { + auto ret = toMatchKindCMP(key.shard(), target.shard); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(key.pool(), target.pool); +} + +template <KeyT Type> +MatchKindCMP compare_to(const full_key_t<Type>& key, const crush_t& target) { + return toMatchKindCMP(key.crush(), target.crush); +} + +template <KeyT Type> +MatchKindCMP compare_to(const full_key_t<Type>& key, const shard_pool_crush_t& target) { + auto ret = compare_to<Type>(key, target.shard_pool); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to<Type>(key, target.crush); +} + +template <KeyT Type> +MatchKindCMP compare_to(const full_key_t<Type>& key, const ns_oid_view_t& target) { + auto ret = compare_to(key.nspace(), string_view_masked_t{target.nspace}); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to(key.oid(), string_view_masked_t{target.oid}); +} + +template <KeyT Type> +MatchKindCMP compare_to(const full_key_t<Type>& key, const snap_gen_t& target) { + auto ret = toMatchKindCMP(key.snap(), target.snap); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(key.gen(), target.gen); +} + +template <KeyT KT> +shard_pool_t shard_pool_t::from_key(const full_key_t<KT>& key) { + if constexpr (KT == KeyT::VIEW) { + return key.shard_pool_packed(); + } else { + return {key.shard(), key.pool()}; + } +} + +template <KeyT KT> +crush_t crush_t::from_key(const full_key_t<KT>& key) { + if constexpr (KT == KeyT::VIEW) { + return key.crush_packed(); + } else { + return {key.crush()}; + } +} + +template <KeyT KT> +shard_pool_crush_t shard_pool_crush_t::from_key(const full_key_t<KT>& key) { + return {shard_pool_t::from_key<KT>(key), crush_t::from_key<KT>(key)}; +} + +template <KeyT KT> +snap_gen_t snap_gen_t::from_key(const full_key_t<KT>& key) { + if constexpr (KT == KeyT::VIEW) { + return key.snap_gen_packed(); + } else { + return {key.snap(), key.gen()}; + } +} + +template <KeyT KT> +node_offset_t ns_oid_view_t::estimate_size(const full_key_t<KT>& key) { + if constexpr (KT == KeyT::VIEW) { + return key.ns_oid_view().size(); + } else { + if (key.dedup_type() != Type::STR) { + // size after deduplication + return sizeof(string_size_t); + } else { + return 2 * sizeof(string_size_t) + key.nspace().size() + key.oid().size(); + } + } +} + +template <KeyT KT> +void ns_oid_view_t::append( + NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) { + if (key.dedup_type() == Type::STR) { + string_key_view_t::append_str(mut, key.nspace(), p_append); + string_key_view_t::append_str(mut, key.oid(), p_append); + } else { + string_key_view_t::append_dedup(mut, key.dedup_type(), p_append); + } +} + +template <KeyT KT> +void ns_oid_view_t::test_append(const full_key_t<KT>& key, char*& p_append) { + if (key.dedup_type() == Type::STR) { + string_key_view_t::test_append_str(key.nspace(), p_append); + string_key_view_t::test_append_str(key.oid(), p_append); + } else { + string_key_view_t::test_append_dedup(key.dedup_type(), p_append); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc new file mode 100644 index 000000000..4a5988185 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc @@ -0,0 +1,318 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" +#include "node_stage_layout.h" + +namespace crimson::os::seastore::onode { + +#define NODE_T node_extent_t<FieldType, NODE_TYPE> +#define NODE_INST(FT, NT) node_extent_t<FT, NT> + +template <typename FieldType, node_type_t NODE_TYPE> +const char* NODE_T::p_left_bound() const { + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + // N3 internal node doesn't have the right part + return nullptr; + } else { + auto ret = p_start() + fields().get_item_end_offset(keys()); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (is_level_tail()) { + ret -= sizeof(laddr_t); + } + } + return ret; + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +node_offset_t NODE_T::size_to_nxt_at(index_t index) const { + assert(index < keys()); + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + return FieldType::estimate_insert_one(); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + auto p_end = p_start() + p_fields->get_item_end_offset(index); + return FieldType::estimate_insert_one() + ns_oid_view_t(p_end).size(); + } else { + ceph_abort("N3 node is not nested"); + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +memory_range_t NODE_T::get_nxt_container(index_t index) const { + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + ceph_abort("N3 internal node doesn't have the right part"); + } else { + node_offset_t item_start_offset = p_fields->get_item_start_offset(index); + node_offset_t item_end_offset = p_fields->get_item_end_offset(index); + assert(item_start_offset < item_end_offset); + auto item_p_start = p_start() + item_start_offset; + auto item_p_end = p_start() + item_end_offset; + if constexpr (FIELD_TYPE == field_type_t::N2) { + // range for sub_items_t<NODE_TYPE> + item_p_end = ns_oid_view_t(item_p_end).p_start(); + assert(item_p_start < item_p_end); + } else { + // range for item_iterator_t<NODE_TYPE> + } + return {item_p_start, item_p_end}; + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +void NODE_T::bootstrap_extent( + NodeExtentMutable& mut, + field_type_t field_type, node_type_t node_type, + bool is_level_tail, level_t level) { + node_header_t::bootstrap_extent( + mut, field_type, node_type, is_level_tail, level); + mut.copy_in_relative( + sizeof(node_header_t), typename FieldType::num_keys_t(0u)); +} + +template <typename FieldType, node_type_t NODE_TYPE> +void NODE_T::update_is_level_tail( + NodeExtentMutable& mut, const node_extent_t& extent, bool value) { + node_header_t::update_is_level_tail(mut, extent.p_fields->header, value); +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +memory_range_t NODE_T::insert_prefix_at( + NodeExtentMutable& mut, const node_extent_t& node, const full_key_t<KT>& key, + index_t index, node_offset_t size, const char* p_left_bound) { + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + assert(index <= node.keys()); + assert(p_left_bound == node.p_left_bound()); + assert(size > FieldType::estimate_insert_one()); + auto size_right = size - FieldType::estimate_insert_one(); + const char* p_insert = node.p_start() + node.fields().get_item_end_offset(index); + const char* p_insert_front = p_insert - size_right; + FieldType::template insert_at<KT>(mut, key, node.fields(), index, size_right); + mut.shift_absolute(p_left_bound, + p_insert - p_left_bound, + -(int)size_right); + return {p_insert_front, p_insert}; + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + ceph_abort("not implemented"); + } else { + ceph_abort("impossible"); + } +} +#define IPA_TEMPLATE(FT, NT, KT) \ + template memory_range_t NODE_INST(FT, NT)::insert_prefix_at<KT>( \ + NodeExtentMutable&, const node_extent_t&, const full_key_t<KT>&, \ + index_t, node_offset_t, const char*) +IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW); +IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW); +IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW); +IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW); +IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW); +IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW); +IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ); + +template <typename FieldType, node_type_t NODE_TYPE> +void NODE_T::update_size_at( + NodeExtentMutable& mut, const node_extent_t& node, index_t index, int change) { + assert(index < node.keys()); + FieldType::update_size_at(mut, node.fields(), index, change); +} + +template <typename FieldType, node_type_t NODE_TYPE> +node_offset_t NODE_T::trim_until( + NodeExtentMutable& mut, const node_extent_t& node, index_t index) { + assert(!node.is_level_tail()); + auto keys = node.keys(); + assert(index <= keys); + if (index == keys) { + return 0; + } + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + ceph_abort("not implemented"); + } else { + mut.copy_in_absolute( + (void*)&node.p_fields->num_keys, num_keys_t(index)); + } + // no need to calculate trim size for node + return 0; +} + +template <typename FieldType, node_type_t NODE_TYPE> +node_offset_t NODE_T::trim_at( + NodeExtentMutable& mut, const node_extent_t& node, + index_t index, node_offset_t trimmed) { + assert(!node.is_level_tail()); + assert(index < node.keys()); + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + ceph_abort("not implemented"); + } else { + node_offset_t offset = node.p_fields->get_item_start_offset(index); + size_t new_offset = offset + trimmed; + assert(new_offset < node.p_fields->get_item_end_offset(index)); + mut.copy_in_absolute(const_cast<void*>(node.p_fields->p_offset(index)), + node_offset_t(new_offset)); + mut.copy_in_absolute( + (void*)&node.p_fields->num_keys, num_keys_t(index + 1)); + } + // no need to calculate trim size for node + return 0; +} + +#define NODE_TEMPLATE(FT, NT) template class NODE_INST(FT, NT) +NODE_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL); +NODE_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_0_t, node_type_t::LEAF); +NODE_TEMPLATE(node_fields_1_t, node_type_t::LEAF); +NODE_TEMPLATE(node_fields_2_t, node_type_t::LEAF); +NODE_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF); + +#define APPEND_T node_extent_t<FieldType, NODE_TYPE>::Appender<KT> + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +void APPEND_T::append(const node_extent_t& src, index_t from, index_t items) { + assert(from <= src.keys()); + if (p_src == nullptr) { + p_src = &src; + } else { + assert(p_src == &src); + } + if (items == 0) { + return; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + num_keys += items; + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + ceph_abort("impossible path"); + } else { + // append left part forwards + node_offset_t offset_left_start = src.fields().get_key_start_offset(from); + node_offset_t offset_left_end = src.fields().get_key_start_offset(from + items); + node_offset_t left_size = offset_left_end - offset_left_start; + if (num_keys == 0) { + // no need to adjust offset + assert(from == 0); + assert(p_start + offset_left_start == p_append_left); + p_mut->copy_in_absolute(p_append_left, + src.p_start() + offset_left_start, left_size); + } else { + node_offset_t step_size = FieldType::estimate_insert_one(); + node_offset_t offset_base = src.fields().get_item_end_offset(from); + int offset_change = p_append_right - p_start - offset_base; + auto p_offset_dst = p_append_left; + if constexpr (FIELD_TYPE != field_type_t::N2) { + // copy keys + p_mut->copy_in_absolute(p_append_left, + src.p_start() + offset_left_start, left_size); + // point to offset for update + p_offset_dst += sizeof(typename FieldType::key_t); + } + for (auto i = from; i < from + items; ++i) { + p_mut->copy_in_absolute(p_offset_dst, + node_offset_t(src.fields().get_item_start_offset(i) + offset_change)); + p_offset_dst += step_size; + } + assert(p_append_left + left_size + sizeof(typename FieldType::key_t) == + p_offset_dst); + } + p_append_left += left_size; + + // append right part backwards + node_offset_t offset_right_start = src.fields().get_item_end_offset(from + items); + node_offset_t offset_right_end = src.fields().get_item_end_offset(from); + node_offset_t right_size = offset_right_end - offset_right_start; + p_append_right -= right_size; + p_mut->copy_in_absolute(p_append_right, + src.p_start() + offset_right_start, right_size); + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +void APPEND_T::append( + const full_key_t<KT>& key, const value_t& value, const value_t*& p_value) { + if constexpr (FIELD_TYPE == field_type_t::N3) { + ceph_abort("not implemented"); + } else { + ceph_abort("should not happen"); + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +std::tuple<NodeExtentMutable*, char*> +APPEND_T::open_nxt(const key_get_type& partial_key) { + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + FieldType::append_key(*p_mut, partial_key, p_append_left); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + FieldType::append_key(*p_mut, partial_key, p_append_right); + } else { + ceph_abort("impossible path"); + } + return {p_mut, p_append_right}; +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +std::tuple<NodeExtentMutable*, char*> +APPEND_T::open_nxt(const full_key_t<KT>& key) { + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + FieldType::template append_key<KT>(*p_mut, key, p_append_left); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + FieldType::template append_key<KT>(*p_mut, key, p_append_right); + } else { + ceph_abort("impossible path"); + } + return {p_mut, p_append_right}; +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +char* APPEND_T::wrap() { + assert(p_append_left <= p_append_right); + assert(p_src); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (p_src->is_level_tail()) { + laddr_t tail_value = p_src->get_end_p_laddr()->value; + p_append_right -= sizeof(laddr_t); + assert(p_append_left <= p_append_right); + p_mut->copy_in_absolute(p_append_right, tail_value); + } + } + p_mut->copy_in_absolute(p_start + offsetof(FieldType, num_keys), num_keys); + return p_append_left; +} + +#define APPEND_TEMPLATE(FT, NT, KT) template class node_extent_t<FT, NT>::Appender<KT> +APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::HOBJ); + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h new file mode 100644 index 000000000..cf0ca463c --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h @@ -0,0 +1,226 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +/** + * node_extent_t + * + * The top indexing stage implementation for node N0/N1/N2/N3, implements + * staged contract as an indexable container, and provides access to node + * header. + * + * The specific field layout are defined by FieldType which are + * node_fields_0_t, node_fields_1_t, node_fields_2_t, internal_fields_3_t and + * leaf_fields_3_t. Diagrams see node_stage_layout.h. + */ +template <typename FieldType, node_type_t _NODE_TYPE> +class node_extent_t { + public: + using value_t = value_type_t<_NODE_TYPE>; + using num_keys_t = typename FieldType::num_keys_t; + static constexpr node_type_t NODE_TYPE = _NODE_TYPE; + static constexpr field_type_t FIELD_TYPE = FieldType::FIELD_TYPE; + static constexpr node_offset_t EXTENT_SIZE = + (FieldType::SIZE + DISK_BLOCK_SIZE - 1u) / DISK_BLOCK_SIZE * DISK_BLOCK_SIZE; + + // TODO: remove + node_extent_t() = default; + + node_extent_t(const FieldType* p_fields) : p_fields{p_fields} { + validate(*p_fields); + } + + const char* p_start() const { return fields_start(*p_fields); } + + const char* off_to_ptr(node_offset_t off) const { + assert(off <= FieldType::SIZE); + return p_start() + off; + } + + node_offset_t ptr_to_off(const void* ptr) const { + auto _ptr = static_cast<const char*>(ptr); + assert(_ptr >= p_start()); + auto off = _ptr - p_start(); + assert(off <= FieldType::SIZE); + return off; + } + + bool is_level_tail() const { return p_fields->is_level_tail(); } + level_t level() const { return p_fields->header.level; } + node_offset_t free_size() const { + return p_fields->template free_size_before<NODE_TYPE>(keys()); + } + node_offset_t total_size() const { return p_fields->total_size(); } + const char* p_left_bound() const; + template <node_type_t T = NODE_TYPE> + std::enable_if_t<T == node_type_t::INTERNAL, const laddr_packed_t*> + get_end_p_laddr() const { + assert(is_level_tail()); + if constexpr (FIELD_TYPE == field_type_t::N3) { + return &p_fields->child_addrs[keys()]; + } else { + auto offset_start = p_fields->get_item_end_offset(keys()); + assert(offset_start <= FieldType::SIZE); + offset_start -= sizeof(laddr_packed_t); + auto p_addr = p_start() + offset_start; + return reinterpret_cast<const laddr_packed_t*>(p_addr); + } + } + + // container type system + using key_get_type = typename FieldType::key_get_type; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + index_t keys() const { return p_fields->num_keys; } + key_get_type operator[] (index_t index) const { return p_fields->get_key(index); } + node_offset_t size_before(index_t index) const { + auto free_size = p_fields->template free_size_before<NODE_TYPE>(index); + assert(total_size() >= free_size); + return total_size() - free_size; + } + node_offset_t size_to_nxt_at(index_t index) const; + node_offset_t size_overhead_at(index_t index) const { + return FieldType::ITEM_OVERHEAD; } + memory_range_t get_nxt_container(index_t index) const; + + template <typename T = FieldType> + std::enable_if_t<T::FIELD_TYPE == field_type_t::N3, const value_t*> + get_p_value(index_t index) const { + assert(index < keys()); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + return &p_fields->child_addrs[index]; + } else { + auto range = get_nxt_container(index); + auto ret = reinterpret_cast<const onode_t*>(range.p_start); + assert(range.p_start + ret->size == range.p_end); + return ret; + } + } + + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + assert(p_node_start == p_start()); + // nothing to encode as the container range is the entire extent + } + + static node_extent_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + // nothing to decode + return node_extent_t(reinterpret_cast<const FieldType*>(p_node_start)); + } + + static void validate(const FieldType& fields) { +#ifndef NDEBUG + assert(fields.header.get_node_type() == NODE_TYPE); + assert(fields.header.get_field_type() == FieldType::FIELD_TYPE); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + assert(fields.header.level > 0u); + } else { + assert(fields.header.level == 0u); + } +#endif + } + + static void bootstrap_extent( + NodeExtentMutable&, field_type_t, node_type_t, bool, level_t); + + static void update_is_level_tail(NodeExtentMutable&, const node_extent_t&, bool); + + static node_offset_t header_size() { return FieldType::HEADER_SIZE; } + + template <KeyT KT> + static node_offset_t estimate_insert( + const full_key_t<KT>& key, const value_t& value) { + auto size = FieldType::estimate_insert_one(); + if constexpr (FIELD_TYPE == field_type_t::N2) { + size += ns_oid_view_t::estimate_size<KT>(key); + } else if constexpr (FIELD_TYPE == field_type_t::N3 && + NODE_TYPE == node_type_t::LEAF) { + size += value.size; + } + return size; + } + + template <KeyT KT> + static const value_t* insert_at( + NodeExtentMutable& mut, const node_extent_t&, + const full_key_t<KT>& key, const value_t& value, + index_t index, node_offset_t size, const char* p_left_bound) { + if constexpr (FIELD_TYPE == field_type_t::N3) { + ceph_abort("not implemented"); + } else { + ceph_abort("impossible"); + } + } + + template <KeyT KT> + static memory_range_t insert_prefix_at( + NodeExtentMutable&, const node_extent_t&, + const full_key_t<KT>& key, + index_t index, node_offset_t size, const char* p_left_bound); + + static void update_size_at( + NodeExtentMutable&, const node_extent_t&, index_t index, int change); + + static node_offset_t trim_until( + NodeExtentMutable&, const node_extent_t&, index_t index); + static node_offset_t trim_at(NodeExtentMutable&, const node_extent_t&, + index_t index, node_offset_t trimmed); + + template <KeyT KT> + class Appender; + + private: + const FieldType& fields() const { return *p_fields; } + const FieldType* p_fields; +}; + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +class node_extent_t<FieldType, NODE_TYPE>::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_start{p_append} { +#ifndef NDEBUG + auto p_fields = reinterpret_cast<const FieldType*>(p_append); + assert(*(p_fields->header.get_field_type()) == FIELD_TYPE); + assert(p_fields->header.get_node_type() == NODE_TYPE); + assert(p_fields->num_keys == 0); +#endif + p_append_left = p_start + FieldType::HEADER_SIZE; + p_append_right = p_start + FieldType::SIZE; + } + void append(const node_extent_t& src, index_t from, index_t items); + void append(const full_key_t<KT>&, const value_t&, const value_t*&); + char* wrap(); + std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&); + std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&); + void wrap_nxt(char* p_append) { + if constexpr (FIELD_TYPE != field_type_t::N3) { + assert(p_append < p_append_right); + assert(p_append_left < p_append); + p_append_right = p_append; + FieldType::append_offset(*p_mut, p_append - p_start, p_append_left); + ++num_keys; + } else { + ceph_abort("not implemented"); + } + } + + private: + const node_extent_t* p_src = nullptr; + NodeExtentMutable* p_mut; + char* p_start; + char* p_append_left; + char* p_append_right; + num_keys_t num_keys = 0; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc new file mode 100644 index 000000000..81bfac72a --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc @@ -0,0 +1,96 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_stage_layout.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +void node_header_t::bootstrap_extent( + NodeExtentMutable& mut, + field_type_t field_type, node_type_t node_type, + bool is_level_tail, level_t level) { + node_header_t header; + header.set_field_type(field_type); + header.set_node_type(node_type); + header.set_is_level_tail(is_level_tail); + header.level = level; + mut.copy_in_relative(0, header); +} + +void node_header_t::update_is_level_tail( + NodeExtentMutable& mut, const node_header_t& header, bool value) { + auto& _header = const_cast<node_header_t&>(header); + _header.set_is_level_tail(value); + mut.validate_inplace_update(_header); +} + +#define F013_T _node_fields_013_t<SlotType> +#define F013_INST(ST) _node_fields_013_t<ST> + +template <typename SlotType> +void F013_T::update_size_at( + NodeExtentMutable& mut, const me_t& node, index_t index, int change) { + assert(index <= node.num_keys); + for (const auto* p_slot = &node.slots[index]; + p_slot < &node.slots[node.num_keys]; + ++p_slot) { + node_offset_t offset = p_slot->right_offset; + mut.copy_in_absolute( + (void*)&(p_slot->right_offset), + node_offset_t(offset - change)); + } +} + +template <typename SlotType> +void F013_T::append_key( + NodeExtentMutable& mut, const key_t& key, char*& p_append) { + mut.copy_in_absolute(p_append, key); + p_append += sizeof(key_t); +} + +template <typename SlotType> +void F013_T::append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) { + mut.copy_in_absolute(p_append, offset_to_right); + p_append += sizeof(node_offset_t); +} + +template <typename SlotType> +template <KeyT KT> +void F013_T::insert_at( + NodeExtentMutable& mut, const full_key_t<KT>& key, + const me_t& node, index_t index, node_offset_t size_right) { + assert(index <= node.num_keys); + update_size_at(mut, node, index, size_right); + auto p_insert = const_cast<char*>(fields_start(node)) + + node.get_key_start_offset(index); + auto p_shift_end = fields_start(node) + node.get_key_start_offset(node.num_keys); + mut.shift_absolute(p_insert, p_shift_end - p_insert, estimate_insert_one()); + mut.copy_in_absolute((void*)&node.num_keys, num_keys_t(node.num_keys + 1)); + append_key(mut, key_t::template from_key<KT>(key), p_insert); + append_offset(mut, node.get_item_end_offset(index) - size_right, p_insert); +} +#define IA_TEMPLATE(ST, KT) template void F013_INST(ST):: \ + insert_at<KT>(NodeExtentMutable&, const full_key_t<KT>&, \ + const F013_INST(ST)&, index_t, node_offset_t) +IA_TEMPLATE(slot_0_t, KeyT::VIEW); +IA_TEMPLATE(slot_1_t, KeyT::VIEW); +IA_TEMPLATE(slot_3_t, KeyT::VIEW); +IA_TEMPLATE(slot_0_t, KeyT::HOBJ); +IA_TEMPLATE(slot_1_t, KeyT::HOBJ); +IA_TEMPLATE(slot_3_t, KeyT::HOBJ); + +#define F013_TEMPLATE(ST) template struct F013_INST(ST) +F013_TEMPLATE(slot_0_t); +F013_TEMPLATE(slot_1_t); +F013_TEMPLATE(slot_3_t); + +void node_fields_2_t::append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) { + mut.copy_in_absolute(p_append, offset_to_right); + p_append += sizeof(node_offset_t); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h new file mode 100644 index 000000000..14ba95bf4 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h @@ -0,0 +1,366 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "key_layout.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +struct node_header_t { + static constexpr unsigned FIELD_TYPE_BITS = 6u; + static_assert(static_cast<uint8_t>(field_type_t::_MAX) <= 1u << FIELD_TYPE_BITS); + static constexpr unsigned NODE_TYPE_BITS = 1u; + static constexpr unsigned B_LEVEL_TAIL_BITS = 1u; + using bits_t = uint8_t; + + node_header_t() {} + std::optional<field_type_t> get_field_type() const { + if (field_type >= FIELD_TYPE_MAGIC && + field_type < static_cast<uint8_t>(field_type_t::_MAX)) { + return static_cast<field_type_t>(field_type); + } else { + return std::nullopt; + } + } + node_type_t get_node_type() const { + return static_cast<node_type_t>(node_type); + } + bool get_is_level_tail() const { + return is_level_tail; + } + + static void bootstrap_extent( + NodeExtentMutable&, field_type_t, node_type_t, bool, level_t); + + static void update_is_level_tail(NodeExtentMutable&, const node_header_t&, bool); + + bits_t field_type : FIELD_TYPE_BITS; + bits_t node_type : NODE_TYPE_BITS; + bits_t is_level_tail : B_LEVEL_TAIL_BITS; + static_assert(sizeof(bits_t) * 8 == + FIELD_TYPE_BITS + NODE_TYPE_BITS + B_LEVEL_TAIL_BITS); + level_t level; + + private: + void set_field_type(field_type_t type) { + field_type = static_cast<uint8_t>(type); + } + void set_node_type(node_type_t type) { + node_type = static_cast<uint8_t>(type); + } + void set_is_level_tail(bool value) { + is_level_tail = static_cast<uint8_t>(value); + } +} __attribute__((packed)); + +template <typename FixedKeyType, field_type_t _FIELD_TYPE> +struct _slot_t { + using key_t = FixedKeyType; + static constexpr field_type_t FIELD_TYPE = _FIELD_TYPE; + static constexpr node_offset_t OVERHEAD = sizeof(_slot_t) - sizeof(key_t); + + key_t key; + node_offset_t right_offset; +} __attribute__((packed)); +using slot_0_t = _slot_t<shard_pool_crush_t, field_type_t::N0>; +using slot_1_t = _slot_t<crush_t, field_type_t::N1>; +using slot_3_t = _slot_t<snap_gen_t, field_type_t::N3>; + +struct node_range_t { + node_offset_t start; + node_offset_t end; +}; + +template <typename FieldType> +const char* fields_start(const FieldType& node) { + return reinterpret_cast<const char*>(&node); +} + +template <node_type_t NODE_TYPE, typename FieldType> +node_range_t fields_free_range_before( + const FieldType& node, index_t index) { + assert(index <= node.num_keys); + node_offset_t offset_start = node.get_key_start_offset(index); + node_offset_t offset_end = + (index == 0 ? FieldType::SIZE + : node.get_item_start_offset(index - 1)); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (node.is_level_tail() && index == node.num_keys) { + offset_end -= sizeof(laddr_t); + } + } + assert(offset_start <= offset_end); + assert(offset_end - offset_start < FieldType::SIZE); + return {offset_start, offset_end}; +} + +/** + * _node_fields_013_t (node_fields_0_t, node_fields_1_t, leaf_fields_3_t + * + * The STAGE_LEFT layout implementation for node N0/N1, or the STAGE_RIGHT + * layout implementation for leaf node N3. + * + * The node layout storing n slots: + * + * # <----------------------------- node range --------------------------------------> # + * # #<~># free space # + * # <----- left part -----------------------------> # <~# <----- right slots -------> # + * # # <---- left slots -------------> #~> # # + * # # slots [2, n) |<~># #<~>| right slots [2, n) # + * # # <- slot 0 -> | <- slot 1 -> | # # | <-- s1 --> | <-- s0 --> # + * # # | | # # | | # + * # | num_ # | right | | right | # # | next-stage | next-stage # + * # header | keys # key | offset | key | offset | # # | container | container # + * # | # 0 | 0 | 1 | 1 |...#...#...| or onode 1 | or onode 0 # + * | | ^ ^ + * | | | | + * | +----------------+ | + * +--------------------------------------------+ + */ +template <typename SlotType> +struct _node_fields_013_t { + // TODO: decide by NODE_BLOCK_SIZE, sizeof(SlotType), sizeof(laddr_t) + // and the minimal size of variable_key. + using num_keys_t = uint8_t; + using key_t = typename SlotType::key_t; + using key_get_type = const key_t&; + using me_t = _node_fields_013_t<SlotType>; + static constexpr field_type_t FIELD_TYPE = SlotType::FIELD_TYPE; + static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE; + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + static constexpr node_offset_t ITEM_OVERHEAD = SlotType::OVERHEAD; + + bool is_level_tail() const { return header.get_is_level_tail(); } + node_offset_t total_size() const { return SIZE; } + key_get_type get_key(index_t index) const { + assert(index < num_keys); + return slots[index].key; + } + node_offset_t get_key_start_offset(index_t index) const { + assert(index <= num_keys); + auto offset = HEADER_SIZE + sizeof(SlotType) * index; + assert(offset < SIZE); + return offset; + } + node_offset_t get_item_start_offset(index_t index) const { + assert(index < num_keys); + auto offset = slots[index].right_offset; + assert(offset <= SIZE); + return offset; + } + const void* p_offset(index_t index) const { + assert(index < num_keys); + return &slots[index].right_offset; + } + node_offset_t get_item_end_offset(index_t index) const { + return index == 0 ? SIZE : get_item_start_offset(index - 1); + } + template <node_type_t NODE_TYPE> + node_offset_t free_size_before(index_t index) const { + auto range = fields_free_range_before<NODE_TYPE>(*this, index); + return range.end - range.start; + } + + static node_offset_t estimate_insert_one() { return sizeof(SlotType); } + template <KeyT KT> + static void insert_at( + NodeExtentMutable&, const full_key_t<KT>& key, + const me_t& node, index_t index, node_offset_t size_right); + static void update_size_at( + NodeExtentMutable&, const me_t& node, index_t index, int change); + static void append_key( + NodeExtentMutable&, const key_t& key, char*& p_append); + template <KeyT KT> + static void append_key( + NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) { + append_key(mut, key_t::template from_key<KT>(key), p_append); + } + static void append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append); + + node_header_t header; + num_keys_t num_keys = 0u; + SlotType slots[]; +} __attribute__((packed)); +using node_fields_0_t = _node_fields_013_t<slot_0_t>; +using node_fields_1_t = _node_fields_013_t<slot_1_t>; + +/** + * node_fields_2_t + * + * The STAGE_STRING layout implementation for node N2. + * + * The node layout storing n slots: + * + * # <--------------------------------- node range ----------------------------------------> # + * # #<~># free space # + * # <------- left part ---------------> # <~# <--------- right slots ---------------------> # + * # # <---- offsets ----> #~> #<~>| slots [2, n) # + * # # offsets [2, n) |<~># # | <----- slot 1 ----> | <----- slot 0 ----> # + * # # | # # | | # + * # | num_ # offset | offset | # # | next-stage | ns-oid | next-stage | ns-oid # + * # header | keys # 0 | 1 |...#...#...| container1 | 1 | container0 | 0 # + * | | ^ ^ + * | | | | + * | +----------------+ | + * +-----------------------------------------------+ + */ +struct node_fields_2_t { + // TODO: decide by NODE_BLOCK_SIZE, sizeof(node_off_t), sizeof(laddr_t) + // and the minimal size of variable_key. + using num_keys_t = uint8_t; + using key_t = ns_oid_view_t; + using key_get_type = key_t; + static constexpr field_type_t FIELD_TYPE = field_type_t::N2; + static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE; + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + static constexpr node_offset_t ITEM_OVERHEAD = sizeof(node_offset_t); + + bool is_level_tail() const { return header.get_is_level_tail(); } + node_offset_t total_size() const { return SIZE; } + key_get_type get_key(index_t index) const { + assert(index < num_keys); + node_offset_t item_end_offset = + (index == 0 ? SIZE : offsets[index - 1]); + assert(item_end_offset <= SIZE); + const char* p_start = fields_start(*this); + return key_t(p_start + item_end_offset); + } + node_offset_t get_key_start_offset(index_t index) const { + assert(index <= num_keys); + auto offset = HEADER_SIZE + sizeof(node_offset_t) * num_keys; + assert(offset <= SIZE); + return offset; + } + node_offset_t get_item_start_offset(index_t index) const { + assert(index < num_keys); + auto offset = offsets[index]; + assert(offset <= SIZE); + return offset; + } + const void* p_offset(index_t index) const { + assert(index < num_keys); + return &offsets[index]; + } + node_offset_t get_item_end_offset(index_t index) const { + return index == 0 ? SIZE : get_item_start_offset(index - 1); + } + template <node_type_t NODE_TYPE> + node_offset_t free_size_before(index_t index) const { + auto range = fields_free_range_before<NODE_TYPE>(*this, index); + return range.end - range.start; + } + + static node_offset_t estimate_insert_one() { return sizeof(node_offset_t); } + template <KeyT KT> + static void insert_at( + NodeExtentMutable& mut, const full_key_t<KT>& key, + const node_fields_2_t& node, index_t index, node_offset_t size_right) { + ceph_abort("not implemented"); + } + static void update_size_at( + NodeExtentMutable& mut, const node_fields_2_t& node, index_t index, int change) { + ceph_abort("not implemented"); + } + static void append_key( + NodeExtentMutable& mut, const key_t& key, char*& p_append) { + ns_oid_view_t::append(mut, key, p_append); + } + template <KeyT KT> + static void append_key( + NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) { + ns_oid_view_t::append<KT>(mut, key, p_append); + } + static void append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append); + + node_header_t header; + num_keys_t num_keys = 0u; + node_offset_t offsets[]; +} __attribute__((packed)); + +/** + * internal_fields_3_t + * + * The STAGE_RIGHT layout implementation for N2. + * + * The node layout storing 3 children: + * + * # <---------------- node range ---------------------------> # + * # # <-- keys ---> # <---- laddrs -----------> # + * # free space: # |<~># |<~># + * # # | # | # + * # | num_ # key | key | # laddr | laddr | laddr | # + * # header | keys # 0 | 1 |...# 0 | 1 | 2 |...# + */ +// TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t) +static constexpr unsigned MAX_NUM_KEYS_I3 = 170u; +template <unsigned MAX_NUM_KEYS> +struct _internal_fields_3_t { + using key_get_type = const snap_gen_t&; + using me_t = _internal_fields_3_t<MAX_NUM_KEYS>; + // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t) + using num_keys_t = uint8_t; + static constexpr field_type_t FIELD_TYPE = field_type_t::N3; + static constexpr node_offset_t SIZE = sizeof(me_t); + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + static constexpr node_offset_t ITEM_OVERHEAD = 0u; + + bool is_level_tail() const { return header.get_is_level_tail(); } + node_offset_t total_size() const { + if (is_level_tail()) { + return SIZE - sizeof(snap_gen_t); + } else { + return SIZE; + } + } + key_get_type get_key(index_t index) const { + assert(index < num_keys); + return keys[index]; + } + template <node_type_t NODE_TYPE> + std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, node_offset_t> + free_size_before(index_t index) const { + assert(index <= num_keys); + assert(num_keys <= (is_level_tail() ? MAX_NUM_KEYS - 1 : MAX_NUM_KEYS)); + auto free = (MAX_NUM_KEYS - index) * (sizeof(snap_gen_t) + sizeof(laddr_t)); + if (is_level_tail() && index == num_keys) { + free -= (sizeof(snap_gen_t) + sizeof(laddr_t)); + } + assert(free < SIZE); + return free; + } + + static node_offset_t estimate_insert_one() { + return sizeof(snap_gen_t) + sizeof(laddr_t); + } + template <KeyT KT> + static void insert_at( + NodeExtentMutable& mut, const full_key_t<KT>& key, + const me_t& node, index_t index, node_offset_t size_right) { + ceph_abort("not implemented"); + } + static void update_size_at( + NodeExtentMutable& mut, const me_t& node, index_t index, int change) { + ceph_abort("not implemented"); + } + + node_header_t header; + num_keys_t num_keys = 0u; + snap_gen_t keys[MAX_NUM_KEYS]; + laddr_packed_t child_addrs[MAX_NUM_KEYS]; +} __attribute__((packed)); +static_assert(_internal_fields_3_t<MAX_NUM_KEYS_I3>::SIZE <= NODE_BLOCK_SIZE && + _internal_fields_3_t<MAX_NUM_KEYS_I3 + 1>::SIZE > NODE_BLOCK_SIZE); +using internal_fields_3_t = _internal_fields_3_t<MAX_NUM_KEYS_I3>; + +using leaf_fields_3_t = _node_fields_013_t<slot_3_t>; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h new file mode 100644 index 000000000..cac167a98 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h @@ -0,0 +1,2186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <optional> +#include <ostream> +#include <sstream> +#include <type_traits> + +#include "common/likely.h" + +#include "sub_items_stage.h" +#include "item_iterator_stage.h" + +namespace crimson::os::seastore::onode { + +struct search_result_bs_t { + index_t index; + MatchKindBS match; +}; +template <typename FGetKey> +search_result_bs_t binary_search( + const full_key_t<KeyT::HOBJ>& key, + index_t begin, index_t end, FGetKey&& f_get_key) { + assert(begin <= end); + while (begin < end) { + auto total = begin + end; + auto mid = total >> 1; + // do not copy if return value is reference + decltype(f_get_key(mid)) target = f_get_key(mid); + auto match = compare_to<KeyT::HOBJ>(key, target); + if (match == MatchKindCMP::LT) { + end = mid; + } else if (match == MatchKindCMP::GT) { + begin = mid + 1; + } else { + return {mid, MatchKindBS::EQ}; + } + } + return {begin , MatchKindBS::NE}; +} + +template <typename PivotType, typename FGet> +search_result_bs_t binary_search_r( + index_t rend, index_t rbegin, FGet&& f_get, const PivotType& key) { + assert(rend <= rbegin); + while (rend < rbegin) { + auto total = rend + rbegin + 1; + auto mid = total >> 1; + // do not copy if return value is reference + decltype(f_get(mid)) target = f_get(mid); + int match = target - key; + if (match < 0) { + rend = mid; + } else if (match > 0) { + rbegin = mid - 1; + } else { + return {mid, MatchKindBS::EQ}; + } + } + return {rbegin, MatchKindBS::NE}; +} + +inline bool matchable(field_type_t type, match_stat_t mstat) { + assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX); + /* + * compressed prefix by field type: + * N0: NONE + * N1: pool/shard + * N2: pool/shard crush + * N3: pool/shard crush ns/oid + * + * if key matches the node's compressed prefix, return true + * else, return false + */ +#ifndef NDEBUG + if (mstat == MSTAT_END) { + assert(type == field_type_t::N0); + } +#endif + return mstat + to_unsigned(type) < 4; +} + +inline void assert_mstat( + const full_key_t<KeyT::HOBJ>& key, + const full_key_t<KeyT::VIEW>& index, + match_stat_t mstat) { + assert(mstat >= MSTAT_MIN && mstat <= MSTAT_LT2); + // key < index ... + switch (mstat) { + case MSTAT_EQ: + break; + case MSTAT_LT0: + assert(compare_to<KeyT::HOBJ>(key, index.snap_gen_packed()) == MatchKindCMP::LT); + break; + case MSTAT_LT1: + assert(compare_to<KeyT::HOBJ>(key, index.ns_oid_view()) == MatchKindCMP::LT); + break; + case MSTAT_LT2: + if (index.has_shard_pool()) { + assert(compare_to<KeyT::HOBJ>(key, shard_pool_crush_t{ + index.shard_pool_packed(), index.crush_packed()}) == MatchKindCMP::LT); + } else { + assert(compare_to<KeyT::HOBJ>(key, index.crush_packed()) == MatchKindCMP::LT); + } + break; + default: + ceph_abort("impossible path"); + } + // key == index ... + switch (mstat) { + case MSTAT_EQ: + assert(compare_to<KeyT::HOBJ>(key, index.snap_gen_packed()) == MatchKindCMP::EQ); + case MSTAT_LT0: + if (!index.has_ns_oid()) + break; + assert(index.ns_oid_view().type() == ns_oid_view_t::Type::MAX || + compare_to<KeyT::HOBJ>(key, index.ns_oid_view()) == MatchKindCMP::EQ); + case MSTAT_LT1: + if (!index.has_crush()) + break; + assert(compare_to<KeyT::HOBJ>(key, index.crush_packed()) == MatchKindCMP::EQ); + if (!index.has_shard_pool()) + break; + assert(compare_to<KeyT::HOBJ>(key, index.shard_pool_packed()) == MatchKindCMP::EQ); + default: + break; + } +} + +#define NXT_STAGE_T staged<next_param_t> + +enum class TrimType { BEFORE, AFTER, AT }; + +/** + * staged + * + * Implements recursive logic that modifies or reads the node layout + * (N0/N1/N2/N3 * LEAF/INTERNAL) with the multi-stage design. The specific + * stage implementation is flexible. So the implementations for different + * stages can be assembled independently, as long as they follow the + * definitions of container interfaces. + * + * Multi-stage is designed to index different portions of onode keys + * stage-by-stage. There are at most 3 stages for a node: + * - STAGE_LEFT: index shard-pool-crush for N0, or index crush for N1 node; + * - STAGE_STRING: index ns-oid for N0/N1/N2 nodes; + * - STAGE_RIGHT: index snap-gen for N0/N1/N2/N3 nodes; + * + * The intention is to consolidate the high-level indexing implementations at + * the level of stage, so we don't need to write them repeatedly for every + * stage and for every node type. + */ +template <typename Params> +struct staged { + static_assert(Params::STAGE >= STAGE_BOTTOM); + static_assert(Params::STAGE <= STAGE_TOP); + using container_t = typename Params::container_t; + using key_get_type = typename container_t::key_get_type; + using next_param_t = typename Params::next_param_t; + using position_t = staged_position_t<Params::STAGE>; + using result_t = staged_result_t<Params::NODE_TYPE, Params::STAGE>; + using value_t = value_type_t<Params::NODE_TYPE>; + static constexpr auto CONTAINER_TYPE = container_t::CONTAINER_TYPE; + static constexpr bool IS_BOTTOM = (Params::STAGE == STAGE_BOTTOM); + static constexpr auto NODE_TYPE = Params::NODE_TYPE; + static constexpr auto STAGE = Params::STAGE; + + template <bool is_exclusive> + static void _left_or_right(index_t& split_index, index_t insert_index, + std::optional<bool>& is_insert_left) { + assert(!is_insert_left.has_value()); + assert(is_valid_index(split_index)); + if constexpr (is_exclusive) { + if (split_index <= insert_index) { + // ...[s_index-1] |!| (i_index) [s_index]... + // offset i_position to right + is_insert_left = false; + } else { + // ...[s_index-1] (i_index)) |?[s_index]| ... + // ...(i_index)...[s_index-1] |?[s_index]| ... + is_insert_left = true; + --split_index; + } + } else { + if (split_index < insert_index) { + // ...[s_index-1] |?[s_index]| ...[(i_index)[s_index_k]... + is_insert_left = false; + } else if (split_index > insert_index) { + // ...[(i_index)s_index-1] |?[s_index]| ... + // ...[(i_index)s_index_k]...[s_index-1] |?[s_index]| ... + is_insert_left = true; + } else { + // ...[s_index-1] |?[(i_index)s_index]| ... + // i_to_left = std::nullopt; + } + } + } + + template <ContainerType CTYPE, typename Enable = void> class _iterator_t; + template <ContainerType CTYPE> + class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::INDEXABLE>> { + /* + * indexable container type system: + * CONTAINER_TYPE = ContainerType::INDEXABLE + * keys() const -> index_t + * operator[](index_t) const -> key_get_type + * size_before(index_t) const -> node_offset_t + * size_overhead_at(index_t) const -> node_offset_t + * (IS_BOTTOM) get_p_value(index_t) const -> const value_t* + * (!IS_BOTTOM) size_to_nxt_at(index_t) const -> node_offset_t + * (!IS_BOTTOM) get_nxt_container(index_t) const + * encode(p_node_start, encoded) + * decode(p_node_start, delta) -> container_t + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + * (IS_BOTTOM) insert_at(mut, src, key, value, + * index, size, p_left_bound) -> const value_t* + * (!IS_BOTTOM) insert_prefix_at(mut, src, key, + * index, size, p_left_bound) -> memory_range_t + * (!IS_BOTTOM) update_size_at(mut, src, index, size) + * trim_until(mut, container, index) -> trim_size + * (!IS_BOTTOM) trim_at(mut, container, index, trimmed) -> trim_size + * + * Appender::append(const container_t& src, from, items) + */ + public: + using me_t = _iterator_t<CTYPE>; + + _iterator_t(const container_t& container) : container{container} { + assert(container.keys()); + } + + index_t index() const { + return _index; + } + key_get_type get_key() const { + assert(!is_end()); + return container[_index]; + } + node_offset_t size_to_nxt() const { + assert(!is_end()); + return container.size_to_nxt_at(_index); + } + template <typename T = typename NXT_STAGE_T::container_t> + std::enable_if_t<!IS_BOTTOM, T> get_nxt_container() const { + assert(!is_end()); + return container.get_nxt_container(_index); + } + template <typename T = value_t> + std::enable_if_t<IS_BOTTOM, const T*> get_p_value() const { + assert(!is_end()); + return container.get_p_value(_index); + } + bool is_last() const { + return _index + 1 == container.keys(); + } + bool is_end() const { return _index == container.keys(); } + node_offset_t size() const { + assert(!is_end()); + assert(header_size() == container.size_before(0)); + assert(container.size_before(_index + 1) > container.size_before(_index)); + return container.size_before(_index + 1) - + container.size_before(_index); + } + node_offset_t size_overhead() const { + assert(!is_end()); + return container.size_overhead_at(_index); + } + + me_t& operator++() { + assert(!is_end()); + assert(!is_last()); + ++_index; + return *this; + } + void seek_at(index_t index) { + assert(index < container.keys()); + seek_till_end(index); + } + void seek_till_end(index_t index) { + assert(!is_end()); + assert(this->index() == 0); + assert(index <= container.keys()); + _index = index; + } + void seek_last() { + assert(!is_end()); + assert(index() == 0); + _index = container.keys() - 1; + } + void set_end() { + assert(!is_end()); + assert(is_last()); + ++_index; + } + // Note: possible to return an end iterator + MatchKindBS seek(const full_key_t<KeyT::HOBJ>& key, bool exclude_last) { + assert(!is_end()); + assert(index() == 0); + index_t end_index = container.keys(); + if (exclude_last) { + assert(end_index); + --end_index; + assert(compare_to<KeyT::HOBJ>(key, container[end_index]) == MatchKindCMP::LT); + } + auto ret = binary_search(key, _index, end_index, + [this] (index_t index) { return container[index]; }); + _index = ret.index; + return ret.match; + } + + template <KeyT KT, typename T = value_t> + std::enable_if_t<IS_BOTTOM, const T*> insert( + NodeExtentMutable& mut, const full_key_t<KT>& key, + const value_t& value, node_offset_t insert_size, const char* p_left_bound) { + return container_t::template insert_at<KT>( + mut, container, key, value, _index, insert_size, p_left_bound); + } + + template <KeyT KT, typename T = memory_range_t> + std::enable_if_t<!IS_BOTTOM, T> insert_prefix( + NodeExtentMutable& mut, const full_key_t<KT>& key, + node_offset_t size, const char* p_left_bound) { + return container_t::template insert_prefix_at<KT>( + mut, container, key, _index, size, p_left_bound); + } + + template <typename T = void> + std::enable_if_t<!IS_BOTTOM, T> + update_size(NodeExtentMutable& mut, node_offset_t insert_size) { + assert(!is_end()); + container_t::update_size_at(mut, container, _index, insert_size); + } + + // Note: possible to return an end iterator when is_exclusive is true + template <bool is_exclusive> + size_t seek_split_inserted( + size_t start_size, size_t extra_size, size_t target_size, + index_t& insert_index, size_t insert_size, + std::optional<bool>& is_insert_left) { + assert(!is_end()); + assert(index() == 0); + // replace insert_index placeholder + if constexpr (!is_exclusive) { + if (insert_index == INDEX_LAST) { + insert_index = container.keys() - 1; + } + } else { + if (insert_index == INDEX_END) { + insert_index = container.keys(); + } + } + assert(insert_index <= container.keys()); + + auto start_size_1 = start_size + extra_size; + auto f_get_used_size = [this, start_size, start_size_1, + insert_index, insert_size] (index_t index) { + size_t current_size; + if (unlikely(index == 0)) { + current_size = start_size; + } else { + current_size = start_size_1; + if (index > insert_index) { + current_size += insert_size; + if constexpr (is_exclusive) { + --index; + } + } + // already includes header size + current_size += container.size_before(index); + } + return current_size; + }; + index_t s_end; + if constexpr (is_exclusive) { + s_end = container.keys(); + } else { + s_end = container.keys() - 1; + } + _index = binary_search_r(0, s_end, f_get_used_size, target_size).index; + size_t current_size = f_get_used_size(_index); + assert(current_size <= target_size); + + _left_or_right<is_exclusive>(_index, insert_index, is_insert_left); + return current_size; + } + + size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) { + assert(!is_end()); + assert(index() == 0); + auto start_size_1 = start_size + extra_size; + auto f_get_used_size = [this, start_size, start_size_1] (index_t index) { + size_t current_size; + if (unlikely(index == 0)) { + current_size = start_size; + } else { + // already includes header size + current_size = start_size_1 + container.size_before(index); + } + return current_size; + }; + _index = binary_search_r( + 0, container.keys() - 1, f_get_used_size, target_size).index; + size_t current_size = f_get_used_size(_index); + assert(current_size <= target_size); + return current_size; + } + + // Note: possible to return an end iterater if to_index == INDEX_END + template <KeyT KT> + void copy_out_until( + typename container_t::template Appender<KT>& appender, index_t& to_index) { + auto num_keys = container.keys(); + index_t items; + if (to_index == INDEX_END) { + items = num_keys - _index; + appender.append(container, _index, items); + _index = num_keys; + to_index = _index; + } else if (to_index == INDEX_LAST) { + assert(!is_end()); + items = num_keys - 1 - _index; + appender.append(container, _index, items); + _index = num_keys - 1; + to_index = _index; + } else { + assert(_index <= to_index); + assert(to_index <= num_keys); + items = to_index - _index; + appender.append(container, _index, items); + _index = to_index; + } + } + + node_offset_t trim_until(NodeExtentMutable& mut) { + return container_t::trim_until(mut, container, _index); + } + + template <typename T = node_offset_t> + std::enable_if_t<!IS_BOTTOM, T> + trim_at(NodeExtentMutable& mut, node_offset_t trimmed) { + return container_t::trim_at(mut, container, _index, trimmed); + } + + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + container.encode(p_node_start, encoded); + ceph::encode(_index, encoded); + } + + static me_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + auto container = container_t::decode(p_node_start, delta); + auto ret = me_t(container); + index_t index; + ceph::decode(index, delta); + ret.seek_till_end(index); + return ret; + } + + static node_offset_t header_size() { + return container_t::header_size(); + } + + template <KeyT KT> + static node_offset_t estimate_insert( + const full_key_t<KT>& key, const value_t& value) { + return container_t::template estimate_insert<KT>(key, value); + } + + private: + container_t container; + index_t _index = 0; + }; + + template <ContainerType CTYPE> + class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::ITERATIVE>> { + /* + * iterative container type system (!IS_BOTTOM): + * CONTAINER_TYPE = ContainerType::ITERATIVE + * index() const -> index_t + * get_key() const -> key_get_type + * size() const -> node_offset_t + * size_to_nxt() const -> node_offset_t + * size_overhead() const -> node_offset_t + * get_nxt_container() const + * has_next() const -> bool + * encode(p_node_start, encoded) + * decode(p_node_start, delta) -> container_t + * operator++() + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + * insert_prefix(mut, src, key, is_end, size, p_left_bound) -> memory_range_t + * update_size(mut, src, size) + * trim_until(mut, container) -> trim_size + * trim_at(mut, container, trimmed) -> trim_size + */ + // currently the iterative iterator is only implemented with STAGE_STRING + // for in-node space efficiency + static_assert(STAGE == STAGE_STRING); + public: + using me_t = _iterator_t<CTYPE>; + + _iterator_t(const container_t& container) : container{container} {} + + index_t index() const { + if (is_end()) { + return container.index() + 1; + } else { + return container.index(); + } + } + key_get_type get_key() const { + assert(!is_end()); + return container.get_key(); + } + node_offset_t size_to_nxt() const { + assert(!is_end()); + return container.size_to_nxt(); + } + const typename NXT_STAGE_T::container_t get_nxt_container() const { + assert(!is_end()); + return container.get_nxt_container(); + } + bool is_last() const { + assert(!is_end()); + return !container.has_next(); + } + bool is_end() const { +#ifndef NDEBUG + if (_is_end) { + assert(!container.has_next()); + } +#endif + return _is_end; + } + node_offset_t size() const { + assert(!is_end()); + return container.size(); + } + node_offset_t size_overhead() const { + assert(!is_end()); + return container.size_overhead(); + } + + me_t& operator++() { + assert(!is_end()); + assert(!is_last()); + ++container; + return *this; + } + void seek_at(index_t index) { + assert(!is_end()); + assert(this->index() == 0); + while (index > 0) { + assert(container.has_next()); + ++container; + --index; + } + } + void seek_till_end(index_t index) { + assert(!is_end()); + assert(this->index() == 0); + while (index > 0) { + if (!container.has_next()) { + assert(index == 1); + set_end(); + break; + } + ++container; + --index; + } + } + void seek_last() { + assert(!is_end()); + assert(index() == 0); + while (container.has_next()) { + ++container; + } + } + void set_end() { + assert(!is_end()); + assert(is_last()); + _is_end = true; + } + // Note: possible to return an end iterator + MatchKindBS seek(const full_key_t<KeyT::HOBJ>& key, bool exclude_last) { + assert(!is_end()); + assert(index() == 0); + do { + if (exclude_last && is_last()) { + assert(compare_to<KeyT::HOBJ>(key, get_key()) == MatchKindCMP::LT); + return MatchKindBS::NE; + } + auto match = compare_to<KeyT::HOBJ>(key, get_key()); + if (match == MatchKindCMP::LT) { + return MatchKindBS::NE; + } else if (match == MatchKindCMP::EQ) { + return MatchKindBS::EQ; + } else { + if (container.has_next()) { + ++container; + } else { + // end + break; + } + } + } while (true); + assert(!exclude_last); + set_end(); + return MatchKindBS::NE; + } + + template <KeyT KT> + memory_range_t insert_prefix( + NodeExtentMutable& mut, const full_key_t<KT>& key, + node_offset_t size, const char* p_left_bound) { + return container_t::template insert_prefix<KT>( + mut, container, key, is_end(), size, p_left_bound); + } + + void update_size(NodeExtentMutable& mut, node_offset_t insert_size) { + assert(!is_end()); + container_t::update_size(mut, container, insert_size); + } + + // Note: possible to return an end iterator when is_exclusive is true + // insert_index can still be INDEX_LAST or INDEX_END + template <bool is_exclusive> + size_t seek_split_inserted( + size_t start_size, size_t extra_size, size_t target_size, + index_t& insert_index, size_t insert_size, + std::optional<bool>& is_insert_left) { + assert(!is_end()); + assert(index() == 0); + size_t current_size = start_size; + index_t split_index = 0; + extra_size += header_size(); + do { + if constexpr (!is_exclusive) { + if (is_last()) { + assert(split_index == index()); + if (insert_index == INDEX_LAST) { + insert_index = index(); + } + assert(insert_index <= index()); + break; + } + } + + size_t nxt_size = current_size; + if (split_index == 0) { + nxt_size += extra_size; + } + if (split_index == insert_index) { + nxt_size += insert_size; + if constexpr (is_exclusive) { + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + ++split_index; + } + } + nxt_size += size(); + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + + if constexpr (is_exclusive) { + if (is_last()) { + assert(split_index == index()); + set_end(); + split_index = index(); + if (insert_index == INDEX_END) { + insert_index = index(); + } + assert(insert_index == index()); + break; + } else { + ++(*this); + ++split_index; + } + } else { + ++(*this); + ++split_index; + } + } while (true); + assert(current_size <= target_size); + + _left_or_right<is_exclusive>(split_index, insert_index, is_insert_left); + assert(split_index == index()); + return current_size; + } + + size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) { + assert(!is_end()); + assert(index() == 0); + size_t current_size = start_size; + do { + if (is_last()) { + break; + } + + size_t nxt_size = current_size; + if (index() == 0) { + nxt_size += extra_size; + } + nxt_size += size(); + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + ++(*this); + } while (true); + assert(current_size <= target_size); + return current_size; + } + + // Note: possible to return an end iterater if to_index == INDEX_END + template <KeyT KT> + void copy_out_until( + typename container_t::template Appender<KT>& appender, index_t& to_index) { + if (is_end()) { + assert(!container.has_next()); + if (to_index == INDEX_END) { + to_index = index(); + } + assert(to_index == index()); + return; + } + index_t items; + if (to_index == INDEX_END || to_index == INDEX_LAST) { + items = to_index; + } else { + assert(is_valid_index(to_index)); + assert(index() <= to_index); + items = to_index - index(); + } + if (appender.append(container, items)) { + set_end(); + } + to_index = index(); + } + + node_offset_t trim_until(NodeExtentMutable& mut) { + if (is_end()) { + return 0; + } + return container_t::trim_until(mut, container); + } + + node_offset_t trim_at(NodeExtentMutable& mut, node_offset_t trimmed) { + assert(!is_end()); + return container_t::trim_at(mut, container, trimmed); + } + + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + container.encode(p_node_start, encoded); + uint8_t is_end = _is_end; + ceph::encode(is_end, encoded); + } + + static me_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + auto container = container_t::decode(p_node_start, delta); + auto ret = me_t(container); + uint8_t is_end; + ceph::decode(is_end, delta); + if (is_end) { + ret.set_end(); + } + return ret; + } + + static node_offset_t header_size() { + return container_t::header_size(); + } + + template <KeyT KT> + static node_offset_t estimate_insert(const full_key_t<KT>& key, const value_t& value) { + return container_t::template estimate_insert<KT>(key, value); + } + + private: + container_t container; + bool _is_end = false; + }; + + /* + * iterator_t encapsulates both indexable and iterative implementations + * from a *non-empty* container. + * cstr(const container_t&) + * access: + * index() -> index_t + * get_key() -> key_get_type (const reference or value type) + * is_last() -> bool + * is_end() -> bool + * size() -> node_offset_t + * size_overhead() -> node_offset_t + * (IS_BOTTOM) get_p_value() -> const value_t* + * (!IS_BOTTOM) get_nxt_container() -> nxt_stage::container_t + * (!IS_BOTTOM) size_to_nxt() -> node_offset_t + * seek: + * operator++() -> iterator_t& + * seek_at(index) + * seek_till_end(index) + * seek_last() + * set_end() + * seek(key, exclude_last) -> MatchKindBS + * insert: + * (IS_BOTTOM) insert(mut, key, value, size, p_left_bound) -> p_value + * (!IS_BOTTOM) insert_prefix(mut, key, size, p_left_bound) -> memory_range_t + * (!IS_BOTTOM) update_size(mut, size) + * split: + * seek_split_inserted<bool is_exclusive>( + * start_size, extra_size, target_size, insert_index, insert_size, + * std::optional<bool>& is_insert_left) + * -> insert to left/right/unknown (!exclusive) + * -> insert to left/right (exclusive, can be end) + * -> split_size + * seek_split(start_size, extra_size, target_size) -> split_size + * copy_out_until(appender, to_index) (can be end) + * trim_until(mut) -> trim_size + * (!IS_BOTTOM) trim_at(mut, trimmed) -> trim_size + * denc: + * encode(p_node_start, encoded) + * decode(p_node_start, delta) -> iterator_t + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + */ + using iterator_t = _iterator_t<CONTAINER_TYPE>; + /* TODO: detailed comments + * - trim_until(mut) -> trim_size + * * keep 0 to i - 1, and remove the rest, return the size trimmed. + * * if this is the end iterator, do nothing and return 0. + * * if this is the start iterator, normally needs to go to the higher + * stage to trim the entire container. + * - trim_at(mut, trimmed) -> trim_size + * * trim happens inside the current iterator, causing the size reduced by + * <trimmed>, return the total size trimmed. + */ + + /* + * Lookup internals (hide?) + */ + + template <bool GET_KEY> + static result_t smallest_result( + const iterator_t& iter, full_key_t<KeyT::VIEW>* index_key) { + static_assert(!IS_BOTTOM); + assert(!iter.is_end()); + auto pos_smallest = NXT_STAGE_T::position_t::begin(); + auto nxt_container = iter.get_nxt_container(); + auto value_ptr = NXT_STAGE_T::template get_p_value<GET_KEY>( + nxt_container, pos_smallest, index_key); + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + return result_t{{iter.index(), pos_smallest}, value_ptr, STAGE}; + } + + template <bool GET_KEY> + static result_t nxt_lower_bound( + const full_key_t<KeyT::HOBJ>& key, iterator_t& iter, + MatchHistory& history, full_key_t<KeyT::VIEW>* index_key) { + static_assert(!IS_BOTTOM); + assert(!iter.is_end()); + auto nxt_container = iter.get_nxt_container(); + auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>( + nxt_container, key, history, index_key); + if (nxt_result.is_end()) { + if (iter.is_last()) { + return result_t::end(); + } else { + return smallest_result<GET_KEY>(++iter, index_key); + } + } else { + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + return result_t::from_nxt(iter.index(), nxt_result); + } + } + + template <bool GET_POS, bool GET_KEY, bool GET_VAL> + static void lookup_largest_slot( + const container_t& container, position_t* p_position, + full_key_t<KeyT::VIEW>* p_index_key, const value_t** pp_value) { + auto iter = iterator_t(container); + iter.seek_last(); + if constexpr (GET_KEY) { + assert(p_index_key); + p_index_key->set(iter.get_key()); + } + if constexpr (GET_POS) { + assert(p_position); + p_position->index = iter.index(); + } + if constexpr (IS_BOTTOM) { + if constexpr (GET_VAL) { + assert(pp_value); + *pp_value = iter.get_p_value(); + } + } else { + auto nxt_container = iter.get_nxt_container(); + if constexpr (GET_POS) { + NXT_STAGE_T::template lookup_largest_slot<true, GET_KEY, GET_VAL>( + nxt_container, &p_position->nxt, p_index_key, pp_value); + } else { + NXT_STAGE_T::template lookup_largest_slot<false, GET_KEY, GET_VAL>( + nxt_container, nullptr, p_index_key, pp_value); + } + } + } + + template <bool GET_KEY = false> + static const value_t* get_p_value( + const container_t& container, const position_t& position, + full_key_t<KeyT::VIEW>* index_key = nullptr) { + auto iter = iterator_t(container); + iter.seek_at(position.index); + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::template get_p_value<GET_KEY>( + nxt_container, position.nxt, index_key); + } else { + return iter.get_p_value(); + } + } + + static void get_key_view( + const container_t& container, + const position_t& position, + full_key_t<KeyT::VIEW>& index_key) { + auto iter = iterator_t(container); + iter.seek_at(position.index); + index_key.set(iter.get_key()); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::get_key_view(nxt_container, position.nxt, index_key); + } + } + + template <bool GET_KEY = false> + static result_t lower_bound( + const container_t& container, + const full_key_t<KeyT::HOBJ>& key, + MatchHistory& history, + full_key_t<KeyT::VIEW>* index_key = nullptr) { + bool exclude_last = false; + if (history.get<STAGE>().has_value()) { + if (*history.get<STAGE>() == MatchKindCMP::EQ) { + // lookup is short-circuited + if constexpr (!IS_BOTTOM) { + assert(history.get<STAGE - 1>().has_value()); + if (history.is_GT<STAGE - 1>()) { + auto iter = iterator_t(container); + bool test_key_equal; + if constexpr (STAGE == STAGE_STRING) { + // TODO(cross-node string dedup) + // test_key_equal = (iter.get_key().type() == ns_oid_view_t::Type::MIN); + auto cmp = compare_to<KeyT::HOBJ>(key, iter.get_key()); + assert(cmp != MatchKindCMP::GT); + test_key_equal = (cmp == MatchKindCMP::EQ); + } else { + auto cmp = compare_to<KeyT::HOBJ>(key, iter.get_key()); + // From history, key[stage] == parent[stage][index - 1] + // which should be the smallest possible value for all + // index[stage][*] + assert(cmp != MatchKindCMP::GT); + test_key_equal = (cmp == MatchKindCMP::EQ); + } + if (test_key_equal) { + return nxt_lower_bound<GET_KEY>(key, iter, history, index_key); + } else { + // key[stage] < index[stage][left-most] + return smallest_result<GET_KEY>(iter, index_key); + } + } + } + // IS_BOTTOM || !history.is_GT<STAGE - 1>() + auto iter = iterator_t(container); + iter.seek_last(); + if constexpr (STAGE == STAGE_STRING) { + // TODO(cross-node string dedup) + // assert(iter.get_key().type() == ns_oid_view_t::Type::MAX); + assert(compare_to<KeyT::HOBJ>(key, iter.get_key()) == MatchKindCMP::EQ); + } else { + assert(compare_to<KeyT::HOBJ>(key, iter.get_key()) == MatchKindCMP::EQ); + } + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + if constexpr (IS_BOTTOM) { + auto value_ptr = iter.get_p_value(); + return result_t{{iter.index()}, value_ptr, MSTAT_EQ}; + } else { + auto nxt_container = iter.get_nxt_container(); + auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>( + nxt_container, key, history, index_key); + // !history.is_GT<STAGE - 1>() means + // key[stage+1 ...] <= index[stage+1 ...][*] + assert(!nxt_result.is_end()); + return result_t::from_nxt(iter.index(), nxt_result); + } + } else if (*history.get<STAGE>() == MatchKindCMP::LT) { + exclude_last = true; + } + } + auto iter = iterator_t(container); + auto bs_match = iter.seek(key, exclude_last); + if (iter.is_end()) { + assert(!exclude_last); + assert(bs_match == MatchKindBS::NE); + history.set<STAGE>(MatchKindCMP::GT); + return result_t::end(); + } + history.set<STAGE>(bs_match == MatchKindBS::EQ ? + MatchKindCMP::EQ : MatchKindCMP::LT); + if constexpr (IS_BOTTOM) { + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + auto value_ptr = iter.get_p_value(); + return result_t{{iter.index()}, value_ptr, + (bs_match == MatchKindBS::EQ ? MSTAT_EQ : MSTAT_LT0)}; + } else { + if (bs_match == MatchKindBS::EQ) { + return nxt_lower_bound<GET_KEY>(key, iter, history, index_key); + } else { + return smallest_result<GET_KEY>(iter, index_key); + } + } + } + + template <KeyT KT> + static node_offset_t insert_size(const full_key_t<KT>& key, const value_t& value) { + if constexpr (IS_BOTTOM) { + return iterator_t::template estimate_insert<KT>(key, value); + } else { + return iterator_t::template estimate_insert<KT>(key, value) + + NXT_STAGE_T::iterator_t::header_size() + + NXT_STAGE_T::template insert_size<KT>(key, value); + } + } + + template <KeyT KT> + static node_offset_t insert_size_at( + match_stage_t stage, const full_key_t<KeyT::HOBJ>& key, const value_t& value) { + if (stage == STAGE) { + return insert_size<KT>(key, value); + } else { + assert(stage < STAGE); + return NXT_STAGE_T::template insert_size_at<KT>(stage, key, value); + } + } + + template <typename T = std::tuple<match_stage_t, node_offset_t>> + static std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, T> evaluate_insert( + const container_t& container, const full_key_t<KeyT::VIEW>& key, + const value_t& value, position_t& position, bool evaluate_last) { + auto iter = iterator_t(container); + auto& index = position.index; + if (evaluate_last || index == INDEX_END) { + iter.seek_last(); + index = iter.index(); + // evaluate the previous index + } else { + assert(is_valid_index(index)); + // evaluate the current index + iter.seek_at(index); + auto match = compare_to<KeyT::VIEW>(key, iter.get_key()); + if (match == MatchKindCMP::EQ) { + if constexpr (IS_BOTTOM) { + ceph_abort("insert conflict at current index!"); + } else { + // insert into the current index + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::evaluate_insert( + nxt_container, key, value, position.nxt, false); + } + } else { + assert(match == MatchKindCMP::LT); + if (index == 0) { + // already the first index, so insert at the current index + return {STAGE, insert_size<KeyT::VIEW>(key, value)}; + } + --index; + iter = iterator_t(container); + iter.seek_at(index); + // proceed to evaluate the previous index + } + } + + // XXX(multi-type): when key is from a different type of node + auto match = compare_to<KeyT::VIEW>(key, iter.get_key()); + if (match == MatchKindCMP::GT) { + // key doesn't match both indexes, so insert at the current index + ++index; + return {STAGE, insert_size<KeyT::VIEW>(key, value)}; + } else { + assert(match == MatchKindCMP::EQ); + if constexpr (IS_BOTTOM) { + // ceph_abort? + ceph_abort("insert conflict at the previous index!"); + } else { + // insert into the previous index + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::evaluate_insert( + nxt_container, key, value, position.nxt, true); + } + } + } + + template <typename T = bool> + static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T> + compensate_insert_position_at(match_stage_t stage, position_t& position) { + auto& index = position.index; + if (stage == STAGE) { + assert(index == 0); + // insert at the end of the current stage + index = INDEX_END; + return true; + } else { + if constexpr (IS_BOTTOM) { + ceph_abort("impossible path"); + } else { + assert(stage < STAGE); + bool compensate = NXT_STAGE_T:: + compensate_insert_position_at(stage, position.nxt); + if (compensate) { + assert(is_valid_index(index)); + if (index == 0) { + // insert into the *last* index of the current stage + index = INDEX_LAST; + return true; + } else { + --index; + return false; + } + } else { + return false; + } + } + } + } + + static void patch_insert_end(position_t& insert_pos, match_stage_t insert_stage) { + assert(insert_stage <= STAGE); + if (insert_stage == STAGE) { + insert_pos.index = INDEX_END; + } else if constexpr (!IS_BOTTOM) { + insert_pos.index = INDEX_LAST; + NXT_STAGE_T::patch_insert_end(insert_pos.nxt, insert_stage); + } + } + + template <typename T = std::tuple<match_stage_t, node_offset_t>> + static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T> evaluate_insert( + const full_key_t<KeyT::HOBJ>& key, const onode_t& value, + const MatchHistory& history, match_stat_t mstat, position_t& position) { + match_stage_t insert_stage = STAGE_TOP; + while (*history.get_by_stage(insert_stage) == MatchKindCMP::EQ) { + assert(insert_stage != STAGE_BOTTOM && "insert conflict!"); + --insert_stage; + } + + if (history.is_GT()) { + if (position.is_end()) { + // no need to compensate insert position + assert(insert_stage <= STAGE && "impossible insert stage"); + } else if (position == position_t::begin()) { + // I must be short-circuited by staged::smallest_result() + // in staged::lower_bound(), so we need to rely on mstat instead + assert(mstat >= MSTAT_LT0 && mstat <= MSTAT_LT3); + if (mstat == MSTAT_LT0) { + insert_stage = STAGE_RIGHT; + } else if (mstat == MSTAT_LT1) { + insert_stage = STAGE_STRING; + } else { + insert_stage = STAGE_LEFT; + } + // XXX(multi-type): need to upgrade node type before inserting an + // incompatible index at front. + assert(insert_stage <= STAGE && "incompatible insert"); + } else { + assert(insert_stage <= STAGE && "impossible insert stage"); + [[maybe_unused]] bool ret = compensate_insert_position_at(insert_stage, position); + assert(!ret); + } + } + + if (position.is_end()) { + patch_insert_end(position, insert_stage); + } + + node_offset_t insert_size = insert_size_at<KeyT::HOBJ>(insert_stage, key, value); + + return {insert_stage, insert_size}; + } + + template <KeyT KT> + static const value_t* insert_new( + NodeExtentMutable& mut, const memory_range_t& range, + const full_key_t<KT>& key, const value_t& value) { + char* p_insert = const_cast<char*>(range.p_end); + const value_t* p_value = nullptr; + StagedAppender<KT> appender; + appender.init(&mut, p_insert); + appender.append(key, value, p_value); + [[maybe_unused]] const char* p_insert_front = appender.wrap(); + assert(p_insert_front == range.p_start); + return p_value; + } + + template <KeyT KT, bool SPLIT> + static const value_t* proceed_insert_recursively( + NodeExtentMutable& mut, const container_t& container, + const full_key_t<KT>& key, const value_t& value, + position_t& position, match_stage_t& stage, + node_offset_t& _insert_size, const char* p_left_bound) { + // proceed insert from right to left + assert(stage <= STAGE); + auto iter = iterator_t(container); + auto& index = position.index; + + bool do_insert = false; + if (stage == STAGE) { + if (index == INDEX_END) { + iter.seek_last(); + iter.set_end(); + index = iter.index(); + } else { + assert(is_valid_index(index)); + iter.seek_till_end(index); + } + do_insert = true; + } else { // stage < STAGE + if (index == INDEX_LAST) { + iter.seek_last(); + index = iter.index(); + } else { + assert(is_valid_index(index)); + iter.seek_till_end(index); + } + if constexpr (SPLIT) { + if (iter.is_end()) { + // insert at the higher stage due to split + do_insert = true; + _insert_size = insert_size<KT>(key, value); + stage = STAGE; + } + } else { + assert(!iter.is_end()); + } + } + + if (do_insert) { + if constexpr (!IS_BOTTOM) { + position.nxt = position_t::nxt_t::begin(); + } + assert(_insert_size == insert_size<KT>(key, value)); + if constexpr (IS_BOTTOM) { + return iter.template insert<KT>( + mut, key, value, _insert_size, p_left_bound); + } else { + auto range = iter.template insert_prefix<KT>( + mut, key, _insert_size, p_left_bound); + return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value); + } + } else { + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + auto p_value = NXT_STAGE_T::template proceed_insert_recursively<KT, SPLIT>( + mut, nxt_container, key, value, + position.nxt, stage, _insert_size, p_left_bound); + iter.update_size(mut, _insert_size); + return p_value; + } else { + ceph_abort("impossible path"); + } + } + } + + template <KeyT KT, bool SPLIT> + static const value_t* proceed_insert( + NodeExtentMutable& mut, const container_t& container, + const full_key_t<KT>& key, const value_t& value, + position_t& position, match_stage_t& stage, node_offset_t& _insert_size) { + auto p_left_bound = container.p_left_bound(); + if (unlikely(!container.keys())) { + if (position.is_end()) { + position = position_t::begin(); + assert(stage == STAGE); + assert(_insert_size == insert_size<KT>(key, value)); + } else if (position == position_t::begin()) { + // when insert into a trimmed and empty left node + stage = STAGE; + _insert_size = insert_size<KT>(key, value); + } else { + ceph_abort("impossible path"); + } + if constexpr (IS_BOTTOM) { + return container_t::template insert_at<KT>( + mut, container, key, value, 0, _insert_size, p_left_bound); + } else { + auto range = container_t::template insert_prefix_at<KT>( + mut, container, key, 0, _insert_size, p_left_bound); + return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value); + } + } else { + return proceed_insert_recursively<KT, SPLIT>( + mut, container, key, value, + position, stage, _insert_size, p_left_bound); + } + } + + static std::ostream& dump(const container_t& container, + std::ostream& os, + const std::string& prefix, + size_t& size, + const char* p_start) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + std::string prefix_blank(prefix.size(), ' '); + const std::string* p_prefix = &prefix; + size += iterator_t::header_size(); + do { + std::ostringstream sos; + sos << *p_prefix << iter.get_key() << ": "; + std::string i_prefix = sos.str(); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + size += iter.size_to_nxt(); + NXT_STAGE_T::dump(nxt_container, os, i_prefix, size, p_start); + } else { + auto value_ptr = iter.get_p_value(); + int offset = reinterpret_cast<const char*>(value_ptr) - p_start; + size += iter.size(); + os << "\n" << i_prefix; + if constexpr (NODE_TYPE == node_type_t::LEAF) { + os << *value_ptr; + } else { + os << "0x" << std::hex << value_ptr->value << std::dec; + } + os << " " << size << "B" + << " @" << offset << "B"; + } + if (iter.is_last()) { + break; + } else { + ++iter; + p_prefix = &prefix_blank; + } + } while (true); + return os; + } + + static void validate(const container_t& container) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + auto key = iter.get_key(); + do { + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + NXT_STAGE_T::validate(nxt_container); + } + if (iter.is_last()) { + break; + } else { + ++iter; + assert(compare_to(key, iter.get_key()) == MatchKindCMP::LT); + key = iter.get_key(); + } + } while (true); + } + + static void get_stats(const container_t& container, node_stats_t& stats, + full_key_t<KeyT::VIEW>& index_key) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + stats.size_overhead += iterator_t::header_size(); + do { + index_key.replace(iter.get_key()); + stats.size_overhead += iter.size_overhead(); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + NXT_STAGE_T::get_stats(nxt_container, stats, index_key); + } else { + ++stats.num_kvs; + size_t kv_logical_size = index_key.size_logical(); + size_t value_size; + if constexpr (NODE_TYPE == node_type_t::LEAF) { + value_size = iter.get_p_value()->size; + } else { + value_size = sizeof(value_t); + } + stats.size_value += value_size; + kv_logical_size += value_size; + stats.size_logical += kv_logical_size; + } + if (iter.is_last()) { + break; + } else { + ++iter; + } + } while (true); + } + + static bool next_position(const container_t& container, position_t& pos) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + iter.seek_at(pos.index); + bool find_next; + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + find_next = NXT_STAGE_T::next_position(nxt_container, pos.nxt); + } else { + find_next = true; + } + if (find_next) { + if (iter.is_last()) { + return true; + } else { + pos.index = iter.index() + 1; + if constexpr (!IS_BOTTOM) { + pos.nxt = NXT_STAGE_T::position_t::begin(); + } + return false; + } + } else { + return false; + } + } + + struct _BaseEmpty {}; + class _BaseWithNxtIterator { + protected: + typename NXT_STAGE_T::StagedIterator _nxt; + }; + class StagedIterator + : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtIterator> { + public: + StagedIterator() = default; + bool valid() const { return iter.has_value(); } + index_t index() const { + return iter->index(); + } + bool is_end() const { return iter->is_end(); } + bool in_progress() const { + assert(valid()); + if constexpr (!IS_BOTTOM) { + if (this->_nxt.valid()) { + if (this->_nxt.index() == 0) { + return this->_nxt.in_progress(); + } else { + return true; + } + } else { + return false; + } + } else { + return false; + } + } + key_get_type get_key() const { return iter->get_key(); } + + iterator_t& get() { return *iter; } + void set(const container_t& container) { + assert(!valid()); + iter = iterator_t(container); + } + void set_end() { iter->set_end(); } + typename NXT_STAGE_T::StagedIterator& nxt() { + if constexpr (!IS_BOTTOM) { + if (!this->_nxt.valid()) { + auto nxt_container = iter->get_nxt_container(); + this->_nxt.set(nxt_container); + } + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + typename NXT_STAGE_T::StagedIterator& get_nxt() { + if constexpr (!IS_BOTTOM) { + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + StagedIterator& operator++() { + if (iter->is_last()) { + iter->set_end(); + } else { + ++(*iter); + } + if constexpr (!IS_BOTTOM) { + this->_nxt.reset(); + } + return *this; + } + void reset() { + if (valid()) { + iter.reset(); + if constexpr (!IS_BOTTOM) { + this->_nxt.reset(); + } + } + } + std::ostream& print(std::ostream& os, bool is_top) const { + if (valid()) { + if (iter->is_end()) { + return os << "END"; + } else { + os << index(); + } + } else { + if (is_top) { + return os << "invalid StagedIterator!"; + } else { + os << "0!"; + } + } + if constexpr (!IS_BOTTOM) { + os << ", "; + return this->_nxt.print(os, false); + } else { + return os; + } + } + position_t get_pos() const { + if (valid()) { + if constexpr (IS_BOTTOM) { + return position_t{index()}; + } else { + return position_t{index(), this->_nxt.get_pos()}; + } + } else { + return position_t::begin(); + } + } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + uint8_t present = static_cast<bool>(iter); + ceph::encode(present, encoded); + if (iter.has_value()) { + iter->encode(p_node_start, encoded); + if constexpr (!IS_BOTTOM) { + this->_nxt.encode(p_node_start, encoded); + } + } + } + static StagedIterator decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + StagedIterator ret; + uint8_t present; + ceph::decode(present, delta); + if (present) { + ret.iter = iterator_t::decode(p_node_start, delta); + if constexpr (!IS_BOTTOM) { + ret._nxt = NXT_STAGE_T::StagedIterator::decode(p_node_start, delta); + } + } + return ret; + } + friend std::ostream& operator<<(std::ostream& os, const StagedIterator& iter) { + return iter.print(os, true); + } + private: + std::optional<iterator_t> iter; + }; + + static bool recursively_locate_split( + size_t& current_size, size_t extra_size, + size_t target_size, StagedIterator& split_at) { + assert(current_size <= target_size); + iterator_t& split_iter = split_at.get(); + current_size = split_iter.seek_split(current_size, extra_size, target_size); + assert(current_size <= target_size); + assert(!split_iter.is_end()); + if (split_iter.index() == 0) { + extra_size += iterator_t::header_size(); + } else { + extra_size = 0; + } + bool locate_nxt; + if constexpr (!IS_BOTTOM) { + locate_nxt = NXT_STAGE_T::recursively_locate_split( + current_size, extra_size + split_iter.size_to_nxt(), + target_size, split_at.nxt()); + } else { // IS_BOTTOM + // located upper_bound, fair split strategy + size_t nxt_size = split_iter.size() + extra_size; + assert(current_size + nxt_size > target_size); + if (current_size + nxt_size/2 < target_size) { + // include next + current_size += nxt_size; + locate_nxt = true; + } else { + // exclude next + locate_nxt = false; + } + } + if (locate_nxt) { + if (split_iter.is_last()) { + return true; + } else { + ++split_at; + return false; + } + } else { + return false; + } + } + + static bool recursively_locate_split_inserted( + size_t& current_size, size_t extra_size, size_t target_size, + position_t& insert_pos, match_stage_t insert_stage, size_t insert_size, + std::optional<bool>& is_insert_left, StagedIterator& split_at) { + assert(current_size <= target_size); + assert(!is_insert_left.has_value()); + iterator_t& split_iter = split_at.get(); + auto& insert_index = insert_pos.index; + if (insert_stage == STAGE) { + current_size = split_iter.template seek_split_inserted<true>( + current_size, extra_size, target_size, + insert_index, insert_size, is_insert_left); + assert(is_insert_left.has_value()); + assert(current_size <= target_size); + if (split_iter.index() == 0) { + if (insert_index == 0) { + if (*is_insert_left == false) { + extra_size += iterator_t::header_size(); + } else { + extra_size = 0; + } + } else { + extra_size += iterator_t::header_size(); + } + } else { + extra_size = 0; + } + if (*is_insert_left == false && split_iter.index() == insert_index) { + // split_iter can be end + // found the lower-bound of target_size + // ...[s_index-1] |!| (i_index) [s_index]... + + // located upper-bound, fair split strategy + // look at the next slot (the insert item) + size_t nxt_size = insert_size + extra_size; + assert(current_size + nxt_size > target_size); + if (current_size + nxt_size/2 < target_size) { + // include next + *is_insert_left = true; + current_size += nxt_size; + if (split_iter.is_end()) { + // ...[s_index-1] (i_index) |!| + return true; + } else { + return false; + } + } else { + // exclude next + return false; + } + } else { + // Already considered insert effect in the current stage. + // Look into the next stage to identify the target_size lower-bound w/o + // insert effect. + assert(!split_iter.is_end()); + bool locate_nxt; + if constexpr (!IS_BOTTOM) { + locate_nxt = NXT_STAGE_T::recursively_locate_split( + current_size, extra_size + split_iter.size_to_nxt(), + target_size, split_at.nxt()); + } else { // IS_BOTTOM + // located upper-bound, fair split strategy + // look at the next slot + size_t nxt_size = split_iter.size() + extra_size; + assert(current_size + nxt_size > target_size); + if (current_size + nxt_size/2 < target_size) { + // include next + current_size += nxt_size; + locate_nxt = true; + } else { + // exclude next + locate_nxt = false; + } + } + if (locate_nxt) { + if (split_iter.is_last()) { + auto end_index = split_iter.index() + 1; + if (insert_index == INDEX_END) { + insert_index = end_index; + } + assert(insert_index <= end_index); + if (insert_index == end_index) { + assert(*is_insert_left == false); + split_iter.set_end(); + // ...[s_index-1] |!| (i_index) + return false; + } else { + assert(*is_insert_left == true); + return true; + } + } else { + ++split_at; + return false; + } + } else { + return false; + } + } + } else { + if constexpr (!IS_BOTTOM) { + assert(insert_stage < STAGE); + current_size = split_iter.template seek_split_inserted<false>( + current_size, extra_size, target_size, + insert_index, insert_size, is_insert_left); + assert(!split_iter.is_end()); + assert(current_size <= target_size); + if (split_iter.index() == 0) { + extra_size += iterator_t::header_size(); + } else { + extra_size = 0; + } + bool locate_nxt; + if (!is_insert_left.has_value()) { + // Considered insert effect in the current stage, and insert happens + // in the lower stage. + // Look into the next stage to identify the target_size lower-bound w/ + // insert effect. + assert(split_iter.index() == insert_index); + locate_nxt = NXT_STAGE_T::recursively_locate_split_inserted( + current_size, extra_size + split_iter.size_to_nxt(), target_size, + insert_pos.nxt, insert_stage, insert_size, + is_insert_left, split_at.nxt()); + assert(is_insert_left.has_value()); +#ifndef NDEBUG + if (locate_nxt) { + assert(*is_insert_left == true); + } +#endif + } else { + // is_insert_left.has_value() == true + // Insert will *not* happen in the lower stage. + // Need to look into the next stage to identify the target_size + // lower-bound w/ insert effect + assert(split_iter.index() != insert_index); + locate_nxt = NXT_STAGE_T::recursively_locate_split( + current_size, extra_size + split_iter.size_to_nxt(), + target_size, split_at.nxt()); +#ifndef NDEBUG + if (split_iter.index() < insert_index) { + assert(*is_insert_left == false); + } else { + assert(*is_insert_left == true); + } +#endif + } + if (locate_nxt) { + if (split_iter.is_last()) { + return true; + } else { + ++split_at; + return false; + } + } else { + return false; + } + } else { + ceph_abort("impossible path"); + return false;; + } + } + } + + /* + * container appender type system + * container_t::Appender(NodeExtentMutable& mut, char* p_append) + * append(const container_t& src, index_t from, index_t items) + * wrap() -> char* + * IF !IS_BOTTOM: + * open_nxt(const key_get_type&) + * open_nxt(const full_key_t&) + * -> std::tuple<NodeExtentMutable&, char*> + * wrap_nxt(char* p_append) + * ELSE + * append(const full_key_t& key, const value_t& value) + */ + template <KeyT KT> + struct _BaseWithNxtAppender { + typename NXT_STAGE_T::template StagedAppender<KT> _nxt; + }; + template <KeyT KT> + class StagedAppender + : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtAppender<KT>> { + public: + StagedAppender() = default; + ~StagedAppender() { + assert(!require_wrap_nxt); + assert(!valid()); + } + bool valid() const { return appender.has_value(); } + index_t index() const { + assert(valid()); + return _index; + } + bool in_progress() const { return require_wrap_nxt; } + // TODO: pass by reference + void init(NodeExtentMutable* p_mut, char* p_start) { + assert(!valid()); + appender = typename container_t::template Appender<KT>(p_mut, p_start); + _index = 0; + } + // possible to make src_iter end if to_index == INDEX_END + void append_until(StagedIterator& src_iter, index_t& to_index) { + assert(!require_wrap_nxt); + auto s_index = src_iter.index(); + src_iter.get().template copy_out_until<KT>(*appender, to_index); + assert(src_iter.index() == to_index); + assert(to_index >= s_index); + auto increment = (to_index - s_index); + if (increment) { + _index += increment; + if constexpr (!IS_BOTTOM) { + src_iter.get_nxt().reset(); + } + } + } + void append(const full_key_t<KT>& key, + const value_t& value, const value_t*& p_value) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + auto& nxt = open_nxt(key); + nxt.append(key, value, p_value); + wrap_nxt(); + } else { + appender->append(key, value, p_value); + ++_index; + } + } + char* wrap() { + assert(valid()); + assert(_index > 0); + if constexpr (!IS_BOTTOM) { + if (require_wrap_nxt) { + wrap_nxt(); + } + } + auto ret = appender->wrap(); + appender.reset(); + return ret; + } + typename NXT_STAGE_T::template StagedAppender<KT>& + open_nxt(key_get_type paritial_key) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + require_wrap_nxt = true; + auto [p_mut, p_append] = appender->open_nxt(paritial_key); + this->_nxt.init(p_mut, p_append); + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + typename NXT_STAGE_T::template StagedAppender<KT>& + open_nxt(const full_key_t<KT>& key) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + require_wrap_nxt = true; + auto [p_mut, p_append] = appender->open_nxt(key); + this->_nxt.init(p_mut, p_append); + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + typename NXT_STAGE_T::template StagedAppender<KT>& get_nxt() { + if constexpr (!IS_BOTTOM) { + assert(require_wrap_nxt); + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + void wrap_nxt() { + if constexpr (!IS_BOTTOM) { + assert(require_wrap_nxt); + require_wrap_nxt = false; + auto p_append = this->_nxt.wrap(); + appender->wrap_nxt(p_append); + ++_index; + } else { + ceph_abort("impossible path"); + } + } + private: + std::optional<typename container_t::template Appender<KT>> appender; + index_t _index; + bool require_wrap_nxt = false; + }; + + template <KeyT KT> + static void _append_range( + StagedIterator& src_iter, StagedAppender<KT>& appender, index_t& to_index) { + if (src_iter.is_end()) { + // append done + assert(to_index == INDEX_END); + to_index = src_iter.index(); + } else if constexpr (!IS_BOTTOM) { + if (appender.in_progress()) { + // appender has appended something at the current item, + // cannot append the current item as-a-whole + index_t to_index_nxt = INDEX_END; + NXT_STAGE_T::template _append_range<KT>( + src_iter.nxt(), appender.get_nxt(), to_index_nxt); + ++src_iter; + appender.wrap_nxt(); + } else if (src_iter.in_progress()) { + // src_iter is not at the beginning of the current item, + // cannot append the current item as-a-whole + index_t to_index_nxt = INDEX_END; + NXT_STAGE_T::template _append_range<KT>( + src_iter.nxt(), appender.open_nxt(src_iter.get_key()), to_index_nxt); + ++src_iter; + appender.wrap_nxt(); + } else { + // we can safely append the current item as-a-whole + } + } + appender.append_until(src_iter, to_index); + } + + template <KeyT KT> + static void _append_into(StagedIterator& src_iter, StagedAppender<KT>& appender, + position_t& position, match_stage_t stage) { + assert(position.index == src_iter.index()); + // reaches the last item + if (stage == STAGE) { + // done, end recursion + if constexpr (!IS_BOTTOM) { + position.nxt = position_t::nxt_t::begin(); + } + } else { + assert(stage < STAGE); + // proceed append in the next stage + NXT_STAGE_T::template append_until<KT>( + src_iter.nxt(), appender.open_nxt(src_iter.get_key()), + position.nxt, stage); + } + } + + template <KeyT KT> + static void append_until(StagedIterator& src_iter, StagedAppender<KT>& appender, + position_t& position, match_stage_t stage) { + index_t from_index = src_iter.index(); + index_t& to_index = position.index; + assert(from_index <= to_index); + if constexpr (IS_BOTTOM) { + assert(stage == STAGE); + appender.append_until(src_iter, to_index); + } else { + assert(stage <= STAGE); + if (src_iter.index() == to_index) { + _append_into<KT>(src_iter, appender, position, stage); + } else { + if (to_index == INDEX_END) { + assert(stage == STAGE); + } else if (to_index == INDEX_LAST) { + assert(stage < STAGE); + } + _append_range<KT>(src_iter, appender, to_index); + _append_into<KT>(src_iter, appender, position, stage); + } + } + to_index -= from_index; + } + + template <KeyT KT> + static bool append_insert( + const full_key_t<KT>& key, const value_t& value, + StagedIterator& src_iter, StagedAppender<KT>& appender, + bool is_front_insert, match_stage_t& stage, const value_t*& p_value) { + assert(src_iter.valid()); + if (stage == STAGE) { + appender.append(key, value, p_value); + if (src_iter.is_end()) { + return true; + } else { + return false; + } + } else { + assert(stage < STAGE); + if constexpr (!IS_BOTTOM) { + auto nxt_is_end = NXT_STAGE_T::template append_insert<KT>( + key, value, src_iter.get_nxt(), appender.get_nxt(), + is_front_insert, stage, p_value); + if (nxt_is_end) { + appender.wrap_nxt(); + ++src_iter; + if (is_front_insert) { + stage = STAGE; + } + if (src_iter.is_end()) { + return true; + } + } + return false; + } else { + ceph_abort("impossible path"); + } + } + } + + /* TrimType: + * BEFORE: remove the entire container, normally means the according higher + * stage iterator needs to be trimmed as-a-whole. + * AFTER: retain the entire container, normally means the trim should be + * start from the next iterator at the higher stage. + * AT: trim happens in the current container, and the according higher + * stage iterator needs to be adjusted by the trimmed size. + */ + static std::tuple<TrimType, node_offset_t> + recursively_trim(NodeExtentMutable& mut, StagedIterator& trim_at) { + if (!trim_at.valid()) { + return {TrimType::BEFORE, 0u}; + } + if (trim_at.is_end()) { + return {TrimType::AFTER, 0u}; + } + + auto& iter = trim_at.get(); + if constexpr (!IS_BOTTOM) { + auto [type, trimmed] = NXT_STAGE_T::recursively_trim( + mut, trim_at.get_nxt()); + node_offset_t trim_size; + if (type == TrimType::AFTER) { + if (iter.is_last()) { + return {TrimType::AFTER, 0u}; + } + ++trim_at; + trim_size = iter.trim_until(mut); + } else if (type == TrimType::BEFORE) { + if (iter.index() == 0) { + return {TrimType::BEFORE, 0u}; + } + trim_size = iter.trim_until(mut); + } else { + trim_size = iter.trim_at(mut, trimmed); + } + return {TrimType::AT, trim_size}; + } else { + if (iter.index() == 0) { + return {TrimType::BEFORE, 0u}; + } else { + auto trimmed = iter.trim_until(mut); + return {TrimType::AT, trimmed}; + } + } + } + + static void trim(NodeExtentMutable& mut, StagedIterator& trim_at) { + auto [type, trimmed] = recursively_trim(mut, trim_at); + if (type == TrimType::BEFORE) { + assert(trim_at.valid()); + auto& iter = trim_at.get(); + iter.trim_until(mut); + } + } +}; + +/** + * Configurations for struct staged + * + * staged_params_* assembles different container_t implementations (defined by + * stated::_iterator_t) by STAGE, and constructs the final multi-stage + * implementations for different node layouts defined by + * node_extent_t<FieldType, NODE_TYPE>. + * + * The specialized implementations for different layouts are accessible through + * the helper type node_to_stage_t<node_extent_t<FieldType, NODE_TYPE>>. + * + * Specifically, the settings of 8 layouts are: + * + * The layout (N0, LEAF/INTERNAL) has 3 stages: + * - STAGE_LEFT: node_extent_t<node_fields_0_t, LEAF/INTERNAL> + * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL> + * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL> + * + * The layout (N1, LEAF/INTERNAL) has 3 stages: + * - STAGE_LEFT: node_extent_t<node_fields_1_t, LEAF/INTERNAL> + * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL> + * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL> + * + * The layout (N2, LEAF/INTERNAL) has 2 stages: + * - STAGE_STRING: node_extent_t<node_fields_2_t, LEAF/INTERNAL> + * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL> + * + * The layout (N3, LEAF) has 1 stage: + * - STAGE_RIGHT: node_extent_t<leaf_fields_3_t, LEAF> + * + * The layout (N3, INTERNAL) has 1 stage: + * - STAGE_RIGHT: node_extent_t<internal_fields_3_t, INTERNAL> + */ + +template <node_type_t _NODE_TYPE> +struct staged_params_subitems { + using container_t = sub_items_t<_NODE_TYPE>; + static constexpr auto NODE_TYPE = _NODE_TYPE; + static constexpr auto STAGE = STAGE_RIGHT; + + // dummy type in order to make our type system work + // any better solution to get rid of this? + using next_param_t = staged_params_subitems<NODE_TYPE>; +}; + +template <node_type_t _NODE_TYPE> +struct staged_params_item_iterator { + using container_t = item_iterator_t<_NODE_TYPE>; + static constexpr auto NODE_TYPE = _NODE_TYPE; + static constexpr auto STAGE = STAGE_STRING; + + using next_param_t = staged_params_subitems<NODE_TYPE>; +}; + +template <typename NodeType> +struct staged_params_node_01 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_LEFT; + + using next_param_t = staged_params_item_iterator<NODE_TYPE>; +}; + +template <typename NodeType> +struct staged_params_node_2 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_STRING; + + using next_param_t = staged_params_subitems<NODE_TYPE>; +}; + +template <typename NodeType> +struct staged_params_node_3 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_RIGHT; + + // dummy type in order to make our type system work + // any better solution to get rid of this? + using next_param_t = staged_params_node_3<NodeType>; +}; + +template <typename NodeType, typename Enable = void> struct _node_to_stage_t; +template <typename NodeType> +struct _node_to_stage_t<NodeType, + std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N0 || + NodeType::FIELD_TYPE == field_type_t::N1>> { + using type = staged<staged_params_node_01<NodeType>>; +}; +template <typename NodeType> +struct _node_to_stage_t<NodeType, + std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N2>> { + using type = staged<staged_params_node_2<NodeType>>; +}; +template <typename NodeType> +struct _node_to_stage_t<NodeType, + std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N3>> { + using type = staged<staged_params_node_3<NodeType>>; +}; +template <typename NodeType> +using node_to_stage_t = typename _node_to_stage_t<NodeType>::type; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h new file mode 100644 index 000000000..a9d5cef3b --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h @@ -0,0 +1,411 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <optional> +#include <ostream> + +#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/tree_types.h" + +namespace crimson::os::seastore::onode { + +using match_stage_t = int8_t; +constexpr match_stage_t STAGE_LEFT = 2; // shard/pool/crush +constexpr match_stage_t STAGE_STRING = 1; // nspace/oid +constexpr match_stage_t STAGE_RIGHT = 0; // snap/gen +constexpr auto STAGE_TOP = STAGE_LEFT; +constexpr auto STAGE_BOTTOM = STAGE_RIGHT; +constexpr bool is_valid_stage(match_stage_t stage) { + return std::clamp(stage, STAGE_BOTTOM, STAGE_TOP) == stage; +} +// TODO: replace by +// using match_history_t = int8_t; +// left_m, str_m, right_m +// 3: GT, +// 2: EQ, GT, +// 1: EQ, EQ, GT +// 0: EQ, EQ, EQ +// -1: EQ, EQ, LT +// -2: EQ, LT, +// -3: LT, + +struct MatchHistory { + template <match_stage_t STAGE> + const std::optional<MatchKindCMP>& get() const { + static_assert(is_valid_stage(STAGE)); + if constexpr (STAGE == STAGE_RIGHT) { + return right_match; + } else if (STAGE == STAGE_STRING) { + return string_match; + } else { + return left_match; + } + } + + const std::optional<MatchKindCMP>& + get_by_stage(match_stage_t stage) const { + assert(is_valid_stage(stage)); + if (stage == STAGE_RIGHT) { + return right_match; + } else if (stage == STAGE_STRING) { + return string_match; + } else { + return left_match; + } + } + + template <match_stage_t STAGE = STAGE_TOP> + const bool is_GT() const; + + template <match_stage_t STAGE> + void set(MatchKindCMP match) { + static_assert(is_valid_stage(STAGE)); + if constexpr (STAGE < STAGE_TOP) { + assert(*get<STAGE + 1>() == MatchKindCMP::EQ); + } + assert(!get<STAGE>().has_value() || *get<STAGE>() != MatchKindCMP::EQ); + const_cast<std::optional<MatchKindCMP>&>(get<STAGE>()) = match; + } + + std::ostream& dump(std::ostream& os) const { + os << "history("; + dump_each(os, left_match) << ", "; + dump_each(os, string_match) << ", "; + dump_each(os, right_match) << ")"; + return os; + } + + std::ostream& dump_each( + std::ostream& os, const std::optional<MatchKindCMP>& match) const { + if (!match.has_value()) { + return os << "--"; + } else if (*match == MatchKindCMP::LT) { + return os << "LT"; + } else if (*match == MatchKindCMP::EQ) { + return os << "EQ"; + } else if (*match == MatchKindCMP::GT) { + return os << "GT"; + } else { + ceph_abort("impossble path"); + } + } + + std::optional<MatchKindCMP> left_match; + std::optional<MatchKindCMP> string_match; + std::optional<MatchKindCMP> right_match; +}; +inline std::ostream& operator<<(std::ostream& os, const MatchHistory& pos) { + return pos.dump(os); +} + +template <match_stage_t STAGE> +struct _check_GT_t { + static bool eval(const MatchHistory* history) { + return history->get<STAGE>() && + (*history->get<STAGE>() == MatchKindCMP::GT || + (*history->get<STAGE>() == MatchKindCMP::EQ && + _check_GT_t<STAGE - 1>::eval(history))); + } +}; +template <> +struct _check_GT_t<STAGE_RIGHT> { + static bool eval(const MatchHistory* history) { + return history->get<STAGE_RIGHT>() && + *history->get<STAGE_RIGHT>() == MatchKindCMP::GT; + } +}; +template <match_stage_t STAGE> +const bool MatchHistory::is_GT() const { + static_assert(is_valid_stage(STAGE)); + if constexpr (STAGE < STAGE_TOP) { + assert(get<STAGE + 1>() == MatchKindCMP::EQ); + } + return _check_GT_t<STAGE>::eval(this); +} + +template <match_stage_t STAGE> +struct staged_position_t { + static_assert(is_valid_stage(STAGE)); + using me_t = staged_position_t<STAGE>; + using nxt_t = staged_position_t<STAGE - 1>; + bool is_end() const { + if (index == INDEX_END) { + return true; + } else { + assert(is_valid_index(index)); + return false; + } + } + index_t& index_by_stage(match_stage_t stage) { + assert(stage <= STAGE); + if (STAGE == stage) { + return index; + } else { + return nxt.index_by_stage(stage); + } + } + + int cmp(const me_t& o) const { + if (index > o.index) { + return 1; + } else if (index < o.index) { + return -1; + } else { + return nxt.cmp(o.nxt); + } + } + bool operator>(const me_t& o) const { return cmp(o) > 0; } + bool operator>=(const me_t& o) const { return cmp(o) >= 0; } + bool operator<(const me_t& o) const { return cmp(o) < 0; } + bool operator<=(const me_t& o) const { return cmp(o) <= 0; } + bool operator==(const me_t& o) const { return cmp(o) == 0; } + bool operator!=(const me_t& o) const { return cmp(o) != 0; } + + me_t& operator-=(const me_t& o) { + assert(is_valid_index(o.index)); + assert(index >= o.index); + if (index != INDEX_END) { + assert(is_valid_index(index)); + index -= o.index; + if (index == 0) { + nxt -= o.nxt; + } + } + return *this; + } + + void encode(ceph::bufferlist& encoded) const { + ceph::encode(index, encoded); + nxt.encode(encoded); + } + + static me_t decode(ceph::bufferlist::const_iterator& delta) { + me_t ret; + ceph::decode(ret.index, delta); + ret.nxt = nxt_t::decode(delta); + return ret; + } + + static me_t begin() { return {0u, nxt_t::begin()}; } + static me_t end() { + return {INDEX_END, nxt_t::end()}; + } + + index_t index; + nxt_t nxt; +}; +template <match_stage_t STAGE> +std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE>& pos) { + if (pos.index == INDEX_END) { + os << "END"; + } else if (pos.index == INDEX_LAST) { + os << "LAST"; + } else { + os << pos.index; + assert(is_valid_index(pos.index)); + } + return os << ", " << pos.nxt; +} + +template <> +struct staged_position_t<STAGE_BOTTOM> { + using me_t = staged_position_t<STAGE_BOTTOM>; + bool is_end() const { + if (index == INDEX_END) { + return true; + } else { + assert(is_valid_index(index)); + return false; + } + } + index_t& index_by_stage(match_stage_t stage) { + assert(stage == STAGE_BOTTOM); + return index; + } + + int cmp(const staged_position_t<STAGE_BOTTOM>& o) const { + if (index > o.index) { + return 1; + } else if (index < o.index) { + return -1; + } else { + return 0; + } + } + bool operator>(const me_t& o) const { return cmp(o) > 0; } + bool operator>=(const me_t& o) const { return cmp(o) >= 0; } + bool operator<(const me_t& o) const { return cmp(o) < 0; } + bool operator<=(const me_t& o) const { return cmp(o) <= 0; } + bool operator==(const me_t& o) const { return cmp(o) == 0; } + bool operator!=(const me_t& o) const { return cmp(o) != 0; } + + me_t& operator-=(const me_t& o) { + assert(is_valid_index(o.index)); + assert(index >= o.index); + if (index != INDEX_END) { + assert(is_valid_index(index)); + index -= o.index; + } + return *this; + } + + void encode(ceph::bufferlist& encoded) const { + ceph::encode(index, encoded); + } + + static me_t decode(ceph::bufferlist::const_iterator& delta) { + me_t ret; + ceph::decode(ret.index, delta); + return ret; + } + + static me_t begin() { return {0u}; } + static me_t end() { return {INDEX_END}; } + + index_t index; +}; +template <> +inline std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE_BOTTOM>& pos) { + if (pos.index == INDEX_END) { + os << "END"; + } else if (pos.index == INDEX_LAST) { + os << "LAST"; + } else { + os << pos.index; + assert(is_valid_index(pos.index)); + } + return os; +} + +using search_position_t = staged_position_t<STAGE_TOP>; + +template <match_stage_t STAGE> +const staged_position_t<STAGE>& cast_down(const search_position_t& pos) { + if constexpr (STAGE == STAGE_LEFT) { + return pos; + } else if constexpr (STAGE == STAGE_STRING) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(pos.nxt.is_end()); + } else { + assert(pos.index == 0u); + } +#endif + return pos.nxt; + } else if constexpr (STAGE == STAGE_RIGHT) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(pos.nxt.nxt.is_end()); + } else { + assert(pos.index == 0u); + assert(pos.nxt.index == 0u); + } +#endif + return pos.nxt.nxt; + } else { + ceph_abort("impossible path"); + } +} + +template <match_stage_t STAGE> +staged_position_t<STAGE>& cast_down(search_position_t& pos) { + const search_position_t& _pos = pos; + return const_cast<staged_position_t<STAGE>&>(cast_down<STAGE>(_pos)); +} + +template <match_stage_t STAGE> +staged_position_t<STAGE>& cast_down_fill_0(search_position_t& pos) { + if constexpr (STAGE == STAGE_LEFT) { + return pos; + } if constexpr (STAGE == STAGE_STRING) { + pos.index = 0; + return pos.nxt; + } else if constexpr (STAGE == STAGE_RIGHT) { + pos.index = 0; + pos.nxt.index = 0; + return pos.nxt.nxt; + } else { + ceph_abort("impossible path"); + } +} + +inline search_position_t&& normalize(search_position_t&& pos) { return std::move(pos); } + +template <match_stage_t STAGE, typename = std::enable_if_t<STAGE != STAGE_TOP>> +search_position_t normalize(staged_position_t<STAGE>&& pos) { + if (pos.is_end()) { + return search_position_t::end(); + } + if constexpr (STAGE == STAGE_STRING) { + return {0u, std::move(pos)}; + } else if (STAGE == STAGE_RIGHT) { + return {0u, {0u, std::move(pos)}}; + } else { + ceph_abort("impossible path"); + } +} + +struct memory_range_t { + const char* p_start; + const char* p_end; +}; + +enum class ContainerType { ITERATIVE, INDEXABLE }; + +template <node_type_t> struct value_type; +template<> struct value_type<node_type_t::INTERNAL> { using type = laddr_packed_t; }; +template<> struct value_type<node_type_t::LEAF> { using type = onode_t; }; +template <node_type_t NODE_TYPE> +using value_type_t = typename value_type<NODE_TYPE>::type; + +template <node_type_t NODE_TYPE, match_stage_t STAGE> +struct staged_result_t { + using me_t = staged_result_t<NODE_TYPE, STAGE>; + bool is_end() const { return position.is_end(); } + + static me_t end() { + return {staged_position_t<STAGE>::end(), nullptr, MSTAT_END}; + } + template <typename T = me_t> + static std::enable_if_t<STAGE != STAGE_BOTTOM, T> from_nxt( + index_t index, const staged_result_t<NODE_TYPE, STAGE - 1>& nxt_stage_result) { + return {{index, nxt_stage_result.position}, + nxt_stage_result.p_value, + nxt_stage_result.mstat}; + } + + staged_position_t<STAGE> position; + const value_type_t<NODE_TYPE>* p_value; + match_stat_t mstat; +}; + +template <node_type_t NODE_TYPE> +using lookup_result_t = staged_result_t<NODE_TYPE, STAGE_TOP>; + +template <node_type_t NODE_TYPE> +lookup_result_t<NODE_TYPE>&& normalize( + lookup_result_t<NODE_TYPE>&& result) { return std::move(result); } + +template <node_type_t NODE_TYPE, match_stage_t STAGE, + typename = std::enable_if_t<STAGE != STAGE_TOP>> +lookup_result_t<NODE_TYPE> normalize( + staged_result_t<NODE_TYPE, STAGE>&& result) { + // FIXME: assert result.mstat correct + return {normalize(std::move(result.position)), result.p_value, result.mstat}; +} + +struct node_stats_t { + size_t size_persistent = 0; + size_t size_filled = 0; + // filled by staged::get_stats() + size_t size_logical = 0; + size_t size_overhead = 0; + size_t size_value = 0; + unsigned num_kvs = 0; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc new file mode 100644 index 000000000..aaca6c3c6 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc @@ -0,0 +1,208 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "sub_items_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +template <KeyT KT> +const laddr_packed_t* internal_sub_items_t::insert_at( + NodeExtentMutable& mut, const internal_sub_items_t& sub_items, + const full_key_t<KT>& key, const laddr_packed_t& value, + index_t index, node_offset_t size, const char* p_left_bound) { + assert(index <= sub_items.keys()); + assert(size == estimate_insert<KT>(key, value)); + const char* p_shift_start = p_left_bound; + const char* p_shift_end = reinterpret_cast<const char*>( + sub_items.p_first_item + 1 - index); + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size); + + auto p_insert = const_cast<char*>(p_shift_end) - size; + auto item = internal_sub_item_t{snap_gen_t::from_key<KT>(key), value}; + mut.copy_in_absolute(p_insert, item); + return &reinterpret_cast<internal_sub_item_t*>(p_insert)->value; +} +#define IA_TEMPLATE(KT) \ + template const laddr_packed_t* internal_sub_items_t::insert_at<KT>( \ + NodeExtentMutable&, const internal_sub_items_t&, const full_key_t<KT>&, \ + const laddr_packed_t&, index_t, node_offset_t, const char*) +IA_TEMPLATE(KeyT::VIEW); +IA_TEMPLATE(KeyT::HOBJ); + +node_offset_t internal_sub_items_t::trim_until( + NodeExtentMutable&, internal_sub_items_t& items, index_t index) { + assert(index != 0); + auto keys = items.keys(); + assert(index <= keys); + size_t ret = sizeof(internal_sub_item_t) * (keys - index); + assert(ret < NODE_BLOCK_SIZE); + return ret; +} + +template <KeyT KT> +void internal_sub_items_t::Appender<KT>::append( + const internal_sub_items_t& src, index_t from, index_t items) { + assert(from <= src.keys()); + if (items == 0) { + return; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + node_offset_t size = sizeof(internal_sub_item_t) * items; + p_append -= size; + p_mut->copy_in_absolute(p_append, src.p_first_item + 1 - from - items, size); +} + +template <KeyT KT> +void internal_sub_items_t::Appender<KT>::append( + const full_key_t<KT>& key, const laddr_packed_t& value, + const laddr_packed_t*& p_value) { + p_append -= sizeof(internal_sub_item_t); + auto item = internal_sub_item_t{snap_gen_t::from_key<KT>(key), value}; + p_mut->copy_in_absolute(p_append, item); + p_value = &reinterpret_cast<internal_sub_item_t*>(p_append)->value; +} + +template <KeyT KT> +const onode_t* leaf_sub_items_t::insert_at( + NodeExtentMutable& mut, const leaf_sub_items_t& sub_items, + const full_key_t<KT>& key, const onode_t& value, + index_t index, node_offset_t size, const char* p_left_bound) { + assert(index <= sub_items.keys()); + assert(size == estimate_insert<KT>(key, value)); + // a. [... item(index)] << size + const char* p_shift_start = p_left_bound; + const char* p_shift_end = sub_items.get_item_end(index); + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size); + + // b. insert item + auto p_insert = const_cast<char*>(p_shift_end - size); + auto p_value = reinterpret_cast<const onode_t*>(p_insert); + mut.copy_in_absolute(p_insert, &value, value.size); + p_insert += value.size; + mut.copy_in_absolute(p_insert, snap_gen_t::template from_key<KT>(key)); + assert(p_insert + sizeof(snap_gen_t) + sizeof(node_offset_t) == p_shift_end); + + // c. compensate affected offsets + auto item_size = value.size + sizeof(snap_gen_t); + for (auto i = index; i < sub_items.keys(); ++i) { + const node_offset_packed_t& offset_i = sub_items.get_offset(i); + mut.copy_in_absolute((void*)&offset_i, node_offset_t(offset_i.value + item_size)); + } + + // d. [item(index-1) ... item(0) ... offset(index)] <<< sizeof(node_offset_t) + const char* p_offset = (index == 0 ? + (const char*)&sub_items.get_offset(0) + sizeof(node_offset_t) : + (const char*)&sub_items.get_offset(index - 1)); + p_shift_start = p_shift_end; + p_shift_end = p_offset; + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)sizeof(node_offset_t)); + + // e. insert offset + node_offset_t offset_to_item_start = item_size + sub_items.get_offset_to_end(index); + mut.copy_in_absolute( + const_cast<char*>(p_shift_end) - sizeof(node_offset_t), offset_to_item_start); + + // f. update num_sub_keys + mut.copy_in_absolute((void*)sub_items.p_num_keys, num_keys_t(sub_items.keys() + 1)); + + return p_value; +} +template const onode_t* leaf_sub_items_t::insert_at<KeyT::HOBJ>( + NodeExtentMutable&, const leaf_sub_items_t&, const full_key_t<KeyT::HOBJ>&, + const onode_t&, index_t, node_offset_t, const char*); + +node_offset_t leaf_sub_items_t::trim_until( + NodeExtentMutable& mut, leaf_sub_items_t& items, index_t index) { + assert(index != 0); + auto keys = items.keys(); + assert(index <= keys); + if (index == keys) { + return 0; + } + index_t trim_items = keys - index; + const char* p_items_start = items.p_start(); + const char* p_shift_start = items.get_item_end(index); + const char* p_shift_end = items.get_item_end(0); + size_t size_trim_offsets = sizeof(node_offset_t) * trim_items; + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, + size_trim_offsets); + mut.copy_in_absolute((void*)items.p_num_keys, num_keys_t(index)); + size_t ret = size_trim_offsets + (p_shift_start - p_items_start); + assert(ret < NODE_BLOCK_SIZE); + return ret; +} + +template class internal_sub_items_t::Appender<KeyT::VIEW>; +template class internal_sub_items_t::Appender<KeyT::HOBJ>; + +// helper type for the visitor +template<class... Ts> struct overloaded : Ts... { using Ts::operator()...; }; +// explicit deduction guide +template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>; + +template <KeyT KT> +char* leaf_sub_items_t::Appender<KT>::wrap() { + auto p_cur = p_append; + num_keys_t num_keys = 0; + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { num_keys += arg.items; }, + [&] (const kv_item_t& arg) { ++num_keys; } + }, a); + } + assert(num_keys); + p_cur -= sizeof(num_keys_t); + p_mut->copy_in_absolute(p_cur, num_keys); + + node_offset_t last_offset = 0; + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { + int compensate = (last_offset - op_src->get_offset_to_end(arg.from)); + node_offset_t offset; + for (auto i = arg.from; i < arg.from + arg.items; ++i) { + offset = op_src->get_offset(i).value + compensate; + p_cur -= sizeof(node_offset_t); + p_mut->copy_in_absolute(p_cur, offset); + } + last_offset = offset; + }, + [&] (const kv_item_t& arg) { + last_offset += sizeof(snap_gen_t) + arg.p_value->size; + p_cur -= sizeof(node_offset_t); + p_mut->copy_in_absolute(p_cur, last_offset); + } + }, a); + } + + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { + auto _p_start = op_src->get_item_end(arg.from + arg.items); + size_t _len = op_src->get_item_end(arg.from) - _p_start; + p_cur -= _len; + p_mut->copy_in_absolute(p_cur, _p_start, _len); + }, + [&] (const kv_item_t& arg) { + assert(pp_value); + p_cur -= sizeof(snap_gen_t); + p_mut->copy_in_absolute(p_cur, snap_gen_t::template from_key<KT>(*arg.p_key)); + p_cur -= arg.p_value->size; + p_mut->copy_in_absolute(p_cur, arg.p_value, arg.p_value->size); + *pp_value = reinterpret_cast<const onode_t*>(p_cur); + } + }, a); + } + return p_cur; +} + +template class leaf_sub_items_t::Appender<KeyT::VIEW>; +template class leaf_sub_items_t::Appender<KeyT::HOBJ>; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h new file mode 100644 index 000000000..8ef5f7472 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h @@ -0,0 +1,341 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <variant> + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +struct internal_sub_item_t { + const snap_gen_t& get_key() const { return key; } + const laddr_packed_t* get_p_value() const { return &value; } + + snap_gen_t key; + laddr_packed_t value; +} __attribute__((packed)); + +/** + * internal_sub_items_t + * + * The STAGE_RIGHT implementation for internal node N0/N1/N2, implements staged + * contract as an indexable container to index snap-gen to child node + * addresses. + * + * The layout of the contaner storing n sub-items: + * + * # <--------- container range -----------> # + * #<~># sub-items [2, n) # + * # # <- sub-item 1 -> # <- sub-item 0 -> # + * #...# snap-gen | laddr # snap-gen | laddr # + * ^ + * | + * p_first_item + + */ +class internal_sub_items_t { + public: + using num_keys_t = index_t; + + internal_sub_items_t(const memory_range_t& range) { + assert(range.p_start < range.p_end); + assert((range.p_end - range.p_start) % sizeof(internal_sub_item_t) == 0); + num_items = (range.p_end - range.p_start) / sizeof(internal_sub_item_t); + assert(num_items > 0); + auto _p_first_item = range.p_end - sizeof(internal_sub_item_t); + p_first_item = reinterpret_cast<const internal_sub_item_t*>(_p_first_item); + } + + // container type system + using key_get_type = const snap_gen_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + num_keys_t keys() const { return num_items; } + key_get_type operator[](index_t index) const { + assert(index < num_items); + return (p_first_item - index)->get_key(); + } + node_offset_t size_before(index_t index) const { + size_t ret = index * sizeof(internal_sub_item_t); + assert(ret < NODE_BLOCK_SIZE); + return ret; + } + const laddr_packed_t* get_p_value(index_t index) const { + assert(index < num_items); + return (p_first_item - index)->get_p_value(); + } + node_offset_t size_overhead_at(index_t index) const { return 0u; } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + auto p_end = reinterpret_cast<const char*>(p_first_item) + + sizeof(internal_sub_item_t); + auto p_start = p_end - num_items * sizeof(internal_sub_item_t); + int start_offset = p_start - p_node_start; + int end_offset = p_end - p_node_start; + assert(start_offset > 0 && + start_offset < end_offset && + end_offset < NODE_BLOCK_SIZE); + ceph::encode(static_cast<node_offset_t>(start_offset), encoded); + ceph::encode(static_cast<node_offset_t>(end_offset), encoded); + } + + static internal_sub_items_t decode( + const char* p_node_start, ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t end_offset; + ceph::decode(end_offset, delta); + assert(start_offset < end_offset); + assert(end_offset <= NODE_BLOCK_SIZE); + return internal_sub_items_t({p_node_start + start_offset, + p_node_start + end_offset}); + } + + static node_offset_t header_size() { return 0u; } + + template <KeyT KT> + static node_offset_t estimate_insert( + const full_key_t<KT>&, const laddr_packed_t&) { + return sizeof(internal_sub_item_t); + } + + template <KeyT KT> + static const laddr_packed_t* insert_at( + NodeExtentMutable&, const internal_sub_items_t&, + const full_key_t<KT>&, const laddr_packed_t&, + index_t index, node_offset_t size, const char* p_left_bound); + + static node_offset_t trim_until(NodeExtentMutable&, internal_sub_items_t&, index_t); + + template <KeyT KT> + class Appender; + + private: + index_t num_items; + const internal_sub_item_t* p_first_item; +}; + +template <KeyT KT> +class internal_sub_items_t::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} {} + void append(const internal_sub_items_t& src, index_t from, index_t items); + void append(const full_key_t<KT>&, const laddr_packed_t&, const laddr_packed_t*&); + char* wrap() { return p_append; } + private: + NodeExtentMutable* p_mut; + char* p_append; +}; + +/** + * leaf_sub_items_t + * + * The STAGE_RIGHT implementation for leaf node N0/N1/N2, implements staged + * contract as an indexable container to index snap-gen to onode_t. + * + * The layout of the contaner storing n sub-items: + * + * # <------------------------ container range -------------------------------> # + * # <---------- sub-items ----------------> # <--- offsets ---------# # + * #<~># sub-items [2, n) #<~>| offsets [2, n) # # + * # # <- sub-item 1 -> # <- sub-item 0 -> # | # # + * #...# snap-gen | onode # snap-gen | onode #...| offset1 | offset0 # num_keys # + * ^ ^ ^ + * | | | + * p_items_end + p_offsets + | + * p_num_keys + + */ +class leaf_sub_items_t { + public: + // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), + // and the minimal size of onode_t + using num_keys_t = uint8_t; + + leaf_sub_items_t(const memory_range_t& range) { + assert(range.p_start < range.p_end); + auto _p_num_keys = range.p_end - sizeof(num_keys_t); + assert(range.p_start < _p_num_keys); + p_num_keys = reinterpret_cast<const num_keys_t*>(_p_num_keys); + assert(keys()); + auto _p_offsets = _p_num_keys - sizeof(node_offset_t); + assert(range.p_start < _p_offsets); + p_offsets = reinterpret_cast<const node_offset_packed_t*>(_p_offsets); + p_items_end = reinterpret_cast<const char*>(&get_offset(keys() - 1)); + assert(range.p_start < p_items_end); + assert(range.p_start == p_start()); + } + + bool operator==(const leaf_sub_items_t& x) { + return (p_num_keys == x.p_num_keys && + p_offsets == x.p_offsets && + p_items_end == x.p_items_end); + } + + const char* p_start() const { return get_item_end(keys()); } + + const node_offset_packed_t& get_offset(index_t index) const { + assert(index < keys()); + return *(p_offsets - index); + } + + const node_offset_t get_offset_to_end(index_t index) const { + assert(index <= keys()); + return index == 0 ? 0 : get_offset(index - 1).value; + } + + const char* get_item_start(index_t index) const { + return p_items_end - get_offset(index).value; + } + + const char* get_item_end(index_t index) const { + return p_items_end - get_offset_to_end(index); + } + + // container type system + using key_get_type = const snap_gen_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + num_keys_t keys() const { return *p_num_keys; } + key_get_type operator[](index_t index) const { + assert(index < keys()); + auto pointer = get_item_end(index); + assert(get_item_start(index) < pointer); + pointer -= sizeof(snap_gen_t); + assert(get_item_start(index) < pointer); + return *reinterpret_cast<const snap_gen_t*>(pointer); + } + node_offset_t size_before(index_t index) const { + assert(index <= keys()); + size_t ret; + if (index == 0) { + ret = sizeof(num_keys_t); + } else { + --index; + ret = sizeof(num_keys_t) + + (index + 1) * sizeof(node_offset_t) + + get_offset(index).value; + } + assert(ret < NODE_BLOCK_SIZE); + return ret; + } + node_offset_t size_overhead_at(index_t index) const { return sizeof(node_offset_t); } + const onode_t* get_p_value(index_t index) const { + assert(index < keys()); + auto pointer = get_item_start(index); + auto value = reinterpret_cast<const onode_t*>(pointer); + assert(pointer + value->size + sizeof(snap_gen_t) == get_item_end(index)); + return value; + } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + auto p_end = reinterpret_cast<const char*>(p_num_keys) + + sizeof(num_keys_t); + int start_offset = p_start() - p_node_start; + int end_offset = p_end - p_node_start; + assert(start_offset > 0 && + start_offset < end_offset && + end_offset < NODE_BLOCK_SIZE); + ceph::encode(static_cast<node_offset_t>(start_offset), encoded); + ceph::encode(static_cast<node_offset_t>(end_offset), encoded); + } + + static leaf_sub_items_t decode( + const char* p_node_start, ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t end_offset; + ceph::decode(end_offset, delta); + assert(start_offset < end_offset); + assert(end_offset <= NODE_BLOCK_SIZE); + return leaf_sub_items_t({p_node_start + start_offset, + p_node_start + end_offset}); + } + + static node_offset_t header_size() { return sizeof(num_keys_t); } + + template <KeyT KT> + static node_offset_t estimate_insert(const full_key_t<KT>&, const onode_t& value) { + return value.size + sizeof(snap_gen_t) + sizeof(node_offset_t); + } + + template <KeyT KT> + static const onode_t* insert_at( + NodeExtentMutable&, const leaf_sub_items_t&, + const full_key_t<KT>&, const onode_t&, + index_t index, node_offset_t size, const char* p_left_bound); + + static node_offset_t trim_until(NodeExtentMutable&, leaf_sub_items_t&, index_t index); + + template <KeyT KT> + class Appender; + + private: + // TODO: support unaligned access + const num_keys_t* p_num_keys; + const node_offset_packed_t* p_offsets; + const char* p_items_end; +}; + +constexpr index_t APPENDER_LIMIT = 3u; + +template <KeyT KT> +class leaf_sub_items_t::Appender { + struct range_items_t { + index_t from; + index_t items; + }; + struct kv_item_t { + const full_key_t<KT>* p_key; + const onode_t* p_value; + }; + using var_t = std::variant<range_items_t, kv_item_t>; + + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} { + } + + void append(const leaf_sub_items_t& src, index_t from, index_t items) { + assert(cnt <= APPENDER_LIMIT); + assert(from <= src.keys()); + if (items == 0) { + return; + } + if (op_src) { + assert(*op_src == src); + } else { + op_src = src; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + appends[cnt] = range_items_t{from, items}; + ++cnt; + } + void append(const full_key_t<KT>& key, + const onode_t& value, const onode_t*& p_value) { + assert(pp_value == nullptr); + assert(cnt <= APPENDER_LIMIT); + appends[cnt] = kv_item_t{&key, &value}; + ++cnt; + pp_value = &p_value; + } + char* wrap(); + + private: + std::optional<leaf_sub_items_t> op_src; + const onode_t** pp_value = nullptr; + NodeExtentMutable* p_mut; + char* p_append; + var_t appends[APPENDER_LIMIT]; + index_t cnt = 0; +}; + +template <node_type_t> struct _sub_items_t; +template<> struct _sub_items_t<node_type_t::INTERNAL> { using type = internal_sub_items_t; }; +template<> struct _sub_items_t<node_type_t::LEAF> { using type = leaf_sub_items_t; }; +template <node_type_t NODE_TYPE> +using sub_items_t = typename _sub_items_t<NODE_TYPE>::type; + +} |