From 19fcec84d8d7d21e796c7624e521b60d28ee21ed Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:45:59 +0200 Subject: Adding upstream version 16.2.11+ds. Signed-off-by: Daniel Baumann --- .../onode_manager/simple-fltree/onode_block.cc | 71 + .../onode_manager/simple-fltree/onode_block.h | 65 + .../onode_manager/simple-fltree/onode_delta.cc | 188 ++ .../onode_manager/simple-fltree/onode_delta.h | 70 + .../onode_manager/simple-fltree/onode_node.cc | 567 +++++ .../onode_manager/simple-fltree/onode_node.h | 942 +++++++++ .../os/seastore/onode_manager/staged-fltree/fwd.h | 93 + .../seastore/onode_manager/staged-fltree/node.cc | 809 ++++++++ .../os/seastore/onode_manager/staged-fltree/node.h | 476 +++++ .../staged-fltree/node_delta_recorder.h | 42 + .../staged-fltree/node_extent_accessor.h | 413 ++++ .../staged-fltree/node_extent_manager.cc | 35 + .../staged-fltree/node_extent_manager.h | 86 + .../staged-fltree/node_extent_manager/dummy.h | 156 ++ .../staged-fltree/node_extent_manager/seastore.cc | 88 + .../staged-fltree/node_extent_manager/seastore.h | 126 ++ .../node_extent_manager/test_replay.h | 67 + .../staged-fltree/node_extent_mutable.cc | 39 + .../staged-fltree/node_extent_mutable.h | 80 + .../onode_manager/staged-fltree/node_impl.cc | 76 + .../onode_manager/staged-fltree/node_impl.h | 197 ++ .../onode_manager/staged-fltree/node_layout.h | 613 ++++++ .../staged-fltree/node_layout_replayable.h | 75 + .../onode_manager/staged-fltree/node_types.h | 64 + .../staged-fltree/stages/item_iterator_stage.cc | 165 ++ .../staged-fltree/stages/item_iterator_stage.h | 180 ++ .../staged-fltree/stages/key_layout.cc | 32 + .../staged-fltree/stages/key_layout.h | 846 ++++++++ .../staged-fltree/stages/node_stage.cc | 318 +++ .../staged-fltree/stages/node_stage.h | 226 ++ .../staged-fltree/stages/node_stage_layout.cc | 96 + .../staged-fltree/stages/node_stage_layout.h | 366 ++++ .../onode_manager/staged-fltree/stages/stage.h | 2186 ++++++++++++++++++++ .../staged-fltree/stages/stage_types.h | 411 ++++ .../staged-fltree/stages/sub_items_stage.cc | 208 ++ .../staged-fltree/stages/sub_items_stage.h | 341 +++ .../seastore/onode_manager/staged-fltree/super.cc | 26 + .../seastore/onode_manager/staged-fltree/super.h | 143 ++ .../seastore/onode_manager/staged-fltree/tree.cc | 235 +++ .../os/seastore/onode_manager/staged-fltree/tree.h | 119 ++ .../onode_manager/staged-fltree/tree_types.h | 125 ++ .../onode_manager/staged-fltree/tree_utils.h | 333 +++ 42 files changed, 11794 insertions(+) create mode 100644 src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc create mode 100644 src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h create mode 100644 src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc create mode 100644 src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h create mode 100644 src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc create mode 100644 src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/node.cc create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/node.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/super.cc create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/super.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/tree.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h create mode 100644 src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h (limited to 'src/crimson/os/seastore/onode_manager') diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc new file mode 100644 index 000000000..b05ea76a3 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "onode_block.h" + +namespace crimson::os::seastore { + +ceph::bufferlist OnodeBlock::get_delta() +{ + bufferlist bl; + assert(deltas.size() <= std::numeric_limits::max()); + uint8_t n_deltas = deltas.size(); + ceph::encode(n_deltas, bl); + for (auto& delta : deltas) { + delta->encode(bl); + } + return bl; +} + +void OnodeBlock::logical_on_delta_write() +{ + // journal submitted to disk, now update the memory + apply_pending_changes(true); +} + +void OnodeBlock::apply_delta(const ceph::bufferlist &bl) +{ + assert(deltas.empty()); + + auto p = bl.cbegin(); + uint8_t n_deltas = 0; + ceph::decode(n_deltas, p); + for (uint8_t i = 0; i < n_deltas; i++) { + delta_t delta; + delta.decode(p); + mutate(std::move(delta)); + } + apply_pending_changes(true); +} + +void OnodeBlock::mutate(delta_t&& d) +{ + if (is_initial_pending()) { + char* const p = get_bptr().c_str(); + mutate_func(p, d); + } + deltas.push_back(std::make_unique(std::move(d))); +} + +void OnodeBlock::apply_pending_changes(bool do_cleanup) +{ + if (!is_mutation_pending()) { + return; + } + if (share_buffer) { + // do a deep copy so i can change my own copy + get_bptr() = ceph::bufferptr{get_bptr().c_str(), + get_bptr().length()}; + share_buffer = false; + } + assert(mutate_func); + char* const p = get_bptr().c_str(); + for (auto& delta : deltas) { + mutate_func(p, *delta); + if (do_cleanup) { + delta.reset(); + } + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h new file mode 100644 index 000000000..0025d9847 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include "crimson/os/seastore/transaction_manager.h" +#include "onode_delta.h" + +namespace crimson::os::seastore { + +// TODO s/CachedExtent/LogicalCachedExtent/ +struct OnodeBlock final : LogicalCachedExtent { + using Ref = TCachedExtentRef; + + template + OnodeBlock(T&&... t) : LogicalCachedExtent(std::forward(t)...) {} + OnodeBlock(OnodeBlock&& block) = delete; + OnodeBlock(const OnodeBlock& block, CachedExtent::share_buffer_t tag) noexcept + : LogicalCachedExtent{block, tag}, + share_buffer{true} + {} + + CachedExtentRef duplicate_for_write() final { + return new OnodeBlock{*this, CachedExtent::share_buffer_t{}}; + } + + // could materialize the pending changes to the underlying buffer here, + // but since we write the change to the buffer immediately, let skip + // this for now. + void prepare_write() final {} + + // queries + static constexpr extent_types_t TYPE = extent_types_t::ONODE_BLOCK; + extent_types_t get_type() const final { + return TYPE; + } + + // have to stash all the changes before on_delta_write() is called, + // otherwise we could pollute the extent with pending mutations + // before the transaction carrying these mutations is committed to + // disk + ceph::bufferlist get_delta() final; + void logical_on_delta_write() final; + void apply_delta(const ceph::bufferlist &bl) final; + + void sync() { + apply_pending_changes(false); + } + void mutate(delta_t&& d); + using mutate_func_t = std::function; + void set_delta_applier(mutate_func_t&& func) { + mutate_func = std::move(func); + } +private: + // before looking at the extent, we need to make sure the content is up to date + void apply_pending_changes(bool do_cleanup); + // assuming we don't stash too many deltas to a single block + // otherwise a fullwrite op is necessary + boost::container::small_vector, 2> deltas; + mutate_func_t mutate_func; + bool share_buffer = false; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc new file mode 100644 index 000000000..869685d45 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc @@ -0,0 +1,188 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "onode_delta.h" + +delta_t::delta_t(delta_t&& delta) +{ + assert(op == op_t::nop); + op = delta.op; + n = delta.n; + oid = std::move(delta.oid); + onode = std::move(delta.onode); + keys = std::move(delta.keys); + cells = std::move(delta.cells); + delta.op = op_t::nop; +} + +delta_t& delta_t::operator=(delta_t&& delta) +{ + assert(op == op_t::nop); + op = delta.op; + n = delta.n; + oid = std::move(delta.oid); + onode = std::move(delta.onode); + keys = std::move(delta.keys); + cells = std::move(delta.cells); + delta.op = op_t::nop; + return *this; +} + +delta_t delta_t::nop() +{ + return delta_t{op_t::nop}; +} + +delta_t delta_t::insert_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode) +{ + delta_t delta{op_t::insert_onode}; + delta.n = slot; + delta.oid = oid; + delta.onode = onode; + return delta; +} + +delta_t delta_t::update_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode) +{ + delta_t delta{op_t::update_onode}; + delta.n = slot; + delta.oid = oid; + delta.onode = onode; + return delta; +} + +delta_t delta_t::insert_child(unsigned slot, + const ghobject_t& oid, + crimson::os::seastore::laddr_t addr) +{ + delta_t delta{op_t::insert_child}; + delta.n = slot; + delta.oid = oid; + delta.addr = addr; + return delta; +} + +delta_t delta_t::update_key(unsigned slot, const ghobject_t& oid) +{ + delta_t delta{op_t::update_key}; + delta.n = slot; + delta.oid = oid; + return delta; +} + +delta_t delta_t::shift_left(unsigned n) +{ + delta_t delta{op_t::shift_left}; + delta.n = n; + return delta; +} + +delta_t delta_t::trim_right(unsigned n) +{ + delta_t delta{op_t::trim_right}; + delta.n = n; + return delta; +} + +delta_t delta_t::insert_front(ceph::buffer::ptr keys, + ceph::buffer::ptr cells) +{ + delta_t delta{op_t::insert_front}; + delta.keys = std::move(keys); + delta.cells = std::move(cells); + return delta; +} + +delta_t delta_t::insert_back(ceph::buffer::ptr keys, + ceph::buffer::ptr cells) +{ + delta_t delta{op_t::insert_back}; + delta.keys = std::move(keys); + delta.cells = std::move(cells); + return delta; +} + +delta_t delta_t::remove_from(unsigned slot) +{ + delta_t delta{op_t::remove_from}; + delta.n = slot; + return delta; +} + +void delta_t::encode(ceph::bufferlist& bl) +{ + using ceph::encode; + switch (op) { + case op_t::insert_onode: + [[fallthrough]]; + case op_t::update_onode: + // the slot # is not encoded, because we can alway figure it out + // when we have to replay the delta by looking the oid up in the + // node block + encode(oid, bl); + encode(*onode, bl); + break; + case op_t::insert_child: + encode(oid, bl); + encode(addr, bl); + case op_t::update_key: + encode(n, bl); + encode(oid, bl); + break; + case op_t::shift_left: + encode(n, bl); + break; + case op_t::trim_right: + encode(n, bl); + break; + case op_t::insert_front: + [[fallthrough]]; + case op_t::insert_back: + encode(n, bl); + encode(keys, bl); + encode(cells, bl); + break; + case op_t::remove_from: + encode(n, bl); + break; + default: + assert(0 == "unknown onode op"); + } +} + +void delta_t::decode(ceph::bufferlist::const_iterator& p) { + using ceph::decode; + decode(op, p); + switch (op) { + case op_t::insert_onode: + [[fallthrough]]; + case op_t::update_onode: + decode(oid, p); + decode(*onode, p); + break; + case op_t::insert_child: + [[fallthrough]]; + case op_t::update_key: + decode(n, p); + decode(oid, p); + break; + case op_t::shift_left: + decode(n, p); + break; + case op_t::trim_right: + decode(n, p); + break; + case op_t::insert_front: + [[fallthrough]]; + case op_t::insert_back: + decode(n, p); + decode(keys, p); + decode(cells, p); + break; + case op_t::remove_from: + decode(n, p); + break; + default: + assert(0 == "unknown onode op"); + } +} diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h new file mode 100644 index 000000000..3e7e7315e --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h @@ -0,0 +1,70 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include + +#include "common/hobject.h" +#include "include/buffer_fwd.h" + +#include "crimson/os/seastore/onode.h" +#include "crimson/os/seastore/seastore_types.h" + +using crimson::os::seastore::OnodeRef; + +struct delta_t { + enum class op_t : uint8_t { + nop, + insert_onode, + update_onode, + insert_child, + update_key, + shift_left, + trim_right, + insert_front, + insert_back, + remove_from, + // finer grained op? + // - changing the embedded extent map of given oid + // - mutating the embedded xattrs of given oid + } op = op_t::nop; + + unsigned n = 0; + ghobject_t oid; + crimson::os::seastore::laddr_t addr = 0; + OnodeRef onode; + ceph::bufferptr keys; + ceph::bufferptr cells; + + delta_t() = default; + delta_t(op_t op) + : op{op} + {} + delta_t(delta_t&& delta); + delta_t& operator=(delta_t&& delta); + + static delta_t nop(); + static delta_t insert_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode); + static delta_t update_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode); + static delta_t insert_child(unsigned slot, const ghobject_t& oid, crimson::os::seastore::laddr_t addr); + static delta_t update_key(unsigned slot, const ghobject_t& oid); + static delta_t shift_left(unsigned n); + static delta_t trim_right(unsigned n); + static delta_t insert_front(ceph::buffer::ptr keys, + ceph::buffer::ptr cells); + static delta_t insert_back(ceph::buffer::ptr keys, + ceph::buffer::ptr cells); + static delta_t remove_from(unsigned slot); + + // shortcuts + static delta_t insert_item(unsigned slot, const ghobject_t& oid, OnodeRef onode) { + return insert_onode(slot, oid, onode); + } + static delta_t insert_item(unsigned slot, const ghobject_t& oid, crimson::os::seastore::laddr_t addr) { + return insert_child(slot, oid, addr); + } + + void encode(ceph::bufferlist& bl); + void decode(ceph::bufferlist::const_iterator& p); +}; diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc new file mode 100644 index 000000000..fdcaa2fcb --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc @@ -0,0 +1,567 @@ +#include "onode_node.h" + +template +auto node_t::key_at(unsigned slot) const + -> std::pair +{ + auto& key = keys[slot]; + if constexpr (item_in_key) { + return {key, key_suffix_t{}}; + } else { + auto p = from_end(key.offset); + return {key, *reinterpret_cast(p)}; + } +} + +// update an existing oid with the specified item +template +ghobject_t +node_t::get_oid_at(unsigned slot, + const ghobject_t& oid) const +{ + auto [prefix, suffix] = key_at(slot); + ghobject_t updated = oid; + prefix.update_oid(updated); + suffix.update_oid(updated); + return updated; +} + +template +auto node_t::item_at(const key_prefix_t& key) const + -> const_item_t +{ + if constexpr (item_in_key) { + return key.child_addr; + } else { + assert(key.offset < BlockSize); + auto p = from_end(key.offset); + auto partial_key = reinterpret_cast(p); + p += size_of(*partial_key); + return *reinterpret_cast(p); + } +} + +template +void node_t::dump(std::ostream& os) const +{ + for (uint16_t i = 0; i < count; i++) { + const auto& [prefix, suffix] = key_at(i); + os << " [" << i << '/' << count - 1 << "]\n" + << " key1 = (" << prefix << ")\n" + << " key2 = (" << suffix << ")\n"; + const auto& item = item_at(prefix); + if (_is_leaf()) { + os << " item = " << item << "\n"; + } else { + os << " child = " << std::hex << item << std::dec << "\n"; + } + } +} + +template +char* node_t::from_end(uint16_t offset) +{ + auto end = reinterpret_cast(this) + BlockSize; + return end - static_cast(offset); +} + +template +const char* node_t::from_end(uint16_t offset) const +{ + auto end = reinterpret_cast(this) + BlockSize; + return end - static_cast(offset); +} + +template +uint16_t node_t::used_space() const +{ + if constexpr (item_in_key) { + return count * sizeof(key_prefix_t); + } else { + if (count) { + return keys[count - 1].offset + count * sizeof(key_prefix_t); + } else { + return 0; + } + } +} + +template +uint16_t node_t::capacity() +{ + auto p = reinterpret_cast(0); + return BlockSize - (reinterpret_cast(p->keys) - + reinterpret_cast(p)); +} + +// TODO: if it's allowed to update 2 siblings at the same time, we can have +// B* tree +template +constexpr uint16_t node_t::min_size() +{ + return capacity() / 2; +} + +template +constexpr std::pair +node_t::bytes_to_add(uint16_t size) +{ + assert(size < min_size()); + return {min_size() - size, capacity() - size}; +} + +template +constexpr std::pair +node_t::bytes_to_remove(uint16_t size) +{ + assert(size > capacity()); + return {size - capacity(), size - min_size()}; +} + +template +size_state_t node_t::size_state(uint16_t size) const +{ + if (size > capacity()) { + return size_state_t::overflow; + } else if (size < capacity() / 2) { + return size_state_t::underflow; + } else { + return size_state_t::okay; + } +} + +template +bool node_t::is_underflow(uint16_t size) const +{ + switch (size_state(size)) { + case size_state_t::underflow: + return true; + case size_state_t::okay: + return false; + default: + assert(0); + return false; + } +} + +template +int16_t node_t::size_with_key(unsigned slot, + const ghobject_t& oid) const +{ + if constexpr (item_in_key) { + return capacity(); + } else { + // the size of fixed key does not change + [[maybe_unused]] const auto& [prefix, suffix] = key_at(slot); + return capacity() + key_suffix_t::size_from(oid) - suffix.size(); + } +} + +template +ordering_t node_t::compare_with_slot(unsigned slot, + const ghobject_t& oid) const +{ + const auto& [prefix, suffix] = key_at(slot); + if (auto result = prefix.compare(oid); result != ordering_t::equivalent) { + return result; + } else { + return suffix.compare(oid); + } +} + +/// return the slot number of the first slot that is greater or equal to +/// key +template +std::pair node_t::lower_bound(const ghobject_t& oid) const +{ + unsigned s = 0, e = count; + while (s != e) { + unsigned mid = (s + e) / 2; + switch (compare_with_slot(mid, oid)) { + case ordering_t::less: + s = ++mid; + break; + case ordering_t::greater: + e = mid; + break; + case ordering_t::equivalent: + assert(mid == 0 || mid < count); + return {mid, true}; + } + } + return {s, false}; +} + +template +uint16_t node_t::size_of_item(const ghobject_t& oid, + const item_t& item) +{ + if constexpr (item_in_key) { + return sizeof(key_prefix_t); + } else { + return (sizeof(key_prefix_t) + + key_suffix_t::size_from(oid) + size_of(item)); + } +} + +template +bool node_t::is_overflow(const ghobject_t& oid, + const item_t& item) const +{ + return free_space() < size_of_item(oid, item); +} + +template +bool node_t::is_overflow(const ghobject_t& oid, + const OnodeRef& item) const +{ + return free_space() < (sizeof(key_prefix_t) + key_suffix_t::size_from(oid) + item->size()); +} + +// inserts an item into the given slot, pushing all subsequent keys forward +// @note if the item is not embedded in key, shift the right half as well +template +void node_t::insert_at(unsigned slot, + const ghobject_t& oid, + const item_t& item) +{ + assert(!is_overflow(oid, item)); + assert(slot <= count); + if constexpr (item_in_key) { + // shift the keys right + key_prefix_t* key = keys + slot; + key_prefix_t* last_key = keys + count; + std::copy_backward(key, last_key, last_key + 1); + key->set(oid, item); + } else { + const uint16_t size = key_suffix_t::size_from(oid) + size_of(item); + uint16_t offset = size; + if (slot > 0) { + offset += keys[slot - 1].offset; + } + if (slot < count) { + // V + // | |... // ...|//////|| | + // | |... // ...|//////| | | + // shift the partial keys and items left + auto first = keys[slot - 1].offset; + auto last = keys[count - 1].offset; + std::memmove(from_end(last + size), from_end(last), last - first); + // shift the keys right and update the pointers + for (key_prefix_t* dst = keys + count; dst > keys + slot; dst--) { + key_prefix_t* src = dst - 1; + *dst = *src; + dst->offset += size; + } + } + keys[slot].set(oid, offset); + auto p = from_end(offset); + auto partial_key = reinterpret_cast(p); + partial_key->set(oid); + p += size_of(*partial_key); + auto item_ptr = reinterpret_cast(p); + *item_ptr = item; + } + count++; + assert(used_space() <= capacity()); +} + +// used by InnerNode for updating the keys indexing its children when their lower boundaries +// is updated +template +void node_t::update_key_at(unsigned slot, const ghobject_t& oid) +{ + if constexpr (is_leaf()) { + assert(0); + } else if constexpr (item_in_key) { + keys[slot].update(oid); + } else { + const auto& [prefix, suffix] = key_at(slot); + int16_t delta = key_suffix_t::size_from(oid) - suffix.size(); + if (delta > 0) { + // shift the cells sitting at its left side + auto first = keys[slot].offset; + auto last = keys[count - 1].offset; + std::memmove(from_end(last + delta), from_end(last), last - first); + // update the pointers + for (key_prefix_t* key = keys + slot; key < keys + count; key++) { + key->offset += delta; + } + } + keys[slot].update(oid); + auto p = from_end(keys[slot].offset); + auto partial_key = reinterpret_cast(p); + partial_key->set(oid); + // we don't update item here + } +} + +template +std::pair +node_t::calc_grab_front(uint16_t min_grab, + uint16_t max_grab) const +{ + // TODO: split by likeness + uint16_t grabbed = 0; + uint16_t used = used_space(); + int n = 0; + for (; n < count; n++) { + const auto& [prefix, suffix] = key_at(n); + uint16_t to_grab = sizeof(prefix) + size_of(suffix); + if constexpr (!item_in_key) { + const auto& item = item_at(prefix); + to_grab += size_of(item); + } + if (grabbed + to_grab > max_grab) { + break; + } + grabbed += to_grab; + } + if (grabbed >= min_grab) { + if (n == count) { + return {n, grabbed}; + } else if (!is_underflow(used - grabbed)) { + return {n, grabbed}; + } + } + return {0, 0}; +} + +template +std::pair +node_t::calc_grab_back(uint16_t min_grab, + uint16_t max_grab) const +{ + // TODO: split by likeness + uint16_t grabbed = 0; + uint16_t used = used_space(); + for (int i = count - 1; i >= 0; i--) { + const auto& [prefix, suffix] = key_at(i); + uint16_t to_grab = sizeof(prefix) + size_of(suffix); + if constexpr (!item_in_key) { + const auto& item = item_at(prefix); + to_grab += size_of(item); + } + grabbed += to_grab; + if (is_underflow(used - grabbed)) { + return {0, 0}; + } else if (grabbed > max_grab) { + return {0, 0}; + } else if (grabbed >= min_grab) { + return {i + 1, grabbed}; + } + } + return {0, 0}; +} + +template +template +void node_t::grab_from_left(node_t& left, + unsigned n, uint16_t bytes, + Mover& mover) +{ + // TODO: rebuild keys if moving across different layouts + // group by likeness + shift_right(n, bytes); + mover.move_from(left.count - n, 0, n); +} + +template +template +delta_t node_t::acquire_right(node_t& right, + unsigned whoami, Mover& mover) +{ + mover.move_from(0, count, right.count); + return mover.to_delta(); +} + +template +template +void node_t::grab_from_right(node_t& right, + unsigned n, uint16_t bytes, + Mover& mover) +{ + mover.move_from(0, count, n); + right.shift_left(n, 0); +} + +template +template +void node_t::push_to_left(node_t& left, + unsigned n, uint16_t bytes, + Mover& mover) +{ + left.grab_from_right(*this, n, bytes, mover); +} + +template +template +void node_t::push_to_right(node_t& right, + unsigned n, uint16_t bytes, + Mover& mover) +{ + right.grab_from_left(*this, n, bytes, mover); +} + +// [to, from) are removed, so we need to shift left +// actually there are only two use cases: +// - to = 0: for giving elements in bulk +// - to = from - 1: for removing a single element +// old: |////|.....| |.....|/|........| +// new: |.....| |.....||........| +template +void node_t::shift_left(unsigned from, unsigned to) +{ + assert(from < count); + assert(to < from); + if constexpr (item_in_key) { + std::copy(keys + from, keys + count, keys + to); + } else { + const uint16_t cell_hi = keys[count - 1].offset; + const uint16_t cell_lo = keys[from - 1].offset; + const uint16_t offset_delta = keys[from].offset - keys[to].offset; + for (auto src_key = keys + from, dst_key = keys + to; + src_key != keys + count; + ++src_key, ++dst_key) { + // shift the keys left + *dst_key = *src_key; + // update the pointers + dst_key->offset -= offset_delta; + } + // and cells + auto dst = from_end(cell_hi); + std::memmove(dst + offset_delta, dst, cell_hi - cell_lo); + } + count -= (from - to); +} + +template +void node_t::insert_front(const ceph::bufferptr& keys_buf, + const ceph::bufferptr& cells_buf) +{ + unsigned n = keys_buf.length() / sizeof(key_prefix_t); + shift_right(n, cells_buf.length()); + keys_buf.copy_out(0, keys_buf.length(), reinterpret_cast(keys)); + if constexpr (item_in_key) { + assert(cells_buf.length() == 0); + } else { + cells_buf.copy_out(0, cells_buf.length(), from_end(keys[n - 1].offset)); + } +} + +template +void node_t::insert_back(const ceph::bufferptr& keys_buf, + const ceph::bufferptr& cells_buf) +{ + keys_buf.copy_out(0, keys_buf.length(), reinterpret_cast(keys + count)); + count += keys_buf.length() / sizeof(key_prefix_t); + if constexpr (item_in_key) { + assert(cells_buf.length() == 0); + } else { + cells_buf.copy_out(0, cells_buf.length(), from_end(keys[count - 1].offset)); + } +} + +// one or more elements are inserted, so we need to shift the elements right +// actually there are only two use cases: +// - bytes != 0: for inserting bytes before from +// - bytes = 0: for inserting a single element before from +// old: ||.....| +// new: |/////|.....| +template +void node_t::shift_right(unsigned n, unsigned bytes) +{ + assert(bytes + used_space() < capacity()); + // shift the keys left + std::copy_backward(keys, keys + count, keys + count + n); + count += n; + if constexpr (!item_in_key) { + uint16_t cells = keys[count - 1].offset; + // copy the partial keys and items + std::memmove(from_end(cells + bytes), from_end(cells), cells); + // update the pointers + for (auto key = keys + n; key < keys + count; ++key) { + key->offset += bytes; + } + } +} + +// shift all keys after slot is removed. +// @note if the item is not embdedded in key, all items sitting at the left +// side of it will be shifted right +template +void node_t::remove_from(unsigned slot) +{ + assert(slot < count); + if (unsigned next = slot + 1; next < count) { + shift_left(next, slot); + } else { + // slot is the last one + count--; + } +} + +template +void node_t::trim_right(unsigned n) +{ + count = n; +} + +template +void node_t::play_delta(const delta_t& delta) +{ + switch (delta.op) { + case delta_t::op_t::insert_onode: + if constexpr (is_leaf()) { + auto [slot, found] = lower_bound(delta.oid); + assert(!found); + assert(delta.onode->size() <= std::numeric_limits::max()); + ceph::bufferptr buf{static_cast(delta.onode->size())}; + delta.onode->encode(buf.c_str(), buf.length()); + auto onode = reinterpret_cast(buf.c_str()); + return insert_at(slot, delta.oid, *onode); + } else { + throw std::invalid_argument("wrong node type"); + } + case delta_t::op_t::update_onode: + // TODO + assert(0 == "not implemented"); + break; + case delta_t::op_t::insert_child: + if constexpr (is_leaf()) { + throw std::invalid_argument("wrong node type"); + } else { + auto [slot, found] = lower_bound(delta.oid); + assert(!found); + insert_at(slot, delta.oid, delta.addr); + } + case delta_t::op_t::update_key: + if constexpr (is_leaf()) { + throw std::invalid_argument("wrong node type"); + } else { + return update_key_at(delta.n, delta.oid); + } + case delta_t::op_t::shift_left: + return shift_left(delta.n, 0); + case delta_t::op_t::trim_right: + return trim_right(delta.n); + case delta_t::op_t::insert_front: + return insert_front(delta.keys, delta.cells); + case delta_t::op_t::insert_back: + return insert_back(delta.keys, delta.cells); + case delta_t::op_t::remove_from: + return remove_from(delta.n); + default: + assert(0 == "unknown onode delta"); + } +} + +// explicit instantiate the node_t classes used by test_node.cc +template class node_t<512, 0, ntype_t::inner>; +template class node_t<512, 0, ntype_t::leaf>; +template class node_t<512, 1, ntype_t::inner>; +template class node_t<512, 1, ntype_t::leaf>; +template class node_t<512, 2, ntype_t::inner>; +template class node_t<512, 2, ntype_t::leaf>; +template class node_t<512, 3, ntype_t::inner>; +template class node_t<512, 3, ntype_t::leaf>; diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h new file mode 100644 index 000000000..d833a6682 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h @@ -0,0 +1,942 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include + +#include "common/hobject.h" +#include "crimson/common/layout.h" +#include "crimson/os/seastore/onode.h" +#include "crimson/os/seastore/seastore_types.h" +#include "onode_delta.h" + +namespace asci = absl::container_internal; + +namespace boost::beast { + template + bool operator==(const span& lhs, const span& rhs) { + return std::equal( + lhs.begin(), lhs.end(), + rhs.begin(), rhs.end()); + } +} + +// on-disk onode +// it only keeps the bits necessary to rebuild an in-memory onode +struct [[gnu::packed]] onode_t { + onode_t& operator=(const onode_t& onode) { + len = onode.len; + std::memcpy(data, onode.data, len); + return *this; + } + size_t size() const { + return sizeof(*this) + len; + } + OnodeRef decode() const { + return new crimson::os::seastore::Onode(std::string_view{data, len}); + } + uint8_t struct_v = 1; + uint8_t struct_compat = 1; + // TODO: + // - use uint16_t for length, as the size of an onode should be less + // than a block (16K for now) + // - drop struct_len + uint32_t struct_len = 0; + uint32_t len; + char data[]; +}; + +static inline std::ostream& operator<<(std::ostream& os, const onode_t& onode) { + return os << *onode.decode(); +} + +using crimson::os::seastore::laddr_t; + +struct [[gnu::packed]] child_addr_t { + laddr_t data; + child_addr_t(laddr_t data) + : data{data} + {} + child_addr_t& operator=(laddr_t addr) { + data = addr; + return *this; + } + laddr_t get() const { + return data; + } + operator laddr_t() const { + return data; + } + size_t size() const { + return sizeof(laddr_t); + } +}; + +// poor man's operator<=> +enum class ordering_t { + less, + equivalent, + greater, +}; + +template +ordering_t compare_element(const L& x, const R& y) +{ + if constexpr (std::is_arithmetic_v) { + static_assert(std::is_arithmetic_v); + if (x < y) { + return ordering_t::less; + } else if (x > y) { + return ordering_t::greater; + } else { + return ordering_t::equivalent; + } + } else { + // string_view::compare(), string::compare(), ... + auto result = x.compare(y); + if (result < 0) { + return ordering_t::less; + } else if (result > 0) { + return ordering_t::greater; + } else { + return ordering_t::equivalent; + } + } +} + +template +constexpr ordering_t tuple_cmp(const L&, const R&, std::index_sequence<>) +{ + return ordering_t::equivalent; +} + +template +constexpr ordering_t tuple_cmp(const L& x, const R& y, + std::index_sequence) +{ + auto ordering = compare_element(std::get(x), std::get(y)); + if (ordering != ordering_t::equivalent) { + return ordering; + } else { + return tuple_cmp(x, y, std::index_sequence()); + } +} + +template +constexpr ordering_t cmp(const std::tuple& x, + const std::tuple& y) +{ + static_assert(sizeof...(Ls) == sizeof...(Rs)); + return tuple_cmp(x, y, std::index_sequence_for()); +} + +enum class likes_t { + yes, + no, + maybe, +}; + +struct [[gnu::packed]] variable_key_suffix { + uint64_t snap; + uint64_t gen; + uint8_t nspace_len; + uint8_t name_len; + char data[]; + struct index_t { + enum { + nspace_data = 0, + name_data = 1, + }; + }; + using layout_type = asci::Layout; + layout_type cell_layout() const { + return layout_type{nspace_len, name_len}; + } + void set(const ghobject_t& oid) { + snap = oid.hobj.snap; + gen = oid.generation; + nspace_len = oid.hobj.nspace.size(); + name_len = oid.hobj.oid.name.size(); + auto layout = cell_layout(); + std::memcpy(layout.Pointer(data), + oid.hobj.nspace.data(), oid.hobj.nspace.size()); + std::memcpy(layout.Pointer(data), + oid.hobj.oid.name.data(), oid.hobj.oid.name.size()); + } + + void update_oid(ghobject_t& oid) const { + oid.hobj.snap = snap; + oid.generation = gen; + oid.hobj.nspace = nspace(); + oid.hobj.oid.name = name(); + } + + variable_key_suffix& operator=(const variable_key_suffix& key) { + snap = key.snap; + gen = key.gen; + auto layout = cell_layout(); + auto nspace = key.nspace(); + std::copy_n(nspace.data(), nspace.size(), + layout.Pointer(data)); + auto name = key.name(); + std::copy_n(name.data(), name.size(), + layout.Pointer(data)); + return *this; + } + const std::string_view nspace() const { + auto layout = cell_layout(); + auto nspace = layout.Slice(data); + return {nspace.data(), nspace.size()}; + } + const std::string_view name() const { + auto layout = cell_layout(); + auto name = layout.Slice(data); + return {name.data(), name.size()}; + } + size_t size() const { + return sizeof(*this) + nspace_len + name_len; + } + static size_t size_from(const ghobject_t& oid) { + return (sizeof(variable_key_suffix) + + oid.hobj.nspace.size() + + oid.hobj.oid.name.size()); + } + ordering_t compare(const ghobject_t& oid) const { + return cmp(std::tie(nspace(), name(), snap, gen), + std::tie(oid.hobj.nspace, oid.hobj.oid.name, oid.hobj.snap.val, + oid.generation)); + } + bool likes(const variable_key_suffix& key) const { + return nspace() == key.nspace() && name() == key.name(); + } +}; + +static inline std::ostream& operator<<(std::ostream& os, const variable_key_suffix& k) { + if (k.snap != CEPH_NOSNAP) { + os << "s" << k.snap << ","; + } + if (k.gen != ghobject_t::NO_GEN) { + os << "g" << k.gen << ","; + } + return os << k.nspace() << "/" << k.name(); +} + +// should use [[no_unique_address]] in C++20 +struct empty_key_suffix { + static constexpr ordering_t compare(const ghobject_t&) { + return ordering_t::equivalent; + } + static void set(const ghobject_t&) {} + static constexpr size_t size() { + return 0; + } + static size_t size_from(const ghobject_t&) { + return 0; + } + static void update_oid(ghobject_t&) {} +}; + +static inline std::ostream& operator<<(std::ostream& os, const empty_key_suffix&) +{ + return os; +} + +enum class ntype_t : uint8_t { + leaf = 0u, + inner, +}; + +constexpr ntype_t flip_ntype(ntype_t ntype) noexcept +{ + if (ntype == ntype_t::leaf) { + return ntype_t::inner; + } else { + return ntype_t::leaf; + } +} + +template +struct FixedKeyPrefix {}; + +template +struct FixedKeyPrefix<0, NodeType> +{ + static constexpr bool item_in_key = false; + int8_t shard = -1; + int64_t pool = -1; + uint32_t hash = 0; + uint16_t offset = 0; + + FixedKeyPrefix() = default; + FixedKeyPrefix(const ghobject_t& oid, uint16_t offset) + : shard{oid.shard_id}, + pool{oid.hobj.pool}, + hash{oid.hobj.get_hash()}, + offset{offset} + {} + + void set(const ghobject_t& oid, uint16_t new_offset) { + shard = oid.shard_id; + pool = oid.hobj.pool; + hash = oid.hobj.get_hash(); + offset = new_offset; + } + + void set(const FixedKeyPrefix& k, uint16_t new_offset) { + shard = k.shard; + pool = k.pool; + hash = k.hash; + offset = new_offset; + } + + void update(const ghobject_t& oid) { + shard = oid.shard_id; + pool = oid.hobj.pool; + hash = oid.hobj.get_hash(); + } + + void update_oid(ghobject_t& oid) const { + oid.set_shard(shard_id_t{shard}); + oid.hobj.pool = pool; + oid.hobj.set_hash(hash); + } + + ordering_t compare(const ghobject_t& oid) const { + // so std::tie() can bind them by reference + int8_t rhs_shard = oid.shard_id; + uint32_t rhs_hash = oid.hobj.get_hash(); + return cmp(std::tie(shard, pool, hash), + std::tie(rhs_shard, oid.hobj.pool, rhs_hash)); + } + // @return true if i likes @c k, we will can be pushed down to next level + // in the same node + likes_t likes(const FixedKeyPrefix& k) const { + if (shard == k.shard && pool == k.pool) { + return likes_t::yes; + } else { + return likes_t::no; + } + } +}; + +template +std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<0, NodeType>& k) { + if (k.shard != shard_id_t::NO_SHARD) { + os << "s" << k.shard; + } + return os << "p=" << k.pool << "," + << "h=" << std::hex << k.hash << std::dec << "," + << ">" << k.offset; +} + +// all elements in this node share the same +template +struct FixedKeyPrefix<1, NodeType> { + static constexpr bool item_in_key = false; + uint32_t hash = 0; + uint16_t offset = 0; + + FixedKeyPrefix() = default; + FixedKeyPrefix(uint32_t hash, uint16_t offset) + : hash{hash}, + offset{offset} + {} + FixedKeyPrefix(const ghobject_t& oid, uint16_t offset) + : FixedKeyPrefix(oid.hobj.get_hash(), offset) + {} + void set(const ghobject_t& oid, uint16_t new_offset) { + hash = oid.hobj.get_hash(); + offset = new_offset; + } + template + void set(const FixedKeyPrefix& k, uint16_t new_offset) { + static_assert(N < 2, "only N0, N1 have hash"); + hash = k.hash; + offset = new_offset; + } + void update_oid(ghobject_t& oid) const { + oid.hobj.set_hash(hash); + } + void update(const ghobject_t& oid) { + hash = oid.hobj.get_hash(); + } + ordering_t compare(const ghobject_t& oid) const { + return compare_element(hash, oid.hobj.get_hash()); + } + likes_t likes(const FixedKeyPrefix& k) const { + return hash == k.hash ? likes_t::yes : likes_t::no; + } +}; + +template +std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<1, NodeType>& k) { + return os << "0x" << std::hex << k.hash << std::dec << "," + << ">" << k.offset; +} + +// all elements in this node must share the same +template +struct FixedKeyPrefix<2, NodeType> { + static constexpr bool item_in_key = false; + uint16_t offset = 0; + + FixedKeyPrefix() = default; + + static constexpr ordering_t compare(const ghobject_t& oid) { + // need to compare the cell + return ordering_t::equivalent; + } + // always defer to my cell for likeness + constexpr likes_t likes(const FixedKeyPrefix&) const { + return likes_t::maybe; + } + void set(const ghobject_t&, uint16_t new_offset) { + offset = new_offset; + } + template + void set(const FixedKeyPrefix&, uint16_t new_offset) { + offset = new_offset; + } + void update(const ghobject_t&) {} + void update_oid(ghobject_t&) const {} +}; + +template +std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<2, NodeType>& k) { + return os << ">" << k.offset; +} + +struct fixed_key_3 { + uint64_t snap = 0; + uint64_t gen = 0; + + fixed_key_3() = default; + fixed_key_3(const ghobject_t& oid) + : snap{oid.hobj.snap}, gen{oid.generation} + {} + ordering_t compare(const ghobject_t& oid) const { + return cmp(std::tie(snap, gen), + std::tie(oid.hobj.snap.val, oid.generation)); + } + // no object likes each other at this level + constexpr likes_t likes(const fixed_key_3&) const { + return likes_t::no; + } + void update_with_oid(const ghobject_t& oid) { + snap = oid.hobj.snap; + gen = oid.generation; + } + void update_oid(ghobject_t& oid) const { + oid.hobj.snap = snap; + oid.generation = gen; + } +}; + +static inline std::ostream& operator<<(std::ostream& os, const fixed_key_3& k) { + if (k.snap != CEPH_NOSNAP) { + os << "s" << k.snap << ","; + } + if (k.gen != ghobject_t::NO_GEN) { + os << "g" << k.gen << ","; + } + return os; +} + +// all elements in this node must share the same +// but the unlike other FixedKeyPrefix<>, a node with FixedKeyPrefix<3> does not have +// variable_sized_key, so if it is an inner node, we can just embed the child +// addr right in the key. +template<> +struct FixedKeyPrefix<3, ntype_t::inner> : public fixed_key_3 { + // the item is embedded in the key + static constexpr bool item_in_key = true; + laddr_t child_addr = 0; + + FixedKeyPrefix() = default; + void set(const ghobject_t& oid, laddr_t new_child_addr) { + update_with_oid(oid); + child_addr = new_child_addr; + } + // unlikely get called, though.. + void update(const ghobject_t& oid) {} + template + std::enable_if_t set(const FixedKeyPrefix&, + laddr_t new_child_addr) { + child_addr = new_child_addr; + } + void set(const FixedKeyPrefix& k, laddr_t new_child_addr) { + snap = k.snap; + gen = k.gen; + child_addr = new_child_addr; + } + void set(const variable_key_suffix& k, laddr_t new_child_addr) { + snap = k.snap; + gen = k.gen; + child_addr = new_child_addr; + } +}; + +template<> +struct FixedKeyPrefix<3, ntype_t::leaf> : public fixed_key_3 { + static constexpr bool item_in_key = false; + uint16_t offset = 0; + + FixedKeyPrefix() = default; + void set(const ghobject_t& oid, uint16_t new_offset) { + update_with_oid(oid); + offset = new_offset; + } + void set(const FixedKeyPrefix& k, uint16_t new_offset) { + snap = k.snap; + gen = k.gen; + offset = new_offset; + } + template + std::enable_if_t set(const FixedKeyPrefix&, + uint16_t new_offset) { + offset = new_offset; + } +}; + +struct tag_t { + template + static constexpr tag_t create() { + static_assert(std::clamp(N, 0, 3) == N); + return tag_t{N, static_cast(node_type)}; + } + bool is_leaf() const { + return type() == ntype_t::leaf; + } + int layout() const { + return layout_type; + } + ntype_t type() const { + return ntype_t{node_type}; + } + int layout_type : 4; + uint8_t node_type : 4; +}; + +static inline std::ostream& operator<<(std::ostream& os, const tag_t& tag) { + return os << "n=" << tag.layout() << ", leaf=" << tag.is_leaf(); +} + +// for calculating size of variable-sized item/key +template +size_t size_of(const T& t) { + using decayed_t = std::decay_t; + if constexpr (std::is_scalar_v) { + return sizeof(decayed_t); + } else { + return t.size(); + } +} + +enum class size_state_t { + okay, + underflow, + overflow, +}; + +// layout of a node of B+ tree +// +// it is different from a typical B+ tree in following ways +// - the size of keys is not necessarily fixed, neither is the size of value. +// - the max number of elements in a node is determined by the total size of +// the keys and values in the node +// - in internal nodes, each key maps to the logical address of the child +// node whose minimum key is greater or equal to that key. +template +struct node_t { + static_assert(std::clamp(N, 0, 3) == N); + constexpr static ntype_t node_type = NodeType; + constexpr static int node_n = N; + + using key_prefix_t = FixedKeyPrefix; + using item_t = std::conditional_t; + using const_item_t = std::conditional_t; + static constexpr bool item_in_key = key_prefix_t::item_in_key; + using key_suffix_t = std::conditional_t; + + std::pair + key_at(unsigned slot) const; + + // update an existing oid with the specified item + ghobject_t get_oid_at(unsigned slot, const ghobject_t& oid) const; + const_item_t item_at(const key_prefix_t& key) const; + void dump(std::ostream& os) const; + + // for debugging only. + static constexpr bool is_leaf() { + return node_type == ntype_t::leaf; + } + + bool _is_leaf() const { + return tag.is_leaf(); + } + + char* from_end(uint16_t offset); + const char* from_end(uint16_t offset) const; + uint16_t used_space() const; + uint16_t free_space() const { + return capacity() - used_space(); + } + static uint16_t capacity(); + // TODO: if it's allowed to update 2 siblings at the same time, we can have + // B* tree + static constexpr uint16_t min_size(); + + + // calculate the allowable bounds on bytes to remove from an overflow node + // with specified size + // @param size the overflowed size + // @return + static constexpr std::pair bytes_to_remove(uint16_t size); + + // calculate the allowable bounds on bytes to add to an underflow node + // with specified size + // @param size the underflowed size + // @return + static constexpr std::pair bytes_to_add(uint16_t size); + + size_state_t size_state(uint16_t size) const; + bool is_underflow(uint16_t size) const; + int16_t size_with_key(unsigned slot, const ghobject_t& oid) const; + ordering_t compare_with_slot(unsigned slot, const ghobject_t& oid) const; + /// return the slot number of the first slot that is greater or equal to + /// key + std::pair lower_bound(const ghobject_t& oid) const; + static uint16_t size_of_item(const ghobject_t& oid, const item_t& item); + bool is_overflow(const ghobject_t& oid, const item_t& item) const; + bool is_overflow(const ghobject_t& oid, const OnodeRef& item) const; + + // inserts an item into the given slot, pushing all subsequent keys forward + // @note if the item is not embedded in key, shift the right half as well + void insert_at(unsigned slot, const ghobject_t& oid, const item_t& item); + // used by InnerNode for updating the keys indexing its children when their lower boundaries + // is updated + void update_key_at(unsigned slot, const ghobject_t& oid); + // try to figure out the number of elements and total size when trying to + // rebalance by moving the elements from the front of this node when its + // left sibling node is underflow + // + // @param min_grab lower bound of the number of bytes to move + // @param max_grab upper bound of the number of bytes to move + // @return the number of element to grab + // @note return {0, 0} if current node would be underflow if + // @c min_grab bytes of elements are taken from it + std::pair calc_grab_front(uint16_t min_grab, uint16_t max_grab) const; + // try to figure out the number of elements and their total size when trying to + // rebalance by moving the elements from the end of this node when its right + // sibling node is underflow + // + // @param min_grab lower bound of the number of bytes to move + // @param max_grab upper bound of the number of bytes to move + // @return the number of element to grab + // @note return {0, 0} if current node would be underflow if + // @c min_grab bytes of elements are taken from it + std::pair calc_grab_back(uint16_t min_grab, uint16_t max_grab) const; + template void grab_from_left( + node_t& left, + unsigned n, uint16_t bytes, + Mover& mover); + template + delta_t acquire_right(node_t& right, + unsigned whoami, Mover& mover); + // transfer n elements at the front of given node to me + template + void grab_from_right(node_t& right, + unsigned n, uint16_t bytes, + Mover& mover); + template + void push_to_left(node_t& left, + unsigned n, uint16_t bytes, + Mover& mover); + template + void push_to_right(node_t& right, + unsigned n, uint16_t bytes, + Mover& mover); + // [to, from) are removed, so we need to shift left + // actually there are only two use cases: + // - to = 0: for giving elements in bulk + // - to = from - 1: for removing a single element + // old: |////|.....| |.....|/|........| + // new: |.....| |.....||........| + void shift_left(unsigned from, unsigned to); + void insert_front(const ceph::bufferptr& keys_buf, const ceph::bufferptr& cells_buf); + void insert_back(const ceph::bufferptr& keys_buf, const ceph::bufferptr& cells_buf); + // one or more elements are inserted, so we need to shift the elements right + // actually there are only two use cases: + // - bytes != 0: for inserting bytes before from + // - bytes = 0: for inserting a single element before from + // old: ||.....| + // new: |/////|.....| + void shift_right(unsigned n, unsigned bytes); + // shift all keys after slot is removed. + // @note if the item is not embdedded in key, all items sitting at the left + // side of it will be shifted right + void remove_from(unsigned slot); + void trim_right(unsigned n); + void play_delta(const delta_t& delta); + // /-------------------------------| + // | V + // |header|k0|k1|k2|... | / / |k2'v2|k1'v1|k0'.v0| v_m | + // |<-- count -->| + tag_t tag = tag_t::create(); + // the count of values in the node + uint16_t count = 0; + key_prefix_t keys[]; +}; + +template +class EntryMover { +public: + // a "trap" mover + EntryMover(const parent_t&, from_t&, to_t& dst, unsigned) { + assert(0); + } + void move_from(unsigned, unsigned, unsigned) { + assert(0); + } + delta_t get_delta() { + return delta_t::nop(); + } +}; + +// lower the layout, for instance, from L0 to L1, no reference oid is used +template +class EntryMover> +{ +public: + EntryMover(const parent_t&, from_t& src, to_t& dst, unsigned) + : src{src}, dst{dst} + {} + void move_from(unsigned src_first, unsigned dst_first, unsigned n) + { + ceph::bufferptr keys_buf{n * sizeof(to_t::key_prefix_t)}; + ceph::bufferptr cells_buf; + auto dst_keys = reinterpret_cast(keys_buf.c_str()); + if constexpr (to_t::item_in_key) { + for (unsigned i = 0; i < n; i++) { + const auto& [prefix, suffix] = src.key_at(src_first + i); + dst_keys[i].set(suffix, src.item_at(prefix)); + } + } else { + // copy keys + uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0; + uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0; + for (unsigned i = 0; i < n; i++) { + auto& src_key = src.keys[src_first + i]; + uint16_t offset = src_key.offset - src_offset + dst_offset; + dst_keys[i].set(src_key, offset); + } + // copy cells in bulk, yay! + auto src_end = src.keys[src_first + n - 1].offset; + uint16_t total_cell_size = src_end - src_offset; + cells_buf = ceph::bufferptr{total_cell_size}; + cells_buf.copy_in(0, total_cell_size, src.from_end(src_end)); + } + if (dst_first == dst.count) { + dst_delta = delta_t::insert_back(keys_buf, cells_buf); + } else { + dst_delta = delta_t::insert_front(keys_buf, cells_buf); + } + if (src_first > 0 && src_first + n == src.count) { + src_delta = delta_t::trim_right(src_first); + } else if (src_first == 0 && n < src.count) { + src_delta = delta_t::shift_left(n); + } else if (src_first == 0 && n == src.count) { + // the caller will retire the src extent + } else { + // grab in the middle? + assert(0); + } + } + + delta_t from_delta() { + return std::move(src_delta); + } + delta_t to_delta() { + return std::move(dst_delta); + } +private: + const from_t& src; + const to_t& dst; + delta_t dst_delta; + delta_t src_delta; +}; + +// lift the layout, for instance, from L2 to L0, need a reference oid +template +class EntryMover to_t::node_n)>> +{ +public: + EntryMover(const parent_t& parent, from_t& src, to_t& dst, unsigned from_slot) + : src{src}, dst{dst}, ref_oid{parent->get_oid_at(from_slot, {})} + {} + void move_from(unsigned src_first, unsigned dst_first, unsigned n) + { + ceph::bufferptr keys_buf{n * sizeof(to_t::key_prefix_t)}; + ceph::bufferptr cells_buf; + auto dst_keys = reinterpret_cast(keys_buf.c_str()); + uint16_t in_node_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0; + static_assert(!std::is_same_v); + // copy keys + uint16_t buf_offset = 0; + for (unsigned i = 0; i < n; i++) { + auto& src_key = src.keys[src_first + i]; + if constexpr (std::is_same_v) { + // heterogeneous partial key, have to rebuild dst partial key from oid + src_key.update_oid(ref_oid); + const auto& src_item = src.item_at(src_key); + size_t key2_size = to_t::key_suffix_t::size_from(ref_oid); + buf_offset += key2_size + size_of(src_item); + dst_keys[i].set(ref_oid, in_node_offset + buf_offset); + auto p = from_end(cells_buf, buf_offset); + auto partial_key = reinterpret_cast(p); + partial_key->set(ref_oid); + p += key2_size; + auto dst_item = reinterpret_cast(p); + *dst_item = src_item; + } else { + // homogeneous partial key, just update the pointers + uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0; + uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0; + uint16_t offset = src_key.offset - src_offset + dst_offset; + dst_keys[i].set(ref_oid, in_node_offset + offset); + } + } + if constexpr (std::is_same_v) { + // copy cells in bulk, yay! + uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0; + uint16_t src_end = src.keys[src_first + n - 1].offset; + uint16_t total_cell_size = src_end - src_offset; + cells_buf.copy_in(0, total_cell_size, src.from_end(src_end)); + } + if (dst_first == dst.count) { + dst_delta = delta_t::insert_back(keys_buf, cells_buf); + } else { + dst_delta = delta_t::insert_front(keys_buf, cells_buf); + } + if (src_first + n == src.count && src_first > 0) { + src_delta = delta_t::trim_right(src_first); + } else { + // the caller will retire the src extent + assert(src_first == 0); + } + } + + delta_t from_delta() { + return std::move(src_delta); + } + delta_t to_delta() { + return std::move(dst_delta); + } +private: + char* from_end(ceph::bufferptr& ptr, uint16_t offset) { + return ptr.end_c_str() - static_cast(offset); + } +private: + const from_t& src; + const to_t& dst; + delta_t dst_delta; + delta_t src_delta; + ghobject_t ref_oid; +}; + +// identical layout, yay! +template +class EntryMover +{ +public: + EntryMover(const parent_t&, child_t& src, child_t& dst, unsigned) + : src{src}, dst{dst} + {} + + void move_from(unsigned src_first, unsigned dst_first, unsigned n) + { + ceph::bufferptr keys_buf{static_cast(n * sizeof(typename child_t::key_prefix_t))}; + ceph::bufferptr cells_buf; + auto dst_keys = reinterpret_cast(keys_buf.c_str()); + + // copy keys + std::copy(src.keys + src_first, src.keys + src_first + n, + dst_keys); + if constexpr (!child_t::item_in_key) { + uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0; + uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0; + const int offset_delta = dst_offset - src_offset; + // update the pointers + for (unsigned i = 0; i < n; i++) { + dst_keys[i].offset += offset_delta; + } + // copy cells in bulk, yay! + auto src_end = src.keys[src_first + n - 1].offset; + uint16_t total_cell_size = src_end - src_offset; + cells_buf = ceph::bufferptr{total_cell_size}; + cells_buf.copy_in(0, total_cell_size, src.from_end(src_end)); + } + if (dst_first == dst.count) { + dst_delta = delta_t::insert_back(std::move(keys_buf), std::move(cells_buf)); + } else { + dst_delta = delta_t::insert_front(std::move(keys_buf), std::move(cells_buf)); + } + if (src_first + n == src.count && src_first > 0) { + src_delta = delta_t::trim_right(n); + } else if (src_first == 0 && n < src.count) { + src_delta = delta_t::shift_left(n); + } else if (src_first == 0 && n == src.count) { + // the caller will retire the src extent + } else { + // grab in the middle? + assert(0); + } + } + + delta_t from_delta() { + return std::move(src_delta); + } + + delta_t to_delta() { + return std::move(dst_delta); + } +private: + char* from_end(ceph::bufferptr& ptr, uint16_t offset) { + return ptr.end_c_str() - static_cast(offset); + } +private: + const child_t& src; + const child_t& dst; + delta_t src_delta; + delta_t dst_delta; +}; + +template +EntryMover +make_mover(const parent_t& parent, from_t& src, to_t& dst, unsigned from_slot) { + return EntryMover(parent, src, dst, from_slot); +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h new file mode 100644 index 000000000..4908c691f --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h @@ -0,0 +1,93 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include +#include + +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction.h" + +namespace crimson::os::seastore::onode { + +using crimson::os::seastore::Transaction; +using crimson::os::seastore::TransactionRef; +using crimson::os::seastore::make_transaction; +using crimson::os::seastore::laddr_t; +using crimson::os::seastore::L_ADDR_MIN; +using crimson::os::seastore::L_ADDR_NULL; +using crimson::os::seastore::extent_len_t; + +class DeltaRecorder; +class NodeExtent; +class NodeExtentManager; +class RootNodeTracker; +using DeltaRecorderURef = std::unique_ptr; +using NodeExtentRef = crimson::os::seastore::TCachedExtentRef; +using NodeExtentManagerURef = std::unique_ptr; +using RootNodeTrackerURef = std::unique_ptr; +struct context_t { + NodeExtentManager& nm; + Transaction& t; +}; + +class LeafNodeImpl; +class InternalNodeImpl; +class NodeImpl; +using LeafNodeImplURef = std::unique_ptr; +using InternalNodeImplURef = std::unique_ptr; +using NodeImplURef = std::unique_ptr; + +using level_t = uint8_t; +// a type only to index within a node, 32 bits should be enough +using index_t = uint32_t; +constexpr auto INDEX_END = std::numeric_limits::max(); +constexpr auto INDEX_LAST = INDEX_END - 0x4; +constexpr auto INDEX_UPPER_BOUND = INDEX_END - 0x8; +inline bool is_valid_index(index_t index) { return index < INDEX_UPPER_BOUND; } + +// TODO: decide by NODE_BLOCK_SIZE +using node_offset_t = uint16_t; +constexpr node_offset_t DISK_BLOCK_SIZE = 1u << 12; +constexpr node_offset_t NODE_BLOCK_SIZE = DISK_BLOCK_SIZE * 1u; + +enum class MatchKindBS : int8_t { NE = -1, EQ = 0 }; + +enum class MatchKindCMP : int8_t { LT = -1, EQ = 0, GT }; +inline MatchKindCMP toMatchKindCMP(int value) { + if (value > 0) { + return MatchKindCMP::GT; + } else if (value < 0) { + return MatchKindCMP::LT; + } else { + return MatchKindCMP::EQ; + } +} +template +MatchKindCMP toMatchKindCMP(const Type& l, const Type& r) { + int match = l - r; + return toMatchKindCMP(match); +} + +inline MatchKindCMP toMatchKindCMP( + std::string_view l, std::string_view r) { + return toMatchKindCMP(l.compare(r)); +} + +inline MatchKindCMP reverse(MatchKindCMP cmp) { + if (cmp == MatchKindCMP::LT) { + return MatchKindCMP::GT; + } else if (cmp == MatchKindCMP::GT) { + return MatchKindCMP::LT; + } else { + return cmp; + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc new file mode 100644 index 000000000..3df458f08 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc @@ -0,0 +1,809 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node.h" + +#include +#include +#include + +#include "common/likely.h" +#include "crimson/common/log.h" +#include "node_extent_manager.h" +#include "node_impl.h" +#include "stages/node_stage_layout.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::onode { + +using node_ertr = Node::node_ertr; +template +using node_future = Node::node_future; + +/* + * tree_cursor_t + */ + +tree_cursor_t::tree_cursor_t(Ref node, const search_position_t& pos) + : leaf_node{node}, position{pos} { + assert(!is_end()); + leaf_node->do_track_cursor(*this); +} + +tree_cursor_t::tree_cursor_t( + Ref node, const search_position_t& pos, + const key_view_t& key, const onode_t* _p_value, layout_version_t v) + : leaf_node{node}, position{pos} { + assert(!is_end()); + update_kv(key, _p_value, v); + leaf_node->do_track_cursor(*this); +} + +tree_cursor_t::tree_cursor_t(Ref node) + : leaf_node{node}, position{search_position_t::end()} { + assert(is_end()); + assert(leaf_node->is_level_tail()); +} + +tree_cursor_t::~tree_cursor_t() { + if (!is_end()) { + leaf_node->do_untrack_cursor(*this); + } +} + +const key_view_t& tree_cursor_t::get_key_view() const { + ensure_kv(); + return *key_view; +} + +const onode_t* tree_cursor_t::get_p_value() const { + ensure_kv(); + return p_value; +} + +template +void tree_cursor_t::update_track( + Ref node, const search_position_t& pos) { + // the cursor must be already untracked + // track the new node and new pos + assert(!pos.is_end()); + assert(!is_end()); + leaf_node = node; + position = pos; + key_view.reset(); + p_value = nullptr; + leaf_node->do_track_cursor(*this); +} +template void tree_cursor_t::update_track(Ref, const search_position_t&); +template void tree_cursor_t::update_track(Ref, const search_position_t&); + +void tree_cursor_t::update_kv( + const key_view_t& key, const onode_t* _p_value, layout_version_t v) const { + assert(!is_end()); + assert(_p_value); + assert(std::make_tuple(key, _p_value, v) == leaf_node->get_kv(position)); + key_view = key; + p_value = _p_value; + node_version = v; +} + +void tree_cursor_t::ensure_kv() const { + assert(!is_end()); + if (!p_value || node_version != leaf_node->get_layout_version()) { + // NOTE: the leaf node is always present when we hold its reference. + std::tie(key_view, p_value, node_version) = leaf_node->get_kv(position); + } + assert(p_value); +} + +/* + * Node + */ + +Node::Node(NodeImplURef&& impl) : impl{std::move(impl)} {} + +Node::~Node() { + // XXX: tolerate failure between allocate() and as_child() + if (is_root()) { + super->do_untrack_root(*this); + } else { + _parent_info->ptr->do_untrack_child(*this); + } +} + +level_t Node::level() const { + return impl->level(); +} + +node_future Node::lower_bound( + context_t c, const key_hobj_t& key) { + return seastar::do_with( + MatchHistory(), [this, c, &key](auto& history) { + return lower_bound_tracked(c, key, history); + } + ); +} + +node_future, bool>> Node::insert( + context_t c, const key_hobj_t& key, const onode_t& value) { + return seastar::do_with( + MatchHistory(), [this, c, &key, &value](auto& history) { + return lower_bound_tracked(c, key, history + ).safe_then([c, &key, &value, &history](auto result) { + if (result.match() == MatchKindBS::EQ) { + return node_ertr::make_ready_future, bool>>( + std::make_pair(result.p_cursor, false)); + } else { + auto leaf_node = result.p_cursor->get_leaf_node(); + return leaf_node->insert_value( + c, key, value, result.p_cursor->get_position(), history, result.mstat + ).safe_then([](auto p_cursor) { + return node_ertr::make_ready_future, bool>>( + std::make_pair(p_cursor, true)); + }); + } + }); + } + ); +} + +node_future Node::get_tree_stats(context_t c) { + return seastar::do_with( + tree_stats_t(), [this, c](auto& stats) { + return do_get_tree_stats(c, stats).safe_then([&stats] { + return stats; + }); + } + ); +} + +std::ostream& Node::dump(std::ostream& os) const { + return impl->dump(os); +} + +std::ostream& Node::dump_brief(std::ostream& os) const { + return impl->dump_brief(os); +} + +void Node::test_make_destructable( + context_t c, NodeExtentMutable& mut, Super::URef&& _super) { + impl->test_set_tail(mut); + make_root(c, std::move(_super)); +} + +node_future<> Node::mkfs(context_t c, RootNodeTracker& root_tracker) { + return LeafNode::allocate_root(c, root_tracker + ).safe_then([](auto ret) { /* FIXME: discard_result(); */ }); +} + +node_future> Node::load_root(context_t c, RootNodeTracker& root_tracker) { + return c.nm.get_super(c.t, root_tracker + ).safe_then([c, &root_tracker](auto&& _super) { + auto root_addr = _super->get_root_laddr(); + assert(root_addr != L_ADDR_NULL); + return Node::load(c, root_addr, true + ).safe_then([c, _super = std::move(_super), + &root_tracker](auto root) mutable { + assert(root->impl->field_type() == field_type_t::N0); + root->as_root(std::move(_super)); + std::ignore = c; // as only used in an assert + std::ignore = root_tracker; + assert(root == root_tracker.get_root(c.t)); + return node_ertr::make_ready_future>(root); + }); + }); +} + +void Node::make_root(context_t c, Super::URef&& _super) { + _super->write_root_laddr(c, impl->laddr()); + as_root(std::move(_super)); +} + +void Node::as_root(Super::URef&& _super) { + assert(!super && !_parent_info); + assert(_super->get_root_laddr() == impl->laddr()); + assert(impl->is_level_tail()); + super = std::move(_super); + super->do_track_root(*this); +} + +node_future<> Node::upgrade_root(context_t c) { + assert(is_root()); + assert(impl->is_level_tail()); + assert(impl->field_type() == field_type_t::N0); + super->do_untrack_root(*this); + return InternalNode::allocate_root(c, impl->level(), impl->laddr(), std::move(super) + ).safe_then([this](auto new_root) { + as_child(search_position_t::end(), new_root); + }); +} + +template +void Node::as_child(const search_position_t& pos, Ref parent_node) { + assert(!super); + _parent_info = parent_info_t{pos, parent_node}; + parent_info().ptr->do_track_child(*this); +} +template void Node::as_child(const search_position_t&, Ref); +template void Node::as_child(const search_position_t&, Ref); + +node_future<> Node::insert_parent(context_t c, Ref right_node) { + assert(!is_root()); + // TODO(cross-node string dedup) + return parent_info().ptr->apply_child_split( + c, parent_info().position, this, right_node); +} + +node_future> Node::load( + context_t c, laddr_t addr, bool expect_is_level_tail) { + // NOTE: + // *option1: all types of node have the same length; + // option2: length is defined by node/field types; + // option3: length is totally flexible; + return c.nm.read_extent(c.t, addr, NODE_BLOCK_SIZE + ).safe_then([expect_is_level_tail](auto extent) { + auto [node_type, field_type] = extent->get_types(); + if (node_type == node_type_t::LEAF) { + auto impl = LeafNodeImpl::load(extent, field_type, expect_is_level_tail); + return Ref(new LeafNode(impl.get(), std::move(impl))); + } else if (node_type == node_type_t::INTERNAL) { + auto impl = InternalNodeImpl::load(extent, field_type, expect_is_level_tail); + return Ref(new InternalNode(impl.get(), std::move(impl))); + } else { + ceph_abort("impossible path"); + } + }); +} + +/* + * InternalNode + */ + +InternalNode::InternalNode(InternalNodeImpl* impl, NodeImplURef&& impl_ref) + : Node(std::move(impl_ref)), impl{impl} {} + +node_future<> InternalNode::apply_child_split( + context_t c, const search_position_t& pos, + Ref left_child, Ref right_child) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(impl->is_level_tail()); + } +#endif + impl->prepare_mutate(c); + + auto left_key = left_child->impl->get_largest_key_view(); + auto left_child_addr = left_child->impl->laddr(); + auto left_child_addr_packed = laddr_packed_t{left_child_addr}; + auto right_key = right_child->impl->get_largest_key_view(); + auto right_child_addr = right_child->impl->laddr(); + logger().debug("OTree::Internal::Insert: " + "pos({}), left_child({}, {:#x}), right_child({}, {:#x}) ...", + pos, left_key, left_child_addr, right_key, right_child_addr); + // update pos => left_child to pos => right_child + impl->replace_child_addr(pos, right_child_addr, left_child_addr); + replace_track(pos, right_child, left_child); + + search_position_t insert_pos = pos; + auto [insert_stage, insert_size] = impl->evaluate_insert( + left_key, left_child_addr, insert_pos); + auto free_size = impl->free_size(); + if (free_size >= insert_size) { + // insert + [[maybe_unused]] auto p_value = impl->insert( + left_key, left_child_addr_packed, insert_pos, insert_stage, insert_size); + assert(impl->free_size() == free_size - insert_size); + assert(insert_pos <= pos); + assert(p_value->value == left_child_addr); + track_insert(insert_pos, insert_stage, left_child, right_child); + validate_tracked_children(); + return node_ertr::now(); + } + // split and insert + Ref this_ref = this; + return (is_root() ? upgrade_root(c) : node_ertr::now() + ).safe_then([this, c] { + return InternalNode::allocate( + c, impl->field_type(), impl->is_level_tail(), impl->level()); + }).safe_then([this_ref, this, c, left_key, left_child, right_child, + insert_pos, insert_stage=insert_stage, insert_size=insert_size](auto fresh_right) mutable { + auto right_node = fresh_right.node; + auto left_child_addr = left_child->impl->laddr(); + auto left_child_addr_packed = laddr_packed_t{left_child_addr}; + auto [split_pos, is_insert_left, p_value] = impl->split_insert( + fresh_right.mut, *right_node->impl, left_key, left_child_addr_packed, + insert_pos, insert_stage, insert_size); + assert(p_value->value == left_child_addr); + track_split(split_pos, right_node); + if (is_insert_left) { + track_insert(insert_pos, insert_stage, left_child); + } else { + right_node->track_insert(insert_pos, insert_stage, left_child); + } + validate_tracked_children(); + right_node->validate_tracked_children(); + + // propagate index to parent + return insert_parent(c, right_node); + // TODO (optimize) + // try to acquire space from siblings before split... see btrfs + }); +} + +node_future> InternalNode::allocate_root( + context_t c, level_t old_root_level, + laddr_t old_root_addr, Super::URef&& super) { + return InternalNode::allocate(c, field_type_t::N0, true, old_root_level + 1 + ).safe_then([c, old_root_addr, + super = std::move(super)](auto fresh_node) mutable { + auto root = fresh_node.node; + auto p_value = root->impl->get_p_value(search_position_t::end()); + fresh_node.mut.copy_in_absolute( + const_cast(p_value), old_root_addr); + root->make_root_from(c, std::move(super), old_root_addr); + return root; + }); +} + +node_future> +InternalNode::lookup_smallest(context_t c) { + auto position = search_position_t::begin(); + laddr_t child_addr = impl->get_p_value(position)->value; + return get_or_track_child(c, position, child_addr + ).safe_then([c](auto child) { + return child->lookup_smallest(c); + }); +} + +node_future> +InternalNode::lookup_largest(context_t c) { + // NOTE: unlike LeafNode::lookup_largest(), this only works for the tail + // internal node to return the tail child address. + auto position = search_position_t::end(); + laddr_t child_addr = impl->get_p_value(position)->value; + return get_or_track_child(c, position, child_addr).safe_then([c](auto child) { + return child->lookup_largest(c); + }); +} + +node_future +InternalNode::lower_bound_tracked( + context_t c, const key_hobj_t& key, MatchHistory& history) { + auto result = impl->lower_bound(key, history); + return get_or_track_child(c, result.position, result.p_value->value + ).safe_then([c, &key, &history](auto child) { + // XXX(multi-type): pass result.mstat to child + return child->lower_bound_tracked(c, key, history); + }); +} + +node_future<> InternalNode::do_get_tree_stats( + context_t c, tree_stats_t& stats) { + auto nstats = impl->get_stats(); + stats.size_persistent_internal += nstats.size_persistent; + stats.size_filled_internal += nstats.size_filled; + stats.size_logical_internal += nstats.size_logical; + stats.size_overhead_internal += nstats.size_overhead; + stats.size_value_internal += nstats.size_value; + stats.num_kvs_internal += nstats.num_kvs; + stats.num_nodes_internal += 1; + + Ref this_ref = this; + return seastar::do_with( + search_position_t(), [this, this_ref, c, &stats](auto& pos) { + pos = search_position_t::begin(); + return crimson::do_until( + [this, this_ref, c, &stats, &pos]() -> node_future { + auto child_addr = impl->get_p_value(pos)->value; + return get_or_track_child(c, pos, child_addr + ).safe_then([c, &stats](auto child) { + return child->do_get_tree_stats(c, stats); + }).safe_then([this, this_ref, &pos] { + if (pos.is_end()) { + return node_ertr::make_ready_future(true); + } else { + impl->next_position(pos); + if (pos.is_end()) { + if (impl->is_level_tail()) { + return node_ertr::make_ready_future(false); + } else { + return node_ertr::make_ready_future(true); + } + } else { + return node_ertr::make_ready_future(false); + } + } + }); + }); + } + ); +} + +node_future<> InternalNode::test_clone_root( + context_t c_other, RootNodeTracker& tracker_other) const { + assert(is_root()); + assert(impl->is_level_tail()); + assert(impl->field_type() == field_type_t::N0); + Ref this_ref = this; + return InternalNode::allocate(c_other, field_type_t::N0, true, impl->level() + ).safe_then([this, c_other, &tracker_other](auto fresh_other) { + impl->test_copy_to(fresh_other.mut); + auto cloned_root = fresh_other.node; + return c_other.nm.get_super(c_other.t, tracker_other + ).safe_then([c_other, cloned_root](auto&& super_other) { + cloned_root->make_root_new(c_other, std::move(super_other)); + return cloned_root; + }); + }).safe_then([this_ref, this, c_other](auto cloned_root) { + // clone tracked children + // In some unit tests, the children are stubbed out that they + // don't exist in NodeExtentManager, and are only tracked in memory. + return crimson::do_for_each( + tracked_child_nodes.begin(), + tracked_child_nodes.end(), + [this_ref, c_other, cloned_root](auto& kv) { + assert(kv.first == kv.second->parent_info().position); + return kv.second->test_clone_non_root(c_other, cloned_root); + } + ); + }); +} + +node_future> InternalNode::get_or_track_child( + context_t c, const search_position_t& position, laddr_t child_addr) { + bool level_tail = position.is_end(); + Ref child; + auto found = tracked_child_nodes.find(position); + Ref this_ref = this; + return (found == tracked_child_nodes.end() + ? (logger().trace("OTree::Internal: load child untracked at {:#x}, pos({}), level={}", + child_addr, position, level() - 1), + Node::load(c, child_addr, level_tail + ).safe_then([this, position] (auto child) { + child->as_child(position, this); + return child; + })) + : (logger().trace("OTree::Internal: load child tracked at {:#x}, pos({}), level={}", + child_addr, position, level() - 1), + node_ertr::make_ready_future>(found->second)) + ).safe_then([this_ref, this, position, child_addr] (auto child) { + assert(child_addr == child->impl->laddr()); + assert(position == child->parent_info().position); + std::ignore = position; + std::ignore = child_addr; + validate_child(*child); + return child; + }); +} + +void InternalNode::track_insert( + const search_position_t& insert_pos, match_stage_t insert_stage, + Ref insert_child, Ref nxt_child) { + // update tracks + auto pos_upper_bound = insert_pos; + pos_upper_bound.index_by_stage(insert_stage) = INDEX_UPPER_BOUND; + auto first = tracked_child_nodes.lower_bound(insert_pos); + auto last = tracked_child_nodes.lower_bound(pos_upper_bound); + std::vector nodes; + std::for_each(first, last, [&nodes](auto& kv) { + nodes.push_back(kv.second); + }); + tracked_child_nodes.erase(first, last); + for (auto& node : nodes) { + auto _pos = node->parent_info().position; + assert(!_pos.is_end()); + ++_pos.index_by_stage(insert_stage); + node->as_child(_pos, this); + } + // track insert + insert_child->as_child(insert_pos, this); + +#ifndef NDEBUG + // validate left_child is before right_child + if (nxt_child) { + auto iter = tracked_child_nodes.find(insert_pos); + ++iter; + assert(iter->second == nxt_child); + } +#endif +} + +void InternalNode::replace_track( + const search_position_t& position, Ref new_child, Ref old_child) { + assert(tracked_child_nodes[position] == old_child); + tracked_child_nodes.erase(position); + new_child->as_child(position, this); + assert(tracked_child_nodes[position] == new_child); +} + +void InternalNode::track_split( + const search_position_t& split_pos, Ref right_node) { + auto first = tracked_child_nodes.lower_bound(split_pos); + auto iter = first; + while (iter != tracked_child_nodes.end()) { + search_position_t new_pos = iter->first; + new_pos -= split_pos; + iter->second->as_child(new_pos, right_node); + ++iter; + } + tracked_child_nodes.erase(first, tracked_child_nodes.end()); +} + +void InternalNode::validate_child(const Node& child) const { +#ifndef NDEBUG + assert(impl->level() - 1 == child.impl->level()); + assert(this == child.parent_info().ptr); + auto& child_pos = child.parent_info().position; + assert(impl->get_p_value(child_pos)->value == child.impl->laddr()); + if (child_pos.is_end()) { + assert(impl->is_level_tail()); + assert(child.impl->is_level_tail()); + } else { + assert(!child.impl->is_level_tail()); + assert(impl->get_key_view(child_pos) == child.impl->get_largest_key_view()); + } + // XXX(multi-type) + assert(impl->field_type() <= child.impl->field_type()); +#endif +} + +node_future InternalNode::allocate( + context_t c, field_type_t field_type, bool is_level_tail, level_t level) { + return InternalNodeImpl::allocate(c, field_type, is_level_tail, level + ).safe_then([](auto&& fresh_impl) { + auto node = Ref(new InternalNode( + fresh_impl.impl.get(), std::move(fresh_impl.impl))); + return fresh_node_t{node, fresh_impl.mut}; + }); +} + +/* + * LeafNode + */ + +LeafNode::LeafNode(LeafNodeImpl* impl, NodeImplURef&& impl_ref) + : Node(std::move(impl_ref)), impl{impl} {} + +bool LeafNode::is_level_tail() const { + return impl->is_level_tail(); +} + +std::tuple LeafNode::get_kv( + const search_position_t& pos) const { + key_view_t key_view; + auto p_value = impl->get_p_value(pos, &key_view); + return {key_view, p_value, layout_version}; +} + +node_future> +LeafNode::lookup_smallest(context_t) { + if (unlikely(impl->is_empty())) { + assert(is_root()); + return node_ertr::make_ready_future>( + new tree_cursor_t(this)); + } + auto pos = search_position_t::begin(); + key_view_t index_key; + auto p_value = impl->get_p_value(pos, &index_key); + return node_ertr::make_ready_future>( + get_or_track_cursor(pos, index_key, p_value)); +} + +node_future> +LeafNode::lookup_largest(context_t) { + if (unlikely(impl->is_empty())) { + assert(is_root()); + return node_ertr::make_ready_future>( + new tree_cursor_t(this)); + } + search_position_t pos; + const onode_t* p_value = nullptr; + key_view_t index_key; + impl->get_largest_slot(pos, index_key, &p_value); + return node_ertr::make_ready_future>( + get_or_track_cursor(pos, index_key, p_value)); +} + +node_future +LeafNode::lower_bound_tracked( + context_t c, const key_hobj_t& key, MatchHistory& history) { + key_view_t index_key; + auto result = impl->lower_bound(key, history, &index_key); + Ref cursor; + if (result.position.is_end()) { + assert(!result.p_value); + cursor = new tree_cursor_t(this); + } else { + cursor = get_or_track_cursor(result.position, index_key, result.p_value); + } + return node_ertr::make_ready_future( + search_result_t{cursor, result.mstat}); +} + +node_future<> LeafNode::do_get_tree_stats(context_t, tree_stats_t& stats) { + auto nstats = impl->get_stats(); + stats.size_persistent_leaf += nstats.size_persistent; + stats.size_filled_leaf += nstats.size_filled; + stats.size_logical_leaf += nstats.size_logical; + stats.size_overhead_leaf += nstats.size_overhead; + stats.size_value_leaf += nstats.size_value; + stats.num_kvs_leaf += nstats.num_kvs; + stats.num_nodes_leaf += 1; + return node_ertr::now(); +} + +node_future<> LeafNode::test_clone_root( + context_t c_other, RootNodeTracker& tracker_other) const { + assert(is_root()); + assert(impl->is_level_tail()); + assert(impl->field_type() == field_type_t::N0); + Ref this_ref = this; + return LeafNode::allocate(c_other, field_type_t::N0, true + ).safe_then([this, c_other, &tracker_other](auto fresh_other) { + impl->test_copy_to(fresh_other.mut); + auto cloned_root = fresh_other.node; + return c_other.nm.get_super(c_other.t, tracker_other + ).safe_then([c_other, cloned_root](auto&& super_other) { + cloned_root->make_root_new(c_other, std::move(super_other)); + }); + }).safe_then([this_ref]{}); +} + +node_future> LeafNode::insert_value( + context_t c, const key_hobj_t& key, const onode_t& value, + const search_position_t& pos, const MatchHistory& history, + match_stat_t mstat) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(impl->is_level_tail()); + } +#endif + logger().debug("OTree::Leaf::Insert: " + "pos({}), {}, {}, {}, mstat({}) ...", + pos, key, value, history, mstat); + search_position_t insert_pos = pos; + auto [insert_stage, insert_size] = impl->evaluate_insert( + key, value, history, mstat, insert_pos); + auto free_size = impl->free_size(); + if (free_size >= insert_size) { + // insert + on_layout_change(); + impl->prepare_mutate(c); + auto p_value = impl->insert(key, value, insert_pos, insert_stage, insert_size); + assert(impl->free_size() == free_size - insert_size); + assert(insert_pos <= pos); + assert(p_value->size == value.size); + auto ret = track_insert(insert_pos, insert_stage, p_value); + validate_tracked_cursors(); + return node_ertr::make_ready_future>(ret); + } + // split and insert + Ref this_ref = this; + return (is_root() ? upgrade_root(c) : node_ertr::now() + ).safe_then([this, c] { + return LeafNode::allocate(c, impl->field_type(), impl->is_level_tail()); + }).safe_then([this_ref, this, c, &key, &value, + insert_pos, insert_stage=insert_stage, insert_size=insert_size](auto fresh_right) mutable { + auto right_node = fresh_right.node; + // no need to bump version for right node, as it is fresh + on_layout_change(); + impl->prepare_mutate(c); + auto [split_pos, is_insert_left, p_value] = impl->split_insert( + fresh_right.mut, *right_node->impl, key, value, + insert_pos, insert_stage, insert_size); + assert(p_value->size == value.size); + track_split(split_pos, right_node); + Ref ret; + if (is_insert_left) { + ret = track_insert(insert_pos, insert_stage, p_value); + } else { + ret = right_node->track_insert(insert_pos, insert_stage, p_value); + } + validate_tracked_cursors(); + right_node->validate_tracked_cursors(); + + // propagate insert to parent + return insert_parent(c, right_node).safe_then([ret] { + return ret; + }); + // TODO (optimize) + // try to acquire space from siblings before split... see btrfs + }); +} + +node_future> LeafNode::allocate_root( + context_t c, RootNodeTracker& root_tracker) { + return LeafNode::allocate(c, field_type_t::N0, true + ).safe_then([c, &root_tracker](auto fresh_node) { + auto root = fresh_node.node; + return c.nm.get_super(c.t, root_tracker + ).safe_then([c, root](auto&& super) { + root->make_root_new(c, std::move(super)); + return root; + }); + }); +} + +Ref LeafNode::get_or_track_cursor( + const search_position_t& position, + const key_view_t& key, const onode_t* p_value) { + assert(!position.is_end()); + assert(p_value); + Ref p_cursor; + auto found = tracked_cursors.find(position); + if (found == tracked_cursors.end()) { + p_cursor = new tree_cursor_t(this, position, key, p_value, layout_version); + } else { + p_cursor = found->second; + assert(p_cursor->get_leaf_node() == this); + assert(p_cursor->get_position() == position); + p_cursor->update_kv(key, p_value, layout_version); + } + return p_cursor; +} + +void LeafNode::validate_cursor(tree_cursor_t& cursor) const { +#ifndef NDEBUG + assert(this == cursor.get_leaf_node().get()); + assert(!cursor.is_end()); + auto [key, val, ver] = get_kv(cursor.get_position()); + assert(key == cursor.get_key_view()); + assert(val == cursor.get_p_value()); +#endif +} + +Ref LeafNode::track_insert( + const search_position_t& insert_pos, match_stage_t insert_stage, + const onode_t* p_onode) { + // update cursor position + auto pos_upper_bound = insert_pos; + pos_upper_bound.index_by_stage(insert_stage) = INDEX_UPPER_BOUND; + auto first = tracked_cursors.lower_bound(insert_pos); + auto last = tracked_cursors.lower_bound(pos_upper_bound); + std::vector p_cursors; + std::for_each(first, last, [&p_cursors](auto& kv) { + p_cursors.push_back(kv.second); + }); + tracked_cursors.erase(first, last); + for (auto& p_cursor : p_cursors) { + search_position_t new_pos = p_cursor->get_position(); + ++new_pos.index_by_stage(insert_stage); + p_cursor->update_track(this, new_pos); + } + + // track insert + // TODO: getting key_view_t from stage::proceed_insert() and + // stage::append_insert() has not supported yet + return new tree_cursor_t(this, insert_pos); +} + +void LeafNode::track_split( + const search_position_t& split_pos, Ref right_node) { + // update cursor ownership and position + auto first = tracked_cursors.lower_bound(split_pos); + auto iter = first; + while (iter != tracked_cursors.end()) { + search_position_t new_pos = iter->first; + new_pos -= split_pos; + iter->second->update_track(right_node, new_pos); + ++iter; + } + tracked_cursors.erase(first, tracked_cursors.end()); +} + +node_future LeafNode::allocate( + context_t c, field_type_t field_type, bool is_level_tail) { + return LeafNodeImpl::allocate(c, field_type, is_level_tail + ).safe_then([](auto&& fresh_impl) { + auto node = Ref(new LeafNode( + fresh_impl.impl.get(), std::move(fresh_impl.impl))); + return fresh_node_t{node, fresh_impl.mut}; + }); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h new file mode 100644 index 000000000..d6af489e7 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h @@ -0,0 +1,476 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include + +#include "crimson/common/type_helpers.h" + +#include "node_extent_mutable.h" +#include "stages/key_layout.h" +#include "stages/stage_types.h" +#include "super.h" +#include "tree_types.h" + +/** + * Tree example (2 levels): + * + * Root node keys: [ 3 7 ] + * values: [p1 p2 p3] + * / | \ + * ------- | ------- + * | | | + * V V V + * Leaf node keys: [ 1 2 3] [ 4 5 7] [ 9 11 12] + * values: [v1 v2 v3] [v4 v5 v6] [v7 v8 v9] + * + * Tree structure properties: + * - As illustrated above, the parent key is strictly equal to its left child's + * largest key; + * - If a tree is indexing multiple seastore transactions, each transaction + * will be mapped to a Super which points to a distinct root node. So the + * transactions are isolated at tree level. However, tree nodes from + * different transactions can reference the same seastore CachedExtent before + * modification; + * - The resources of the transactional tree are tracked by tree_cursor_ts held + * by users. As long as any cursor is alive, the according tree hierarchy is + * alive and keeps tracked. See the reversed resource management sections + * below; + */ + +namespace crimson::os::seastore::onode { + +class LeafNode; +class InternalNode; + +/** + * tree_cursor_t + * + * A cursor points to a position (LeafNode and search_position_t) of the tree + * where it can find the according key and value pair. The position is updated + * by LeafNode insert/split/delete/merge internally and is kept valid. It also + * caches the key-value information for a specific node layout version. + * + * Exposes public interfaces for Btree::Cursor. + */ +using layout_version_t = uint32_t; +class tree_cursor_t final + : public boost::intrusive_ref_counter< + tree_cursor_t, boost::thread_unsafe_counter> { + public: + // public to Btree + ~tree_cursor_t(); + tree_cursor_t(const tree_cursor_t&) = delete; + tree_cursor_t(tree_cursor_t&&) = delete; + tree_cursor_t& operator=(const tree_cursor_t&) = delete; + tree_cursor_t& operator=(tree_cursor_t&&) = delete; + + /** + * is_end + * + * Represents one-past-the-last of all the sorted key-value + * pairs in the tree. An end cursor won't contain valid key-value + * information. + */ + bool is_end() const { return position.is_end(); } + + /// Returns the key view in tree if it is not an end cursor. + const key_view_t& get_key_view() const; + + /// Returns the value pointer in tree if it is not an end cursor. + const onode_t* get_p_value() const; + + private: + tree_cursor_t(Ref, const search_position_t&); + tree_cursor_t(Ref, const search_position_t&, + const key_view_t& key, const onode_t*, layout_version_t); + // lookup reaches the end, contain leaf node for further insert + tree_cursor_t(Ref); + const search_position_t& get_position() const { return position; } + Ref get_leaf_node() { return leaf_node; } + template + void update_track(Ref, const search_position_t&); + void update_kv(const key_view_t&, const onode_t*, layout_version_t) const; + void ensure_kv() const; + + private: + /** + * Reversed resource management (tree_cursor_t) + * + * tree_cursor_t holds a reference to the LeafNode, so the LeafNode will be + * alive as long as any of it's cursors is still referenced by user. + */ + Ref leaf_node; + search_position_t position; + + // cached information + mutable std::optional key_view; + mutable const onode_t* p_value; + mutable layout_version_t node_version; + + friend class LeafNode; + friend class Node; // get_position(), get_leaf_node() +}; + +/** + * Node + * + * An abstracted class for both InternalNode and LeafNode. + * + * Exposes public interfaces for Btree. + */ +class Node + : public boost::intrusive_ref_counter< + Node, boost::thread_unsafe_counter> { + public: + // public to Btree + using node_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + template + using node_future = node_ertr::future; + + struct search_result_t { + bool is_end() const { return p_cursor->is_end(); } + Ref p_cursor; + match_stat_t mstat; + + MatchKindBS match() const { + assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX); + return (mstat == MSTAT_EQ ? MatchKindBS::EQ : MatchKindBS::NE); + } + }; + + virtual ~Node(); + Node(const Node&) = delete; + Node(Node&&) = delete; + Node& operator=(const Node&) = delete; + Node& operator=(Node&&) = delete; + + /** + * level + * + * A positive value denotes the level (or height) of this node in tree. + * 0 means LeafNode, positive means InternalNode. + */ + level_t level() const; + + /** + * lookup_smallest + * + * Returns a cursor pointing to the smallest key in the sub-tree formed by + * this node. + * + * Returns an end cursor if it is an empty root node. + */ + virtual node_future> lookup_smallest(context_t) = 0; + + /** + * lookup_largest + * + * Returns a cursor pointing to the largest key in the sub-tree formed by + * this node. + * + * Returns an end cursor if it is an empty root node. + */ + virtual node_future> lookup_largest(context_t) = 0; + + /** + * lower_bound + * + * Returns a cursor pointing to the first element in the range [first, last) + * of the sub-tree which does not compare less than the input key. The + * result also denotes whether the pointed key is equal to the input key. + * + * Returns an end cursor with MatchKindBS::NE if: + * - It is an empty root node; + * - Or the input key is larger than all the keys in the sub-tree; + */ + node_future lower_bound(context_t c, const key_hobj_t& key); + + /** + * insert + * + * Try to insert a key-value pair into the sub-tree formed by this node. + * + * Returns a boolean denoting whether the insertion is successful: + * - If true, the returned cursor points to the inserted element in tree; + * - If false, the returned cursor points to the conflicting element in tree; + */ + node_future, bool>> insert( + context_t, const key_hobj_t&, const onode_t&); + + /// Recursively collects the statistics of the sub-tree formed by this node + node_future get_tree_stats(context_t); + + /// Returns an ostream containing a dump of all the elements in the node. + std::ostream& dump(std::ostream&) const; + + /// Returns an ostream containing an one-line summary of this node. + std::ostream& dump_brief(std::ostream&) const; + + /// Initializes the tree by allocating an empty root node. + static node_future<> mkfs(context_t, RootNodeTracker&); + + /// Loads the tree root. The tree must be initialized. + static node_future> load_root(context_t, RootNodeTracker&); + + // Only for unit test purposes. + void test_make_destructable(context_t, NodeExtentMutable&, Super::URef&&); + virtual node_future<> test_clone_root(context_t, RootNodeTracker&) const = 0; + + protected: + virtual node_future<> test_clone_non_root(context_t, Ref) const { + ceph_abort("impossible path"); + } + virtual node_future lower_bound_tracked( + context_t, const key_hobj_t&, MatchHistory&) = 0; + virtual node_future<> do_get_tree_stats(context_t, tree_stats_t&) = 0; + + protected: + Node(NodeImplURef&&); + bool is_root() const { + assert((super && !_parent_info.has_value()) || + (!super && _parent_info.has_value())); + return !_parent_info.has_value(); + } + + // as root + void make_root(context_t c, Super::URef&& _super); + void make_root_new(context_t c, Super::URef&& _super) { + assert(_super->get_root_laddr() == L_ADDR_NULL); + make_root(c, std::move(_super)); + } + void make_root_from(context_t c, Super::URef&& _super, laddr_t from_addr) { + assert(_super->get_root_laddr() == from_addr); + make_root(c, std::move(_super)); + } + void as_root(Super::URef&& _super); + node_future<> upgrade_root(context_t); + + // as child/non-root + template + void as_child(const search_position_t&, Ref); + struct parent_info_t { + search_position_t position; + Ref ptr; + }; + const parent_info_t& parent_info() const { return *_parent_info; } + node_future<> insert_parent(context_t, Ref right_node); + + private: + /** + * Reversed resource management (Node) + * + * Root Node holds a reference to its parent Super class, so its parent + * will be alive as long as this root node is alive. + * + * None-root Node holds a reference to its parent Node, so its parent will + * be alive as long as any of it's children is alive. + */ + // as root + Super::URef super; + // as child/non-root + std::optional _parent_info; + + private: + static node_future> load(context_t, laddr_t, bool expect_is_level_tail); + + NodeImplURef impl; + friend class InternalNode; +}; +inline std::ostream& operator<<(std::ostream& os, const Node& node) { + return node.dump_brief(os); +} + +/** + * InternalNode + * + * A concrete implementation of Node class that represents an internal tree + * node. Its level is always positive and its values are logical block + * addresses to its child nodes. An internal node cannot be empty. + */ +class InternalNode final : public Node { + public: + // public to Node + InternalNode(InternalNodeImpl*, NodeImplURef&&); + ~InternalNode() override { assert(tracked_child_nodes.empty()); } + InternalNode(const InternalNode&) = delete; + InternalNode(InternalNode&&) = delete; + InternalNode& operator=(const InternalNode&) = delete; + InternalNode& operator=(InternalNode&&) = delete; + + node_future<> apply_child_split( + context_t, const search_position_t&, Ref left, Ref right); + template + void do_track_child(Node& child) { + if constexpr (VALIDATE) { + validate_child(child); + } + auto& child_pos = child.parent_info().position; + assert(tracked_child_nodes.find(child_pos) == tracked_child_nodes.end()); + tracked_child_nodes[child_pos] = &child; + } + void do_untrack_child(const Node& child) { + auto& child_pos = child.parent_info().position; + assert(tracked_child_nodes.find(child_pos)->second == &child); + [[maybe_unused]] auto removed = tracked_child_nodes.erase(child_pos); + assert(removed); + } + + static node_future> allocate_root( + context_t, level_t, laddr_t, Super::URef&&); + + protected: + node_future> lookup_smallest(context_t) override; + node_future> lookup_largest(context_t) override; + node_future lower_bound_tracked( + context_t, const key_hobj_t&, MatchHistory&) override; + node_future<> do_get_tree_stats(context_t, tree_stats_t&) override; + + node_future<> test_clone_root(context_t, RootNodeTracker&) const override; + + private: + // XXX: extract a common tracker for InternalNode to track Node, + // and LeafNode to track tree_cursor_t. + node_future> get_or_track_child(context_t, const search_position_t&, laddr_t); + void track_insert( + const search_position_t&, match_stage_t, Ref, Ref nxt_child = nullptr); + void replace_track(const search_position_t&, Ref new_child, Ref old_child); + void track_split(const search_position_t&, Ref); + void validate_tracked_children() const { +#ifndef NDEBUG + for (auto& kv : tracked_child_nodes) { + assert(kv.first == kv.second->parent_info().position); + validate_child(*kv.second); + } +#endif + } + void validate_child(const Node& child) const; + + struct fresh_node_t { + Ref node; + NodeExtentMutable mut; + std::pair, NodeExtentMutable> make_pair() { + return std::make_pair(Ref(node), mut); + } + }; + static node_future allocate(context_t, field_type_t, bool, level_t); + + private: + /** + * Reversed resource management (InternalNode) + * + * InteralNode keeps track of its child nodes which are still alive in + * memory, and their positions will be updated throughout + * insert/split/delete/merge operations of this node. + */ + // XXX: leverage intrusive data structure to control memory overhead + std::map tracked_child_nodes; + InternalNodeImpl* impl; +}; + +/** + * LeafNode + * + * A concrete implementation of Node class that represents a leaf tree node. + * Its level is always 0. A leaf node can only be empty if it is root. + */ +class LeafNode final : public Node { + public: + // public to tree_cursor_t + ~LeafNode() override { assert(tracked_cursors.empty()); } + LeafNode(const LeafNode&) = delete; + LeafNode(LeafNode&&) = delete; + LeafNode& operator=(const LeafNode&) = delete; + LeafNode& operator=(LeafNode&&) = delete; + + bool is_level_tail() const; + layout_version_t get_layout_version() const { return layout_version; } + std::tuple get_kv( + const search_position_t&) const; + template + void do_track_cursor(tree_cursor_t& cursor) { + if constexpr (VALIDATE) { + validate_cursor(cursor); + } + auto& cursor_pos = cursor.get_position(); + assert(tracked_cursors.find(cursor_pos) == tracked_cursors.end()); + tracked_cursors[cursor_pos] = &cursor; + } + void do_untrack_cursor(tree_cursor_t& cursor) { + validate_cursor(cursor); + auto& cursor_pos = cursor.get_position(); + assert(tracked_cursors.find(cursor_pos)->second == &cursor); + [[maybe_unused]] auto removed = tracked_cursors.erase(cursor_pos); + assert(removed); + } + + protected: + node_future> lookup_smallest(context_t) override; + node_future> lookup_largest(context_t) override; + node_future lower_bound_tracked( + context_t, const key_hobj_t&, MatchHistory&) override; + node_future<> do_get_tree_stats(context_t, tree_stats_t&) override; + + node_future<> test_clone_root(context_t, RootNodeTracker&) const override; + + private: + LeafNode(LeafNodeImpl*, NodeImplURef&&); + node_future> insert_value( + context_t, const key_hobj_t&, const onode_t&, + const search_position_t&, const MatchHistory&, + match_stat_t mstat); + static node_future> allocate_root(context_t, RootNodeTracker&); + friend class Node; + + private: + // XXX: extract a common tracker for InternalNode to track Node, + // and LeafNode to track tree_cursor_t. + Ref get_or_track_cursor( + const search_position_t&, const key_view_t&, const onode_t*); + Ref track_insert( + const search_position_t&, match_stage_t, const onode_t*); + void track_split(const search_position_t&, Ref); + void validate_tracked_cursors() const { +#ifndef NDEBUG + for (auto& kv : tracked_cursors) { + assert(kv.first == kv.second->get_position()); + validate_cursor(*kv.second); + } +#endif + } + void validate_cursor(tree_cursor_t& cursor) const; + // invalidate p_value pointers in tree_cursor_t + void on_layout_change() { ++layout_version; } + + struct fresh_node_t { + Ref node; + NodeExtentMutable mut; + std::pair, NodeExtentMutable> make_pair() { + return std::make_pair(Ref(node), mut); + } + }; + static node_future allocate(context_t, field_type_t, bool); + + private: + /** + * Reversed resource management (LeafNode) + * + * LeafNode keeps track of the referencing cursors which are still alive in + * memory, and their positions will be updated throughout + * insert/split/delete/merge operations of this node. + */ + // XXX: leverage intrusive data structure to control memory overhead + std::map tracked_cursors; + LeafNodeImpl* impl; + layout_version_t layout_version = 0; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h new file mode 100644 index 000000000..d08a99015 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "include/buffer.h" +#include "node_types.h" + +namespace crimson::os::seastore::onode { + +/** + * DeltaRecorder + * + * An abstracted class to encapsulate different implementations to apply delta + * to a specific node layout. + */ +class DeltaRecorder { + public: + virtual ~DeltaRecorder() { + assert(is_empty()); + } + + bool is_empty() const { + return encoded.length() == 0; + } + + ceph::bufferlist get_delta() { + assert(!is_empty()); + return std::move(encoded); + } + + virtual node_type_t node_type() const = 0; + virtual field_type_t field_type() const = 0; + virtual void apply_delta(ceph::bufferlist::const_iterator&, + NodeExtentMutable&) = 0; + + protected: + DeltaRecorder() = default; + ceph::bufferlist encoded; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h new file mode 100644 index 000000000..94782f50d --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h @@ -0,0 +1,413 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/log.h" +#include "node_extent_manager.h" +#include "node_delta_recorder.h" +#include "node_layout_replayable.h" + +#ifndef NDEBUG +#include "node_extent_manager/test_replay.h" +#endif + +namespace crimson::os::seastore::onode { + +/** + * DeltaRecorderT + * + * Responsible to encode and decode delta, and apply delta for a specific node + * layout. + */ +template +class DeltaRecorderT final: public DeltaRecorder { + enum class op_t : uint8_t { + INSERT, + SPLIT, + SPLIT_INSERT, + UPDATE_CHILD_ADDR, + }; + + public: + using layout_t = NodeLayoutReplayableT; + using node_stage_t = typename layout_t::node_stage_t; + using position_t = typename layout_t::position_t; + using StagedIterator = typename layout_t::StagedIterator; + using value_t = typename layout_t::value_t; + static constexpr auto FIELD_TYPE = layout_t::FIELD_TYPE; + + ~DeltaRecorderT() override = default; + + template + void encode_insert( + const full_key_t& key, + const value_t& value, + const position_t& insert_pos, + const match_stage_t& insert_stage, + const node_offset_t& insert_size) { + ceph::encode(op_t::INSERT, encoded); + encode_key(key, encoded); + encode_value(value, encoded); + insert_pos.encode(encoded); + ceph::encode(insert_stage, encoded); + ceph::encode(insert_size, encoded); + } + + void encode_split( + const StagedIterator& split_at, + const char* p_node_start) { + ceph::encode(op_t::SPLIT, encoded); + split_at.encode(p_node_start, encoded); + } + + template + void encode_split_insert( + const StagedIterator& split_at, + const full_key_t& key, + const value_t& value, + const position_t& insert_pos, + const match_stage_t& insert_stage, + const node_offset_t& insert_size, + const char* p_node_start) { + ceph::encode(op_t::SPLIT_INSERT, encoded); + split_at.encode(p_node_start, encoded); + encode_key(key, encoded); + encode_value(value, encoded); + insert_pos.encode(encoded); + ceph::encode(insert_stage, encoded); + ceph::encode(insert_size, encoded); + } + + void encode_update_child_addr( + const laddr_t new_addr, + const laddr_packed_t* p_addr, + const char* p_node_start) { + ceph::encode(op_t::UPDATE_CHILD_ADDR, encoded); + ceph::encode(new_addr, encoded); + int node_offset = reinterpret_cast(p_addr) - p_node_start; + assert(node_offset > 0 && node_offset <= NODE_BLOCK_SIZE); + ceph::encode(static_cast(node_offset), encoded); + } + + static DeltaRecorderURef create() { + return std::unique_ptr(new DeltaRecorderT()); + } + + protected: + DeltaRecorderT() = default; + node_type_t node_type() const override { return NODE_TYPE; } + field_type_t field_type() const override { return FIELD_TYPE; } + void apply_delta(ceph::bufferlist::const_iterator& delta, + NodeExtentMutable& node) override { + assert(is_empty()); + node_stage_t stage(reinterpret_cast(node.get_read())); + op_t op; + try { + ceph::decode(op, delta); + switch (op) { + case op_t::INSERT: { + logger().debug("OTree::Extent::Replay: decoding INSERT ..."); + auto key = key_hobj_t::decode(delta); + + std::unique_ptr value_storage_heap; + value_t value_storage_stack; + auto p_value = decode_value(delta, value_storage_heap, value_storage_stack); + + auto insert_pos = position_t::decode(delta); + match_stage_t insert_stage; + ceph::decode(insert_stage, delta); + node_offset_t insert_size; + ceph::decode(insert_size, delta); + logger().debug("OTree::Extent::Replay: apply {}, {}, " + "insert_pos({}), insert_stage={}, insert_size={}B ...", + key, *p_value, insert_pos, insert_stage, insert_size); + layout_t::template insert( + node, stage, key, *p_value, insert_pos, insert_stage, insert_size); + break; + } + case op_t::SPLIT: { + logger().debug("OTree::Extent::Replay: decoding SPLIT ..."); + auto split_at = StagedIterator::decode(stage.p_start(), delta); + logger().debug("OTree::Extent::Replay: apply split_at={} ...", split_at); + layout_t::split(node, stage, split_at); + break; + } + case op_t::SPLIT_INSERT: { + logger().debug("OTree::Extent::Replay: decoding SPLIT_INSERT ..."); + auto split_at = StagedIterator::decode(stage.p_start(), delta); + auto key = key_hobj_t::decode(delta); + + std::unique_ptr value_storage_heap; + value_t value_storage_stack; + auto p_value = decode_value(delta, value_storage_heap, value_storage_stack); + + auto insert_pos = position_t::decode(delta); + match_stage_t insert_stage; + ceph::decode(insert_stage, delta); + node_offset_t insert_size; + ceph::decode(insert_size, delta); + logger().debug("OTree::Extent::Replay: apply split_at={}, {}, {}, " + "insert_pos({}), insert_stage={}, insert_size={}B ...", + split_at, key, *p_value, insert_pos, insert_stage, insert_size); + layout_t::template split_insert( + node, stage, split_at, key, *p_value, insert_pos, insert_stage, insert_size); + break; + } + case op_t::UPDATE_CHILD_ADDR: { + logger().debug("OTree::Extent::Replay: decoding UPDATE_CHILD_ADDR ..."); + laddr_t new_addr; + ceph::decode(new_addr, delta); + node_offset_t update_offset; + ceph::decode(update_offset, delta); + auto p_addr = reinterpret_cast( + node.get_write() + update_offset); + logger().debug("OTree::Extent::Replay: apply {:#x} to offset {:#x} ...", + new_addr, update_offset); + layout_t::update_child_addr(node, new_addr, p_addr); + break; + } + default: + logger().error("OTree::Extent::Replay: got unknown op {} when replay {:#x}", + op, node.get_laddr()); + ceph_abort(); + } + } catch (buffer::error& e) { + logger().error("OTree::Extent::Replay: got decode error {} when replay {:#x}", + e, node.get_laddr()); + ceph_abort(); + } + } + + private: + static void encode_value(const value_t& value, ceph::bufferlist& encoded) { + if constexpr (std::is_same_v) { + // NODE_TYPE == node_type_t::INTERNAL + ceph::encode(value.value, encoded); + } else if constexpr (std::is_same_v) { + // NODE_TYPE == node_type_t::LEAF + value.encode(encoded); + } else { + ceph_abort("impossible path"); + } + } + + static value_t* decode_value(ceph::bufferlist::const_iterator& delta, + std::unique_ptr& value_storage_heap, + value_t& value_storage_stack) { + if constexpr (std::is_same_v) { + // NODE_TYPE == node_type_t::INTERNAL + laddr_t value; + ceph::decode(value, delta); + value_storage_stack.value = value; + return &value_storage_stack; + } else if constexpr (std::is_same_v) { + // NODE_TYPE == node_type_t::LEAF + auto value_config = onode_t::decode(delta); + value_storage_heap = onode_t::allocate(value_config); + return reinterpret_cast(value_storage_heap.get()); + } else { + ceph_abort("impossible path"); + } + } + + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +}; + +/** + * NodeExtentAccessorT + * + * This component is responsible to reference and mutate the underlying + * NodeExtent, record mutation parameters when needed, and apply the recorded + * modifications for a specific node layout. + */ +template +class NodeExtentAccessorT { + public: + using layout_t = NodeLayoutReplayableT; + using node_stage_t = typename layout_t::node_stage_t; + using position_t = typename layout_t::position_t; + using recorder_t = DeltaRecorderT; + using StagedIterator = typename layout_t::StagedIterator; + using value_t = typename layout_t::value_t; + static constexpr auto FIELD_TYPE = layout_t::FIELD_TYPE; + + NodeExtentAccessorT(NodeExtentRef extent) + : extent{extent}, + node_stage{reinterpret_cast(extent->get_read())} { + if (no_recording()) { + mut.emplace(extent->get_mutable()); + assert(extent->get_recorder() == nullptr); + recorder = nullptr; + } else if (needs_recording()) { + mut.emplace(extent->get_mutable()); + auto p_recorder = extent->get_recorder(); + assert(p_recorder != nullptr); + assert(p_recorder->node_type() == NODE_TYPE); + assert(p_recorder->field_type() == FIELD_TYPE); + recorder = static_cast(p_recorder); + } else if (needs_mutate()) { + // mut is empty + assert(extent->get_recorder() == nullptr || + extent->get_recorder()->is_empty()); + recorder = nullptr; + } else { + ceph_abort("impossible path"); + } +#ifndef NDEBUG + auto ref_recorder = recorder_t::create(); + test_recorder = static_cast(ref_recorder.get()); + test_extent = TestReplayExtent::create( + extent->get_length(), std::move(ref_recorder)); +#endif + } + ~NodeExtentAccessorT() = default; + NodeExtentAccessorT(const NodeExtentAccessorT&) = delete; + NodeExtentAccessorT(NodeExtentAccessorT&&) = delete; + NodeExtentAccessorT& operator=(const NodeExtentAccessorT&) = delete; + NodeExtentAccessorT& operator=(NodeExtentAccessorT&&) = delete; + + const node_stage_t& read() const { return node_stage; } + laddr_t get_laddr() const { return extent->get_laddr(); } + + // must be called before any mutate attempes. + // for the safety of mixed read and mutate, call before read. + void prepare_mutate(context_t c) { + if (needs_mutate()) { + auto ref_recorder = recorder_t::create(); + recorder = static_cast(ref_recorder.get()); + extent = extent->mutate(c, std::move(ref_recorder)); + assert(needs_recording()); + node_stage = node_stage_t( + reinterpret_cast(extent->get_read())); + assert(recorder == static_cast(extent->get_recorder())); + mut.emplace(extent->get_mutable()); + } + } + + template + const value_t* insert_replayable( + const full_key_t& key, + const value_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + assert(!needs_mutate()); + if (needs_recording()) { + recorder->template encode_insert( + key, value, insert_pos, insert_stage, insert_size); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->template encode_insert( + key, value, insert_pos, insert_stage, insert_size); +#endif + auto ret = layout_t::template insert( + *mut, read(), key, value, + insert_pos, insert_stage, insert_size); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + return ret; + } + + void split_replayable(StagedIterator& split_at) { + assert(!needs_mutate()); + if (needs_recording()) { + recorder->encode_split(split_at, read().p_start()); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->template encode_split(split_at, read().p_start()); +#endif + layout_t::split(*mut, read(), split_at); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + } + + template + const value_t* split_insert_replayable( + StagedIterator& split_at, + const full_key_t& key, + const value_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + assert(!needs_mutate()); + if (needs_recording()) { + recorder->template encode_split_insert( + split_at, key, value, insert_pos, insert_stage, insert_size, + read().p_start()); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->template encode_split_insert( + split_at, key, value, insert_pos, insert_stage, insert_size, + read().p_start()); +#endif + auto ret = layout_t::template split_insert( + *mut, read(), split_at, key, value, + insert_pos, insert_stage, insert_size); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + return ret; + } + + void update_child_addr_replayable( + const laddr_t new_addr, laddr_packed_t* p_addr) { + assert(!needs_mutate()); + if (needs_recording()) { + recorder->encode_update_child_addr(new_addr, p_addr, read().p_start()); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->encode_update_child_addr(new_addr, p_addr, read().p_start()); +#endif + layout_t::update_child_addr(*mut, new_addr, p_addr); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + } + + void test_copy_to(NodeExtentMutable& to) const { + assert(extent->get_length() == to.get_length()); + std::memcpy(to.get_write(), extent->get_read(), extent->get_length()); + } + + private: + /** + * Possible states with CachedExtent::extent_state_t: + * INITIAL_WRITE_PENDING -- can mutate, no recording + * MUTATION_PENDING -- can mutate, needs recording + * CLEAN/DIRTY -- pending mutate + * INVALID -- impossible + */ + bool no_recording() const { + return extent->is_initial_pending(); + } + bool needs_recording() const { + return extent->is_mutation_pending(); + } + bool needs_mutate() const { + assert(extent->is_valid()); + return !extent->is_pending(); + } + + NodeExtentRef extent; + node_stage_t node_stage; + std::optional mut; + // owned by extent + recorder_t* recorder; + +#ifndef NDEBUG + // verify record replay using a different memory block + TestReplayExtent::Ref test_extent; + recorder_t* test_recorder; +#endif +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc new file mode 100644 index 000000000..bd22d4b67 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_extent_manager.h" + +#include "node_extent_manager/dummy.h" +#include "node_extent_manager/seastore.h" +#include "stages/node_stage_layout.h" + +namespace crimson::os::seastore::onode { + +std::pair NodeExtent::get_types() const { + const auto header = reinterpret_cast(get_read()); + auto node_type = header->get_node_type(); + auto field_type = header->get_field_type(); + if (!field_type.has_value()) { + throw std::runtime_error("load failed: bad field type"); + } + return {node_type, *field_type}; +} + +NodeExtentManagerURef NodeExtentManager::create_dummy(bool is_sync) { + if (is_sync) { + return NodeExtentManagerURef(new DummyNodeExtentManager()); + } else { + return NodeExtentManagerURef(new DummyNodeExtentManager()); + } +} + +NodeExtentManagerURef NodeExtentManager::create_seastore( + TransactionManager& tm, laddr_t min_laddr) { + return NodeExtentManagerURef(new SeastoreNodeExtentManager(tm, min_laddr)); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h new file mode 100644 index 000000000..77b230e03 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/type_helpers.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/transaction_manager.h" + +#include "fwd.h" +#include "super.h" +#include "node_extent_mutable.h" +#include "node_types.h" + +/** + * node_extent_manager.h + * + * Contains general interfaces for different backends (Dummy and Seastore). + */ + +namespace crimson::os::seastore::onode { + +using crimson::os::seastore::LogicalCachedExtent; +class NodeExtent : public LogicalCachedExtent { + public: + virtual ~NodeExtent() = default; + std::pair get_types() const; + const char* get_read() const { + return get_bptr().c_str(); + } + NodeExtentMutable get_mutable() { + assert(is_pending()); + return do_get_mutable(); + } + + virtual DeltaRecorder* get_recorder() const = 0; + virtual NodeExtentRef mutate(context_t, DeltaRecorderURef&&) = 0; + + protected: + template + NodeExtent(T&&... t) : LogicalCachedExtent(std::forward(t)...) {} + + NodeExtentMutable do_get_mutable() { + return NodeExtentMutable(*this); + } + + /** + * Abstracted interfaces to implement: + * - CacheExtent::duplicate_for_write() -> CachedExtentRef + * - CacheExtent::get_type() -> extent_types_t + * - CacheExtent::get_delta() -> ceph::bufferlist + * - LogicalCachedExtent::apply_delta(const ceph::bufferlist) -> void + */ + + private: + friend class NodeExtentMutable; +}; + +using crimson::os::seastore::TransactionManager; +class NodeExtentManager { + public: + virtual ~NodeExtentManager() = default; + using tm_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + template + using tm_future = tm_ertr::future; + + virtual bool is_read_isolated() const = 0; + virtual tm_future read_extent( + Transaction&, laddr_t, extent_len_t) = 0; + virtual tm_future alloc_extent(Transaction&, extent_len_t) = 0; + virtual tm_future get_super(Transaction&, RootNodeTracker&) = 0; + virtual std::ostream& print(std::ostream& os) const = 0; + + static NodeExtentManagerURef create_dummy(bool is_sync); + static NodeExtentManagerURef create_seastore( + TransactionManager& tm, laddr_t min_laddr = L_ADDR_MIN); +}; +inline std::ostream& operator<<(std::ostream& os, const NodeExtentManager& nm) { + return nm.print(os); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h new file mode 100644 index 000000000..830ea4a7d --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h @@ -0,0 +1,156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include + +#include "include/buffer_raw.h" + +#include "crimson/common/log.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h" + +/** + * dummy.h + * + * Dummy backend implementations for test purposes. + */ + +namespace crimson::os::seastore::onode { + +class DummySuper final: public Super { + public: + DummySuper(Transaction& t, RootNodeTracker& tracker, laddr_t* p_root_laddr) + : Super(t, tracker), p_root_laddr{p_root_laddr} {} + ~DummySuper() override = default; + protected: + laddr_t get_root_laddr() const override { return *p_root_laddr; } + void write_root_laddr(context_t, laddr_t addr) override { + logger().info("OTree::Dummy: update root {:#x} ...", addr); + *p_root_laddr = addr; + } + private: + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + laddr_t* p_root_laddr; +}; + +class DummyNodeExtent final: public NodeExtent { + public: + DummyNodeExtent(ceph::bufferptr &&ptr) : NodeExtent(std::move(ptr)) { + state = extent_state_t::INITIAL_WRITE_PENDING; + } + ~DummyNodeExtent() override = default; + protected: + NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override { + ceph_abort("impossible path"); } + DeltaRecorder* get_recorder() const override { + return nullptr; } + CachedExtentRef duplicate_for_write() override { + ceph_abort("impossible path"); } + extent_types_t get_type() const override { + return extent_types_t::TEST_BLOCK; } + ceph::bufferlist get_delta() override { + ceph_abort("impossible path"); } + void apply_delta(const ceph::bufferlist&) override { + ceph_abort("impossible path"); } +}; + +template +class DummyNodeExtentManager final: public NodeExtentManager { + static constexpr size_t ALIGNMENT = 4096; + public: + ~DummyNodeExtentManager() override = default; + protected: + bool is_read_isolated() const override { return false; } + + tm_future read_extent( + Transaction& t, laddr_t addr, extent_len_t len) override { + logger().trace("OTree::Dummy: reading {}B at {:#x} ...", len, addr); + if constexpr (SYNC) { + return read_extent_sync(t, addr, len); + } else { + using namespace std::chrono_literals; + return seastar::sleep(1us).then([this, &t, addr, len] { + return read_extent_sync(t, addr, len); + }); + } + } + + tm_future alloc_extent( + Transaction& t, extent_len_t len) override { + logger().trace("OTree::Dummy: allocating {}B ...", len); + if constexpr (SYNC) { + return alloc_extent_sync(t, len); + } else { + using namespace std::chrono_literals; + return seastar::sleep(1us).then([this, &t, len] { + return alloc_extent_sync(t, len); + }); + } + } + + tm_future get_super( + Transaction& t, RootNodeTracker& tracker) override { + logger().trace("OTree::Dummy: get root ..."); + if constexpr (SYNC) { + return get_super_sync(t, tracker); + } else { + using namespace std::chrono_literals; + return seastar::sleep(1us).then([this, &t, &tracker] { + return get_super_sync(t, tracker); + }); + } + } + + std::ostream& print(std::ostream& os) const override { + return os << "DummyNodeExtentManager(sync=" << SYNC << ")"; + } + + private: + tm_future read_extent_sync( + Transaction& t, laddr_t addr, extent_len_t len) { + auto iter = allocate_map.find(addr); + assert(iter != allocate_map.end()); + auto extent = iter->second; + logger().trace("OTree::Dummy: read {}B at {:#x}", + extent->get_length(), extent->get_laddr()); + assert(extent->get_laddr() == addr); + assert(extent->get_length() == len); + return tm_ertr::make_ready_future(extent); + } + + tm_future alloc_extent_sync( + Transaction& t, extent_len_t len) { + assert(len % ALIGNMENT == 0); + auto r = ceph::buffer::create_aligned(len, ALIGNMENT); + auto addr = reinterpret_cast(r->get_data()); + auto bp = ceph::bufferptr(std::move(r)); + auto extent = Ref(new DummyNodeExtent(std::move(bp))); + extent->set_laddr(addr); + assert(allocate_map.find(extent->get_laddr()) == allocate_map.end()); + allocate_map.insert({extent->get_laddr(), extent}); + logger().debug("OTree::Dummy: allocated {}B at {:#x}", + extent->get_length(), extent->get_laddr()); + assert(extent->get_length() == len); + return tm_ertr::make_ready_future(extent); + } + + tm_future get_super_sync( + Transaction& t, RootNodeTracker& tracker) { + logger().debug("OTree::Dummy: got root {:#x}", root_laddr); + return tm_ertr::make_ready_future( + Super::URef(new DummySuper(t, tracker, &root_laddr))); + } + + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + + std::map> allocate_map; + laddr_t root_laddr = L_ADDR_NULL; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc new file mode 100644 index 000000000..8d88485bf --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc @@ -0,0 +1,88 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "seastore.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h" + +namespace { + +seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); +} + +} + +namespace crimson::os::seastore::onode { + +static DeltaRecorderURef create_recorder( + node_type_t node_type, field_type_t field_type) { + if (node_type == node_type_t::LEAF) { + if (field_type == field_type_t::N0) { + return DeltaRecorderT::create(); + } else if (field_type == field_type_t::N1) { + return DeltaRecorderT::create(); + } else if (field_type == field_type_t::N2) { + return DeltaRecorderT::create(); + } else if (field_type == field_type_t::N3) { + return DeltaRecorderT::create(); + } else { + ceph_abort("impossible path"); + } + } else if (node_type == node_type_t::INTERNAL) { + if (field_type == field_type_t::N0) { + return DeltaRecorderT::create(); + } else if (field_type == field_type_t::N1) { + return DeltaRecorderT::create(); + } else if (field_type == field_type_t::N2) { + return DeltaRecorderT::create(); + } else if (field_type == field_type_t::N3) { + return DeltaRecorderT::create(); + } else { + ceph_abort("impossible path"); + } + } else { + ceph_abort("impossible path"); + } +} + +void SeastoreSuper::write_root_laddr(context_t c, laddr_t addr) { + logger().info("OTree::Seastore: update root {:#x} ...", addr); + root_addr = addr; + auto nm = static_cast(&c.nm); + nm->get_tm().write_onode_root(c.t, addr); +} + +NodeExtentRef SeastoreNodeExtent::mutate( + context_t c, DeltaRecorderURef&& _recorder) { + logger().debug("OTree::Seastore: mutate {:#x} ...", get_laddr()); + auto nm = static_cast(&c.nm); + auto extent = nm->get_tm().get_mutable_extent(c.t, this); + auto ret = extent->cast(); + assert(!ret->recorder || ret->recorder->is_empty()); + ret->recorder = std::move(_recorder); + return ret; +} + +void SeastoreNodeExtent::apply_delta(const ceph::bufferlist& bl) { + logger().debug("OTree::Seastore: replay {:#x} ...", get_laddr()); + if (!recorder) { + auto [node_type, field_type] = get_types(); + recorder = create_recorder(node_type, field_type); + } else { +#ifndef NDEBUG + auto [node_type, field_type] = get_types(); + assert(recorder->node_type() == node_type); + assert(recorder->field_type() == field_type); +#endif + } + assert(is_clean()); + auto node = do_get_mutable(); + auto p = bl.cbegin(); + while (p != bl.end()) { + recorder->apply_delta(p, node); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h new file mode 100644 index 000000000..f80b99fab --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h @@ -0,0 +1,126 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/log.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h" + +/** + * seastore.h + * + * Seastore backend implementations. + */ + +namespace crimson::os::seastore::onode { + +class SeastoreSuper final: public Super { + public: + SeastoreSuper(Transaction& t, RootNodeTracker& tracker, + laddr_t root_addr, TransactionManager& tm) + : Super(t, tracker), root_addr{root_addr}, tm{tm} {} + ~SeastoreSuper() override = default; + protected: + laddr_t get_root_laddr() const override { + return root_addr; + } + void write_root_laddr(context_t c, laddr_t addr) override; + private: + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + laddr_t root_addr; + TransactionManager& tm; +}; + +class SeastoreNodeExtent final: public NodeExtent { + public: + SeastoreNodeExtent(ceph::bufferptr &&ptr) + : NodeExtent(std::move(ptr)) {} + SeastoreNodeExtent(const SeastoreNodeExtent& other) + : NodeExtent(other) {} + ~SeastoreNodeExtent() override = default; + protected: + NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override; + + DeltaRecorder* get_recorder() const override { + return recorder.get(); + } + + CachedExtentRef duplicate_for_write() override { + return CachedExtentRef(new SeastoreNodeExtent(*this)); + } + extent_types_t get_type() const override { + return extent_types_t::ONODE_BLOCK_STAGED; + } + ceph::bufferlist get_delta() override { + assert(recorder); + return recorder->get_delta(); + } + void apply_delta(const ceph::bufferlist&) override; + private: + DeltaRecorderURef recorder; +}; + +class SeastoreNodeExtentManager final: public NodeExtentManager { + public: + SeastoreNodeExtentManager(TransactionManager& tm, laddr_t min) + : tm{tm}, addr_min{min} {}; + ~SeastoreNodeExtentManager() override = default; + TransactionManager& get_tm() { return tm; } + protected: + bool is_read_isolated() const override { return true; } + + tm_future read_extent( + Transaction& t, laddr_t addr, extent_len_t len) override { + logger().debug("OTree::Seastore: reading {}B at {:#x} ...", len, addr); + return tm.read_extents(t, addr, len + ).safe_then([addr, len](auto&& extents) { + assert(extents.size() == 1); + [[maybe_unused]] auto [laddr, e] = extents.front(); + logger().trace("OTree::Seastore: read {}B at {:#x}", + e->get_length(), e->get_laddr()); + assert(e->get_laddr() == addr); + assert(e->get_length() == len); + std::ignore = addr; + std::ignore = len; + return NodeExtentRef(e); + }); + } + + tm_future alloc_extent( + Transaction& t, extent_len_t len) override { + logger().debug("OTree::Seastore: allocating {}B ...", len); + return tm.alloc_extent(t, addr_min, len + ).safe_then([len](auto extent) { + logger().debug("OTree::Seastore: allocated {}B at {:#x}", + extent->get_length(), extent->get_laddr()); + assert(extent->get_length() == len); + std::ignore = len; + return NodeExtentRef(extent); + }); + } + + tm_future get_super( + Transaction& t, RootNodeTracker& tracker) override { + logger().trace("OTree::Seastore: get root ..."); + return tm.read_onode_root(t).safe_then([this, &t, &tracker](auto root_addr) { + logger().debug("OTree::Seastore: got root {:#x}", root_addr); + return Super::URef(new SeastoreSuper(t, tracker, root_addr, tm)); + }); + } + + std::ostream& print(std::ostream& os) const override { + return os << "SeastoreNodeExtentManager"; + } + + private: + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + TransactionManager& tm; + const laddr_t addr_min; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h new file mode 100644 index 000000000..240c88932 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h" + +/** test_replay.h + * + * A special version of NodeExtent to help verify delta encode, decode and + * replay in recorder_t under debug build. + */ + +namespace crimson::os::seastore::onode { + +class TestReplayExtent final: public NodeExtent { + public: + using Ref = crimson::os::seastore::TCachedExtentRef; + + void prepare_replay(NodeExtentRef from_extent) { + assert(get_length() == from_extent->get_length()); + auto mut = do_get_mutable(); + std::memcpy(mut.get_write(), from_extent->get_read(), get_length()); + } + + void replay_and_verify(NodeExtentRef replayed_extent) { + assert(get_length() == replayed_extent->get_length()); + auto mut = do_get_mutable(); + auto bl = recorder->get_delta(); + assert(bl.length()); + auto p = bl.cbegin(); + recorder->apply_delta(p, mut); + assert(p == bl.end()); + auto cmp = std::memcmp(get_read(), replayed_extent->get_read(), get_length()); + ceph_assert(cmp == 0 && "replay mismatch!"); + } + + static Ref create(extent_len_t length, DeltaRecorderURef&& recorder) { + auto r = ceph::buffer::create_aligned(length, 4096); + auto bp = ceph::bufferptr(std::move(r)); + return new TestReplayExtent(std::move(bp), std::move(recorder)); + } + + protected: + NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override { + ceph_abort("impossible path"); } + DeltaRecorder* get_recorder() const override { + ceph_abort("impossible path"); } + CachedExtentRef duplicate_for_write() override { + ceph_abort("impossible path"); } + extent_types_t get_type() const override { + return extent_types_t::TEST_BLOCK; } + ceph::bufferlist get_delta() override { + ceph_abort("impossible path"); } + void apply_delta(const ceph::bufferlist&) override { + ceph_abort("impossible path"); } + + private: + TestReplayExtent(ceph::bufferptr&& ptr, DeltaRecorderURef&& recorder) + : NodeExtent(std::move(ptr)), recorder(std::move(recorder)) { + state = extent_state_t::MUTATION_PENDING; + } + DeltaRecorderURef recorder; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc new file mode 100644 index 000000000..048c4000d --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_extent_mutable.h" +#include "node_extent_manager.h" + +namespace crimson::os::seastore::onode { + +NodeExtentMutable::NodeExtentMutable(NodeExtent& extent) + : extent{extent} { + assert(extent.is_pending() || // during mutation + extent.is_clean()); // during replay +} + +const char* NodeExtentMutable::get_read() const { + assert(extent.is_pending() || // during mutation + extent.is_clean()); // during replay + return extent.get_bptr().c_str(); +} + +char* NodeExtentMutable::get_write() { + assert(extent.is_pending() || // during mutation + extent.is_clean()); // during replay + return extent.get_bptr().c_str(); +} + +extent_len_t NodeExtentMutable::get_length() const { + return extent.get_length(); +} + +laddr_t NodeExtentMutable::get_laddr() const { + return extent.get_laddr(); +} + +const char* NodeExtentMutable::buf_upper_bound() const { + return get_read() + get_length(); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h new file mode 100644 index 000000000..52f10a013 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include "fwd.h" + +#pragma once + +namespace crimson::os::seastore::onode { + +class NodeExtent; + +/** + * NodeExtentMutable + * + * A thin wrapper of NodeExtent to make sure that only the newly allocated + * or the duplicated NodeExtent is mutable, and the memory modifications are + * safe within the extent range. + */ +class NodeExtentMutable { + public: + void copy_in_absolute(void* dst, const void* src, extent_len_t len) { + assert((char*)dst >= get_write()); + assert((char*)dst + len <= buf_upper_bound()); + std::memcpy(dst, src, len); + } + template + void copy_in_absolute(void* dst, const T& src) { + copy_in_absolute(dst, &src, sizeof(T)); + } + + const void* copy_in_relative( + extent_len_t dst_offset, const void* src, extent_len_t len) { + auto dst = get_write() + dst_offset; + copy_in_absolute(dst, src, len); + return dst; + } + template + const T* copy_in_relative( + extent_len_t dst_offset, const T& src) { + auto dst = copy_in_relative(dst_offset, &src, sizeof(T)); + return static_cast(dst); + } + + void shift_absolute(const void* src, extent_len_t len, int offset) { + assert((const char*)src >= get_write()); + assert((const char*)src + len <= buf_upper_bound()); + char* to = (char*)src + offset; + assert(to >= get_write()); + assert(to + len <= buf_upper_bound()); + if (len != 0) { + std::memmove(to, src, len); + } + } + void shift_relative(extent_len_t src_offset, extent_len_t len, int offset) { + shift_absolute(get_write() + src_offset, len, offset); + } + + template + void validate_inplace_update(const T& updated) { + assert((const char*)&updated >= get_write()); + assert((const char*)&updated + sizeof(T) <= buf_upper_bound()); + } + + const char* get_read() const; + char* get_write(); + extent_len_t get_length() const; + laddr_t get_laddr() const; + + private: + explicit NodeExtentMutable(NodeExtent&); + const char* buf_upper_bound() const; + + NodeExtent& extent; + + friend class NodeExtent; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc new file mode 100644 index 000000000..59d792b1a --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_impl.h" +#include "node_layout.h" + +namespace crimson::os::seastore::onode { + +#ifdef UNIT_TESTS_BUILT +last_split_info_t last_split = {}; +#endif + +// XXX: branchless allocation +InternalNodeImpl::alloc_ertr::future +InternalNodeImpl::allocate( + context_t c, field_type_t type, bool is_level_tail, level_t level) { + if (type == field_type_t::N0) { + return InternalNode0::allocate(c, is_level_tail, level); + } else if (type == field_type_t::N1) { + return InternalNode1::allocate(c, is_level_tail, level); + } else if (type == field_type_t::N2) { + return InternalNode2::allocate(c, is_level_tail, level); + } else if (type == field_type_t::N3) { + return InternalNode3::allocate(c, is_level_tail, level); + } else { + ceph_abort("impossible path"); + } +} + +LeafNodeImpl::alloc_ertr::future +LeafNodeImpl::allocate( + context_t c, field_type_t type, bool is_level_tail) { + if (type == field_type_t::N0) { + return LeafNode0::allocate(c, is_level_tail, 0); + } else if (type == field_type_t::N1) { + return LeafNode1::allocate(c, is_level_tail, 0); + } else if (type == field_type_t::N2) { + return LeafNode2::allocate(c, is_level_tail, 0); + } else if (type == field_type_t::N3) { + return LeafNode3::allocate(c, is_level_tail, 0); + } else { + ceph_abort("impossible path"); + } +} + +InternalNodeImplURef InternalNodeImpl::load( + NodeExtentRef extent, field_type_t type, bool expect_is_level_tail) { + if (type == field_type_t::N0) { + return InternalNode0::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N1) { + return InternalNode1::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N2) { + return InternalNode2::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N3) { + return InternalNode3::load(extent, expect_is_level_tail); + } else { + ceph_abort("impossible path"); + } +} + +LeafNodeImplURef LeafNodeImpl::load( + NodeExtentRef extent, field_type_t type, bool expect_is_level_tail) { + if (type == field_type_t::N0) { + return LeafNode0::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N1) { + return LeafNode1::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N2) { + return LeafNode2::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N3) { + return LeafNode3::load(extent, expect_is_level_tail); + } else { + ceph_abort("impossible path"); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h new file mode 100644 index 000000000..3267cda2b --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h @@ -0,0 +1,197 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include + +#include "node_extent_mutable.h" +#include "node_types.h" +#include "stages/stage_types.h" + +namespace crimson::os::seastore::onode { + +#ifdef UNIT_TESTS_BUILT +enum class InsertType { BEGIN, LAST, MID }; +struct split_expectation_t { + match_stage_t split_stage; + match_stage_t insert_stage; + bool is_insert_left; + InsertType insert_type; +}; +struct last_split_info_t { + search_position_t split_pos; + match_stage_t insert_stage; + bool is_insert_left; + InsertType insert_type; + bool match(const split_expectation_t& e) const { + match_stage_t split_stage; + if (split_pos.nxt.nxt.index == 0) { + if (split_pos.nxt.index == 0) { + split_stage = 2; + } else { + split_stage = 1; + } + } else { + split_stage = 0; + } + return split_stage == e.split_stage && + insert_stage == e.insert_stage && + is_insert_left == e.is_insert_left && + insert_type == e.insert_type; + } + bool match_split_pos(const search_position_t& pos) const { + return split_pos == pos; + } +}; +extern last_split_info_t last_split; +#endif + +struct key_hobj_t; +struct key_view_t; +class NodeExtentMutable; + +/** + * NodeImpl + * + * Hides type specific node layout implementations for Node. + */ +class NodeImpl { + public: + using alloc_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + virtual ~NodeImpl() = default; + + virtual field_type_t field_type() const = 0; + virtual laddr_t laddr() const = 0; + virtual void prepare_mutate(context_t) = 0; + virtual bool is_level_tail() const = 0; + virtual bool is_empty() const = 0; + virtual level_t level() const = 0; + virtual node_offset_t free_size() const = 0; + virtual key_view_t get_key_view(const search_position_t&) const = 0; + virtual key_view_t get_largest_key_view() const = 0; + virtual void next_position(search_position_t&) const = 0; + + virtual node_stats_t get_stats() const = 0; + virtual std::ostream& dump(std::ostream&) const = 0; + virtual std::ostream& dump_brief(std::ostream&) const = 0; + virtual void validate_layout() const = 0; + + virtual void test_copy_to(NodeExtentMutable&) const = 0; + virtual void test_set_tail(NodeExtentMutable&) = 0; + + protected: + NodeImpl() = default; +}; + +/** + * InternalNodeImpl + * + * Hides type specific node layout implementations for InternalNode. + */ +class InternalNodeImpl : public NodeImpl { + public: + struct internal_marker_t {}; + virtual ~InternalNodeImpl() = default; + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual const laddr_packed_t* get_p_value( + const search_position_t&, + key_view_t* = nullptr, internal_marker_t = {}) const { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual lookup_result_t lower_bound( + const key_hobj_t&, MatchHistory&, + key_view_t* = nullptr, internal_marker_t = {}) const { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual const laddr_packed_t* insert( + const key_view_t&, const laddr_packed_t&, search_position_t&, match_stage_t&, node_offset_t&) { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual std::tuple split_insert( + NodeExtentMutable&, NodeImpl&, const key_view_t&, const laddr_packed_t&, + search_position_t&, match_stage_t&, node_offset_t&) { + ceph_abort("impossible path"); + } + + virtual void replace_child_addr(const search_position_t&, laddr_t dst, laddr_t src) = 0; + virtual std::tuple evaluate_insert( + const key_view_t&, const laddr_t&, search_position_t&) const = 0; + + struct fresh_impl_t { + InternalNodeImplURef impl; + NodeExtentMutable mut; + std::pair make_pair() { + return {std::move(impl), mut}; + } + }; + static alloc_ertr::future allocate(context_t, field_type_t, bool, level_t); + static InternalNodeImplURef load(NodeExtentRef, field_type_t, bool); + + protected: + InternalNodeImpl() = default; +}; + +/** + * LeafNodeImpl + * + * Hides type specific node layout implementations for LeafNode. + */ +class LeafNodeImpl : public NodeImpl { + public: + struct leaf_marker_t {}; + virtual ~LeafNodeImpl() = default; + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual const onode_t* get_p_value( + const search_position_t&, + key_view_t* = nullptr, leaf_marker_t={}) const { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual lookup_result_t lower_bound( + const key_hobj_t&, MatchHistory&, + key_view_t* = nullptr, leaf_marker_t = {}) const { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual const onode_t* insert( + const key_hobj_t&, const onode_t&, search_position_t&, match_stage_t&, node_offset_t&) { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual std::tuple split_insert( + NodeExtentMutable&, NodeImpl&, const key_hobj_t&, const onode_t&, + search_position_t&, match_stage_t&, node_offset_t&) { + ceph_abort("impossible path"); + } + + virtual void get_largest_slot( + search_position_t&, key_view_t&, const onode_t**) const = 0; + virtual std::tuple evaluate_insert( + const key_hobj_t&, const onode_t&, + const MatchHistory&, match_stat_t, search_position_t&) const = 0; + + struct fresh_impl_t { + LeafNodeImplURef impl; + NodeExtentMutable mut; + std::pair make_pair() { + return {std::move(impl), mut}; + } + }; + static alloc_ertr::future allocate(context_t, field_type_t, bool); + static LeafNodeImplURef load(NodeExtentRef, field_type_t, bool); + + protected: + LeafNodeImpl() = default; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h new file mode 100644 index 000000000..916d17424 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h @@ -0,0 +1,613 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include + +#include "common/likely.h" +#include "crimson/common/log.h" +#include "node_extent_accessor.h" +#include "node_impl.h" +#include "stages/node_stage_layout.h" + +namespace crimson::os::seastore::onode { + +template struct insert_key_type; +template <> struct insert_key_type { + static constexpr auto type = KeyT::VIEW; }; +template <> struct insert_key_type { + static constexpr auto type = KeyT::HOBJ; }; + +template struct node_impl_type; +template <> struct node_impl_type { + using type = InternalNodeImpl; }; +template <> struct node_impl_type { + using type = LeafNodeImpl; }; + +template struct node_marker_type; +template <> struct node_marker_type { + using type = InternalNodeImpl::internal_marker_t; }; +template <> struct node_marker_type { + using type = LeafNodeImpl::leaf_marker_t; }; + +/** + * NodeLayoutT + * + * Contains templated and concrete implementations for both InternalNodeImpl + * and LeafNodeImpl under a specific node layout. + */ +template +class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl { + public: + using URef = std::unique_ptr; + using extent_t = NodeExtentAccessorT; + using parent_t = typename node_impl_type::type; + using marker_t = typename node_marker_type::type; + using node_stage_t = typename extent_t::node_stage_t; + using position_t = typename extent_t::position_t; + using value_t = typename extent_t::value_t; + static constexpr auto FIELD_TYPE = extent_t::FIELD_TYPE; + static constexpr auto KEY_TYPE = insert_key_type::type; + static constexpr auto STAGE = STAGE_T::STAGE; + + NodeLayoutT(const NodeLayoutT&) = delete; + NodeLayoutT(NodeLayoutT&&) = delete; + NodeLayoutT& operator=(const NodeLayoutT&) = delete; + NodeLayoutT& operator=(NodeLayoutT&&) = delete; + ~NodeLayoutT() override = default; + + static URef load(NodeExtentRef extent, bool expect_is_level_tail) { + std::unique_ptr ret(new NodeLayoutT(extent)); + assert(ret->is_level_tail() == expect_is_level_tail); + return ret; + } + + using alloc_ertr = NodeExtentManager::tm_ertr; + static alloc_ertr::future allocate( + context_t c, bool is_level_tail, level_t level) { + // NOTE: Currently, all the node types have the same size for simplicity. + // But depending on the requirement, we may need to make node size + // configurable by field_type_t and node_type_t, or totally flexible. + return c.nm.alloc_extent(c.t, node_stage_t::EXTENT_SIZE + ).safe_then([is_level_tail, level](auto extent) { + assert(extent->is_initial_pending()); + auto mut = extent->get_mutable(); + node_stage_t::bootstrap_extent( + mut, FIELD_TYPE, NODE_TYPE, is_level_tail, level); + return typename parent_t::fresh_impl_t{ + std::unique_ptr(new NodeLayoutT(extent)), mut}; + }); + } + + protected: + /* + * NodeImpl + */ + field_type_t field_type() const override { return FIELD_TYPE; } + laddr_t laddr() const override { return extent.get_laddr(); } + void prepare_mutate(context_t c) override { return extent.prepare_mutate(c); } + bool is_level_tail() const override { return extent.read().is_level_tail(); } + bool is_empty() const override { return extent.read().keys() == 0; } + level_t level() const override { return extent.read().level(); } + node_offset_t free_size() const override { return extent.read().free_size(); } + + key_view_t get_key_view(const search_position_t& position) const override { + key_view_t ret; + STAGE_T::get_key_view(extent.read(), cast_down(position), ret); + return ret; + } + + key_view_t get_largest_key_view() const override { + key_view_t index_key; + STAGE_T::template lookup_largest_slot( + extent.read(), nullptr, &index_key, nullptr); + return index_key; + } + + void next_position(search_position_t& pos) const override { + assert(!pos.is_end()); + bool find_next = STAGE_T::next_position(extent.read(), cast_down(pos)); + if (find_next) { + pos = search_position_t::end(); + } + } + + node_stats_t get_stats() const override { + node_stats_t stats; + auto& node_stage = extent.read(); + key_view_t index_key; + if (node_stage.keys()) { + STAGE_T::get_stats(node_stage, stats, index_key); + } + stats.size_persistent = node_stage_t::EXTENT_SIZE; + stats.size_filled = filled_size(); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (is_level_tail()) { + stats.size_logical += sizeof(value_t); + stats.size_value += sizeof(value_t); + stats.num_kvs += 1; + } + } + return stats; + } + + std::ostream& dump(std::ostream& os) const override { + auto& node_stage = extent.read(); + auto p_start = node_stage.p_start(); + dump_brief(os); + auto stats = get_stats(); + os << " num_kvs=" << stats.num_kvs + << ", logical=" << stats.size_logical + << "B, overhead=" << stats.size_overhead + << "B, value=" << stats.size_value << "B"; + os << ":\n header: " << node_stage_t::header_size() << "B"; + size_t size = 0u; + if (node_stage.keys()) { + STAGE_T::dump(node_stage, os, " ", size, p_start); + } else { + size += node_stage_t::header_size(); + if (NODE_TYPE == node_type_t::LEAF || !node_stage.is_level_tail()) { + os << " empty!"; + } + } + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (node_stage.is_level_tail()) { + size += sizeof(laddr_t); + auto value_ptr = node_stage.get_end_p_laddr(); + int offset = reinterpret_cast(value_ptr) - p_start; + os << "\n tail value: 0x" + << std::hex << value_ptr->value << std::dec + << " " << size << "B" + << " @" << offset << "B"; + } + } + assert(size == filled_size()); + return os; + } + + std::ostream& dump_brief(std::ostream& os) const override { + auto& node_stage = extent.read(); + os << "Node" << NODE_TYPE << FIELD_TYPE + << "@0x" << std::hex << extent.get_laddr() + << "+" << node_stage_t::EXTENT_SIZE << std::dec + << (node_stage.is_level_tail() ? "$" : "") + << "(level=" << (unsigned)node_stage.level() + << ", filled=" << filled_size() << "B" + << ", free=" << node_stage.free_size() << "B" + << ")"; + return os; + } + + void validate_layout() const override { +#ifndef NDEBUG + STAGE_T::validate(extent.read()); +#endif + } + + void test_copy_to(NodeExtentMutable& to) const override { + extent.test_copy_to(to); + } + + void test_set_tail(NodeExtentMutable& mut) override { + node_stage_t::update_is_level_tail(mut, extent.read(), true); + } + + /* + * Common + */ + const value_t* get_p_value(const search_position_t& position, + key_view_t* index_key=nullptr, marker_t={}) const override { + auto& node_stage = extent.read(); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + assert(!index_key); + if (position.is_end()) { + assert(is_level_tail()); + return node_stage.get_end_p_laddr(); + } + } else { + assert(!position.is_end()); + } + if (index_key) { + return STAGE_T::template get_p_value( + node_stage, cast_down(position), index_key); + } else { + return STAGE_T::get_p_value(node_stage, cast_down(position)); + } + } + + lookup_result_t lower_bound( + const key_hobj_t& key, MatchHistory& history, + key_view_t* index_key=nullptr, marker_t={}) const override { + auto& node_stage = extent.read(); + if constexpr (NODE_TYPE == node_type_t::LEAF) { + if (unlikely(node_stage.keys() == 0)) { + history.set(MatchKindCMP::LT); + return lookup_result_t::end(); + } + } + + typename STAGE_T::result_t result_raw; + if (index_key) { + result_raw = STAGE_T::template lower_bound( + node_stage, key, history, index_key); +#ifndef NDEBUG + if (!result_raw.is_end()) { + full_key_t index; + STAGE_T::get_key_view(node_stage, result_raw.position, index); + assert(index == *index_key); + } +#endif + } else { + result_raw = STAGE_T::lower_bound(node_stage, key, history); + } +#ifndef NDEBUG + if (result_raw.is_end()) { + assert(result_raw.mstat == MSTAT_END); + } else { + full_key_t index; + STAGE_T::get_key_view(node_stage, result_raw.position, index); + assert_mstat(key, index, result_raw.mstat); + } +#endif + + // calculate MSTAT_LT3 + if constexpr (FIELD_TYPE == field_type_t::N0) { + // currently only internal node checks mstat + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (result_raw.mstat == MSTAT_LT2) { + auto cmp = compare_to( + key, node_stage[result_raw.position.index].shard_pool); + assert(cmp != MatchKindCMP::GT); + if (cmp != MatchKindCMP::EQ) { + result_raw.mstat = MSTAT_LT3; + } + } + } + } + + auto result = normalize(std::move(result_raw)); + if (result.is_end()) { + assert(node_stage.is_level_tail()); + assert(result.p_value == nullptr); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + result.p_value = node_stage.get_end_p_laddr(); + } + } else { + assert(result.p_value != nullptr); + } + return result; + } + + const value_t* insert( + const full_key_t& key, const value_t& value, + search_position_t& insert_pos, match_stage_t& insert_stage, + node_offset_t& insert_size) override { + logger().debug("OTree::Layout::Insert: begin at " + "insert_pos({}), insert_stage={}, insert_size={}B ...", + insert_pos, insert_stage, insert_size); + if (unlikely(logger().is_enabled(seastar::log_level::trace))) { + std::ostringstream sos; + dump(sos); + logger().trace("OTree::Layout::Insert: -- dump\n{}", sos.str()); + } + auto ret = extent.template insert_replayable( + key, value, cast_down(insert_pos), insert_stage, insert_size); + logger().debug("OTree::Layout::Insert: done at " + "insert_pos({}), insert_stage={}, insert_size={}B", + insert_pos, insert_stage, insert_size); + if (unlikely(logger().is_enabled(seastar::log_level::trace))) { + std::ostringstream sos; + dump(sos); + logger().trace("OTree::Layout::Insert: -- dump\n{}", sos.str()); + } + validate_layout(); + assert(get_key_view(insert_pos) == key); + return ret; + } + + std::tuple split_insert( + NodeExtentMutable& right_mut, NodeImpl& right_impl, + const full_key_t& key, const value_t& value, + search_position_t& _insert_pos, match_stage_t& insert_stage, + node_offset_t& insert_size) override { + logger().info("OTree::Layout::Split: begin at " + "insert_pos({}), insert_stage={}, insert_size={}B, " + "{:#x}=>{:#x} ...", + _insert_pos, insert_stage, insert_size, + laddr(), right_impl.laddr()); + if (unlikely(logger().is_enabled(seastar::log_level::debug))) { + std::ostringstream sos; + dump(sos); + logger().debug("OTree::Layout::Split: -- dump\n{}", sos.str()); + } +#ifdef UNIT_TESTS_BUILT + auto insert_stage_pre = insert_stage; +#endif + + auto& insert_pos = cast_down(_insert_pos); + auto& node_stage = extent.read(); + typename STAGE_T::StagedIterator split_at; + bool is_insert_left; + size_t split_size; + size_t target_split_size; + { + size_t empty_size = node_stage.size_before(0); + size_t filled_kv_size = filled_size() - empty_size; + /** NODE_BLOCK_SIZE considerations + * + * Generally, + * target_split_size = (filled_size + insert_size) / 2 + * We can have two locate_split() strategies: + * A. the simpler one is to locate the largest split position where + * the estimated left_node_size <= target_split_size; + * B. the fair one takes a further step to calculate the next slot of + * P KiB, and if left_node_size + P/2 < target_split_size, compensate + * the split position to include the next slot; (TODO) + * + * Say that the node_block_size = N KiB, the largest allowed + * insert_size = 1/I * N KiB (I > 1). We want to identify the minimal 'I' + * that won't lead to "double split" effect, meaning after a split, + * the right node size is still larger than N KiB and need to split + * again. I think "double split" makes split much more complicated and + * we can no longer identify whether the node is safe under concurrent + * operations. + * + * We need to evaluate the worst case in order to identify 'I'. This means: + * - filled_size ~= N KiB + * - insert_size == N/I KiB + * - target_split_size ~= (I+1)/2I * N KiB + * To simplify the below calculations, node_block_size is normalized to 1. + * + * With strategy A, the worst case is when left_node_size cannot include + * the next slot that will just overflow the target_split_size: + * - left_node_size + 1/I ~= (I+1)/2I + * - left_node_size ~= (I-1)/2I + * - right_node_size ~= 1 + 1/I - left_node_size ~= (I+3)/2I + * The right_node_size cannot larger than the node_block_size in the + * worst case, which means (I+3)/2I < 1, so I > 3, meaning the largest + * possible insert_size must be smaller than 1/3 of the node_block_size. + * + * With strategy B, the worst case is when left_node_size cannot include + * the next slot that will just overflow the threshold + * target_split_size - 1/2I, thus: + * - left_node_size ~= (I+1)/2I - 1/2I ~= 1/2 + * - right_node_size ~= 1 + 1/I - 1/2 ~= (I+2)/2I < node_block_size(1) + * - I > 2 + * This means the largest possible insert_size must be smaller than 1/2 of + * the node_block_size, which is better than strategy A. + + * In order to avoid "double split", there is another side-effect we need + * to take into consideration: if split happens with snap-gen indexes, the + * according ns-oid string needs to be copied to the right node. That is + * to say: right_node_size + string_size < node_block_size. + * + * Say that the largest allowed string size is 1/S of the largest allowed + * insert_size N/I KiB. If we go with stragety B, the equation should be + * changed to: + * - right_node_size ~= (I+2)/2I + 1/(I*S) < 1 + * - I > 2 + 2/S (S > 1) + * + * Now back to NODE_BLOCK_SIZE calculation, if we have limits of at most + * X KiB ns-oid string and Y KiB of onode_t to store in this BTree, then: + * - largest_insert_size ~= X+Y KiB + * - 1/S == X/(X+Y) + * - I > (4X+2Y)/(X+Y) + * - node_block_size(N) == I * insert_size > 4X+2Y KiB + * + * In conclusion, + * (TODO) the current node block size (4 KiB) is too small to + * store entire 2 KiB ns-oid string. We need to consider a larger + * node_block_size. + * + * We are setting X = Y = 640 B in order not to break the current + * implementations with 4KiB node. + * + * (TODO) Implement smarter logics to check when "double split" happens. + */ + target_split_size = empty_size + (filled_kv_size + insert_size) / 2; + assert(insert_size < (node_stage.total_size() - empty_size) / 2); + + std::optional _is_insert_left; + split_at.set(node_stage); + split_size = 0; + bool locate_nxt = STAGE_T::recursively_locate_split_inserted( + split_size, 0, target_split_size, insert_pos, + insert_stage, insert_size, _is_insert_left, split_at); + is_insert_left = *_is_insert_left; + logger().debug("OTree::Layout::Split: -- located " + "split_at({}), insert_pos({}), is_insert_left={}, " + "split_size={}B(target={}B, current={}B)", + split_at, insert_pos, is_insert_left, + split_size, target_split_size, filled_size()); + // split_size can be larger than target_split_size in strategy B + // assert(split_size <= target_split_size); + if (locate_nxt) { + assert(insert_stage == STAGE); + assert(split_at.get().is_last()); + split_at.set_end(); + assert(insert_pos.index == split_at.index()); + } + } + + auto append_at = split_at; + // TODO(cross-node string dedup) + typename STAGE_T::template StagedAppender right_appender; + right_appender.init(&right_mut, right_mut.get_write()); + const value_t* p_value = nullptr; + if (!is_insert_left) { + // right node: append [start(append_at), insert_pos) + STAGE_T::template append_until( + append_at, right_appender, insert_pos, insert_stage); + logger().debug("OTree::Layout::Split: -- right appended until " + "insert_pos({}), insert_stage={}, insert/append the rest ...", + insert_pos, insert_stage); + // right node: append [insert_pos(key, value)] + bool is_front_insert = (insert_pos == position_t::begin()); + [[maybe_unused]] bool is_end = STAGE_T::template append_insert( + key, value, append_at, right_appender, + is_front_insert, insert_stage, p_value); + assert(append_at.is_end() == is_end); + } else { + logger().debug("OTree::Layout::Split: -- right appending ..."); + } + + // right node: append (insert_pos, end) + auto pos_end = position_t::end(); + STAGE_T::template append_until( + append_at, right_appender, pos_end, STAGE); + assert(append_at.is_end()); + right_appender.wrap(); + if (unlikely(logger().is_enabled(seastar::log_level::debug))) { + std::ostringstream sos; + right_impl.dump(sos); + logger().debug("OTree::Layout::Split: -- right node dump\n{}", sos.str()); + } + right_impl.validate_layout(); + + // mutate left node + if (is_insert_left) { + logger().debug("OTree::Layout::Split: -- left trim/insert at " + "insert_pos({}), insert_stage={} ...", + insert_pos, insert_stage); + p_value = extent.template split_insert_replayable( + split_at, key, value, insert_pos, insert_stage, insert_size); + assert(get_key_view(_insert_pos) == key); + } else { + logger().debug("OTree::Layout::Split: -- left trim ..."); + assert(right_impl.get_key_view(_insert_pos) == key); + extent.split_replayable(split_at); + } + if (unlikely(logger().is_enabled(seastar::log_level::debug))) { + std::ostringstream sos; + dump(sos); + logger().debug("OTree::Layout::Split: -- left node dump\n{}", sos.str()); + } + validate_layout(); + assert(p_value); + + auto split_pos = normalize(split_at.get_pos()); + logger().info("OTree::Layout::Split: done at " + "insert_pos({}), insert_stage={}, insert_size={}B, split_at({}), " + "is_insert_left={}, split_size={}B(target={}B)", + _insert_pos, insert_stage, insert_size, split_pos, + is_insert_left, split_size, target_split_size); + assert(split_size == filled_size()); + +#ifdef UNIT_TESTS_BUILT + InsertType insert_type; + search_position_t last_pos; + if (is_insert_left) { + STAGE_T::template lookup_largest_slot( + extent.read(), &cast_down_fill_0(last_pos), nullptr, nullptr); + } else { + node_stage_t right_stage{reinterpret_cast(right_mut.get_write())}; + STAGE_T::template lookup_largest_slot( + right_stage, &cast_down_fill_0(last_pos), nullptr, nullptr); + } + if (_insert_pos == search_position_t::begin()) { + insert_type = InsertType::BEGIN; + } else if (_insert_pos == last_pos) { + insert_type = InsertType::LAST; + } else { + insert_type = InsertType::MID; + } + last_split = {split_pos, insert_stage_pre, is_insert_left, insert_type}; +#endif + return {split_pos, is_insert_left, p_value}; + } + + /* + * InternalNodeImpl + */ + void replace_child_addr( + const search_position_t& pos, laddr_t dst, laddr_t src) override { + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + const laddr_packed_t* p_value = get_p_value(pos); + assert(p_value->value == src); + extent.update_child_addr_replayable(dst, const_cast(p_value)); + } else { + ceph_abort("impossible path"); + } + } + + std::tuple evaluate_insert( + const key_view_t& key, const laddr_t& value, + search_position_t& insert_pos) const override { + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + auto packed_value = laddr_packed_t{value}; + auto& node_stage = extent.read(); + match_stage_t insert_stage; + node_offset_t insert_size; + if (unlikely(!node_stage.keys())) { + assert(insert_pos.is_end()); + insert_stage = STAGE; + insert_size = STAGE_T::template insert_size(key, packed_value); + } else { + std::tie(insert_stage, insert_size) = STAGE_T::evaluate_insert( + node_stage, key, packed_value, cast_down(insert_pos), false); + } + return {insert_stage, insert_size}; + } else { + ceph_abort("impossible path"); + } + } + + /* + * LeafNodeImpl + */ + void get_largest_slot(search_position_t& pos, + key_view_t& index_key, const onode_t** pp_value) const override { + if constexpr (NODE_TYPE == node_type_t::LEAF) { + STAGE_T::template lookup_largest_slot( + extent.read(), &cast_down_fill_0(pos), &index_key, pp_value); + } else { + ceph_abort("impossible path"); + } + } + + std::tuple evaluate_insert( + const key_hobj_t& key, const onode_t& value, + const MatchHistory& history, match_stat_t mstat, + search_position_t& insert_pos) const override { + if constexpr (NODE_TYPE == node_type_t::LEAF) { + if (unlikely(is_empty())) { + assert(insert_pos.is_end()); + return {STAGE, STAGE_T::template insert_size(key, value)}; + } else { + return STAGE_T::evaluate_insert( + key, value, history, mstat, cast_down(insert_pos)); + } + } else { + ceph_abort("impossible path"); + } + } + + private: + NodeLayoutT(NodeExtentRef extent) : extent{extent} {} + + node_offset_t filled_size() const { + auto& node_stage = extent.read(); + auto ret = node_stage.size_before(node_stage.keys()); + assert(ret == node_stage.total_size() - node_stage.free_size()); + return ret; + } + + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + + extent_t extent; +}; + +using InternalNode0 = NodeLayoutT; +using InternalNode1 = NodeLayoutT; +using InternalNode2 = NodeLayoutT; +using InternalNode3 = NodeLayoutT; +using LeafNode0 = NodeLayoutT; +using LeafNode1 = NodeLayoutT; +using LeafNode2 = NodeLayoutT; +using LeafNode3 = NodeLayoutT; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h new file mode 100644 index 000000000..c1499d609 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "node_extent_mutable.h" +#include "stages/node_stage.h" +#include "stages/stage.h" + +#define STAGE_T node_to_stage_t + +namespace crimson::os::seastore::onode { + +/** + * NodeLayoutReplayableT + * + * Contains templated logics to modify the layout of a NodeExtend which are + * also replayable. Used by NodeExtentAccessorT at runtime and by + * DeltaRecorderT during replay. + */ +template +struct NodeLayoutReplayableT { + using node_stage_t = node_extent_t; + using position_t = typename STAGE_T::position_t; + using StagedIterator = typename STAGE_T::StagedIterator; + using value_t = value_type_t; + static constexpr auto FIELD_TYPE = FieldType::FIELD_TYPE; + + template + static const value_t* insert( + NodeExtentMutable& mut, + const node_stage_t& node_stage, + const full_key_t& key, + const value_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + auto p_value = STAGE_T::template proceed_insert( + mut, node_stage, key, value, insert_pos, insert_stage, insert_size); + return p_value; + } + + static void split( + NodeExtentMutable& mut, + const node_stage_t& node_stage, + StagedIterator& split_at) { + node_stage_t::update_is_level_tail(mut, node_stage, false); + STAGE_T::trim(mut, split_at); + } + + template + static const value_t* split_insert( + NodeExtentMutable& mut, + const node_stage_t& node_stage, + StagedIterator& split_at, + const full_key_t& key, + const value_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + node_stage_t::update_is_level_tail(mut, node_stage, false); + STAGE_T::trim(mut, split_at); + auto p_value = STAGE_T::template proceed_insert( + mut, node_stage, key, value, insert_pos, insert_stage, insert_size); + return p_value; + } + + static void update_child_addr( + NodeExtentMutable& mut, const laddr_t new_addr, laddr_packed_t* p_addr) { + assert(NODE_TYPE == node_type_t::INTERNAL); + mut.copy_in_absolute(p_addr, new_addr); + } +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h new file mode 100644 index 000000000..6774544c7 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include + +#include "fwd.h" + +namespace crimson::os::seastore::onode { + +constexpr uint8_t FIELD_TYPE_MAGIC = 0x25; +enum class field_type_t : uint8_t { + N0 = FIELD_TYPE_MAGIC, + N1, + N2, + N3, + _MAX +}; +inline uint8_t to_unsigned(field_type_t type) { + auto value = static_cast(type); + assert(value >= FIELD_TYPE_MAGIC); + assert(value < static_cast(field_type_t::_MAX)); + return value - FIELD_TYPE_MAGIC; +} +inline std::ostream& operator<<(std::ostream &os, field_type_t type) { + const char* const names[] = {"0", "1", "2", "3"}; + auto index = to_unsigned(type); + os << names[index]; + return os; +} + +enum class node_type_t : uint8_t { + LEAF = 0, + INTERNAL +}; +inline std::ostream& operator<<(std::ostream &os, const node_type_t& type) { + const char* const names[] = {"L", "I"}; + auto index = static_cast(type); + assert(index <= 1u); + os << names[index]; + return os; +} + +struct laddr_packed_t { + laddr_t value; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const laddr_packed_t& laddr) { + return os << "laddr_packed(0x" << std::hex << laddr.value << std::dec << ")"; +} + +using match_stat_t = int8_t; +constexpr match_stat_t MSTAT_END = -2; // index is search_position_t::end() +constexpr match_stat_t MSTAT_EQ = -1; // key == index +constexpr match_stat_t MSTAT_LT0 = 0; // key == index [pool/shard crush ns/oid]; key < index [snap/gen] +constexpr match_stat_t MSTAT_LT1 = 1; // key == index [pool/shard crush]; key < index [ns/oid] +constexpr match_stat_t MSTAT_LT2 = 2; // key < index [pool/shard crush ns/oid] || + // key == index [pool/shard]; key < index [crush] +constexpr match_stat_t MSTAT_LT3 = 3; // key < index [pool/shard] +constexpr match_stat_t MSTAT_MIN = MSTAT_END; +constexpr match_stat_t MSTAT_MAX = MSTAT_LT3; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc new file mode 100644 index 000000000..443c6cabd --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc @@ -0,0 +1,165 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "item_iterator_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +#define ITER_T item_iterator_t +#define ITER_INST(NT) item_iterator_t + +template +template +memory_range_t ITER_T::insert_prefix( + NodeExtentMutable& mut, const ITER_T& iter, const full_key_t& key, + bool is_end, node_offset_t size, const char* p_left_bound) { + // 1. insert range + char* p_insert; + if (is_end) { + assert(!iter.has_next()); + p_insert = const_cast(iter.p_start()); + } else { + p_insert = const_cast(iter.p_end()); + } + char* p_insert_front = p_insert - size; + + // 2. shift memory + const char* p_shift_start = p_left_bound; + const char* p_shift_end = p_insert; + mut.shift_absolute(p_shift_start, + p_shift_end - p_shift_start, + -(int)size); + + // 3. append header + p_insert -= sizeof(node_offset_t); + node_offset_t back_offset = (p_insert - p_insert_front); + mut.copy_in_absolute(p_insert, back_offset); + ns_oid_view_t::append(mut, key, p_insert); + + return {p_insert_front, p_insert}; +} +#define IP_TEMPLATE(NT, KT) \ + template memory_range_t ITER_INST(NT)::insert_prefix( \ + NodeExtentMutable&, const ITER_INST(NT)&, const full_key_t&, \ + bool, node_offset_t, const char*) +IP_TEMPLATE(node_type_t::LEAF, KeyT::VIEW); +IP_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW); +IP_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ); +IP_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ); + +template +void ITER_T::update_size( + NodeExtentMutable& mut, const ITER_T& iter, int change) { + node_offset_t offset = iter.get_back_offset(); + int new_size = change + offset; + assert(new_size > 0 && new_size < NODE_BLOCK_SIZE); + mut.copy_in_absolute( + (void*)iter.get_item_range().p_end, node_offset_t(new_size)); +} + +template +node_offset_t ITER_T::trim_until(NodeExtentMutable&, const ITER_T& iter) { + assert(iter.index() != 0); + size_t ret = iter.p_end() - iter.p_items_start; + assert(ret < NODE_BLOCK_SIZE); + return ret; +} + +template +node_offset_t ITER_T::trim_at( + NodeExtentMutable& mut, const ITER_T& iter, node_offset_t trimmed) { + size_t trim_size = iter.p_start() - iter.p_items_start + trimmed; + assert(trim_size < NODE_BLOCK_SIZE); + assert(iter.get_back_offset() > trimmed); + node_offset_t new_offset = iter.get_back_offset() - trimmed; + mut.copy_in_absolute((void*)iter.item_range.p_end, new_offset); + return trim_size; +} + +#define ITER_TEMPLATE(NT) template class ITER_INST(NT) +ITER_TEMPLATE(node_type_t::LEAF); +ITER_TEMPLATE(node_type_t::INTERNAL); + +#define APPEND_T ITER_T::Appender + +template +template +bool APPEND_T::append(const ITER_T& src, index_t& items) { + auto p_end = src.p_end(); + bool append_till_end = false; + if (is_valid_index(items)) { + for (auto i = 1u; i <= items; ++i) { + if (!src.has_next()) { + assert(i == items); + append_till_end = true; + break; + } + ++src; + } + } else { + if (items == INDEX_END) { + append_till_end = true; + } else { + assert(items == INDEX_LAST); + } + items = 0; + while (src.has_next()) { + ++src; + ++items; + } + if (append_till_end) { + ++items; + } + } + + const char* p_start; + if (append_till_end) { + p_start = src.p_start(); + } else { + p_start = src.p_end(); + } + assert(p_end >= p_start); + size_t append_size = p_end - p_start; + p_append -= append_size; + p_mut->copy_in_absolute(p_append, p_start, append_size); + return append_till_end; +} + +template +template +std::tuple +APPEND_T::open_nxt(const key_get_type& partial_key) { + p_append -= sizeof(node_offset_t); + p_offset_while_open = p_append; + ns_oid_view_t::append(*p_mut, partial_key, p_append); + return {p_mut, p_append}; +} + +template +template +std::tuple +APPEND_T::open_nxt(const full_key_t& key) { + p_append -= sizeof(node_offset_t); + p_offset_while_open = p_append; + ns_oid_view_t::append(*p_mut, key, p_append); + return {p_mut, p_append}; +} + +template +template +void APPEND_T::wrap_nxt(char* _p_append) { + assert(_p_append < p_append); + p_mut->copy_in_absolute( + p_offset_while_open, node_offset_t(p_offset_while_open - _p_append)); + p_append = _p_append; +} + +#define APPEND_TEMPLATE(NT, KT) template class ITER_INST(NT)::Appender +APPEND_TEMPLATE(node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ); + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h new file mode 100644 index 000000000..bb68eec8f --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h @@ -0,0 +1,180 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +/** + * item_iterator_t + * + * The STAGE_STRING implementation for node N0/N1, implements staged contract + * as an iterative container to resolve crush hash conflicts. + * + * The layout of the contaner to index ns, oid strings storing n items: + * + * # <--------- container range ---------> # + * #<~># items [i+1, n) # + * # # items [0, i) #<~># + * # # <------ item i -------------> # # + * # # <--- item_range ---> | # # + * # # | # # + * # # next-stage | ns-oid | back_ # # + * # # contaner | strings | offset # # + * #...# range | | #...# + * ^ ^ | ^ + * | | | | + * | +---------------------------+ | + * + p_items_start p_items_end + + */ +template +class item_iterator_t { + using value_t = value_type_t; + public: + item_iterator_t(const memory_range_t& range) + : p_items_start(range.p_start), p_items_end(range.p_end) { + assert(p_items_start < p_items_end); + next_item_range(p_items_end); + } + + const char* p_start() const { return item_range.p_start; } + const char* p_end() const { return item_range.p_end + sizeof(node_offset_t); } + const memory_range_t& get_item_range() const { return item_range; } + node_offset_t get_back_offset() const { return back_offset; } + + // container type system + using key_get_type = const ns_oid_view_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::ITERATIVE; + index_t index() const { return _index; } + key_get_type get_key() const { + if (!key.has_value()) { + key = ns_oid_view_t(item_range.p_end); + assert(item_range.p_start < (*key).p_start()); + } + return *key; + } + node_offset_t size() const { + size_t ret = item_range.p_end - item_range.p_start + sizeof(node_offset_t); + assert(ret < NODE_BLOCK_SIZE); + return ret; + }; + node_offset_t size_to_nxt() const { + size_t ret = get_key().size() + sizeof(node_offset_t); + assert(ret < NODE_BLOCK_SIZE); + return ret; + } + node_offset_t size_overhead() const { + return sizeof(node_offset_t) + get_key().size_overhead(); + } + memory_range_t get_nxt_container() const { + return {item_range.p_start, get_key().p_start()}; + } + bool has_next() const { + assert(p_items_start <= item_range.p_start); + return p_items_start < item_range.p_start; + } + const item_iterator_t& operator++() const { + assert(has_next()); + next_item_range(item_range.p_start); + key.reset(); + ++_index; + return *this; + } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + int start_offset = p_items_start - p_node_start; + int end_offset = p_items_end - p_node_start; + assert(start_offset > 0 && start_offset < NODE_BLOCK_SIZE); + assert(end_offset > 0 && end_offset <= NODE_BLOCK_SIZE); + ceph::encode(static_cast(start_offset), encoded); + ceph::encode(static_cast(end_offset), encoded); + ceph::encode(_index, encoded); + } + + static item_iterator_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t end_offset; + ceph::decode(end_offset, delta); + assert(start_offset < end_offset); + assert(end_offset <= NODE_BLOCK_SIZE); + index_t index; + ceph::decode(index, delta); + + item_iterator_t ret({p_node_start + start_offset, + p_node_start + end_offset}); + while (index > 0) { + ++ret; + --index; + } + return ret; + } + + static node_offset_t header_size() { return 0u; } + + template + static node_offset_t estimate_insert( + const full_key_t& key, const value_t&) { + return ns_oid_view_t::estimate_size(key) + sizeof(node_offset_t); + } + + template + static memory_range_t insert_prefix( + NodeExtentMutable& mut, const item_iterator_t& iter, + const full_key_t& key, bool is_end, + node_offset_t size, const char* p_left_bound); + + static void update_size( + NodeExtentMutable& mut, const item_iterator_t& iter, int change); + + static node_offset_t trim_until(NodeExtentMutable&, const item_iterator_t&); + static node_offset_t trim_at( + NodeExtentMutable&, const item_iterator_t&, node_offset_t trimmed); + + template + class Appender; + + private: + void next_item_range(const char* p_end) const { + auto p_item_end = p_end - sizeof(node_offset_t); + assert(p_items_start < p_item_end); + back_offset = reinterpret_cast(p_item_end)->value; + assert(back_offset); + const char* p_item_start = p_item_end - back_offset; + assert(p_items_start <= p_item_start); + item_range = {p_item_start, p_item_end}; + } + + const char* p_items_start; + const char* p_items_end; + mutable memory_range_t item_range; + mutable node_offset_t back_offset; + mutable std::optional key; + mutable index_t _index = 0u; +}; + +template +template +class item_iterator_t::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} {} + bool append(const item_iterator_t& src, index_t& items); + char* wrap() { return p_append; } + std::tuple open_nxt(const key_get_type&); + std::tuple open_nxt(const full_key_t&); + void wrap_nxt(char* _p_append); + + private: + NodeExtentMutable* p_mut; + char* p_append; + char* p_offset_while_open; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc new file mode 100644 index 000000000..d60bb8d09 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc @@ -0,0 +1,32 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "key_layout.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +void string_key_view_t::append_str( + NodeExtentMutable& mut, std::string_view str, char*& p_append) { + assert(is_valid_size(str.length())); + p_append -= sizeof(string_size_t); + string_size_t len = str.length(); + mut.copy_in_absolute(p_append, len); + p_append -= len; + mut.copy_in_absolute(p_append, str.data(), len); +} + +void string_key_view_t::append_dedup( + NodeExtentMutable& mut, const Type& dedup_type, char*& p_append) { + p_append -= sizeof(string_size_t); + if (dedup_type == Type::MIN) { + mut.copy_in_absolute(p_append, MIN); + } else if (dedup_type == Type::MAX) { + mut.copy_in_absolute(p_append, MAX); + } else { + ceph_abort("impossible path"); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h new file mode 100644 index 000000000..cc1f546c1 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h @@ -0,0 +1,846 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include + +#include "common/hobject.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h" + +namespace crimson::os::seastore::onode { + +using shard_t = int8_t; +using pool_t = int64_t; +using crush_hash_t = uint32_t; +using snap_t = uint64_t; +using gen_t = uint64_t; +static_assert(sizeof(shard_t) == sizeof(ghobject_t().shard_id.id)); +static_assert(sizeof(pool_t) == sizeof(ghobject_t().hobj.pool)); +static_assert(sizeof(crush_hash_t) == sizeof(ghobject_t().hobj.get_hash())); +static_assert(sizeof(snap_t) == sizeof(ghobject_t().hobj.snap.val)); +static_assert(sizeof(gen_t) == sizeof(ghobject_t().generation)); + +class NodeExtentMutable; +class key_view_t; +class key_hobj_t; +enum class KeyT { VIEW, HOBJ }; +template struct _full_key_type; +template<> struct _full_key_type { using type = key_view_t; }; +template<> struct _full_key_type { using type = key_hobj_t; }; +template +using full_key_t = typename _full_key_type::type; + +struct node_offset_packed_t { + node_offset_t value; +} __attribute__((packed)); + +// TODO: consider alignments +struct shard_pool_t { + bool operator==(const shard_pool_t& x) const { + return (shard == x.shard && pool == x.pool); + } + bool operator!=(const shard_pool_t& x) const { return !(*this == x); } + + template + static shard_pool_t from_key(const full_key_t& key); + + shard_t shard; + pool_t pool; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const shard_pool_t& sp) { + return os << (unsigned)sp.shard << "," << sp.pool; +} +inline MatchKindCMP compare_to(const shard_pool_t& l, const shard_pool_t& r) { + auto ret = toMatchKindCMP(l.shard, r.shard); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(l.pool, r.pool); +} + +struct crush_t { + bool operator==(const crush_t& x) const { return crush == x.crush; } + bool operator!=(const crush_t& x) const { return !(*this == x); } + + template + static crush_t from_key(const full_key_t& key); + + crush_hash_t crush; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const crush_t& c) { + return os << c.crush; +} +inline MatchKindCMP compare_to(const crush_t& l, const crush_t& r) { + return toMatchKindCMP(l.crush, r.crush); +} + +struct shard_pool_crush_t { + bool operator==(const shard_pool_crush_t& x) const { + return (shard_pool == x.shard_pool && crush == x.crush); + } + bool operator!=(const shard_pool_crush_t& x) const { return !(*this == x); } + + template + static shard_pool_crush_t from_key(const full_key_t& key); + + shard_pool_t shard_pool; + crush_t crush; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const shard_pool_crush_t& spc) { + return os << spc.shard_pool << "," << spc.crush; +} +inline MatchKindCMP compare_to( + const shard_pool_crush_t& l, const shard_pool_crush_t& r) { + auto ret = compare_to(l.shard_pool, r.shard_pool); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to(l.crush, r.crush); +} + +struct snap_gen_t { + bool operator==(const snap_gen_t& x) const { + return (snap == x.snap && gen == x.gen); + } + bool operator!=(const snap_gen_t& x) const { return !(*this == x); } + + template + static snap_gen_t from_key(const full_key_t& key); + + snap_t snap; + gen_t gen; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const snap_gen_t& sg) { + return os << sg.snap << "," << sg.gen; +} +inline MatchKindCMP compare_to(const snap_gen_t& l, const snap_gen_t& r) { + auto ret = toMatchKindCMP(l.snap, r.snap); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(l.gen, r.gen); +} + +/** + * string_key_view_t + * + * The layout to store char array as an oid or an ns string which may be + * compressed. + * + * If compressed, the physical block only stores an unsigned int of + * string_size_t, with value 0 denoting Type::MIN, and value max() denoting + * Type::MAX. + * + * If not compressed (Type::STR), the physical block stores the char array and + * a valid string_size_t value. + */ +struct string_key_view_t { + enum class Type {MIN, STR, MAX}; + // presumably the maximum string length is 2KiB + using string_size_t = uint16_t; + static constexpr auto MAX = std::numeric_limits::max(); + static constexpr auto MIN = string_size_t(0u); + static auto is_valid_size(size_t size) { + return (size > MIN && size < MAX); + } + + string_key_view_t(const char* p_end) { + p_length = p_end - sizeof(string_size_t); + std::memcpy(&length, p_length, sizeof(string_size_t)); + if (is_valid_size(length)) { + auto _p_key = p_length - length; + p_key = static_cast(_p_key); + } else { + assert(length == MAX || length == MIN); + p_key = nullptr; + } + } + Type type() const { + if (length == MIN) { + return Type::MIN; + } else if (length == MAX) { + return Type::MAX; + } else { + assert(is_valid_size(length)); + return Type::STR; + } + } + const char* p_start() const { + if (p_key) { + return p_key; + } else { + return p_length; + } + } + const char* p_next_end() const { + if (p_key) { + return p_start(); + } else { + return p_length + sizeof(string_size_t); + } + } + node_offset_t size() const { + size_t ret = length + sizeof(string_size_t); + assert(ret < NODE_BLOCK_SIZE); + return ret; + } + node_offset_t size_logical() const { + assert(type() == Type::STR); + assert(is_valid_size(length)); + return length; + } + node_offset_t size_overhead() const { + assert(type() == Type::STR); + return sizeof(string_size_t); + } + + std::string_view to_string_view() const { + assert(type() == Type::STR); + assert(is_valid_size(length)); + return {p_key, length}; + } + bool operator==(const string_key_view_t& x) const { + if (type() == x.type() && type() != Type::STR) + return true; + if (type() != x.type()) + return false; + if (length != x.length) + return false; + return (memcmp(p_key, x.p_key, length) == 0); + } + bool operator!=(const string_key_view_t& x) const { return !(*this == x); } + + static void append_str( + NodeExtentMutable&, std::string_view, char*& p_append); + + static void test_append_str(std::string_view str, char*& p_append) { + assert(is_valid_size(str.length())); + p_append -= sizeof(string_size_t); + string_size_t len = str.length(); + std::memcpy(p_append, &len, sizeof(string_size_t)); + p_append -= len; + std::memcpy(p_append, str.data(), len); + } + + static void append_dedup( + NodeExtentMutable&, const Type& dedup_type, char*& p_append); + + static void test_append_dedup(const Type& dedup_type, char*& p_append) { + p_append -= sizeof(string_size_t); + string_size_t len; + if (dedup_type == Type::MIN) { + len = MIN; + } else if (dedup_type == Type::MAX) { + len = MAX; + } else { + ceph_abort("impossible path"); + } + std::memcpy(p_append, &len, sizeof(string_size_t)); + } + + const char* p_key; + const char* p_length; + // TODO: remove if p_length is aligned + string_size_t length; +}; + +/** + * string_view_masked_t + * + * A common class to hide the underlying string implementation regardless of a + * string_key_view_t (maybe compressed), a string/string_view, or a compressed + * string. And leverage this consistant class to do compare, print, convert and + * append operations. + */ +class string_view_masked_t { + public: + using string_size_t = string_key_view_t::string_size_t; + using Type = string_key_view_t::Type; + explicit string_view_masked_t(const string_key_view_t& index) + : type{index.type()} { + if (type == Type::STR) { + view = index.to_string_view(); + } + } + explicit string_view_masked_t(std::string_view str) + : type{Type::STR}, view{str} { + assert(string_key_view_t::is_valid_size(view.size())); + } + + Type get_type() const { return type; } + std::string_view to_string_view() const { + assert(get_type() == Type::STR); + return view; + } + string_size_t size() const { + assert(get_type() == Type::STR); + assert(string_key_view_t::is_valid_size(view.size())); + return view.size(); + } + bool operator==(const string_view_masked_t& x) const { + if (get_type() == x.get_type() && get_type() != Type::STR) + return true; + if (get_type() != x.get_type()) + return false; + if (size() != x.size()) + return false; + return (memcmp(view.data(), x.view.data(), size()) == 0); + } + bool operator!=(const string_view_masked_t& x) const { return !(*this == x); } + void encode(ceph::bufferlist& bl) const { + if (get_type() == Type::MIN) { + ceph::encode(string_key_view_t::MIN, bl); + } else if (get_type() == Type::MAX) { + ceph::encode(string_key_view_t::MAX, bl); + } else { + ceph::encode(size(), bl); + ceph::encode_nohead(view, bl); + } + } + static auto min() { return string_view_masked_t{Type::MIN}; } + static auto max() { return string_view_masked_t{Type::MAX}; } + static string_view_masked_t decode( + std::string& str_storage, ceph::bufferlist::const_iterator& delta) { + string_size_t size; + ceph::decode(size, delta); + if (size == string_key_view_t::MIN) { + return min(); + } else if (size == string_key_view_t::MAX) { + return max(); + } else { + ceph::decode_nohead(size, str_storage, delta); + return string_view_masked_t(str_storage); + } + } + + private: + explicit string_view_masked_t(Type type) + : type{type} {} + + Type type; + std::string_view view; +}; +inline MatchKindCMP compare_to(const string_view_masked_t& l, const string_view_masked_t& r) { + using Type = string_view_masked_t::Type; + auto l_type = l.get_type(); + auto r_type = r.get_type(); + if (l_type == Type::STR && r_type == Type::STR) { + assert(l.size() && r.size()); + return toMatchKindCMP(l.to_string_view(), r.to_string_view()); + } else if (l_type == r_type) { + return MatchKindCMP::EQ; + } else if (l_type == Type::MIN || r_type == Type::MAX) { + return MatchKindCMP::LT; + } else { // l_type == Type::MAX || r_type == Type::MIN + return MatchKindCMP::GT; + } +} +inline MatchKindCMP compare_to(std::string_view l, const string_view_masked_t& r) { + using Type = string_view_masked_t::Type; + assert(l.length()); + auto r_type = r.get_type(); + if (r_type == Type::MIN) { + return MatchKindCMP::GT; + } else if (r_type == Type::MAX) { + return MatchKindCMP::LT; + } else { // r_type == Type::STR + assert(r.size()); + return toMatchKindCMP(l, r.to_string_view()); + } +} +inline MatchKindCMP compare_to(const string_view_masked_t& l, std::string_view r) { + return reverse(compare_to(r, l)); +} +inline std::ostream& operator<<(std::ostream& os, const string_view_masked_t& masked) { + using Type = string_view_masked_t::Type; + auto type = masked.get_type(); + if (type == Type::MIN) { + return os << "MIN"; + } else if (type == Type::MAX) { + return os << "MAX"; + } else { // type == Type::STR + auto view = masked.to_string_view(); + if (view.length() <= 12) { + os << "\"" << view << "\""; + } else { + os << "\"" << std::string_view(view.data(), 4) << ".." + << std::string_view(view.data() + view.length() - 2, 2) + << "/" << view.length() << "B\""; + } + return os; + } +} + +struct ns_oid_view_t { + using string_size_t = string_key_view_t::string_size_t; + using Type = string_key_view_t::Type; + + ns_oid_view_t(const char* p_end) : nspace(p_end), oid(nspace.p_next_end()) {} + Type type() const { return oid.type(); } + const char* p_start() const { return oid.p_start(); } + node_offset_t size() const { + if (type() == Type::STR) { + size_t ret = nspace.size() + oid.size(); + assert(ret < NODE_BLOCK_SIZE); + return ret; + } else { + return sizeof(string_size_t); + } + } + node_offset_t size_logical() const { + assert(type() == Type::STR); + return nspace.size_logical() + oid.size_logical(); + } + node_offset_t size_overhead() const { + assert(type() == Type::STR); + return nspace.size_overhead() + oid.size_overhead(); + } + bool operator==(const ns_oid_view_t& x) const { + return (string_view_masked_t{nspace} == string_view_masked_t{x.nspace} && + string_view_masked_t{oid} == string_view_masked_t{x.oid}); + } + bool operator!=(const ns_oid_view_t& x) const { return !(*this == x); } + + template + static node_offset_t estimate_size(const full_key_t& key); + + template + static void append(NodeExtentMutable&, + const full_key_t& key, + char*& p_append); + + static void append(NodeExtentMutable& mut, + const ns_oid_view_t& view, + char*& p_append) { + if (view.type() == Type::STR) { + string_key_view_t::append_str(mut, view.nspace.to_string_view(), p_append); + string_key_view_t::append_str(mut, view.oid.to_string_view(), p_append); + } else { + string_key_view_t::append_dedup(mut, view.type(), p_append); + } + } + + template + static void test_append(const full_key_t& key, char*& p_append); + + string_key_view_t nspace; + string_key_view_t oid; +}; +inline std::ostream& operator<<(std::ostream& os, const ns_oid_view_t& ns_oid) { + return os << string_view_masked_t{ns_oid.nspace} << "," + << string_view_masked_t{ns_oid.oid}; +} +inline MatchKindCMP compare_to(const ns_oid_view_t& l, const ns_oid_view_t& r) { + auto ret = compare_to(string_view_masked_t{l.nspace}, + string_view_masked_t{r.nspace}); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to(string_view_masked_t{l.oid}, + string_view_masked_t{r.oid}); +} + +/** + * key_hobj_t + * + * A specialized implementation of a full_key_t storing a ghobject_t passed + * from user. + */ +class key_hobj_t { + public: + explicit key_hobj_t(const ghobject_t& ghobj) : ghobj{ghobj} {} + /* + * common interfaces as a full_key_t + */ + shard_t shard() const { + return ghobj.shard_id; + } + pool_t pool() const { + return ghobj.hobj.pool; + } + crush_hash_t crush() const { + return ghobj.hobj.get_hash(); + } + std::string_view nspace() const { + // TODO(cross-node string dedup) + return ghobj.hobj.nspace; + } + string_view_masked_t nspace_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{nspace()}; + } + std::string_view oid() const { + // TODO(cross-node string dedup) + return ghobj.hobj.oid.name; + } + string_view_masked_t oid_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{oid()}; + } + ns_oid_view_t::Type dedup_type() const { + return _dedup_type; + } + snap_t snap() const { + return ghobj.hobj.snap; + } + gen_t gen() const { + return ghobj.generation; + } + + bool operator==(const full_key_t& o) const; + bool operator==(const full_key_t& o) const; + bool operator!=(const full_key_t& o) const { + return !operator==(o); + } + bool operator!=(const full_key_t& o) const { + return !operator==(o); + } + + std::ostream& dump(std::ostream& os) const { + os << "key_hobj(" << (unsigned)shard() << "," + << pool() << "," << crush() << "; " + << string_view_masked_t{nspace()} << "," + << string_view_masked_t{oid()} << "; " + << snap() << "," << gen() << ")"; + return os; + } + + static key_hobj_t decode(ceph::bufferlist::const_iterator& delta) { + shard_t shard; + ceph::decode(shard, delta); + pool_t pool; + ceph::decode(pool, delta); + crush_hash_t crush; + ceph::decode(crush, delta); + std::string nspace; + auto nspace_masked = string_view_masked_t::decode(nspace, delta); + // TODO(cross-node string dedup) + assert(nspace_masked.get_type() == string_view_masked_t::Type::STR); + std::string oid; + auto oid_masked = string_view_masked_t::decode(oid, delta); + // TODO(cross-node string dedup) + assert(oid_masked.get_type() == string_view_masked_t::Type::STR); + snap_t snap; + ceph::decode(snap, delta); + gen_t gen; + ceph::decode(gen, delta); + return key_hobj_t(ghobject_t( + shard_id_t(shard), pool, crush, nspace, oid, snap, gen)); + } + + private: + ns_oid_view_t::Type _dedup_type = ns_oid_view_t::Type::STR; + ghobject_t ghobj; +}; +inline std::ostream& operator<<(std::ostream& os, const key_hobj_t& key) { + return key.dump(os); +} + +/** + * key_view_t + * + * A specialized implementation of a full_key_t pointing to the locations + * storing the full key in a tree node. + */ +class key_view_t { + public: + /** + * common interfaces as a full_key_t + */ + shard_t shard() const { + return shard_pool_packed().shard; + } + pool_t pool() const { + return shard_pool_packed().pool; + } + crush_hash_t crush() const { + return crush_packed().crush; + } + std::string_view nspace() const { + // TODO(cross-node string dedup) + return ns_oid_view().nspace.to_string_view(); + } + string_view_masked_t nspace_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{ns_oid_view().nspace}; + } + std::string_view oid() const { + // TODO(cross-node string dedup) + return ns_oid_view().oid.to_string_view(); + } + string_view_masked_t oid_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{ns_oid_view().oid}; + } + ns_oid_view_t::Type dedup_type() const { + return ns_oid_view().type(); + } + snap_t snap() const { + return snap_gen_packed().snap; + } + gen_t gen() const { + return snap_gen_packed().gen; + } + + bool operator==(const full_key_t& o) const; + bool operator==(const full_key_t& o) const; + bool operator!=(const full_key_t& o) const { + return !operator==(o); + } + bool operator!=(const full_key_t& o) const { + return !operator==(o); + } + + /** + * key_view_t specific interfaces + */ + bool has_shard_pool() const { + return p_shard_pool != nullptr; + } + bool has_crush() const { + return p_crush != nullptr; + } + bool has_ns_oid() const { + return p_ns_oid.has_value(); + } + bool has_snap_gen() const { + return p_snap_gen != nullptr; + } + + const shard_pool_t& shard_pool_packed() const { + assert(has_shard_pool()); + return *p_shard_pool; + } + const crush_t& crush_packed() const { + assert(has_crush()); + return *p_crush; + } + const ns_oid_view_t& ns_oid_view() const { + assert(has_ns_oid()); + return *p_ns_oid; + } + const snap_gen_t& snap_gen_packed() const { + assert(has_snap_gen()); + return *p_snap_gen; + } + + size_t size_logical() const { + return sizeof(shard_t) + sizeof(pool_t) + sizeof(crush_hash_t) + + sizeof(snap_t) + sizeof(gen_t) + ns_oid_view().size_logical(); + } + + ghobject_t to_ghobj() const { + return ghobject_t( + shard_id_t(shard()), pool(), crush(), + std::string(nspace()), std::string(oid()), snap(), gen()); + } + + void replace(const crush_t& key) { p_crush = &key; } + void set(const crush_t& key) { + assert(!has_crush()); + replace(key); + } + void replace(const shard_pool_crush_t& key) { p_shard_pool = &key.shard_pool; } + void set(const shard_pool_crush_t& key) { + set(key.crush); + assert(!has_shard_pool()); + replace(key); + } + void replace(const ns_oid_view_t& key) { p_ns_oid = key; } + void set(const ns_oid_view_t& key) { + assert(!has_ns_oid()); + replace(key); + } + void replace(const snap_gen_t& key) { p_snap_gen = &key; } + void set(const snap_gen_t& key) { + assert(!has_snap_gen()); + replace(key); + } + + std::ostream& dump(std::ostream& os) const { + os << "key_view("; + if (has_shard_pool()) { + os << (unsigned)shard() << "," << pool() << ","; + } else { + os << "X,X,"; + } + if (has_crush()) { + os << crush() << "; "; + } else { + os << "X; "; + } + if (has_ns_oid()) { + os << ns_oid_view() << "; "; + } else { + os << "X,X; "; + } + if (has_snap_gen()) { + os << snap() << "," << gen() << ")"; + } else { + os << "X,X)"; + } + return os; + } + + private: + const shard_pool_t* p_shard_pool = nullptr; + const crush_t* p_crush = nullptr; + std::optional p_ns_oid; + const snap_gen_t* p_snap_gen = nullptr; +}; + +template +void encode_key(const full_key_t& key, ceph::bufferlist& bl) { + ceph::encode(key.shard(), bl); + ceph::encode(key.pool(), bl); + ceph::encode(key.crush(), bl); + key.nspace_masked().encode(bl); + key.oid_masked().encode(bl); + ceph::encode(key.snap(), bl); + ceph::encode(key.gen(), bl); +} + +inline MatchKindCMP compare_to(std::string_view l, std::string_view r) { + return toMatchKindCMP(l, r); +} +template +bool compare_full_key(const full_key_t& l, const full_key_t& r) { + if (l.shard() != r.shard()) + return false; + if (l.pool() != r.pool()) + return false; + if (l.crush() != r.crush()) + return false; + if (compare_to(l.nspace(), r.nspace()) != MatchKindCMP::EQ) + return false; + if (compare_to(l.oid(), r.oid()) != MatchKindCMP::EQ) + return false; + if (l.snap() != r.snap()) + return false; + if (l.gen() != r.gen()) + return false; + return true; +} + +inline bool key_hobj_t::operator==(const full_key_t& o) const { + return compare_full_key(*this, o); +} +inline bool key_hobj_t::operator==(const full_key_t& o) const { + return compare_full_key(*this, o); +} +inline bool key_view_t::operator==(const full_key_t& o) const { + return compare_full_key(*this, o); +} +inline bool key_view_t::operator==(const full_key_t& o) const { + return compare_full_key(*this, o); +} + +inline std::ostream& operator<<(std::ostream& os, const key_view_t& key) { + return key.dump(os); +} + +template +MatchKindCMP compare_to(const full_key_t& key, const shard_pool_t& target) { + auto ret = toMatchKindCMP(key.shard(), target.shard); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(key.pool(), target.pool); +} + +template +MatchKindCMP compare_to(const full_key_t& key, const crush_t& target) { + return toMatchKindCMP(key.crush(), target.crush); +} + +template +MatchKindCMP compare_to(const full_key_t& key, const shard_pool_crush_t& target) { + auto ret = compare_to(key, target.shard_pool); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to(key, target.crush); +} + +template +MatchKindCMP compare_to(const full_key_t& key, const ns_oid_view_t& target) { + auto ret = compare_to(key.nspace(), string_view_masked_t{target.nspace}); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to(key.oid(), string_view_masked_t{target.oid}); +} + +template +MatchKindCMP compare_to(const full_key_t& key, const snap_gen_t& target) { + auto ret = toMatchKindCMP(key.snap(), target.snap); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(key.gen(), target.gen); +} + +template +shard_pool_t shard_pool_t::from_key(const full_key_t& key) { + if constexpr (KT == KeyT::VIEW) { + return key.shard_pool_packed(); + } else { + return {key.shard(), key.pool()}; + } +} + +template +crush_t crush_t::from_key(const full_key_t& key) { + if constexpr (KT == KeyT::VIEW) { + return key.crush_packed(); + } else { + return {key.crush()}; + } +} + +template +shard_pool_crush_t shard_pool_crush_t::from_key(const full_key_t& key) { + return {shard_pool_t::from_key(key), crush_t::from_key(key)}; +} + +template +snap_gen_t snap_gen_t::from_key(const full_key_t& key) { + if constexpr (KT == KeyT::VIEW) { + return key.snap_gen_packed(); + } else { + return {key.snap(), key.gen()}; + } +} + +template +node_offset_t ns_oid_view_t::estimate_size(const full_key_t& key) { + if constexpr (KT == KeyT::VIEW) { + return key.ns_oid_view().size(); + } else { + if (key.dedup_type() != Type::STR) { + // size after deduplication + return sizeof(string_size_t); + } else { + return 2 * sizeof(string_size_t) + key.nspace().size() + key.oid().size(); + } + } +} + +template +void ns_oid_view_t::append( + NodeExtentMutable& mut, const full_key_t& key, char*& p_append) { + if (key.dedup_type() == Type::STR) { + string_key_view_t::append_str(mut, key.nspace(), p_append); + string_key_view_t::append_str(mut, key.oid(), p_append); + } else { + string_key_view_t::append_dedup(mut, key.dedup_type(), p_append); + } +} + +template +void ns_oid_view_t::test_append(const full_key_t& key, char*& p_append) { + if (key.dedup_type() == Type::STR) { + string_key_view_t::test_append_str(key.nspace(), p_append); + string_key_view_t::test_append_str(key.oid(), p_append); + } else { + string_key_view_t::test_append_dedup(key.dedup_type(), p_append); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc new file mode 100644 index 000000000..4a5988185 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc @@ -0,0 +1,318 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" +#include "node_stage_layout.h" + +namespace crimson::os::seastore::onode { + +#define NODE_T node_extent_t +#define NODE_INST(FT, NT) node_extent_t + +template +const char* NODE_T::p_left_bound() const { + if constexpr (std::is_same_v) { + // N3 internal node doesn't have the right part + return nullptr; + } else { + auto ret = p_start() + fields().get_item_end_offset(keys()); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (is_level_tail()) { + ret -= sizeof(laddr_t); + } + } + return ret; + } +} + +template +node_offset_t NODE_T::size_to_nxt_at(index_t index) const { + assert(index < keys()); + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + return FieldType::estimate_insert_one(); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + auto p_end = p_start() + p_fields->get_item_end_offset(index); + return FieldType::estimate_insert_one() + ns_oid_view_t(p_end).size(); + } else { + ceph_abort("N3 node is not nested"); + } +} + +template +memory_range_t NODE_T::get_nxt_container(index_t index) const { + if constexpr (std::is_same_v) { + ceph_abort("N3 internal node doesn't have the right part"); + } else { + node_offset_t item_start_offset = p_fields->get_item_start_offset(index); + node_offset_t item_end_offset = p_fields->get_item_end_offset(index); + assert(item_start_offset < item_end_offset); + auto item_p_start = p_start() + item_start_offset; + auto item_p_end = p_start() + item_end_offset; + if constexpr (FIELD_TYPE == field_type_t::N2) { + // range for sub_items_t + item_p_end = ns_oid_view_t(item_p_end).p_start(); + assert(item_p_start < item_p_end); + } else { + // range for item_iterator_t + } + return {item_p_start, item_p_end}; + } +} + +template +void NODE_T::bootstrap_extent( + NodeExtentMutable& mut, + field_type_t field_type, node_type_t node_type, + bool is_level_tail, level_t level) { + node_header_t::bootstrap_extent( + mut, field_type, node_type, is_level_tail, level); + mut.copy_in_relative( + sizeof(node_header_t), typename FieldType::num_keys_t(0u)); +} + +template +void NODE_T::update_is_level_tail( + NodeExtentMutable& mut, const node_extent_t& extent, bool value) { + node_header_t::update_is_level_tail(mut, extent.p_fields->header, value); +} + +template +template +memory_range_t NODE_T::insert_prefix_at( + NodeExtentMutable& mut, const node_extent_t& node, const full_key_t& key, + index_t index, node_offset_t size, const char* p_left_bound) { + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + assert(index <= node.keys()); + assert(p_left_bound == node.p_left_bound()); + assert(size > FieldType::estimate_insert_one()); + auto size_right = size - FieldType::estimate_insert_one(); + const char* p_insert = node.p_start() + node.fields().get_item_end_offset(index); + const char* p_insert_front = p_insert - size_right; + FieldType::template insert_at(mut, key, node.fields(), index, size_right); + mut.shift_absolute(p_left_bound, + p_insert - p_left_bound, + -(int)size_right); + return {p_insert_front, p_insert}; + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + ceph_abort("not implemented"); + } else { + ceph_abort("impossible"); + } +} +#define IPA_TEMPLATE(FT, NT, KT) \ + template memory_range_t NODE_INST(FT, NT)::insert_prefix_at( \ + NodeExtentMutable&, const node_extent_t&, const full_key_t&, \ + index_t, node_offset_t, const char*) +IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW); +IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW); +IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW); +IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW); +IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW); +IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW); +IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ); + +template +void NODE_T::update_size_at( + NodeExtentMutable& mut, const node_extent_t& node, index_t index, int change) { + assert(index < node.keys()); + FieldType::update_size_at(mut, node.fields(), index, change); +} + +template +node_offset_t NODE_T::trim_until( + NodeExtentMutable& mut, const node_extent_t& node, index_t index) { + assert(!node.is_level_tail()); + auto keys = node.keys(); + assert(index <= keys); + if (index == keys) { + return 0; + } + if constexpr (std::is_same_v) { + ceph_abort("not implemented"); + } else { + mut.copy_in_absolute( + (void*)&node.p_fields->num_keys, num_keys_t(index)); + } + // no need to calculate trim size for node + return 0; +} + +template +node_offset_t NODE_T::trim_at( + NodeExtentMutable& mut, const node_extent_t& node, + index_t index, node_offset_t trimmed) { + assert(!node.is_level_tail()); + assert(index < node.keys()); + if constexpr (std::is_same_v) { + ceph_abort("not implemented"); + } else { + node_offset_t offset = node.p_fields->get_item_start_offset(index); + size_t new_offset = offset + trimmed; + assert(new_offset < node.p_fields->get_item_end_offset(index)); + mut.copy_in_absolute(const_cast(node.p_fields->p_offset(index)), + node_offset_t(new_offset)); + mut.copy_in_absolute( + (void*)&node.p_fields->num_keys, num_keys_t(index + 1)); + } + // no need to calculate trim size for node + return 0; +} + +#define NODE_TEMPLATE(FT, NT) template class NODE_INST(FT, NT) +NODE_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL); +NODE_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_0_t, node_type_t::LEAF); +NODE_TEMPLATE(node_fields_1_t, node_type_t::LEAF); +NODE_TEMPLATE(node_fields_2_t, node_type_t::LEAF); +NODE_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF); + +#define APPEND_T node_extent_t::Appender + +template +template +void APPEND_T::append(const node_extent_t& src, index_t from, index_t items) { + assert(from <= src.keys()); + if (p_src == nullptr) { + p_src = &src; + } else { + assert(p_src == &src); + } + if (items == 0) { + return; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + num_keys += items; + if constexpr (std::is_same_v) { + ceph_abort("impossible path"); + } else { + // append left part forwards + node_offset_t offset_left_start = src.fields().get_key_start_offset(from); + node_offset_t offset_left_end = src.fields().get_key_start_offset(from + items); + node_offset_t left_size = offset_left_end - offset_left_start; + if (num_keys == 0) { + // no need to adjust offset + assert(from == 0); + assert(p_start + offset_left_start == p_append_left); + p_mut->copy_in_absolute(p_append_left, + src.p_start() + offset_left_start, left_size); + } else { + node_offset_t step_size = FieldType::estimate_insert_one(); + node_offset_t offset_base = src.fields().get_item_end_offset(from); + int offset_change = p_append_right - p_start - offset_base; + auto p_offset_dst = p_append_left; + if constexpr (FIELD_TYPE != field_type_t::N2) { + // copy keys + p_mut->copy_in_absolute(p_append_left, + src.p_start() + offset_left_start, left_size); + // point to offset for update + p_offset_dst += sizeof(typename FieldType::key_t); + } + for (auto i = from; i < from + items; ++i) { + p_mut->copy_in_absolute(p_offset_dst, + node_offset_t(src.fields().get_item_start_offset(i) + offset_change)); + p_offset_dst += step_size; + } + assert(p_append_left + left_size + sizeof(typename FieldType::key_t) == + p_offset_dst); + } + p_append_left += left_size; + + // append right part backwards + node_offset_t offset_right_start = src.fields().get_item_end_offset(from + items); + node_offset_t offset_right_end = src.fields().get_item_end_offset(from); + node_offset_t right_size = offset_right_end - offset_right_start; + p_append_right -= right_size; + p_mut->copy_in_absolute(p_append_right, + src.p_start() + offset_right_start, right_size); + } +} + +template +template +void APPEND_T::append( + const full_key_t& key, const value_t& value, const value_t*& p_value) { + if constexpr (FIELD_TYPE == field_type_t::N3) { + ceph_abort("not implemented"); + } else { + ceph_abort("should not happen"); + } +} + +template +template +std::tuple +APPEND_T::open_nxt(const key_get_type& partial_key) { + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + FieldType::append_key(*p_mut, partial_key, p_append_left); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + FieldType::append_key(*p_mut, partial_key, p_append_right); + } else { + ceph_abort("impossible path"); + } + return {p_mut, p_append_right}; +} + +template +template +std::tuple +APPEND_T::open_nxt(const full_key_t& key) { + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + FieldType::template append_key(*p_mut, key, p_append_left); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + FieldType::template append_key(*p_mut, key, p_append_right); + } else { + ceph_abort("impossible path"); + } + return {p_mut, p_append_right}; +} + +template +template +char* APPEND_T::wrap() { + assert(p_append_left <= p_append_right); + assert(p_src); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (p_src->is_level_tail()) { + laddr_t tail_value = p_src->get_end_p_laddr()->value; + p_append_right -= sizeof(laddr_t); + assert(p_append_left <= p_append_right); + p_mut->copy_in_absolute(p_append_right, tail_value); + } + } + p_mut->copy_in_absolute(p_start + offsetof(FieldType, num_keys), num_keys); + return p_append_left; +} + +#define APPEND_TEMPLATE(FT, NT, KT) template class node_extent_t::Appender +APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::HOBJ); + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h new file mode 100644 index 000000000..cf0ca463c --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h @@ -0,0 +1,226 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +/** + * node_extent_t + * + * The top indexing stage implementation for node N0/N1/N2/N3, implements + * staged contract as an indexable container, and provides access to node + * header. + * + * The specific field layout are defined by FieldType which are + * node_fields_0_t, node_fields_1_t, node_fields_2_t, internal_fields_3_t and + * leaf_fields_3_t. Diagrams see node_stage_layout.h. + */ +template +class node_extent_t { + public: + using value_t = value_type_t<_NODE_TYPE>; + using num_keys_t = typename FieldType::num_keys_t; + static constexpr node_type_t NODE_TYPE = _NODE_TYPE; + static constexpr field_type_t FIELD_TYPE = FieldType::FIELD_TYPE; + static constexpr node_offset_t EXTENT_SIZE = + (FieldType::SIZE + DISK_BLOCK_SIZE - 1u) / DISK_BLOCK_SIZE * DISK_BLOCK_SIZE; + + // TODO: remove + node_extent_t() = default; + + node_extent_t(const FieldType* p_fields) : p_fields{p_fields} { + validate(*p_fields); + } + + const char* p_start() const { return fields_start(*p_fields); } + + const char* off_to_ptr(node_offset_t off) const { + assert(off <= FieldType::SIZE); + return p_start() + off; + } + + node_offset_t ptr_to_off(const void* ptr) const { + auto _ptr = static_cast(ptr); + assert(_ptr >= p_start()); + auto off = _ptr - p_start(); + assert(off <= FieldType::SIZE); + return off; + } + + bool is_level_tail() const { return p_fields->is_level_tail(); } + level_t level() const { return p_fields->header.level; } + node_offset_t free_size() const { + return p_fields->template free_size_before(keys()); + } + node_offset_t total_size() const { return p_fields->total_size(); } + const char* p_left_bound() const; + template + std::enable_if_t + get_end_p_laddr() const { + assert(is_level_tail()); + if constexpr (FIELD_TYPE == field_type_t::N3) { + return &p_fields->child_addrs[keys()]; + } else { + auto offset_start = p_fields->get_item_end_offset(keys()); + assert(offset_start <= FieldType::SIZE); + offset_start -= sizeof(laddr_packed_t); + auto p_addr = p_start() + offset_start; + return reinterpret_cast(p_addr); + } + } + + // container type system + using key_get_type = typename FieldType::key_get_type; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + index_t keys() const { return p_fields->num_keys; } + key_get_type operator[] (index_t index) const { return p_fields->get_key(index); } + node_offset_t size_before(index_t index) const { + auto free_size = p_fields->template free_size_before(index); + assert(total_size() >= free_size); + return total_size() - free_size; + } + node_offset_t size_to_nxt_at(index_t index) const; + node_offset_t size_overhead_at(index_t index) const { + return FieldType::ITEM_OVERHEAD; } + memory_range_t get_nxt_container(index_t index) const; + + template + std::enable_if_t + get_p_value(index_t index) const { + assert(index < keys()); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + return &p_fields->child_addrs[index]; + } else { + auto range = get_nxt_container(index); + auto ret = reinterpret_cast(range.p_start); + assert(range.p_start + ret->size == range.p_end); + return ret; + } + } + + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + assert(p_node_start == p_start()); + // nothing to encode as the container range is the entire extent + } + + static node_extent_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + // nothing to decode + return node_extent_t(reinterpret_cast(p_node_start)); + } + + static void validate(const FieldType& fields) { +#ifndef NDEBUG + assert(fields.header.get_node_type() == NODE_TYPE); + assert(fields.header.get_field_type() == FieldType::FIELD_TYPE); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + assert(fields.header.level > 0u); + } else { + assert(fields.header.level == 0u); + } +#endif + } + + static void bootstrap_extent( + NodeExtentMutable&, field_type_t, node_type_t, bool, level_t); + + static void update_is_level_tail(NodeExtentMutable&, const node_extent_t&, bool); + + static node_offset_t header_size() { return FieldType::HEADER_SIZE; } + + template + static node_offset_t estimate_insert( + const full_key_t& key, const value_t& value) { + auto size = FieldType::estimate_insert_one(); + if constexpr (FIELD_TYPE == field_type_t::N2) { + size += ns_oid_view_t::estimate_size(key); + } else if constexpr (FIELD_TYPE == field_type_t::N3 && + NODE_TYPE == node_type_t::LEAF) { + size += value.size; + } + return size; + } + + template + static const value_t* insert_at( + NodeExtentMutable& mut, const node_extent_t&, + const full_key_t& key, const value_t& value, + index_t index, node_offset_t size, const char* p_left_bound) { + if constexpr (FIELD_TYPE == field_type_t::N3) { + ceph_abort("not implemented"); + } else { + ceph_abort("impossible"); + } + } + + template + static memory_range_t insert_prefix_at( + NodeExtentMutable&, const node_extent_t&, + const full_key_t& key, + index_t index, node_offset_t size, const char* p_left_bound); + + static void update_size_at( + NodeExtentMutable&, const node_extent_t&, index_t index, int change); + + static node_offset_t trim_until( + NodeExtentMutable&, const node_extent_t&, index_t index); + static node_offset_t trim_at(NodeExtentMutable&, const node_extent_t&, + index_t index, node_offset_t trimmed); + + template + class Appender; + + private: + const FieldType& fields() const { return *p_fields; } + const FieldType* p_fields; +}; + +template +template +class node_extent_t::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_start{p_append} { +#ifndef NDEBUG + auto p_fields = reinterpret_cast(p_append); + assert(*(p_fields->header.get_field_type()) == FIELD_TYPE); + assert(p_fields->header.get_node_type() == NODE_TYPE); + assert(p_fields->num_keys == 0); +#endif + p_append_left = p_start + FieldType::HEADER_SIZE; + p_append_right = p_start + FieldType::SIZE; + } + void append(const node_extent_t& src, index_t from, index_t items); + void append(const full_key_t&, const value_t&, const value_t*&); + char* wrap(); + std::tuple open_nxt(const key_get_type&); + std::tuple open_nxt(const full_key_t&); + void wrap_nxt(char* p_append) { + if constexpr (FIELD_TYPE != field_type_t::N3) { + assert(p_append < p_append_right); + assert(p_append_left < p_append); + p_append_right = p_append; + FieldType::append_offset(*p_mut, p_append - p_start, p_append_left); + ++num_keys; + } else { + ceph_abort("not implemented"); + } + } + + private: + const node_extent_t* p_src = nullptr; + NodeExtentMutable* p_mut; + char* p_start; + char* p_append_left; + char* p_append_right; + num_keys_t num_keys = 0; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc new file mode 100644 index 000000000..81bfac72a --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc @@ -0,0 +1,96 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_stage_layout.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +void node_header_t::bootstrap_extent( + NodeExtentMutable& mut, + field_type_t field_type, node_type_t node_type, + bool is_level_tail, level_t level) { + node_header_t header; + header.set_field_type(field_type); + header.set_node_type(node_type); + header.set_is_level_tail(is_level_tail); + header.level = level; + mut.copy_in_relative(0, header); +} + +void node_header_t::update_is_level_tail( + NodeExtentMutable& mut, const node_header_t& header, bool value) { + auto& _header = const_cast(header); + _header.set_is_level_tail(value); + mut.validate_inplace_update(_header); +} + +#define F013_T _node_fields_013_t +#define F013_INST(ST) _node_fields_013_t + +template +void F013_T::update_size_at( + NodeExtentMutable& mut, const me_t& node, index_t index, int change) { + assert(index <= node.num_keys); + for (const auto* p_slot = &node.slots[index]; + p_slot < &node.slots[node.num_keys]; + ++p_slot) { + node_offset_t offset = p_slot->right_offset; + mut.copy_in_absolute( + (void*)&(p_slot->right_offset), + node_offset_t(offset - change)); + } +} + +template +void F013_T::append_key( + NodeExtentMutable& mut, const key_t& key, char*& p_append) { + mut.copy_in_absolute(p_append, key); + p_append += sizeof(key_t); +} + +template +void F013_T::append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) { + mut.copy_in_absolute(p_append, offset_to_right); + p_append += sizeof(node_offset_t); +} + +template +template +void F013_T::insert_at( + NodeExtentMutable& mut, const full_key_t& key, + const me_t& node, index_t index, node_offset_t size_right) { + assert(index <= node.num_keys); + update_size_at(mut, node, index, size_right); + auto p_insert = const_cast(fields_start(node)) + + node.get_key_start_offset(index); + auto p_shift_end = fields_start(node) + node.get_key_start_offset(node.num_keys); + mut.shift_absolute(p_insert, p_shift_end - p_insert, estimate_insert_one()); + mut.copy_in_absolute((void*)&node.num_keys, num_keys_t(node.num_keys + 1)); + append_key(mut, key_t::template from_key(key), p_insert); + append_offset(mut, node.get_item_end_offset(index) - size_right, p_insert); +} +#define IA_TEMPLATE(ST, KT) template void F013_INST(ST):: \ + insert_at(NodeExtentMutable&, const full_key_t&, \ + const F013_INST(ST)&, index_t, node_offset_t) +IA_TEMPLATE(slot_0_t, KeyT::VIEW); +IA_TEMPLATE(slot_1_t, KeyT::VIEW); +IA_TEMPLATE(slot_3_t, KeyT::VIEW); +IA_TEMPLATE(slot_0_t, KeyT::HOBJ); +IA_TEMPLATE(slot_1_t, KeyT::HOBJ); +IA_TEMPLATE(slot_3_t, KeyT::HOBJ); + +#define F013_TEMPLATE(ST) template struct F013_INST(ST) +F013_TEMPLATE(slot_0_t); +F013_TEMPLATE(slot_1_t); +F013_TEMPLATE(slot_3_t); + +void node_fields_2_t::append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) { + mut.copy_in_absolute(p_append, offset_to_right); + p_append += sizeof(node_offset_t); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h new file mode 100644 index 000000000..14ba95bf4 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h @@ -0,0 +1,366 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "key_layout.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +struct node_header_t { + static constexpr unsigned FIELD_TYPE_BITS = 6u; + static_assert(static_cast(field_type_t::_MAX) <= 1u << FIELD_TYPE_BITS); + static constexpr unsigned NODE_TYPE_BITS = 1u; + static constexpr unsigned B_LEVEL_TAIL_BITS = 1u; + using bits_t = uint8_t; + + node_header_t() {} + std::optional get_field_type() const { + if (field_type >= FIELD_TYPE_MAGIC && + field_type < static_cast(field_type_t::_MAX)) { + return static_cast(field_type); + } else { + return std::nullopt; + } + } + node_type_t get_node_type() const { + return static_cast(node_type); + } + bool get_is_level_tail() const { + return is_level_tail; + } + + static void bootstrap_extent( + NodeExtentMutable&, field_type_t, node_type_t, bool, level_t); + + static void update_is_level_tail(NodeExtentMutable&, const node_header_t&, bool); + + bits_t field_type : FIELD_TYPE_BITS; + bits_t node_type : NODE_TYPE_BITS; + bits_t is_level_tail : B_LEVEL_TAIL_BITS; + static_assert(sizeof(bits_t) * 8 == + FIELD_TYPE_BITS + NODE_TYPE_BITS + B_LEVEL_TAIL_BITS); + level_t level; + + private: + void set_field_type(field_type_t type) { + field_type = static_cast(type); + } + void set_node_type(node_type_t type) { + node_type = static_cast(type); + } + void set_is_level_tail(bool value) { + is_level_tail = static_cast(value); + } +} __attribute__((packed)); + +template +struct _slot_t { + using key_t = FixedKeyType; + static constexpr field_type_t FIELD_TYPE = _FIELD_TYPE; + static constexpr node_offset_t OVERHEAD = sizeof(_slot_t) - sizeof(key_t); + + key_t key; + node_offset_t right_offset; +} __attribute__((packed)); +using slot_0_t = _slot_t; +using slot_1_t = _slot_t; +using slot_3_t = _slot_t; + +struct node_range_t { + node_offset_t start; + node_offset_t end; +}; + +template +const char* fields_start(const FieldType& node) { + return reinterpret_cast(&node); +} + +template +node_range_t fields_free_range_before( + const FieldType& node, index_t index) { + assert(index <= node.num_keys); + node_offset_t offset_start = node.get_key_start_offset(index); + node_offset_t offset_end = + (index == 0 ? FieldType::SIZE + : node.get_item_start_offset(index - 1)); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (node.is_level_tail() && index == node.num_keys) { + offset_end -= sizeof(laddr_t); + } + } + assert(offset_start <= offset_end); + assert(offset_end - offset_start < FieldType::SIZE); + return {offset_start, offset_end}; +} + +/** + * _node_fields_013_t (node_fields_0_t, node_fields_1_t, leaf_fields_3_t + * + * The STAGE_LEFT layout implementation for node N0/N1, or the STAGE_RIGHT + * layout implementation for leaf node N3. + * + * The node layout storing n slots: + * + * # <----------------------------- node range --------------------------------------> # + * # #<~># free space # + * # <----- left part -----------------------------> # <~# <----- right slots -------> # + * # # <---- left slots -------------> #~> # # + * # # slots [2, n) |<~># #<~>| right slots [2, n) # + * # # <- slot 0 -> | <- slot 1 -> | # # | <-- s1 --> | <-- s0 --> # + * # # | | # # | | # + * # | num_ # | right | | right | # # | next-stage | next-stage # + * # header | keys # key | offset | key | offset | # # | container | container # + * # | # 0 | 0 | 1 | 1 |...#...#...| or onode 1 | or onode 0 # + * | | ^ ^ + * | | | | + * | +----------------+ | + * +--------------------------------------------+ + */ +template +struct _node_fields_013_t { + // TODO: decide by NODE_BLOCK_SIZE, sizeof(SlotType), sizeof(laddr_t) + // and the minimal size of variable_key. + using num_keys_t = uint8_t; + using key_t = typename SlotType::key_t; + using key_get_type = const key_t&; + using me_t = _node_fields_013_t; + static constexpr field_type_t FIELD_TYPE = SlotType::FIELD_TYPE; + static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE; + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + static constexpr node_offset_t ITEM_OVERHEAD = SlotType::OVERHEAD; + + bool is_level_tail() const { return header.get_is_level_tail(); } + node_offset_t total_size() const { return SIZE; } + key_get_type get_key(index_t index) const { + assert(index < num_keys); + return slots[index].key; + } + node_offset_t get_key_start_offset(index_t index) const { + assert(index <= num_keys); + auto offset = HEADER_SIZE + sizeof(SlotType) * index; + assert(offset < SIZE); + return offset; + } + node_offset_t get_item_start_offset(index_t index) const { + assert(index < num_keys); + auto offset = slots[index].right_offset; + assert(offset <= SIZE); + return offset; + } + const void* p_offset(index_t index) const { + assert(index < num_keys); + return &slots[index].right_offset; + } + node_offset_t get_item_end_offset(index_t index) const { + return index == 0 ? SIZE : get_item_start_offset(index - 1); + } + template + node_offset_t free_size_before(index_t index) const { + auto range = fields_free_range_before(*this, index); + return range.end - range.start; + } + + static node_offset_t estimate_insert_one() { return sizeof(SlotType); } + template + static void insert_at( + NodeExtentMutable&, const full_key_t& key, + const me_t& node, index_t index, node_offset_t size_right); + static void update_size_at( + NodeExtentMutable&, const me_t& node, index_t index, int change); + static void append_key( + NodeExtentMutable&, const key_t& key, char*& p_append); + template + static void append_key( + NodeExtentMutable& mut, const full_key_t& key, char*& p_append) { + append_key(mut, key_t::template from_key(key), p_append); + } + static void append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append); + + node_header_t header; + num_keys_t num_keys = 0u; + SlotType slots[]; +} __attribute__((packed)); +using node_fields_0_t = _node_fields_013_t; +using node_fields_1_t = _node_fields_013_t; + +/** + * node_fields_2_t + * + * The STAGE_STRING layout implementation for node N2. + * + * The node layout storing n slots: + * + * # <--------------------------------- node range ----------------------------------------> # + * # #<~># free space # + * # <------- left part ---------------> # <~# <--------- right slots ---------------------> # + * # # <---- offsets ----> #~> #<~>| slots [2, n) # + * # # offsets [2, n) |<~># # | <----- slot 1 ----> | <----- slot 0 ----> # + * # # | # # | | # + * # | num_ # offset | offset | # # | next-stage | ns-oid | next-stage | ns-oid # + * # header | keys # 0 | 1 |...#...#...| container1 | 1 | container0 | 0 # + * | | ^ ^ + * | | | | + * | +----------------+ | + * +-----------------------------------------------+ + */ +struct node_fields_2_t { + // TODO: decide by NODE_BLOCK_SIZE, sizeof(node_off_t), sizeof(laddr_t) + // and the minimal size of variable_key. + using num_keys_t = uint8_t; + using key_t = ns_oid_view_t; + using key_get_type = key_t; + static constexpr field_type_t FIELD_TYPE = field_type_t::N2; + static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE; + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + static constexpr node_offset_t ITEM_OVERHEAD = sizeof(node_offset_t); + + bool is_level_tail() const { return header.get_is_level_tail(); } + node_offset_t total_size() const { return SIZE; } + key_get_type get_key(index_t index) const { + assert(index < num_keys); + node_offset_t item_end_offset = + (index == 0 ? SIZE : offsets[index - 1]); + assert(item_end_offset <= SIZE); + const char* p_start = fields_start(*this); + return key_t(p_start + item_end_offset); + } + node_offset_t get_key_start_offset(index_t index) const { + assert(index <= num_keys); + auto offset = HEADER_SIZE + sizeof(node_offset_t) * num_keys; + assert(offset <= SIZE); + return offset; + } + node_offset_t get_item_start_offset(index_t index) const { + assert(index < num_keys); + auto offset = offsets[index]; + assert(offset <= SIZE); + return offset; + } + const void* p_offset(index_t index) const { + assert(index < num_keys); + return &offsets[index]; + } + node_offset_t get_item_end_offset(index_t index) const { + return index == 0 ? SIZE : get_item_start_offset(index - 1); + } + template + node_offset_t free_size_before(index_t index) const { + auto range = fields_free_range_before(*this, index); + return range.end - range.start; + } + + static node_offset_t estimate_insert_one() { return sizeof(node_offset_t); } + template + static void insert_at( + NodeExtentMutable& mut, const full_key_t& key, + const node_fields_2_t& node, index_t index, node_offset_t size_right) { + ceph_abort("not implemented"); + } + static void update_size_at( + NodeExtentMutable& mut, const node_fields_2_t& node, index_t index, int change) { + ceph_abort("not implemented"); + } + static void append_key( + NodeExtentMutable& mut, const key_t& key, char*& p_append) { + ns_oid_view_t::append(mut, key, p_append); + } + template + static void append_key( + NodeExtentMutable& mut, const full_key_t& key, char*& p_append) { + ns_oid_view_t::append(mut, key, p_append); + } + static void append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append); + + node_header_t header; + num_keys_t num_keys = 0u; + node_offset_t offsets[]; +} __attribute__((packed)); + +/** + * internal_fields_3_t + * + * The STAGE_RIGHT layout implementation for N2. + * + * The node layout storing 3 children: + * + * # <---------------- node range ---------------------------> # + * # # <-- keys ---> # <---- laddrs -----------> # + * # free space: # |<~># |<~># + * # # | # | # + * # | num_ # key | key | # laddr | laddr | laddr | # + * # header | keys # 0 | 1 |...# 0 | 1 | 2 |...# + */ +// TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t) +static constexpr unsigned MAX_NUM_KEYS_I3 = 170u; +template +struct _internal_fields_3_t { + using key_get_type = const snap_gen_t&; + using me_t = _internal_fields_3_t; + // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t) + using num_keys_t = uint8_t; + static constexpr field_type_t FIELD_TYPE = field_type_t::N3; + static constexpr node_offset_t SIZE = sizeof(me_t); + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + static constexpr node_offset_t ITEM_OVERHEAD = 0u; + + bool is_level_tail() const { return header.get_is_level_tail(); } + node_offset_t total_size() const { + if (is_level_tail()) { + return SIZE - sizeof(snap_gen_t); + } else { + return SIZE; + } + } + key_get_type get_key(index_t index) const { + assert(index < num_keys); + return keys[index]; + } + template + std::enable_if_t + free_size_before(index_t index) const { + assert(index <= num_keys); + assert(num_keys <= (is_level_tail() ? MAX_NUM_KEYS - 1 : MAX_NUM_KEYS)); + auto free = (MAX_NUM_KEYS - index) * (sizeof(snap_gen_t) + sizeof(laddr_t)); + if (is_level_tail() && index == num_keys) { + free -= (sizeof(snap_gen_t) + sizeof(laddr_t)); + } + assert(free < SIZE); + return free; + } + + static node_offset_t estimate_insert_one() { + return sizeof(snap_gen_t) + sizeof(laddr_t); + } + template + static void insert_at( + NodeExtentMutable& mut, const full_key_t& key, + const me_t& node, index_t index, node_offset_t size_right) { + ceph_abort("not implemented"); + } + static void update_size_at( + NodeExtentMutable& mut, const me_t& node, index_t index, int change) { + ceph_abort("not implemented"); + } + + node_header_t header; + num_keys_t num_keys = 0u; + snap_gen_t keys[MAX_NUM_KEYS]; + laddr_packed_t child_addrs[MAX_NUM_KEYS]; +} __attribute__((packed)); +static_assert(_internal_fields_3_t::SIZE <= NODE_BLOCK_SIZE && + _internal_fields_3_t::SIZE > NODE_BLOCK_SIZE); +using internal_fields_3_t = _internal_fields_3_t; + +using leaf_fields_3_t = _node_fields_013_t; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h new file mode 100644 index 000000000..cac167a98 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h @@ -0,0 +1,2186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include +#include + +#include "common/likely.h" + +#include "sub_items_stage.h" +#include "item_iterator_stage.h" + +namespace crimson::os::seastore::onode { + +struct search_result_bs_t { + index_t index; + MatchKindBS match; +}; +template +search_result_bs_t binary_search( + const full_key_t& key, + index_t begin, index_t end, FGetKey&& f_get_key) { + assert(begin <= end); + while (begin < end) { + auto total = begin + end; + auto mid = total >> 1; + // do not copy if return value is reference + decltype(f_get_key(mid)) target = f_get_key(mid); + auto match = compare_to(key, target); + if (match == MatchKindCMP::LT) { + end = mid; + } else if (match == MatchKindCMP::GT) { + begin = mid + 1; + } else { + return {mid, MatchKindBS::EQ}; + } + } + return {begin , MatchKindBS::NE}; +} + +template +search_result_bs_t binary_search_r( + index_t rend, index_t rbegin, FGet&& f_get, const PivotType& key) { + assert(rend <= rbegin); + while (rend < rbegin) { + auto total = rend + rbegin + 1; + auto mid = total >> 1; + // do not copy if return value is reference + decltype(f_get(mid)) target = f_get(mid); + int match = target - key; + if (match < 0) { + rend = mid; + } else if (match > 0) { + rbegin = mid - 1; + } else { + return {mid, MatchKindBS::EQ}; + } + } + return {rbegin, MatchKindBS::NE}; +} + +inline bool matchable(field_type_t type, match_stat_t mstat) { + assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX); + /* + * compressed prefix by field type: + * N0: NONE + * N1: pool/shard + * N2: pool/shard crush + * N3: pool/shard crush ns/oid + * + * if key matches the node's compressed prefix, return true + * else, return false + */ +#ifndef NDEBUG + if (mstat == MSTAT_END) { + assert(type == field_type_t::N0); + } +#endif + return mstat + to_unsigned(type) < 4; +} + +inline void assert_mstat( + const full_key_t& key, + const full_key_t& index, + match_stat_t mstat) { + assert(mstat >= MSTAT_MIN && mstat <= MSTAT_LT2); + // key < index ... + switch (mstat) { + case MSTAT_EQ: + break; + case MSTAT_LT0: + assert(compare_to(key, index.snap_gen_packed()) == MatchKindCMP::LT); + break; + case MSTAT_LT1: + assert(compare_to(key, index.ns_oid_view()) == MatchKindCMP::LT); + break; + case MSTAT_LT2: + if (index.has_shard_pool()) { + assert(compare_to(key, shard_pool_crush_t{ + index.shard_pool_packed(), index.crush_packed()}) == MatchKindCMP::LT); + } else { + assert(compare_to(key, index.crush_packed()) == MatchKindCMP::LT); + } + break; + default: + ceph_abort("impossible path"); + } + // key == index ... + switch (mstat) { + case MSTAT_EQ: + assert(compare_to(key, index.snap_gen_packed()) == MatchKindCMP::EQ); + case MSTAT_LT0: + if (!index.has_ns_oid()) + break; + assert(index.ns_oid_view().type() == ns_oid_view_t::Type::MAX || + compare_to(key, index.ns_oid_view()) == MatchKindCMP::EQ); + case MSTAT_LT1: + if (!index.has_crush()) + break; + assert(compare_to(key, index.crush_packed()) == MatchKindCMP::EQ); + if (!index.has_shard_pool()) + break; + assert(compare_to(key, index.shard_pool_packed()) == MatchKindCMP::EQ); + default: + break; + } +} + +#define NXT_STAGE_T staged + +enum class TrimType { BEFORE, AFTER, AT }; + +/** + * staged + * + * Implements recursive logic that modifies or reads the node layout + * (N0/N1/N2/N3 * LEAF/INTERNAL) with the multi-stage design. The specific + * stage implementation is flexible. So the implementations for different + * stages can be assembled independently, as long as they follow the + * definitions of container interfaces. + * + * Multi-stage is designed to index different portions of onode keys + * stage-by-stage. There are at most 3 stages for a node: + * - STAGE_LEFT: index shard-pool-crush for N0, or index crush for N1 node; + * - STAGE_STRING: index ns-oid for N0/N1/N2 nodes; + * - STAGE_RIGHT: index snap-gen for N0/N1/N2/N3 nodes; + * + * The intention is to consolidate the high-level indexing implementations at + * the level of stage, so we don't need to write them repeatedly for every + * stage and for every node type. + */ +template +struct staged { + static_assert(Params::STAGE >= STAGE_BOTTOM); + static_assert(Params::STAGE <= STAGE_TOP); + using container_t = typename Params::container_t; + using key_get_type = typename container_t::key_get_type; + using next_param_t = typename Params::next_param_t; + using position_t = staged_position_t; + using result_t = staged_result_t; + using value_t = value_type_t; + static constexpr auto CONTAINER_TYPE = container_t::CONTAINER_TYPE; + static constexpr bool IS_BOTTOM = (Params::STAGE == STAGE_BOTTOM); + static constexpr auto NODE_TYPE = Params::NODE_TYPE; + static constexpr auto STAGE = Params::STAGE; + + template + static void _left_or_right(index_t& split_index, index_t insert_index, + std::optional& is_insert_left) { + assert(!is_insert_left.has_value()); + assert(is_valid_index(split_index)); + if constexpr (is_exclusive) { + if (split_index <= insert_index) { + // ...[s_index-1] |!| (i_index) [s_index]... + // offset i_position to right + is_insert_left = false; + } else { + // ...[s_index-1] (i_index)) |?[s_index]| ... + // ...(i_index)...[s_index-1] |?[s_index]| ... + is_insert_left = true; + --split_index; + } + } else { + if (split_index < insert_index) { + // ...[s_index-1] |?[s_index]| ...[(i_index)[s_index_k]... + is_insert_left = false; + } else if (split_index > insert_index) { + // ...[(i_index)s_index-1] |?[s_index]| ... + // ...[(i_index)s_index_k]...[s_index-1] |?[s_index]| ... + is_insert_left = true; + } else { + // ...[s_index-1] |?[(i_index)s_index]| ... + // i_to_left = std::nullopt; + } + } + } + + template class _iterator_t; + template + class _iterator_t> { + /* + * indexable container type system: + * CONTAINER_TYPE = ContainerType::INDEXABLE + * keys() const -> index_t + * operator[](index_t) const -> key_get_type + * size_before(index_t) const -> node_offset_t + * size_overhead_at(index_t) const -> node_offset_t + * (IS_BOTTOM) get_p_value(index_t) const -> const value_t* + * (!IS_BOTTOM) size_to_nxt_at(index_t) const -> node_offset_t + * (!IS_BOTTOM) get_nxt_container(index_t) const + * encode(p_node_start, encoded) + * decode(p_node_start, delta) -> container_t + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + * (IS_BOTTOM) insert_at(mut, src, key, value, + * index, size, p_left_bound) -> const value_t* + * (!IS_BOTTOM) insert_prefix_at(mut, src, key, + * index, size, p_left_bound) -> memory_range_t + * (!IS_BOTTOM) update_size_at(mut, src, index, size) + * trim_until(mut, container, index) -> trim_size + * (!IS_BOTTOM) trim_at(mut, container, index, trimmed) -> trim_size + * + * Appender::append(const container_t& src, from, items) + */ + public: + using me_t = _iterator_t; + + _iterator_t(const container_t& container) : container{container} { + assert(container.keys()); + } + + index_t index() const { + return _index; + } + key_get_type get_key() const { + assert(!is_end()); + return container[_index]; + } + node_offset_t size_to_nxt() const { + assert(!is_end()); + return container.size_to_nxt_at(_index); + } + template + std::enable_if_t get_nxt_container() const { + assert(!is_end()); + return container.get_nxt_container(_index); + } + template + std::enable_if_t get_p_value() const { + assert(!is_end()); + return container.get_p_value(_index); + } + bool is_last() const { + return _index + 1 == container.keys(); + } + bool is_end() const { return _index == container.keys(); } + node_offset_t size() const { + assert(!is_end()); + assert(header_size() == container.size_before(0)); + assert(container.size_before(_index + 1) > container.size_before(_index)); + return container.size_before(_index + 1) - + container.size_before(_index); + } + node_offset_t size_overhead() const { + assert(!is_end()); + return container.size_overhead_at(_index); + } + + me_t& operator++() { + assert(!is_end()); + assert(!is_last()); + ++_index; + return *this; + } + void seek_at(index_t index) { + assert(index < container.keys()); + seek_till_end(index); + } + void seek_till_end(index_t index) { + assert(!is_end()); + assert(this->index() == 0); + assert(index <= container.keys()); + _index = index; + } + void seek_last() { + assert(!is_end()); + assert(index() == 0); + _index = container.keys() - 1; + } + void set_end() { + assert(!is_end()); + assert(is_last()); + ++_index; + } + // Note: possible to return an end iterator + MatchKindBS seek(const full_key_t& key, bool exclude_last) { + assert(!is_end()); + assert(index() == 0); + index_t end_index = container.keys(); + if (exclude_last) { + assert(end_index); + --end_index; + assert(compare_to(key, container[end_index]) == MatchKindCMP::LT); + } + auto ret = binary_search(key, _index, end_index, + [this] (index_t index) { return container[index]; }); + _index = ret.index; + return ret.match; + } + + template + std::enable_if_t insert( + NodeExtentMutable& mut, const full_key_t& key, + const value_t& value, node_offset_t insert_size, const char* p_left_bound) { + return container_t::template insert_at( + mut, container, key, value, _index, insert_size, p_left_bound); + } + + template + std::enable_if_t insert_prefix( + NodeExtentMutable& mut, const full_key_t& key, + node_offset_t size, const char* p_left_bound) { + return container_t::template insert_prefix_at( + mut, container, key, _index, size, p_left_bound); + } + + template + std::enable_if_t + update_size(NodeExtentMutable& mut, node_offset_t insert_size) { + assert(!is_end()); + container_t::update_size_at(mut, container, _index, insert_size); + } + + // Note: possible to return an end iterator when is_exclusive is true + template + size_t seek_split_inserted( + size_t start_size, size_t extra_size, size_t target_size, + index_t& insert_index, size_t insert_size, + std::optional& is_insert_left) { + assert(!is_end()); + assert(index() == 0); + // replace insert_index placeholder + if constexpr (!is_exclusive) { + if (insert_index == INDEX_LAST) { + insert_index = container.keys() - 1; + } + } else { + if (insert_index == INDEX_END) { + insert_index = container.keys(); + } + } + assert(insert_index <= container.keys()); + + auto start_size_1 = start_size + extra_size; + auto f_get_used_size = [this, start_size, start_size_1, + insert_index, insert_size] (index_t index) { + size_t current_size; + if (unlikely(index == 0)) { + current_size = start_size; + } else { + current_size = start_size_1; + if (index > insert_index) { + current_size += insert_size; + if constexpr (is_exclusive) { + --index; + } + } + // already includes header size + current_size += container.size_before(index); + } + return current_size; + }; + index_t s_end; + if constexpr (is_exclusive) { + s_end = container.keys(); + } else { + s_end = container.keys() - 1; + } + _index = binary_search_r(0, s_end, f_get_used_size, target_size).index; + size_t current_size = f_get_used_size(_index); + assert(current_size <= target_size); + + _left_or_right(_index, insert_index, is_insert_left); + return current_size; + } + + size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) { + assert(!is_end()); + assert(index() == 0); + auto start_size_1 = start_size + extra_size; + auto f_get_used_size = [this, start_size, start_size_1] (index_t index) { + size_t current_size; + if (unlikely(index == 0)) { + current_size = start_size; + } else { + // already includes header size + current_size = start_size_1 + container.size_before(index); + } + return current_size; + }; + _index = binary_search_r( + 0, container.keys() - 1, f_get_used_size, target_size).index; + size_t current_size = f_get_used_size(_index); + assert(current_size <= target_size); + return current_size; + } + + // Note: possible to return an end iterater if to_index == INDEX_END + template + void copy_out_until( + typename container_t::template Appender& appender, index_t& to_index) { + auto num_keys = container.keys(); + index_t items; + if (to_index == INDEX_END) { + items = num_keys - _index; + appender.append(container, _index, items); + _index = num_keys; + to_index = _index; + } else if (to_index == INDEX_LAST) { + assert(!is_end()); + items = num_keys - 1 - _index; + appender.append(container, _index, items); + _index = num_keys - 1; + to_index = _index; + } else { + assert(_index <= to_index); + assert(to_index <= num_keys); + items = to_index - _index; + appender.append(container, _index, items); + _index = to_index; + } + } + + node_offset_t trim_until(NodeExtentMutable& mut) { + return container_t::trim_until(mut, container, _index); + } + + template + std::enable_if_t + trim_at(NodeExtentMutable& mut, node_offset_t trimmed) { + return container_t::trim_at(mut, container, _index, trimmed); + } + + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + container.encode(p_node_start, encoded); + ceph::encode(_index, encoded); + } + + static me_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + auto container = container_t::decode(p_node_start, delta); + auto ret = me_t(container); + index_t index; + ceph::decode(index, delta); + ret.seek_till_end(index); + return ret; + } + + static node_offset_t header_size() { + return container_t::header_size(); + } + + template + static node_offset_t estimate_insert( + const full_key_t& key, const value_t& value) { + return container_t::template estimate_insert(key, value); + } + + private: + container_t container; + index_t _index = 0; + }; + + template + class _iterator_t> { + /* + * iterative container type system (!IS_BOTTOM): + * CONTAINER_TYPE = ContainerType::ITERATIVE + * index() const -> index_t + * get_key() const -> key_get_type + * size() const -> node_offset_t + * size_to_nxt() const -> node_offset_t + * size_overhead() const -> node_offset_t + * get_nxt_container() const + * has_next() const -> bool + * encode(p_node_start, encoded) + * decode(p_node_start, delta) -> container_t + * operator++() + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + * insert_prefix(mut, src, key, is_end, size, p_left_bound) -> memory_range_t + * update_size(mut, src, size) + * trim_until(mut, container) -> trim_size + * trim_at(mut, container, trimmed) -> trim_size + */ + // currently the iterative iterator is only implemented with STAGE_STRING + // for in-node space efficiency + static_assert(STAGE == STAGE_STRING); + public: + using me_t = _iterator_t; + + _iterator_t(const container_t& container) : container{container} {} + + index_t index() const { + if (is_end()) { + return container.index() + 1; + } else { + return container.index(); + } + } + key_get_type get_key() const { + assert(!is_end()); + return container.get_key(); + } + node_offset_t size_to_nxt() const { + assert(!is_end()); + return container.size_to_nxt(); + } + const typename NXT_STAGE_T::container_t get_nxt_container() const { + assert(!is_end()); + return container.get_nxt_container(); + } + bool is_last() const { + assert(!is_end()); + return !container.has_next(); + } + bool is_end() const { +#ifndef NDEBUG + if (_is_end) { + assert(!container.has_next()); + } +#endif + return _is_end; + } + node_offset_t size() const { + assert(!is_end()); + return container.size(); + } + node_offset_t size_overhead() const { + assert(!is_end()); + return container.size_overhead(); + } + + me_t& operator++() { + assert(!is_end()); + assert(!is_last()); + ++container; + return *this; + } + void seek_at(index_t index) { + assert(!is_end()); + assert(this->index() == 0); + while (index > 0) { + assert(container.has_next()); + ++container; + --index; + } + } + void seek_till_end(index_t index) { + assert(!is_end()); + assert(this->index() == 0); + while (index > 0) { + if (!container.has_next()) { + assert(index == 1); + set_end(); + break; + } + ++container; + --index; + } + } + void seek_last() { + assert(!is_end()); + assert(index() == 0); + while (container.has_next()) { + ++container; + } + } + void set_end() { + assert(!is_end()); + assert(is_last()); + _is_end = true; + } + // Note: possible to return an end iterator + MatchKindBS seek(const full_key_t& key, bool exclude_last) { + assert(!is_end()); + assert(index() == 0); + do { + if (exclude_last && is_last()) { + assert(compare_to(key, get_key()) == MatchKindCMP::LT); + return MatchKindBS::NE; + } + auto match = compare_to(key, get_key()); + if (match == MatchKindCMP::LT) { + return MatchKindBS::NE; + } else if (match == MatchKindCMP::EQ) { + return MatchKindBS::EQ; + } else { + if (container.has_next()) { + ++container; + } else { + // end + break; + } + } + } while (true); + assert(!exclude_last); + set_end(); + return MatchKindBS::NE; + } + + template + memory_range_t insert_prefix( + NodeExtentMutable& mut, const full_key_t& key, + node_offset_t size, const char* p_left_bound) { + return container_t::template insert_prefix( + mut, container, key, is_end(), size, p_left_bound); + } + + void update_size(NodeExtentMutable& mut, node_offset_t insert_size) { + assert(!is_end()); + container_t::update_size(mut, container, insert_size); + } + + // Note: possible to return an end iterator when is_exclusive is true + // insert_index can still be INDEX_LAST or INDEX_END + template + size_t seek_split_inserted( + size_t start_size, size_t extra_size, size_t target_size, + index_t& insert_index, size_t insert_size, + std::optional& is_insert_left) { + assert(!is_end()); + assert(index() == 0); + size_t current_size = start_size; + index_t split_index = 0; + extra_size += header_size(); + do { + if constexpr (!is_exclusive) { + if (is_last()) { + assert(split_index == index()); + if (insert_index == INDEX_LAST) { + insert_index = index(); + } + assert(insert_index <= index()); + break; + } + } + + size_t nxt_size = current_size; + if (split_index == 0) { + nxt_size += extra_size; + } + if (split_index == insert_index) { + nxt_size += insert_size; + if constexpr (is_exclusive) { + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + ++split_index; + } + } + nxt_size += size(); + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + + if constexpr (is_exclusive) { + if (is_last()) { + assert(split_index == index()); + set_end(); + split_index = index(); + if (insert_index == INDEX_END) { + insert_index = index(); + } + assert(insert_index == index()); + break; + } else { + ++(*this); + ++split_index; + } + } else { + ++(*this); + ++split_index; + } + } while (true); + assert(current_size <= target_size); + + _left_or_right(split_index, insert_index, is_insert_left); + assert(split_index == index()); + return current_size; + } + + size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) { + assert(!is_end()); + assert(index() == 0); + size_t current_size = start_size; + do { + if (is_last()) { + break; + } + + size_t nxt_size = current_size; + if (index() == 0) { + nxt_size += extra_size; + } + nxt_size += size(); + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + ++(*this); + } while (true); + assert(current_size <= target_size); + return current_size; + } + + // Note: possible to return an end iterater if to_index == INDEX_END + template + void copy_out_until( + typename container_t::template Appender& appender, index_t& to_index) { + if (is_end()) { + assert(!container.has_next()); + if (to_index == INDEX_END) { + to_index = index(); + } + assert(to_index == index()); + return; + } + index_t items; + if (to_index == INDEX_END || to_index == INDEX_LAST) { + items = to_index; + } else { + assert(is_valid_index(to_index)); + assert(index() <= to_index); + items = to_index - index(); + } + if (appender.append(container, items)) { + set_end(); + } + to_index = index(); + } + + node_offset_t trim_until(NodeExtentMutable& mut) { + if (is_end()) { + return 0; + } + return container_t::trim_until(mut, container); + } + + node_offset_t trim_at(NodeExtentMutable& mut, node_offset_t trimmed) { + assert(!is_end()); + return container_t::trim_at(mut, container, trimmed); + } + + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + container.encode(p_node_start, encoded); + uint8_t is_end = _is_end; + ceph::encode(is_end, encoded); + } + + static me_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + auto container = container_t::decode(p_node_start, delta); + auto ret = me_t(container); + uint8_t is_end; + ceph::decode(is_end, delta); + if (is_end) { + ret.set_end(); + } + return ret; + } + + static node_offset_t header_size() { + return container_t::header_size(); + } + + template + static node_offset_t estimate_insert(const full_key_t& key, const value_t& value) { + return container_t::template estimate_insert(key, value); + } + + private: + container_t container; + bool _is_end = false; + }; + + /* + * iterator_t encapsulates both indexable and iterative implementations + * from a *non-empty* container. + * cstr(const container_t&) + * access: + * index() -> index_t + * get_key() -> key_get_type (const reference or value type) + * is_last() -> bool + * is_end() -> bool + * size() -> node_offset_t + * size_overhead() -> node_offset_t + * (IS_BOTTOM) get_p_value() -> const value_t* + * (!IS_BOTTOM) get_nxt_container() -> nxt_stage::container_t + * (!IS_BOTTOM) size_to_nxt() -> node_offset_t + * seek: + * operator++() -> iterator_t& + * seek_at(index) + * seek_till_end(index) + * seek_last() + * set_end() + * seek(key, exclude_last) -> MatchKindBS + * insert: + * (IS_BOTTOM) insert(mut, key, value, size, p_left_bound) -> p_value + * (!IS_BOTTOM) insert_prefix(mut, key, size, p_left_bound) -> memory_range_t + * (!IS_BOTTOM) update_size(mut, size) + * split: + * seek_split_inserted( + * start_size, extra_size, target_size, insert_index, insert_size, + * std::optional& is_insert_left) + * -> insert to left/right/unknown (!exclusive) + * -> insert to left/right (exclusive, can be end) + * -> split_size + * seek_split(start_size, extra_size, target_size) -> split_size + * copy_out_until(appender, to_index) (can be end) + * trim_until(mut) -> trim_size + * (!IS_BOTTOM) trim_at(mut, trimmed) -> trim_size + * denc: + * encode(p_node_start, encoded) + * decode(p_node_start, delta) -> iterator_t + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + */ + using iterator_t = _iterator_t; + /* TODO: detailed comments + * - trim_until(mut) -> trim_size + * * keep 0 to i - 1, and remove the rest, return the size trimmed. + * * if this is the end iterator, do nothing and return 0. + * * if this is the start iterator, normally needs to go to the higher + * stage to trim the entire container. + * - trim_at(mut, trimmed) -> trim_size + * * trim happens inside the current iterator, causing the size reduced by + * , return the total size trimmed. + */ + + /* + * Lookup internals (hide?) + */ + + template + static result_t smallest_result( + const iterator_t& iter, full_key_t* index_key) { + static_assert(!IS_BOTTOM); + assert(!iter.is_end()); + auto pos_smallest = NXT_STAGE_T::position_t::begin(); + auto nxt_container = iter.get_nxt_container(); + auto value_ptr = NXT_STAGE_T::template get_p_value( + nxt_container, pos_smallest, index_key); + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + return result_t{{iter.index(), pos_smallest}, value_ptr, STAGE}; + } + + template + static result_t nxt_lower_bound( + const full_key_t& key, iterator_t& iter, + MatchHistory& history, full_key_t* index_key) { + static_assert(!IS_BOTTOM); + assert(!iter.is_end()); + auto nxt_container = iter.get_nxt_container(); + auto nxt_result = NXT_STAGE_T::template lower_bound( + nxt_container, key, history, index_key); + if (nxt_result.is_end()) { + if (iter.is_last()) { + return result_t::end(); + } else { + return smallest_result(++iter, index_key); + } + } else { + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + return result_t::from_nxt(iter.index(), nxt_result); + } + } + + template + static void lookup_largest_slot( + const container_t& container, position_t* p_position, + full_key_t* p_index_key, const value_t** pp_value) { + auto iter = iterator_t(container); + iter.seek_last(); + if constexpr (GET_KEY) { + assert(p_index_key); + p_index_key->set(iter.get_key()); + } + if constexpr (GET_POS) { + assert(p_position); + p_position->index = iter.index(); + } + if constexpr (IS_BOTTOM) { + if constexpr (GET_VAL) { + assert(pp_value); + *pp_value = iter.get_p_value(); + } + } else { + auto nxt_container = iter.get_nxt_container(); + if constexpr (GET_POS) { + NXT_STAGE_T::template lookup_largest_slot( + nxt_container, &p_position->nxt, p_index_key, pp_value); + } else { + NXT_STAGE_T::template lookup_largest_slot( + nxt_container, nullptr, p_index_key, pp_value); + } + } + } + + template + static const value_t* get_p_value( + const container_t& container, const position_t& position, + full_key_t* index_key = nullptr) { + auto iter = iterator_t(container); + iter.seek_at(position.index); + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::template get_p_value( + nxt_container, position.nxt, index_key); + } else { + return iter.get_p_value(); + } + } + + static void get_key_view( + const container_t& container, + const position_t& position, + full_key_t& index_key) { + auto iter = iterator_t(container); + iter.seek_at(position.index); + index_key.set(iter.get_key()); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::get_key_view(nxt_container, position.nxt, index_key); + } + } + + template + static result_t lower_bound( + const container_t& container, + const full_key_t& key, + MatchHistory& history, + full_key_t* index_key = nullptr) { + bool exclude_last = false; + if (history.get().has_value()) { + if (*history.get() == MatchKindCMP::EQ) { + // lookup is short-circuited + if constexpr (!IS_BOTTOM) { + assert(history.get().has_value()); + if (history.is_GT()) { + auto iter = iterator_t(container); + bool test_key_equal; + if constexpr (STAGE == STAGE_STRING) { + // TODO(cross-node string dedup) + // test_key_equal = (iter.get_key().type() == ns_oid_view_t::Type::MIN); + auto cmp = compare_to(key, iter.get_key()); + assert(cmp != MatchKindCMP::GT); + test_key_equal = (cmp == MatchKindCMP::EQ); + } else { + auto cmp = compare_to(key, iter.get_key()); + // From history, key[stage] == parent[stage][index - 1] + // which should be the smallest possible value for all + // index[stage][*] + assert(cmp != MatchKindCMP::GT); + test_key_equal = (cmp == MatchKindCMP::EQ); + } + if (test_key_equal) { + return nxt_lower_bound(key, iter, history, index_key); + } else { + // key[stage] < index[stage][left-most] + return smallest_result(iter, index_key); + } + } + } + // IS_BOTTOM || !history.is_GT() + auto iter = iterator_t(container); + iter.seek_last(); + if constexpr (STAGE == STAGE_STRING) { + // TODO(cross-node string dedup) + // assert(iter.get_key().type() == ns_oid_view_t::Type::MAX); + assert(compare_to(key, iter.get_key()) == MatchKindCMP::EQ); + } else { + assert(compare_to(key, iter.get_key()) == MatchKindCMP::EQ); + } + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + if constexpr (IS_BOTTOM) { + auto value_ptr = iter.get_p_value(); + return result_t{{iter.index()}, value_ptr, MSTAT_EQ}; + } else { + auto nxt_container = iter.get_nxt_container(); + auto nxt_result = NXT_STAGE_T::template lower_bound( + nxt_container, key, history, index_key); + // !history.is_GT() means + // key[stage+1 ...] <= index[stage+1 ...][*] + assert(!nxt_result.is_end()); + return result_t::from_nxt(iter.index(), nxt_result); + } + } else if (*history.get() == MatchKindCMP::LT) { + exclude_last = true; + } + } + auto iter = iterator_t(container); + auto bs_match = iter.seek(key, exclude_last); + if (iter.is_end()) { + assert(!exclude_last); + assert(bs_match == MatchKindBS::NE); + history.set(MatchKindCMP::GT); + return result_t::end(); + } + history.set(bs_match == MatchKindBS::EQ ? + MatchKindCMP::EQ : MatchKindCMP::LT); + if constexpr (IS_BOTTOM) { + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + auto value_ptr = iter.get_p_value(); + return result_t{{iter.index()}, value_ptr, + (bs_match == MatchKindBS::EQ ? MSTAT_EQ : MSTAT_LT0)}; + } else { + if (bs_match == MatchKindBS::EQ) { + return nxt_lower_bound(key, iter, history, index_key); + } else { + return smallest_result(iter, index_key); + } + } + } + + template + static node_offset_t insert_size(const full_key_t& key, const value_t& value) { + if constexpr (IS_BOTTOM) { + return iterator_t::template estimate_insert(key, value); + } else { + return iterator_t::template estimate_insert(key, value) + + NXT_STAGE_T::iterator_t::header_size() + + NXT_STAGE_T::template insert_size(key, value); + } + } + + template + static node_offset_t insert_size_at( + match_stage_t stage, const full_key_t& key, const value_t& value) { + if (stage == STAGE) { + return insert_size(key, value); + } else { + assert(stage < STAGE); + return NXT_STAGE_T::template insert_size_at(stage, key, value); + } + } + + template > + static std::enable_if_t evaluate_insert( + const container_t& container, const full_key_t& key, + const value_t& value, position_t& position, bool evaluate_last) { + auto iter = iterator_t(container); + auto& index = position.index; + if (evaluate_last || index == INDEX_END) { + iter.seek_last(); + index = iter.index(); + // evaluate the previous index + } else { + assert(is_valid_index(index)); + // evaluate the current index + iter.seek_at(index); + auto match = compare_to(key, iter.get_key()); + if (match == MatchKindCMP::EQ) { + if constexpr (IS_BOTTOM) { + ceph_abort("insert conflict at current index!"); + } else { + // insert into the current index + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::evaluate_insert( + nxt_container, key, value, position.nxt, false); + } + } else { + assert(match == MatchKindCMP::LT); + if (index == 0) { + // already the first index, so insert at the current index + return {STAGE, insert_size(key, value)}; + } + --index; + iter = iterator_t(container); + iter.seek_at(index); + // proceed to evaluate the previous index + } + } + + // XXX(multi-type): when key is from a different type of node + auto match = compare_to(key, iter.get_key()); + if (match == MatchKindCMP::GT) { + // key doesn't match both indexes, so insert at the current index + ++index; + return {STAGE, insert_size(key, value)}; + } else { + assert(match == MatchKindCMP::EQ); + if constexpr (IS_BOTTOM) { + // ceph_abort? + ceph_abort("insert conflict at the previous index!"); + } else { + // insert into the previous index + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::evaluate_insert( + nxt_container, key, value, position.nxt, true); + } + } + } + + template + static std::enable_if_t + compensate_insert_position_at(match_stage_t stage, position_t& position) { + auto& index = position.index; + if (stage == STAGE) { + assert(index == 0); + // insert at the end of the current stage + index = INDEX_END; + return true; + } else { + if constexpr (IS_BOTTOM) { + ceph_abort("impossible path"); + } else { + assert(stage < STAGE); + bool compensate = NXT_STAGE_T:: + compensate_insert_position_at(stage, position.nxt); + if (compensate) { + assert(is_valid_index(index)); + if (index == 0) { + // insert into the *last* index of the current stage + index = INDEX_LAST; + return true; + } else { + --index; + return false; + } + } else { + return false; + } + } + } + } + + static void patch_insert_end(position_t& insert_pos, match_stage_t insert_stage) { + assert(insert_stage <= STAGE); + if (insert_stage == STAGE) { + insert_pos.index = INDEX_END; + } else if constexpr (!IS_BOTTOM) { + insert_pos.index = INDEX_LAST; + NXT_STAGE_T::patch_insert_end(insert_pos.nxt, insert_stage); + } + } + + template > + static std::enable_if_t evaluate_insert( + const full_key_t& key, const onode_t& value, + const MatchHistory& history, match_stat_t mstat, position_t& position) { + match_stage_t insert_stage = STAGE_TOP; + while (*history.get_by_stage(insert_stage) == MatchKindCMP::EQ) { + assert(insert_stage != STAGE_BOTTOM && "insert conflict!"); + --insert_stage; + } + + if (history.is_GT()) { + if (position.is_end()) { + // no need to compensate insert position + assert(insert_stage <= STAGE && "impossible insert stage"); + } else if (position == position_t::begin()) { + // I must be short-circuited by staged::smallest_result() + // in staged::lower_bound(), so we need to rely on mstat instead + assert(mstat >= MSTAT_LT0 && mstat <= MSTAT_LT3); + if (mstat == MSTAT_LT0) { + insert_stage = STAGE_RIGHT; + } else if (mstat == MSTAT_LT1) { + insert_stage = STAGE_STRING; + } else { + insert_stage = STAGE_LEFT; + } + // XXX(multi-type): need to upgrade node type before inserting an + // incompatible index at front. + assert(insert_stage <= STAGE && "incompatible insert"); + } else { + assert(insert_stage <= STAGE && "impossible insert stage"); + [[maybe_unused]] bool ret = compensate_insert_position_at(insert_stage, position); + assert(!ret); + } + } + + if (position.is_end()) { + patch_insert_end(position, insert_stage); + } + + node_offset_t insert_size = insert_size_at(insert_stage, key, value); + + return {insert_stage, insert_size}; + } + + template + static const value_t* insert_new( + NodeExtentMutable& mut, const memory_range_t& range, + const full_key_t& key, const value_t& value) { + char* p_insert = const_cast(range.p_end); + const value_t* p_value = nullptr; + StagedAppender appender; + appender.init(&mut, p_insert); + appender.append(key, value, p_value); + [[maybe_unused]] const char* p_insert_front = appender.wrap(); + assert(p_insert_front == range.p_start); + return p_value; + } + + template + static const value_t* proceed_insert_recursively( + NodeExtentMutable& mut, const container_t& container, + const full_key_t& key, const value_t& value, + position_t& position, match_stage_t& stage, + node_offset_t& _insert_size, const char* p_left_bound) { + // proceed insert from right to left + assert(stage <= STAGE); + auto iter = iterator_t(container); + auto& index = position.index; + + bool do_insert = false; + if (stage == STAGE) { + if (index == INDEX_END) { + iter.seek_last(); + iter.set_end(); + index = iter.index(); + } else { + assert(is_valid_index(index)); + iter.seek_till_end(index); + } + do_insert = true; + } else { // stage < STAGE + if (index == INDEX_LAST) { + iter.seek_last(); + index = iter.index(); + } else { + assert(is_valid_index(index)); + iter.seek_till_end(index); + } + if constexpr (SPLIT) { + if (iter.is_end()) { + // insert at the higher stage due to split + do_insert = true; + _insert_size = insert_size(key, value); + stage = STAGE; + } + } else { + assert(!iter.is_end()); + } + } + + if (do_insert) { + if constexpr (!IS_BOTTOM) { + position.nxt = position_t::nxt_t::begin(); + } + assert(_insert_size == insert_size(key, value)); + if constexpr (IS_BOTTOM) { + return iter.template insert( + mut, key, value, _insert_size, p_left_bound); + } else { + auto range = iter.template insert_prefix( + mut, key, _insert_size, p_left_bound); + return NXT_STAGE_T::template insert_new(mut, range, key, value); + } + } else { + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + auto p_value = NXT_STAGE_T::template proceed_insert_recursively( + mut, nxt_container, key, value, + position.nxt, stage, _insert_size, p_left_bound); + iter.update_size(mut, _insert_size); + return p_value; + } else { + ceph_abort("impossible path"); + } + } + } + + template + static const value_t* proceed_insert( + NodeExtentMutable& mut, const container_t& container, + const full_key_t& key, const value_t& value, + position_t& position, match_stage_t& stage, node_offset_t& _insert_size) { + auto p_left_bound = container.p_left_bound(); + if (unlikely(!container.keys())) { + if (position.is_end()) { + position = position_t::begin(); + assert(stage == STAGE); + assert(_insert_size == insert_size(key, value)); + } else if (position == position_t::begin()) { + // when insert into a trimmed and empty left node + stage = STAGE; + _insert_size = insert_size(key, value); + } else { + ceph_abort("impossible path"); + } + if constexpr (IS_BOTTOM) { + return container_t::template insert_at( + mut, container, key, value, 0, _insert_size, p_left_bound); + } else { + auto range = container_t::template insert_prefix_at( + mut, container, key, 0, _insert_size, p_left_bound); + return NXT_STAGE_T::template insert_new(mut, range, key, value); + } + } else { + return proceed_insert_recursively( + mut, container, key, value, + position, stage, _insert_size, p_left_bound); + } + } + + static std::ostream& dump(const container_t& container, + std::ostream& os, + const std::string& prefix, + size_t& size, + const char* p_start) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + std::string prefix_blank(prefix.size(), ' '); + const std::string* p_prefix = &prefix; + size += iterator_t::header_size(); + do { + std::ostringstream sos; + sos << *p_prefix << iter.get_key() << ": "; + std::string i_prefix = sos.str(); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + size += iter.size_to_nxt(); + NXT_STAGE_T::dump(nxt_container, os, i_prefix, size, p_start); + } else { + auto value_ptr = iter.get_p_value(); + int offset = reinterpret_cast(value_ptr) - p_start; + size += iter.size(); + os << "\n" << i_prefix; + if constexpr (NODE_TYPE == node_type_t::LEAF) { + os << *value_ptr; + } else { + os << "0x" << std::hex << value_ptr->value << std::dec; + } + os << " " << size << "B" + << " @" << offset << "B"; + } + if (iter.is_last()) { + break; + } else { + ++iter; + p_prefix = &prefix_blank; + } + } while (true); + return os; + } + + static void validate(const container_t& container) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + auto key = iter.get_key(); + do { + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + NXT_STAGE_T::validate(nxt_container); + } + if (iter.is_last()) { + break; + } else { + ++iter; + assert(compare_to(key, iter.get_key()) == MatchKindCMP::LT); + key = iter.get_key(); + } + } while (true); + } + + static void get_stats(const container_t& container, node_stats_t& stats, + full_key_t& index_key) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + stats.size_overhead += iterator_t::header_size(); + do { + index_key.replace(iter.get_key()); + stats.size_overhead += iter.size_overhead(); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + NXT_STAGE_T::get_stats(nxt_container, stats, index_key); + } else { + ++stats.num_kvs; + size_t kv_logical_size = index_key.size_logical(); + size_t value_size; + if constexpr (NODE_TYPE == node_type_t::LEAF) { + value_size = iter.get_p_value()->size; + } else { + value_size = sizeof(value_t); + } + stats.size_value += value_size; + kv_logical_size += value_size; + stats.size_logical += kv_logical_size; + } + if (iter.is_last()) { + break; + } else { + ++iter; + } + } while (true); + } + + static bool next_position(const container_t& container, position_t& pos) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + iter.seek_at(pos.index); + bool find_next; + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + find_next = NXT_STAGE_T::next_position(nxt_container, pos.nxt); + } else { + find_next = true; + } + if (find_next) { + if (iter.is_last()) { + return true; + } else { + pos.index = iter.index() + 1; + if constexpr (!IS_BOTTOM) { + pos.nxt = NXT_STAGE_T::position_t::begin(); + } + return false; + } + } else { + return false; + } + } + + struct _BaseEmpty {}; + class _BaseWithNxtIterator { + protected: + typename NXT_STAGE_T::StagedIterator _nxt; + }; + class StagedIterator + : std::conditional_t { + public: + StagedIterator() = default; + bool valid() const { return iter.has_value(); } + index_t index() const { + return iter->index(); + } + bool is_end() const { return iter->is_end(); } + bool in_progress() const { + assert(valid()); + if constexpr (!IS_BOTTOM) { + if (this->_nxt.valid()) { + if (this->_nxt.index() == 0) { + return this->_nxt.in_progress(); + } else { + return true; + } + } else { + return false; + } + } else { + return false; + } + } + key_get_type get_key() const { return iter->get_key(); } + + iterator_t& get() { return *iter; } + void set(const container_t& container) { + assert(!valid()); + iter = iterator_t(container); + } + void set_end() { iter->set_end(); } + typename NXT_STAGE_T::StagedIterator& nxt() { + if constexpr (!IS_BOTTOM) { + if (!this->_nxt.valid()) { + auto nxt_container = iter->get_nxt_container(); + this->_nxt.set(nxt_container); + } + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + typename NXT_STAGE_T::StagedIterator& get_nxt() { + if constexpr (!IS_BOTTOM) { + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + StagedIterator& operator++() { + if (iter->is_last()) { + iter->set_end(); + } else { + ++(*iter); + } + if constexpr (!IS_BOTTOM) { + this->_nxt.reset(); + } + return *this; + } + void reset() { + if (valid()) { + iter.reset(); + if constexpr (!IS_BOTTOM) { + this->_nxt.reset(); + } + } + } + std::ostream& print(std::ostream& os, bool is_top) const { + if (valid()) { + if (iter->is_end()) { + return os << "END"; + } else { + os << index(); + } + } else { + if (is_top) { + return os << "invalid StagedIterator!"; + } else { + os << "0!"; + } + } + if constexpr (!IS_BOTTOM) { + os << ", "; + return this->_nxt.print(os, false); + } else { + return os; + } + } + position_t get_pos() const { + if (valid()) { + if constexpr (IS_BOTTOM) { + return position_t{index()}; + } else { + return position_t{index(), this->_nxt.get_pos()}; + } + } else { + return position_t::begin(); + } + } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + uint8_t present = static_cast(iter); + ceph::encode(present, encoded); + if (iter.has_value()) { + iter->encode(p_node_start, encoded); + if constexpr (!IS_BOTTOM) { + this->_nxt.encode(p_node_start, encoded); + } + } + } + static StagedIterator decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + StagedIterator ret; + uint8_t present; + ceph::decode(present, delta); + if (present) { + ret.iter = iterator_t::decode(p_node_start, delta); + if constexpr (!IS_BOTTOM) { + ret._nxt = NXT_STAGE_T::StagedIterator::decode(p_node_start, delta); + } + } + return ret; + } + friend std::ostream& operator<<(std::ostream& os, const StagedIterator& iter) { + return iter.print(os, true); + } + private: + std::optional iter; + }; + + static bool recursively_locate_split( + size_t& current_size, size_t extra_size, + size_t target_size, StagedIterator& split_at) { + assert(current_size <= target_size); + iterator_t& split_iter = split_at.get(); + current_size = split_iter.seek_split(current_size, extra_size, target_size); + assert(current_size <= target_size); + assert(!split_iter.is_end()); + if (split_iter.index() == 0) { + extra_size += iterator_t::header_size(); + } else { + extra_size = 0; + } + bool locate_nxt; + if constexpr (!IS_BOTTOM) { + locate_nxt = NXT_STAGE_T::recursively_locate_split( + current_size, extra_size + split_iter.size_to_nxt(), + target_size, split_at.nxt()); + } else { // IS_BOTTOM + // located upper_bound, fair split strategy + size_t nxt_size = split_iter.size() + extra_size; + assert(current_size + nxt_size > target_size); + if (current_size + nxt_size/2 < target_size) { + // include next + current_size += nxt_size; + locate_nxt = true; + } else { + // exclude next + locate_nxt = false; + } + } + if (locate_nxt) { + if (split_iter.is_last()) { + return true; + } else { + ++split_at; + return false; + } + } else { + return false; + } + } + + static bool recursively_locate_split_inserted( + size_t& current_size, size_t extra_size, size_t target_size, + position_t& insert_pos, match_stage_t insert_stage, size_t insert_size, + std::optional& is_insert_left, StagedIterator& split_at) { + assert(current_size <= target_size); + assert(!is_insert_left.has_value()); + iterator_t& split_iter = split_at.get(); + auto& insert_index = insert_pos.index; + if (insert_stage == STAGE) { + current_size = split_iter.template seek_split_inserted( + current_size, extra_size, target_size, + insert_index, insert_size, is_insert_left); + assert(is_insert_left.has_value()); + assert(current_size <= target_size); + if (split_iter.index() == 0) { + if (insert_index == 0) { + if (*is_insert_left == false) { + extra_size += iterator_t::header_size(); + } else { + extra_size = 0; + } + } else { + extra_size += iterator_t::header_size(); + } + } else { + extra_size = 0; + } + if (*is_insert_left == false && split_iter.index() == insert_index) { + // split_iter can be end + // found the lower-bound of target_size + // ...[s_index-1] |!| (i_index) [s_index]... + + // located upper-bound, fair split strategy + // look at the next slot (the insert item) + size_t nxt_size = insert_size + extra_size; + assert(current_size + nxt_size > target_size); + if (current_size + nxt_size/2 < target_size) { + // include next + *is_insert_left = true; + current_size += nxt_size; + if (split_iter.is_end()) { + // ...[s_index-1] (i_index) |!| + return true; + } else { + return false; + } + } else { + // exclude next + return false; + } + } else { + // Already considered insert effect in the current stage. + // Look into the next stage to identify the target_size lower-bound w/o + // insert effect. + assert(!split_iter.is_end()); + bool locate_nxt; + if constexpr (!IS_BOTTOM) { + locate_nxt = NXT_STAGE_T::recursively_locate_split( + current_size, extra_size + split_iter.size_to_nxt(), + target_size, split_at.nxt()); + } else { // IS_BOTTOM + // located upper-bound, fair split strategy + // look at the next slot + size_t nxt_size = split_iter.size() + extra_size; + assert(current_size + nxt_size > target_size); + if (current_size + nxt_size/2 < target_size) { + // include next + current_size += nxt_size; + locate_nxt = true; + } else { + // exclude next + locate_nxt = false; + } + } + if (locate_nxt) { + if (split_iter.is_last()) { + auto end_index = split_iter.index() + 1; + if (insert_index == INDEX_END) { + insert_index = end_index; + } + assert(insert_index <= end_index); + if (insert_index == end_index) { + assert(*is_insert_left == false); + split_iter.set_end(); + // ...[s_index-1] |!| (i_index) + return false; + } else { + assert(*is_insert_left == true); + return true; + } + } else { + ++split_at; + return false; + } + } else { + return false; + } + } + } else { + if constexpr (!IS_BOTTOM) { + assert(insert_stage < STAGE); + current_size = split_iter.template seek_split_inserted( + current_size, extra_size, target_size, + insert_index, insert_size, is_insert_left); + assert(!split_iter.is_end()); + assert(current_size <= target_size); + if (split_iter.index() == 0) { + extra_size += iterator_t::header_size(); + } else { + extra_size = 0; + } + bool locate_nxt; + if (!is_insert_left.has_value()) { + // Considered insert effect in the current stage, and insert happens + // in the lower stage. + // Look into the next stage to identify the target_size lower-bound w/ + // insert effect. + assert(split_iter.index() == insert_index); + locate_nxt = NXT_STAGE_T::recursively_locate_split_inserted( + current_size, extra_size + split_iter.size_to_nxt(), target_size, + insert_pos.nxt, insert_stage, insert_size, + is_insert_left, split_at.nxt()); + assert(is_insert_left.has_value()); +#ifndef NDEBUG + if (locate_nxt) { + assert(*is_insert_left == true); + } +#endif + } else { + // is_insert_left.has_value() == true + // Insert will *not* happen in the lower stage. + // Need to look into the next stage to identify the target_size + // lower-bound w/ insert effect + assert(split_iter.index() != insert_index); + locate_nxt = NXT_STAGE_T::recursively_locate_split( + current_size, extra_size + split_iter.size_to_nxt(), + target_size, split_at.nxt()); +#ifndef NDEBUG + if (split_iter.index() < insert_index) { + assert(*is_insert_left == false); + } else { + assert(*is_insert_left == true); + } +#endif + } + if (locate_nxt) { + if (split_iter.is_last()) { + return true; + } else { + ++split_at; + return false; + } + } else { + return false; + } + } else { + ceph_abort("impossible path"); + return false;; + } + } + } + + /* + * container appender type system + * container_t::Appender(NodeExtentMutable& mut, char* p_append) + * append(const container_t& src, index_t from, index_t items) + * wrap() -> char* + * IF !IS_BOTTOM: + * open_nxt(const key_get_type&) + * open_nxt(const full_key_t&) + * -> std::tuple + * wrap_nxt(char* p_append) + * ELSE + * append(const full_key_t& key, const value_t& value) + */ + template + struct _BaseWithNxtAppender { + typename NXT_STAGE_T::template StagedAppender _nxt; + }; + template + class StagedAppender + : std::conditional_t> { + public: + StagedAppender() = default; + ~StagedAppender() { + assert(!require_wrap_nxt); + assert(!valid()); + } + bool valid() const { return appender.has_value(); } + index_t index() const { + assert(valid()); + return _index; + } + bool in_progress() const { return require_wrap_nxt; } + // TODO: pass by reference + void init(NodeExtentMutable* p_mut, char* p_start) { + assert(!valid()); + appender = typename container_t::template Appender(p_mut, p_start); + _index = 0; + } + // possible to make src_iter end if to_index == INDEX_END + void append_until(StagedIterator& src_iter, index_t& to_index) { + assert(!require_wrap_nxt); + auto s_index = src_iter.index(); + src_iter.get().template copy_out_until(*appender, to_index); + assert(src_iter.index() == to_index); + assert(to_index >= s_index); + auto increment = (to_index - s_index); + if (increment) { + _index += increment; + if constexpr (!IS_BOTTOM) { + src_iter.get_nxt().reset(); + } + } + } + void append(const full_key_t& key, + const value_t& value, const value_t*& p_value) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + auto& nxt = open_nxt(key); + nxt.append(key, value, p_value); + wrap_nxt(); + } else { + appender->append(key, value, p_value); + ++_index; + } + } + char* wrap() { + assert(valid()); + assert(_index > 0); + if constexpr (!IS_BOTTOM) { + if (require_wrap_nxt) { + wrap_nxt(); + } + } + auto ret = appender->wrap(); + appender.reset(); + return ret; + } + typename NXT_STAGE_T::template StagedAppender& + open_nxt(key_get_type paritial_key) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + require_wrap_nxt = true; + auto [p_mut, p_append] = appender->open_nxt(paritial_key); + this->_nxt.init(p_mut, p_append); + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + typename NXT_STAGE_T::template StagedAppender& + open_nxt(const full_key_t& key) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + require_wrap_nxt = true; + auto [p_mut, p_append] = appender->open_nxt(key); + this->_nxt.init(p_mut, p_append); + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + typename NXT_STAGE_T::template StagedAppender& get_nxt() { + if constexpr (!IS_BOTTOM) { + assert(require_wrap_nxt); + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + void wrap_nxt() { + if constexpr (!IS_BOTTOM) { + assert(require_wrap_nxt); + require_wrap_nxt = false; + auto p_append = this->_nxt.wrap(); + appender->wrap_nxt(p_append); + ++_index; + } else { + ceph_abort("impossible path"); + } + } + private: + std::optional> appender; + index_t _index; + bool require_wrap_nxt = false; + }; + + template + static void _append_range( + StagedIterator& src_iter, StagedAppender& appender, index_t& to_index) { + if (src_iter.is_end()) { + // append done + assert(to_index == INDEX_END); + to_index = src_iter.index(); + } else if constexpr (!IS_BOTTOM) { + if (appender.in_progress()) { + // appender has appended something at the current item, + // cannot append the current item as-a-whole + index_t to_index_nxt = INDEX_END; + NXT_STAGE_T::template _append_range( + src_iter.nxt(), appender.get_nxt(), to_index_nxt); + ++src_iter; + appender.wrap_nxt(); + } else if (src_iter.in_progress()) { + // src_iter is not at the beginning of the current item, + // cannot append the current item as-a-whole + index_t to_index_nxt = INDEX_END; + NXT_STAGE_T::template _append_range( + src_iter.nxt(), appender.open_nxt(src_iter.get_key()), to_index_nxt); + ++src_iter; + appender.wrap_nxt(); + } else { + // we can safely append the current item as-a-whole + } + } + appender.append_until(src_iter, to_index); + } + + template + static void _append_into(StagedIterator& src_iter, StagedAppender& appender, + position_t& position, match_stage_t stage) { + assert(position.index == src_iter.index()); + // reaches the last item + if (stage == STAGE) { + // done, end recursion + if constexpr (!IS_BOTTOM) { + position.nxt = position_t::nxt_t::begin(); + } + } else { + assert(stage < STAGE); + // proceed append in the next stage + NXT_STAGE_T::template append_until( + src_iter.nxt(), appender.open_nxt(src_iter.get_key()), + position.nxt, stage); + } + } + + template + static void append_until(StagedIterator& src_iter, StagedAppender& appender, + position_t& position, match_stage_t stage) { + index_t from_index = src_iter.index(); + index_t& to_index = position.index; + assert(from_index <= to_index); + if constexpr (IS_BOTTOM) { + assert(stage == STAGE); + appender.append_until(src_iter, to_index); + } else { + assert(stage <= STAGE); + if (src_iter.index() == to_index) { + _append_into(src_iter, appender, position, stage); + } else { + if (to_index == INDEX_END) { + assert(stage == STAGE); + } else if (to_index == INDEX_LAST) { + assert(stage < STAGE); + } + _append_range(src_iter, appender, to_index); + _append_into(src_iter, appender, position, stage); + } + } + to_index -= from_index; + } + + template + static bool append_insert( + const full_key_t& key, const value_t& value, + StagedIterator& src_iter, StagedAppender& appender, + bool is_front_insert, match_stage_t& stage, const value_t*& p_value) { + assert(src_iter.valid()); + if (stage == STAGE) { + appender.append(key, value, p_value); + if (src_iter.is_end()) { + return true; + } else { + return false; + } + } else { + assert(stage < STAGE); + if constexpr (!IS_BOTTOM) { + auto nxt_is_end = NXT_STAGE_T::template append_insert( + key, value, src_iter.get_nxt(), appender.get_nxt(), + is_front_insert, stage, p_value); + if (nxt_is_end) { + appender.wrap_nxt(); + ++src_iter; + if (is_front_insert) { + stage = STAGE; + } + if (src_iter.is_end()) { + return true; + } + } + return false; + } else { + ceph_abort("impossible path"); + } + } + } + + /* TrimType: + * BEFORE: remove the entire container, normally means the according higher + * stage iterator needs to be trimmed as-a-whole. + * AFTER: retain the entire container, normally means the trim should be + * start from the next iterator at the higher stage. + * AT: trim happens in the current container, and the according higher + * stage iterator needs to be adjusted by the trimmed size. + */ + static std::tuple + recursively_trim(NodeExtentMutable& mut, StagedIterator& trim_at) { + if (!trim_at.valid()) { + return {TrimType::BEFORE, 0u}; + } + if (trim_at.is_end()) { + return {TrimType::AFTER, 0u}; + } + + auto& iter = trim_at.get(); + if constexpr (!IS_BOTTOM) { + auto [type, trimmed] = NXT_STAGE_T::recursively_trim( + mut, trim_at.get_nxt()); + node_offset_t trim_size; + if (type == TrimType::AFTER) { + if (iter.is_last()) { + return {TrimType::AFTER, 0u}; + } + ++trim_at; + trim_size = iter.trim_until(mut); + } else if (type == TrimType::BEFORE) { + if (iter.index() == 0) { + return {TrimType::BEFORE, 0u}; + } + trim_size = iter.trim_until(mut); + } else { + trim_size = iter.trim_at(mut, trimmed); + } + return {TrimType::AT, trim_size}; + } else { + if (iter.index() == 0) { + return {TrimType::BEFORE, 0u}; + } else { + auto trimmed = iter.trim_until(mut); + return {TrimType::AT, trimmed}; + } + } + } + + static void trim(NodeExtentMutable& mut, StagedIterator& trim_at) { + auto [type, trimmed] = recursively_trim(mut, trim_at); + if (type == TrimType::BEFORE) { + assert(trim_at.valid()); + auto& iter = trim_at.get(); + iter.trim_until(mut); + } + } +}; + +/** + * Configurations for struct staged + * + * staged_params_* assembles different container_t implementations (defined by + * stated::_iterator_t) by STAGE, and constructs the final multi-stage + * implementations for different node layouts defined by + * node_extent_t. + * + * The specialized implementations for different layouts are accessible through + * the helper type node_to_stage_t>. + * + * Specifically, the settings of 8 layouts are: + * + * The layout (N0, LEAF/INTERNAL) has 3 stages: + * - STAGE_LEFT: node_extent_t + * - STAGE_STRING: item_iterator_t + * - STAGE_RIGHT: sub_items_t + * + * The layout (N1, LEAF/INTERNAL) has 3 stages: + * - STAGE_LEFT: node_extent_t + * - STAGE_STRING: item_iterator_t + * - STAGE_RIGHT: sub_items_t + * + * The layout (N2, LEAF/INTERNAL) has 2 stages: + * - STAGE_STRING: node_extent_t + * - STAGE_RIGHT: sub_items_t + * + * The layout (N3, LEAF) has 1 stage: + * - STAGE_RIGHT: node_extent_t + * + * The layout (N3, INTERNAL) has 1 stage: + * - STAGE_RIGHT: node_extent_t + */ + +template +struct staged_params_subitems { + using container_t = sub_items_t<_NODE_TYPE>; + static constexpr auto NODE_TYPE = _NODE_TYPE; + static constexpr auto STAGE = STAGE_RIGHT; + + // dummy type in order to make our type system work + // any better solution to get rid of this? + using next_param_t = staged_params_subitems; +}; + +template +struct staged_params_item_iterator { + using container_t = item_iterator_t<_NODE_TYPE>; + static constexpr auto NODE_TYPE = _NODE_TYPE; + static constexpr auto STAGE = STAGE_STRING; + + using next_param_t = staged_params_subitems; +}; + +template +struct staged_params_node_01 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_LEFT; + + using next_param_t = staged_params_item_iterator; +}; + +template +struct staged_params_node_2 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_STRING; + + using next_param_t = staged_params_subitems; +}; + +template +struct staged_params_node_3 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_RIGHT; + + // dummy type in order to make our type system work + // any better solution to get rid of this? + using next_param_t = staged_params_node_3; +}; + +template struct _node_to_stage_t; +template +struct _node_to_stage_t> { + using type = staged>; +}; +template +struct _node_to_stage_t> { + using type = staged>; +}; +template +struct _node_to_stage_t> { + using type = staged>; +}; +template +using node_to_stage_t = typename _node_to_stage_t::type; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h new file mode 100644 index 000000000..a9d5cef3b --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h @@ -0,0 +1,411 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include + +#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/tree_types.h" + +namespace crimson::os::seastore::onode { + +using match_stage_t = int8_t; +constexpr match_stage_t STAGE_LEFT = 2; // shard/pool/crush +constexpr match_stage_t STAGE_STRING = 1; // nspace/oid +constexpr match_stage_t STAGE_RIGHT = 0; // snap/gen +constexpr auto STAGE_TOP = STAGE_LEFT; +constexpr auto STAGE_BOTTOM = STAGE_RIGHT; +constexpr bool is_valid_stage(match_stage_t stage) { + return std::clamp(stage, STAGE_BOTTOM, STAGE_TOP) == stage; +} +// TODO: replace by +// using match_history_t = int8_t; +// left_m, str_m, right_m +// 3: GT, +// 2: EQ, GT, +// 1: EQ, EQ, GT +// 0: EQ, EQ, EQ +// -1: EQ, EQ, LT +// -2: EQ, LT, +// -3: LT, + +struct MatchHistory { + template + const std::optional& get() const { + static_assert(is_valid_stage(STAGE)); + if constexpr (STAGE == STAGE_RIGHT) { + return right_match; + } else if (STAGE == STAGE_STRING) { + return string_match; + } else { + return left_match; + } + } + + const std::optional& + get_by_stage(match_stage_t stage) const { + assert(is_valid_stage(stage)); + if (stage == STAGE_RIGHT) { + return right_match; + } else if (stage == STAGE_STRING) { + return string_match; + } else { + return left_match; + } + } + + template + const bool is_GT() const; + + template + void set(MatchKindCMP match) { + static_assert(is_valid_stage(STAGE)); + if constexpr (STAGE < STAGE_TOP) { + assert(*get() == MatchKindCMP::EQ); + } + assert(!get().has_value() || *get() != MatchKindCMP::EQ); + const_cast&>(get()) = match; + } + + std::ostream& dump(std::ostream& os) const { + os << "history("; + dump_each(os, left_match) << ", "; + dump_each(os, string_match) << ", "; + dump_each(os, right_match) << ")"; + return os; + } + + std::ostream& dump_each( + std::ostream& os, const std::optional& match) const { + if (!match.has_value()) { + return os << "--"; + } else if (*match == MatchKindCMP::LT) { + return os << "LT"; + } else if (*match == MatchKindCMP::EQ) { + return os << "EQ"; + } else if (*match == MatchKindCMP::GT) { + return os << "GT"; + } else { + ceph_abort("impossble path"); + } + } + + std::optional left_match; + std::optional string_match; + std::optional right_match; +}; +inline std::ostream& operator<<(std::ostream& os, const MatchHistory& pos) { + return pos.dump(os); +} + +template +struct _check_GT_t { + static bool eval(const MatchHistory* history) { + return history->get() && + (*history->get() == MatchKindCMP::GT || + (*history->get() == MatchKindCMP::EQ && + _check_GT_t::eval(history))); + } +}; +template <> +struct _check_GT_t { + static bool eval(const MatchHistory* history) { + return history->get() && + *history->get() == MatchKindCMP::GT; + } +}; +template +const bool MatchHistory::is_GT() const { + static_assert(is_valid_stage(STAGE)); + if constexpr (STAGE < STAGE_TOP) { + assert(get() == MatchKindCMP::EQ); + } + return _check_GT_t::eval(this); +} + +template +struct staged_position_t { + static_assert(is_valid_stage(STAGE)); + using me_t = staged_position_t; + using nxt_t = staged_position_t; + bool is_end() const { + if (index == INDEX_END) { + return true; + } else { + assert(is_valid_index(index)); + return false; + } + } + index_t& index_by_stage(match_stage_t stage) { + assert(stage <= STAGE); + if (STAGE == stage) { + return index; + } else { + return nxt.index_by_stage(stage); + } + } + + int cmp(const me_t& o) const { + if (index > o.index) { + return 1; + } else if (index < o.index) { + return -1; + } else { + return nxt.cmp(o.nxt); + } + } + bool operator>(const me_t& o) const { return cmp(o) > 0; } + bool operator>=(const me_t& o) const { return cmp(o) >= 0; } + bool operator<(const me_t& o) const { return cmp(o) < 0; } + bool operator<=(const me_t& o) const { return cmp(o) <= 0; } + bool operator==(const me_t& o) const { return cmp(o) == 0; } + bool operator!=(const me_t& o) const { return cmp(o) != 0; } + + me_t& operator-=(const me_t& o) { + assert(is_valid_index(o.index)); + assert(index >= o.index); + if (index != INDEX_END) { + assert(is_valid_index(index)); + index -= o.index; + if (index == 0) { + nxt -= o.nxt; + } + } + return *this; + } + + void encode(ceph::bufferlist& encoded) const { + ceph::encode(index, encoded); + nxt.encode(encoded); + } + + static me_t decode(ceph::bufferlist::const_iterator& delta) { + me_t ret; + ceph::decode(ret.index, delta); + ret.nxt = nxt_t::decode(delta); + return ret; + } + + static me_t begin() { return {0u, nxt_t::begin()}; } + static me_t end() { + return {INDEX_END, nxt_t::end()}; + } + + index_t index; + nxt_t nxt; +}; +template +std::ostream& operator<<(std::ostream& os, const staged_position_t& pos) { + if (pos.index == INDEX_END) { + os << "END"; + } else if (pos.index == INDEX_LAST) { + os << "LAST"; + } else { + os << pos.index; + assert(is_valid_index(pos.index)); + } + return os << ", " << pos.nxt; +} + +template <> +struct staged_position_t { + using me_t = staged_position_t; + bool is_end() const { + if (index == INDEX_END) { + return true; + } else { + assert(is_valid_index(index)); + return false; + } + } + index_t& index_by_stage(match_stage_t stage) { + assert(stage == STAGE_BOTTOM); + return index; + } + + int cmp(const staged_position_t& o) const { + if (index > o.index) { + return 1; + } else if (index < o.index) { + return -1; + } else { + return 0; + } + } + bool operator>(const me_t& o) const { return cmp(o) > 0; } + bool operator>=(const me_t& o) const { return cmp(o) >= 0; } + bool operator<(const me_t& o) const { return cmp(o) < 0; } + bool operator<=(const me_t& o) const { return cmp(o) <= 0; } + bool operator==(const me_t& o) const { return cmp(o) == 0; } + bool operator!=(const me_t& o) const { return cmp(o) != 0; } + + me_t& operator-=(const me_t& o) { + assert(is_valid_index(o.index)); + assert(index >= o.index); + if (index != INDEX_END) { + assert(is_valid_index(index)); + index -= o.index; + } + return *this; + } + + void encode(ceph::bufferlist& encoded) const { + ceph::encode(index, encoded); + } + + static me_t decode(ceph::bufferlist::const_iterator& delta) { + me_t ret; + ceph::decode(ret.index, delta); + return ret; + } + + static me_t begin() { return {0u}; } + static me_t end() { return {INDEX_END}; } + + index_t index; +}; +template <> +inline std::ostream& operator<<(std::ostream& os, const staged_position_t& pos) { + if (pos.index == INDEX_END) { + os << "END"; + } else if (pos.index == INDEX_LAST) { + os << "LAST"; + } else { + os << pos.index; + assert(is_valid_index(pos.index)); + } + return os; +} + +using search_position_t = staged_position_t; + +template +const staged_position_t& cast_down(const search_position_t& pos) { + if constexpr (STAGE == STAGE_LEFT) { + return pos; + } else if constexpr (STAGE == STAGE_STRING) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(pos.nxt.is_end()); + } else { + assert(pos.index == 0u); + } +#endif + return pos.nxt; + } else if constexpr (STAGE == STAGE_RIGHT) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(pos.nxt.nxt.is_end()); + } else { + assert(pos.index == 0u); + assert(pos.nxt.index == 0u); + } +#endif + return pos.nxt.nxt; + } else { + ceph_abort("impossible path"); + } +} + +template +staged_position_t& cast_down(search_position_t& pos) { + const search_position_t& _pos = pos; + return const_cast&>(cast_down(_pos)); +} + +template +staged_position_t& cast_down_fill_0(search_position_t& pos) { + if constexpr (STAGE == STAGE_LEFT) { + return pos; + } if constexpr (STAGE == STAGE_STRING) { + pos.index = 0; + return pos.nxt; + } else if constexpr (STAGE == STAGE_RIGHT) { + pos.index = 0; + pos.nxt.index = 0; + return pos.nxt.nxt; + } else { + ceph_abort("impossible path"); + } +} + +inline search_position_t&& normalize(search_position_t&& pos) { return std::move(pos); } + +template > +search_position_t normalize(staged_position_t&& pos) { + if (pos.is_end()) { + return search_position_t::end(); + } + if constexpr (STAGE == STAGE_STRING) { + return {0u, std::move(pos)}; + } else if (STAGE == STAGE_RIGHT) { + return {0u, {0u, std::move(pos)}}; + } else { + ceph_abort("impossible path"); + } +} + +struct memory_range_t { + const char* p_start; + const char* p_end; +}; + +enum class ContainerType { ITERATIVE, INDEXABLE }; + +template struct value_type; +template<> struct value_type { using type = laddr_packed_t; }; +template<> struct value_type { using type = onode_t; }; +template +using value_type_t = typename value_type::type; + +template +struct staged_result_t { + using me_t = staged_result_t; + bool is_end() const { return position.is_end(); } + + static me_t end() { + return {staged_position_t::end(), nullptr, MSTAT_END}; + } + template + static std::enable_if_t from_nxt( + index_t index, const staged_result_t& nxt_stage_result) { + return {{index, nxt_stage_result.position}, + nxt_stage_result.p_value, + nxt_stage_result.mstat}; + } + + staged_position_t position; + const value_type_t* p_value; + match_stat_t mstat; +}; + +template +using lookup_result_t = staged_result_t; + +template +lookup_result_t&& normalize( + lookup_result_t&& result) { return std::move(result); } + +template > +lookup_result_t normalize( + staged_result_t&& result) { + // FIXME: assert result.mstat correct + return {normalize(std::move(result.position)), result.p_value, result.mstat}; +} + +struct node_stats_t { + size_t size_persistent = 0; + size_t size_filled = 0; + // filled by staged::get_stats() + size_t size_logical = 0; + size_t size_overhead = 0; + size_t size_value = 0; + unsigned num_kvs = 0; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc new file mode 100644 index 000000000..aaca6c3c6 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc @@ -0,0 +1,208 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "sub_items_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +template +const laddr_packed_t* internal_sub_items_t::insert_at( + NodeExtentMutable& mut, const internal_sub_items_t& sub_items, + const full_key_t& key, const laddr_packed_t& value, + index_t index, node_offset_t size, const char* p_left_bound) { + assert(index <= sub_items.keys()); + assert(size == estimate_insert(key, value)); + const char* p_shift_start = p_left_bound; + const char* p_shift_end = reinterpret_cast( + sub_items.p_first_item + 1 - index); + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size); + + auto p_insert = const_cast(p_shift_end) - size; + auto item = internal_sub_item_t{snap_gen_t::from_key(key), value}; + mut.copy_in_absolute(p_insert, item); + return &reinterpret_cast(p_insert)->value; +} +#define IA_TEMPLATE(KT) \ + template const laddr_packed_t* internal_sub_items_t::insert_at( \ + NodeExtentMutable&, const internal_sub_items_t&, const full_key_t&, \ + const laddr_packed_t&, index_t, node_offset_t, const char*) +IA_TEMPLATE(KeyT::VIEW); +IA_TEMPLATE(KeyT::HOBJ); + +node_offset_t internal_sub_items_t::trim_until( + NodeExtentMutable&, internal_sub_items_t& items, index_t index) { + assert(index != 0); + auto keys = items.keys(); + assert(index <= keys); + size_t ret = sizeof(internal_sub_item_t) * (keys - index); + assert(ret < NODE_BLOCK_SIZE); + return ret; +} + +template +void internal_sub_items_t::Appender::append( + const internal_sub_items_t& src, index_t from, index_t items) { + assert(from <= src.keys()); + if (items == 0) { + return; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + node_offset_t size = sizeof(internal_sub_item_t) * items; + p_append -= size; + p_mut->copy_in_absolute(p_append, src.p_first_item + 1 - from - items, size); +} + +template +void internal_sub_items_t::Appender::append( + const full_key_t& key, const laddr_packed_t& value, + const laddr_packed_t*& p_value) { + p_append -= sizeof(internal_sub_item_t); + auto item = internal_sub_item_t{snap_gen_t::from_key(key), value}; + p_mut->copy_in_absolute(p_append, item); + p_value = &reinterpret_cast(p_append)->value; +} + +template +const onode_t* leaf_sub_items_t::insert_at( + NodeExtentMutable& mut, const leaf_sub_items_t& sub_items, + const full_key_t& key, const onode_t& value, + index_t index, node_offset_t size, const char* p_left_bound) { + assert(index <= sub_items.keys()); + assert(size == estimate_insert(key, value)); + // a. [... item(index)] << size + const char* p_shift_start = p_left_bound; + const char* p_shift_end = sub_items.get_item_end(index); + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size); + + // b. insert item + auto p_insert = const_cast(p_shift_end - size); + auto p_value = reinterpret_cast(p_insert); + mut.copy_in_absolute(p_insert, &value, value.size); + p_insert += value.size; + mut.copy_in_absolute(p_insert, snap_gen_t::template from_key(key)); + assert(p_insert + sizeof(snap_gen_t) + sizeof(node_offset_t) == p_shift_end); + + // c. compensate affected offsets + auto item_size = value.size + sizeof(snap_gen_t); + for (auto i = index; i < sub_items.keys(); ++i) { + const node_offset_packed_t& offset_i = sub_items.get_offset(i); + mut.copy_in_absolute((void*)&offset_i, node_offset_t(offset_i.value + item_size)); + } + + // d. [item(index-1) ... item(0) ... offset(index)] <<< sizeof(node_offset_t) + const char* p_offset = (index == 0 ? + (const char*)&sub_items.get_offset(0) + sizeof(node_offset_t) : + (const char*)&sub_items.get_offset(index - 1)); + p_shift_start = p_shift_end; + p_shift_end = p_offset; + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)sizeof(node_offset_t)); + + // e. insert offset + node_offset_t offset_to_item_start = item_size + sub_items.get_offset_to_end(index); + mut.copy_in_absolute( + const_cast(p_shift_end) - sizeof(node_offset_t), offset_to_item_start); + + // f. update num_sub_keys + mut.copy_in_absolute((void*)sub_items.p_num_keys, num_keys_t(sub_items.keys() + 1)); + + return p_value; +} +template const onode_t* leaf_sub_items_t::insert_at( + NodeExtentMutable&, const leaf_sub_items_t&, const full_key_t&, + const onode_t&, index_t, node_offset_t, const char*); + +node_offset_t leaf_sub_items_t::trim_until( + NodeExtentMutable& mut, leaf_sub_items_t& items, index_t index) { + assert(index != 0); + auto keys = items.keys(); + assert(index <= keys); + if (index == keys) { + return 0; + } + index_t trim_items = keys - index; + const char* p_items_start = items.p_start(); + const char* p_shift_start = items.get_item_end(index); + const char* p_shift_end = items.get_item_end(0); + size_t size_trim_offsets = sizeof(node_offset_t) * trim_items; + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, + size_trim_offsets); + mut.copy_in_absolute((void*)items.p_num_keys, num_keys_t(index)); + size_t ret = size_trim_offsets + (p_shift_start - p_items_start); + assert(ret < NODE_BLOCK_SIZE); + return ret; +} + +template class internal_sub_items_t::Appender; +template class internal_sub_items_t::Appender; + +// helper type for the visitor +template struct overloaded : Ts... { using Ts::operator()...; }; +// explicit deduction guide +template overloaded(Ts...) -> overloaded; + +template +char* leaf_sub_items_t::Appender::wrap() { + auto p_cur = p_append; + num_keys_t num_keys = 0; + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { num_keys += arg.items; }, + [&] (const kv_item_t& arg) { ++num_keys; } + }, a); + } + assert(num_keys); + p_cur -= sizeof(num_keys_t); + p_mut->copy_in_absolute(p_cur, num_keys); + + node_offset_t last_offset = 0; + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { + int compensate = (last_offset - op_src->get_offset_to_end(arg.from)); + node_offset_t offset; + for (auto i = arg.from; i < arg.from + arg.items; ++i) { + offset = op_src->get_offset(i).value + compensate; + p_cur -= sizeof(node_offset_t); + p_mut->copy_in_absolute(p_cur, offset); + } + last_offset = offset; + }, + [&] (const kv_item_t& arg) { + last_offset += sizeof(snap_gen_t) + arg.p_value->size; + p_cur -= sizeof(node_offset_t); + p_mut->copy_in_absolute(p_cur, last_offset); + } + }, a); + } + + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { + auto _p_start = op_src->get_item_end(arg.from + arg.items); + size_t _len = op_src->get_item_end(arg.from) - _p_start; + p_cur -= _len; + p_mut->copy_in_absolute(p_cur, _p_start, _len); + }, + [&] (const kv_item_t& arg) { + assert(pp_value); + p_cur -= sizeof(snap_gen_t); + p_mut->copy_in_absolute(p_cur, snap_gen_t::template from_key(*arg.p_key)); + p_cur -= arg.p_value->size; + p_mut->copy_in_absolute(p_cur, arg.p_value, arg.p_value->size); + *pp_value = reinterpret_cast(p_cur); + } + }, a); + } + return p_cur; +} + +template class leaf_sub_items_t::Appender; +template class leaf_sub_items_t::Appender; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h new file mode 100644 index 000000000..8ef5f7472 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h @@ -0,0 +1,341 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +struct internal_sub_item_t { + const snap_gen_t& get_key() const { return key; } + const laddr_packed_t* get_p_value() const { return &value; } + + snap_gen_t key; + laddr_packed_t value; +} __attribute__((packed)); + +/** + * internal_sub_items_t + * + * The STAGE_RIGHT implementation for internal node N0/N1/N2, implements staged + * contract as an indexable container to index snap-gen to child node + * addresses. + * + * The layout of the contaner storing n sub-items: + * + * # <--------- container range -----------> # + * #<~># sub-items [2, n) # + * # # <- sub-item 1 -> # <- sub-item 0 -> # + * #...# snap-gen | laddr # snap-gen | laddr # + * ^ + * | + * p_first_item + + */ +class internal_sub_items_t { + public: + using num_keys_t = index_t; + + internal_sub_items_t(const memory_range_t& range) { + assert(range.p_start < range.p_end); + assert((range.p_end - range.p_start) % sizeof(internal_sub_item_t) == 0); + num_items = (range.p_end - range.p_start) / sizeof(internal_sub_item_t); + assert(num_items > 0); + auto _p_first_item = range.p_end - sizeof(internal_sub_item_t); + p_first_item = reinterpret_cast(_p_first_item); + } + + // container type system + using key_get_type = const snap_gen_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + num_keys_t keys() const { return num_items; } + key_get_type operator[](index_t index) const { + assert(index < num_items); + return (p_first_item - index)->get_key(); + } + node_offset_t size_before(index_t index) const { + size_t ret = index * sizeof(internal_sub_item_t); + assert(ret < NODE_BLOCK_SIZE); + return ret; + } + const laddr_packed_t* get_p_value(index_t index) const { + assert(index < num_items); + return (p_first_item - index)->get_p_value(); + } + node_offset_t size_overhead_at(index_t index) const { return 0u; } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + auto p_end = reinterpret_cast(p_first_item) + + sizeof(internal_sub_item_t); + auto p_start = p_end - num_items * sizeof(internal_sub_item_t); + int start_offset = p_start - p_node_start; + int end_offset = p_end - p_node_start; + assert(start_offset > 0 && + start_offset < end_offset && + end_offset < NODE_BLOCK_SIZE); + ceph::encode(static_cast(start_offset), encoded); + ceph::encode(static_cast(end_offset), encoded); + } + + static internal_sub_items_t decode( + const char* p_node_start, ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t end_offset; + ceph::decode(end_offset, delta); + assert(start_offset < end_offset); + assert(end_offset <= NODE_BLOCK_SIZE); + return internal_sub_items_t({p_node_start + start_offset, + p_node_start + end_offset}); + } + + static node_offset_t header_size() { return 0u; } + + template + static node_offset_t estimate_insert( + const full_key_t&, const laddr_packed_t&) { + return sizeof(internal_sub_item_t); + } + + template + static const laddr_packed_t* insert_at( + NodeExtentMutable&, const internal_sub_items_t&, + const full_key_t&, const laddr_packed_t&, + index_t index, node_offset_t size, const char* p_left_bound); + + static node_offset_t trim_until(NodeExtentMutable&, internal_sub_items_t&, index_t); + + template + class Appender; + + private: + index_t num_items; + const internal_sub_item_t* p_first_item; +}; + +template +class internal_sub_items_t::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} {} + void append(const internal_sub_items_t& src, index_t from, index_t items); + void append(const full_key_t&, const laddr_packed_t&, const laddr_packed_t*&); + char* wrap() { return p_append; } + private: + NodeExtentMutable* p_mut; + char* p_append; +}; + +/** + * leaf_sub_items_t + * + * The STAGE_RIGHT implementation for leaf node N0/N1/N2, implements staged + * contract as an indexable container to index snap-gen to onode_t. + * + * The layout of the contaner storing n sub-items: + * + * # <------------------------ container range -------------------------------> # + * # <---------- sub-items ----------------> # <--- offsets ---------# # + * #<~># sub-items [2, n) #<~>| offsets [2, n) # # + * # # <- sub-item 1 -> # <- sub-item 0 -> # | # # + * #...# snap-gen | onode # snap-gen | onode #...| offset1 | offset0 # num_keys # + * ^ ^ ^ + * | | | + * p_items_end + p_offsets + | + * p_num_keys + + */ +class leaf_sub_items_t { + public: + // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), + // and the minimal size of onode_t + using num_keys_t = uint8_t; + + leaf_sub_items_t(const memory_range_t& range) { + assert(range.p_start < range.p_end); + auto _p_num_keys = range.p_end - sizeof(num_keys_t); + assert(range.p_start < _p_num_keys); + p_num_keys = reinterpret_cast(_p_num_keys); + assert(keys()); + auto _p_offsets = _p_num_keys - sizeof(node_offset_t); + assert(range.p_start < _p_offsets); + p_offsets = reinterpret_cast(_p_offsets); + p_items_end = reinterpret_cast(&get_offset(keys() - 1)); + assert(range.p_start < p_items_end); + assert(range.p_start == p_start()); + } + + bool operator==(const leaf_sub_items_t& x) { + return (p_num_keys == x.p_num_keys && + p_offsets == x.p_offsets && + p_items_end == x.p_items_end); + } + + const char* p_start() const { return get_item_end(keys()); } + + const node_offset_packed_t& get_offset(index_t index) const { + assert(index < keys()); + return *(p_offsets - index); + } + + const node_offset_t get_offset_to_end(index_t index) const { + assert(index <= keys()); + return index == 0 ? 0 : get_offset(index - 1).value; + } + + const char* get_item_start(index_t index) const { + return p_items_end - get_offset(index).value; + } + + const char* get_item_end(index_t index) const { + return p_items_end - get_offset_to_end(index); + } + + // container type system + using key_get_type = const snap_gen_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + num_keys_t keys() const { return *p_num_keys; } + key_get_type operator[](index_t index) const { + assert(index < keys()); + auto pointer = get_item_end(index); + assert(get_item_start(index) < pointer); + pointer -= sizeof(snap_gen_t); + assert(get_item_start(index) < pointer); + return *reinterpret_cast(pointer); + } + node_offset_t size_before(index_t index) const { + assert(index <= keys()); + size_t ret; + if (index == 0) { + ret = sizeof(num_keys_t); + } else { + --index; + ret = sizeof(num_keys_t) + + (index + 1) * sizeof(node_offset_t) + + get_offset(index).value; + } + assert(ret < NODE_BLOCK_SIZE); + return ret; + } + node_offset_t size_overhead_at(index_t index) const { return sizeof(node_offset_t); } + const onode_t* get_p_value(index_t index) const { + assert(index < keys()); + auto pointer = get_item_start(index); + auto value = reinterpret_cast(pointer); + assert(pointer + value->size + sizeof(snap_gen_t) == get_item_end(index)); + return value; + } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + auto p_end = reinterpret_cast(p_num_keys) + + sizeof(num_keys_t); + int start_offset = p_start() - p_node_start; + int end_offset = p_end - p_node_start; + assert(start_offset > 0 && + start_offset < end_offset && + end_offset < NODE_BLOCK_SIZE); + ceph::encode(static_cast(start_offset), encoded); + ceph::encode(static_cast(end_offset), encoded); + } + + static leaf_sub_items_t decode( + const char* p_node_start, ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t end_offset; + ceph::decode(end_offset, delta); + assert(start_offset < end_offset); + assert(end_offset <= NODE_BLOCK_SIZE); + return leaf_sub_items_t({p_node_start + start_offset, + p_node_start + end_offset}); + } + + static node_offset_t header_size() { return sizeof(num_keys_t); } + + template + static node_offset_t estimate_insert(const full_key_t&, const onode_t& value) { + return value.size + sizeof(snap_gen_t) + sizeof(node_offset_t); + } + + template + static const onode_t* insert_at( + NodeExtentMutable&, const leaf_sub_items_t&, + const full_key_t&, const onode_t&, + index_t index, node_offset_t size, const char* p_left_bound); + + static node_offset_t trim_until(NodeExtentMutable&, leaf_sub_items_t&, index_t index); + + template + class Appender; + + private: + // TODO: support unaligned access + const num_keys_t* p_num_keys; + const node_offset_packed_t* p_offsets; + const char* p_items_end; +}; + +constexpr index_t APPENDER_LIMIT = 3u; + +template +class leaf_sub_items_t::Appender { + struct range_items_t { + index_t from; + index_t items; + }; + struct kv_item_t { + const full_key_t* p_key; + const onode_t* p_value; + }; + using var_t = std::variant; + + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} { + } + + void append(const leaf_sub_items_t& src, index_t from, index_t items) { + assert(cnt <= APPENDER_LIMIT); + assert(from <= src.keys()); + if (items == 0) { + return; + } + if (op_src) { + assert(*op_src == src); + } else { + op_src = src; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + appends[cnt] = range_items_t{from, items}; + ++cnt; + } + void append(const full_key_t& key, + const onode_t& value, const onode_t*& p_value) { + assert(pp_value == nullptr); + assert(cnt <= APPENDER_LIMIT); + appends[cnt] = kv_item_t{&key, &value}; + ++cnt; + pp_value = &p_value; + } + char* wrap(); + + private: + std::optional op_src; + const onode_t** pp_value = nullptr; + NodeExtentMutable* p_mut; + char* p_append; + var_t appends[APPENDER_LIMIT]; + index_t cnt = 0; +}; + +template struct _sub_items_t; +template<> struct _sub_items_t { using type = internal_sub_items_t; }; +template<> struct _sub_items_t { using type = leaf_sub_items_t; }; +template +using sub_items_t = typename _sub_items_t::type; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc new file mode 100644 index 000000000..5a28f5097 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc @@ -0,0 +1,26 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "super.h" +#include "node.h" + +namespace crimson::os::seastore::onode { + +Ref RootNodeTrackerIsolated::get_root(Transaction& t) const { + auto iter = tracked_supers.find(&t); + if (iter == tracked_supers.end()) { + return nullptr; + } else { + return iter->second->get_p_root(); + } +} + +Ref RootNodeTrackerShared::get_root(Transaction&) const { + if (is_clean()) { + return nullptr; + } else { + return tracked_super->get_p_root(); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.h b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h new file mode 100644 index 000000000..5eefee9ff --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h @@ -0,0 +1,143 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include + +#include "crimson/common/type_helpers.h" + +#include "fwd.h" + +namespace crimson::os::seastore::onode { + +class Node; +class Super; + +/** + * RootNodeTracker + * + * An abstracted tracker to get the root node by Transaction. + */ +class RootNodeTracker { + public: + virtual ~RootNodeTracker() = default; + virtual bool is_clean() const = 0; + virtual Ref get_root(Transaction&) const = 0; + static RootNodeTrackerURef create(bool read_isolated); + protected: + RootNodeTracker() = default; + RootNodeTracker(const RootNodeTracker&) = delete; + RootNodeTracker(RootNodeTracker&&) = delete; + RootNodeTracker& operator=(const RootNodeTracker&) = delete; + RootNodeTracker& operator=(RootNodeTracker&&) = delete; + virtual void do_track_super(Transaction&, Super&) = 0; + virtual void do_untrack_super(Transaction&, Super&) = 0; + friend class Super; +}; + +/** + * Super + * + * The parent of root node. It contains the relationship between a Transaction + * and a root node address. + */ +class Super { + public: + using URef = std::unique_ptr; + Super(const Super&) = delete; + Super(Super&&) = delete; + Super& operator=(const Super&) = delete; + Super& operator=(Super&&) = delete; + virtual ~Super() { + assert(tracked_root_node == nullptr); + tracker.do_untrack_super(t, *this); + } + + virtual laddr_t get_root_laddr() const = 0; + virtual void write_root_laddr(context_t, laddr_t) = 0; + + void do_track_root(Node& root) { + assert(tracked_root_node == nullptr); + tracked_root_node = &root; + } + void do_untrack_root(Node& root) { + assert(tracked_root_node == &root); + tracked_root_node = nullptr; + } + Node* get_p_root() const { + assert(tracked_root_node != nullptr); + return tracked_root_node; + } + + protected: + Super(Transaction& t, RootNodeTracker& tracker) + : t{t}, tracker{tracker} { + tracker.do_track_super(t, *this); + } + + private: + Transaction& t; + RootNodeTracker& tracker; + Node* tracked_root_node = nullptr; +}; + +/** + * RootNodeTrackerIsolated + * + * A concrete RootNodeTracker implementation which provides root node isolation + * between Transactions for Seastore backend. + */ +class RootNodeTrackerIsolated final : public RootNodeTracker { + public: + ~RootNodeTrackerIsolated() override { assert(is_clean()); } + protected: + bool is_clean() const override { + return tracked_supers.empty(); + } + void do_track_super(Transaction& t, Super& super) override { + assert(tracked_supers.find(&t) == tracked_supers.end()); + tracked_supers[&t] = &super; + } + void do_untrack_super(Transaction& t, Super& super) override { + [[maybe_unused]] auto removed = tracked_supers.erase(&t); + assert(removed); + } + ::Ref get_root(Transaction& t) const override; + std::map tracked_supers; +}; + +/** + * RootNodeTrackerShared + * + * A concrete RootNodeTracker implementation which has no isolation between + * Transactions for Dummy backend. + */ +class RootNodeTrackerShared final : public RootNodeTracker { + public: + ~RootNodeTrackerShared() override { assert(is_clean()); } + protected: + bool is_clean() const override { + return tracked_super == nullptr; + } + void do_track_super(Transaction&, Super& super) override { + assert(is_clean()); + tracked_super = &super; + } + void do_untrack_super(Transaction&, Super& super) override { + assert(tracked_super == &super); + tracked_super = nullptr; + } + ::Ref get_root(Transaction&) const override; + Super* tracked_super = nullptr; +}; + +inline RootNodeTrackerURef RootNodeTracker::create(bool read_isolated) { + if (read_isolated) { + return RootNodeTrackerURef(new RootNodeTrackerIsolated()); + } else { + return RootNodeTrackerURef(new RootNodeTrackerShared()); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc new file mode 100644 index 000000000..2c8c21652 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "tree.h" + +#include "node.h" +#include "node_extent_manager.h" +#include "stages/key_layout.h" +#include "super.h" + +namespace crimson::os::seastore::onode { + +using btree_ertr = Btree::btree_ertr; +template +using btree_future = Btree::btree_future; +using Cursor = Btree::Cursor; + +Cursor::Cursor(Btree* p_tree, Ref _p_cursor) + : p_tree(p_tree) { + if (_p_cursor->is_end()) { + // no need to hold the leaf node + } else { + p_cursor = _p_cursor; + } +} +Cursor::Cursor(Btree* p_tree) : p_tree{p_tree} {} +Cursor::Cursor(const Cursor&) = default; +Cursor::Cursor(Cursor&&) noexcept = default; +Cursor& Cursor::operator=(const Cursor&) = default; +Cursor& Cursor::operator=(Cursor&&) = default; +Cursor::~Cursor() = default; + +bool Cursor::is_end() const { + if (p_cursor) { + assert(!p_cursor->is_end()); + return false; + } else { + return true; + } +} + +ghobject_t Cursor::get_ghobj() const { + return p_cursor->get_key_view().to_ghobj(); +} + +const onode_t* Cursor::value() const { + return p_cursor->get_p_value(); +} + +bool Cursor::operator==(const Cursor& x) const { + return p_cursor == x.p_cursor; +} + +Cursor& Cursor::operator++() { + // TODO + return *this; +} + +Cursor Cursor::operator++(int) { + Cursor tmp = *this; + ++*this; + return tmp; +} + +Cursor Cursor::make_end(Btree* p_tree) { + return {p_tree}; +} + +Btree::Btree(NodeExtentManagerURef&& _nm) + : nm{std::move(_nm)}, + root_tracker{RootNodeTracker::create(nm->is_read_isolated())} {} + +Btree::~Btree() { assert(root_tracker->is_clean()); } + +btree_future<> Btree::mkfs(Transaction& t) { + return Node::mkfs(get_context(t), *root_tracker); +} + +btree_future Btree::begin(Transaction& t) { + return get_root(t).safe_then([this, &t](auto root) { + return root->lookup_smallest(get_context(t)); + }).safe_then([this](auto cursor) { + return Cursor{this, cursor}; + }); +} + +btree_future Btree::last(Transaction& t) { + return get_root(t).safe_then([this, &t](auto root) { + return root->lookup_largest(get_context(t)); + }).safe_then([this](auto cursor) { + return Cursor(this, cursor); + }); +} + +Cursor Btree::end() { + return Cursor::make_end(this); +} + +btree_future +Btree::contains(Transaction& t, const ghobject_t& obj) { + return seastar::do_with( + full_key_t(obj), + [this, &t](auto& key) -> btree_future { + return get_root(t).safe_then([this, &t, &key](auto root) { + // TODO: improve lower_bound() + return root->lower_bound(get_context(t), key); + }).safe_then([](auto result) { + return MatchKindBS::EQ == result.match(); + }); + } + ); +} + +btree_future +Btree::find(Transaction& t, const ghobject_t& obj) { + return seastar::do_with( + full_key_t(obj), + [this, &t](auto& key) -> btree_future { + return get_root(t).safe_then([this, &t, &key](auto root) { + // TODO: improve lower_bound() + return root->lower_bound(get_context(t), key); + }).safe_then([this](auto result) { + if (result.match() == MatchKindBS::EQ) { + return Cursor(this, result.p_cursor); + } else { + return Cursor::make_end(this); + } + }); + } + ); +} + +btree_future +Btree::lower_bound(Transaction& t, const ghobject_t& obj) { + return seastar::do_with( + full_key_t(obj), + [this, &t](auto& key) -> btree_future { + return get_root(t).safe_then([this, &t, &key](auto root) { + return root->lower_bound(get_context(t), key); + }).safe_then([this](auto result) { + return Cursor(this, result.p_cursor); + }); + } + ); +} + +btree_future> +Btree::insert(Transaction& t, const ghobject_t& obj, const onode_t& value) { + return seastar::do_with( + full_key_t(obj), + [this, &t, &value](auto& key) -> btree_future> { + return get_root(t).safe_then([this, &t, &key, &value](auto root) { + return root->insert(get_context(t), key, value); + }).safe_then([this](auto ret) { + auto& [cursor, success] = ret; + return std::make_pair(Cursor(this, cursor), success); + }); + } + ); +} + +btree_future Btree::erase(Transaction& t, const ghobject_t& obj) { + // TODO + return btree_ertr::make_ready_future(0u); +} + +btree_future Btree::erase(Cursor& pos) { + // TODO + return btree_ertr::make_ready_future( + Cursor::make_end(this)); +} + +btree_future +Btree::erase(Cursor& first, Cursor& last) { + // TODO + return btree_ertr::make_ready_future( + Cursor::make_end(this)); +} + +btree_future Btree::height(Transaction& t) { + return get_root(t).safe_then([](auto root) { + return size_t(root->level() + 1); + }); +} + +btree_future Btree::get_stats_slow(Transaction& t) { + return get_root(t).safe_then([this, &t](auto root) { + unsigned height = root->level() + 1; + return root->get_tree_stats(get_context(t) + ).safe_then([height](auto stats) { + stats.height = height; + return btree_ertr::make_ready_future(stats); + }); + }); +} + +std::ostream& Btree::dump(Transaction& t, std::ostream& os) { + auto root = root_tracker->get_root(t); + if (root) { + root->dump(os); + } else { + os << "empty tree!"; + } + return os; +} + +std::ostream& Btree::print(std::ostream& os) const { + return os << "BTree-" << *nm; +} + +btree_future> Btree::get_root(Transaction& t) { + auto root = root_tracker->get_root(t); + if (root) { + return btree_ertr::make_ready_future>(root); + } else { + return Node::load_root(get_context(t), *root_tracker); + } +} + +bool Btree::test_is_clean() const { + return root_tracker->is_clean(); +} + +btree_future<> Btree::test_clone_from( + Transaction& t, Transaction& t_from, Btree& from) { + // Note: assume the tree to clone is tracked correctly in memory. + // In some unit tests, parts of the tree are stubbed out that they + // should not be loaded from NodeExtentManager. + return from.get_root(t_from + ).safe_then([this, &t](auto root_from) { + return root_from->test_clone_root(get_context(t), *root_tracker); + }); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h new file mode 100644 index 000000000..7ee618cb3 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h @@ -0,0 +1,119 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include + +#include "common/hobject.h" +#include "crimson/common/type_helpers.h" + +#include "fwd.h" +#include "tree_types.h" + +/** + * tree.h + * + * An example implementation to expose tree interfaces to users. The current + * interface design is based on: + * - ceph::os::Transaction::create/touch/remove() + * - ceph::ObjectStore::collection_list() + * - ceph::BlueStore::get_onode() + * - db->get_iterator(PREFIIX_OBJ) by ceph::BlueStore::fsck() + * + * TODO: Redesign the interfaces based on real onode manager requirements. + */ + +namespace crimson::os::seastore::onode { + +class Node; +class Btree { + public: + using btree_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + template + using btree_future = btree_ertr::future; + + Btree(NodeExtentManagerURef&& nm); + Btree(const Btree&) = delete; + Btree(Btree&&) = delete; + Btree& operator=(const Btree&) = delete; + Btree& operator=(Btree&&) = delete; + ~Btree(); + + btree_future<> mkfs(Transaction&); + + class Cursor; + // lookup + btree_future begin(Transaction&); + btree_future last(Transaction&); + Cursor end(); + btree_future contains(Transaction&, const ghobject_t&); + btree_future find(Transaction&, const ghobject_t&); + btree_future lower_bound(Transaction&, const ghobject_t&); + + // modifiers + // TODO: replace onode_t + btree_future> + insert(Transaction&, const ghobject_t&, const onode_t&); + btree_future erase(Transaction&, const ghobject_t& key); + btree_future erase(Cursor& pos); + btree_future erase(Cursor& first, Cursor& last); + + // stats + btree_future height(Transaction&); + btree_future get_stats_slow(Transaction&); + std::ostream& dump(Transaction&, std::ostream&); + std::ostream& print(std::ostream& os) const; + + // test_only + bool test_is_clean() const; + btree_future<> test_clone_from(Transaction& t, Transaction& t_from, Btree& from); + + private: + context_t get_context(Transaction& t) { return {*nm, t}; } + btree_future> get_root(Transaction& t); + + NodeExtentManagerURef nm; + RootNodeTrackerURef root_tracker; + + friend class DummyChildPool; +}; +inline std::ostream& operator<<(std::ostream& os, const Btree& tree) { + return tree.print(os); +} + +class tree_cursor_t; +class Btree::Cursor { + public: + Cursor(const Cursor&); + Cursor(Cursor&&) noexcept; + Cursor& operator=(const Cursor&); + Cursor& operator=(Cursor&&); + ~Cursor(); + + bool is_end() const; + // XXX: return key_view_t to avoid unecessary ghobject_t constructions + ghobject_t get_ghobj() const; + const onode_t* value() const; + bool operator==(const Cursor& x) const; + bool operator!=(const Cursor& x) const { return !(*this == x); } + Cursor& operator++(); + Cursor operator++(int); + + private: + Cursor(Btree*, Ref); + Cursor(Btree*); + + static Cursor make_end(Btree*); + + Btree* p_tree; + Ref p_cursor; + + friend class Btree; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h new file mode 100644 index 000000000..0bb345e0a --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h @@ -0,0 +1,125 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include + +namespace crimson::os::seastore::onode { + +// TODO: Redesign according to real requirement from onode manager +struct onode_t { + // onode should be smaller than a node + uint16_t size; // address up to 64 KiB sized node + uint16_t id; + // omap, extent_map, inline data + + bool operator==(const onode_t& o) const { return size == o.size && id == o.id; } + bool operator!=(const onode_t& o) const { return !(*this == o); } + + void encode(ceph::bufferlist& encoded) const { + ceph::encode(size, encoded); + ceph::encode(id, encoded); + } + static onode_t decode(ceph::bufferlist::const_iterator& delta) { + uint16_t size; + ceph::decode(size, delta); + uint16_t id; + ceph::decode(id, delta); + onode_t ret{size, id}; + return ret; + } + static void validate_tail_magic(const onode_t& onode) { + auto p_target = (const char*)&onode + onode.size - sizeof(uint32_t); + uint32_t target; + std::memcpy(&target, p_target, sizeof(uint32_t)); + ceph_assert(target == onode.size * 137); + } + static std::unique_ptr allocate(const onode_t& config) { + ceph_assert(config.size >= sizeof(onode_t) + sizeof(uint32_t)); + + auto ret = std::make_unique(config.size); + char* p_mem = ret.get(); + auto p_onode = reinterpret_cast(p_mem); + *p_onode = config; + + uint32_t tail_magic = config.size * 137; + p_mem += (config.size - sizeof(uint32_t)); + std::memcpy(p_mem, &tail_magic, sizeof(uint32_t)); + validate_tail_magic(*p_onode); + + return ret; + } +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const onode_t& node) { + return os << "onode(" << node.id << ", " << node.size << "B)"; +} + +struct tree_stats_t { + size_t size_persistent_leaf = 0; + size_t size_persistent_internal = 0; + size_t size_filled_leaf = 0; + size_t size_filled_internal = 0; + size_t size_logical_leaf = 0; + size_t size_logical_internal = 0; + size_t size_overhead_leaf = 0; + size_t size_overhead_internal = 0; + size_t size_value_leaf = 0; + size_t size_value_internal = 0; + unsigned num_kvs_leaf = 0; + unsigned num_kvs_internal = 0; + unsigned num_nodes_leaf = 0; + unsigned num_nodes_internal = 0; + unsigned height = 0; + + size_t size_persistent() const { + return size_persistent_leaf + size_persistent_internal; } + size_t size_filled() const { + return size_filled_leaf + size_filled_internal; } + size_t size_logical() const { + return size_logical_leaf + size_logical_internal; } + size_t size_overhead() const { + return size_overhead_leaf + size_overhead_internal; } + size_t size_value() const { + return size_value_leaf + size_value_internal; } + unsigned num_kvs() const { + return num_kvs_leaf + num_kvs_internal; } + unsigned num_nodes() const { + return num_nodes_leaf + num_nodes_internal; } + + double ratio_fullness() const { + return (double)size_filled() / size_persistent(); } + double ratio_key_compression() const { + return (double)(size_filled() - size_value()) / (size_logical() - size_value()); } + double ratio_overhead() const { + return (double)size_overhead() / size_filled(); } + double ratio_keys_leaf() const { + return (double)num_kvs_leaf / num_kvs(); } + double ratio_nodes_leaf() const { + return (double)num_nodes_leaf / num_nodes(); } + double ratio_filled_leaf() const { + return (double)size_filled_leaf / size_filled(); } +}; +inline std::ostream& operator<<(std::ostream& os, const tree_stats_t& stats) { + os << "Tree stats:" + << "\n height = " << stats.height + << "\n num values = " << stats.num_kvs_leaf + << "\n num nodes = " << stats.num_nodes() + << " (leaf=" << stats.num_nodes_leaf + << ", internal=" << stats.num_nodes_internal << ")" + << "\n size persistent = " << stats.size_persistent() << "B" + << "\n size filled = " << stats.size_filled() << "B" + << " (value=" << stats.size_value_leaf << "B" + << ", rest=" << stats.size_filled() - stats.size_value_leaf << "B)" + << "\n size logical = " << stats.size_logical() << "B" + << "\n size overhead = " << stats.size_overhead() << "B" + << "\n ratio fullness = " << stats.ratio_fullness() + << "\n ratio keys leaf = " << stats.ratio_keys_leaf() + << "\n ratio nodes leaf = " << stats.ratio_nodes_leaf() + << "\n ratio filled leaf = " << stats.ratio_filled_leaf() + << "\n ratio key compression = " << stats.ratio_key_compression(); + assert(stats.num_kvs_internal + 1 == stats.num_nodes()); + return os; +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h new file mode 100644 index 000000000..536052003 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h @@ -0,0 +1,333 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "crimson/common/log.h" +#include "stages/key_layout.h" +#include "tree.h" + +/** + * tree_utils.h + * + * Contains shared logic for unit tests and perf tool. + */ + +namespace crimson::os::seastore::onode { + +class Onodes { + public: + Onodes(size_t n) { + for (size_t i = 1; i <= n; ++i) { + auto p_onode = &create(i * 8); + onodes.push_back(p_onode); + } + } + + Onodes(std::vector sizes) { + for (auto& size : sizes) { + auto p_onode = &create(size); + onodes.push_back(p_onode); + } + } + + ~Onodes() = default; + + const onode_t& create(size_t size) { + ceph_assert(size <= std::numeric_limits::max()); + onode_t config{static_cast(size), id++}; + auto onode = onode_t::allocate(config); + auto p_onode = onode.get(); + tracked_onodes.push_back(std::move(onode)); + return *reinterpret_cast(p_onode); + } + + const onode_t& pick() const { + auto index = rd() % onodes.size(); + return *onodes[index]; + } + + const onode_t& pick_largest() const { + return *onodes[onodes.size() - 1]; + } + + static void validate_cursor( + const Btree::Cursor& cursor, const ghobject_t& key, const onode_t& onode) { + ceph_assert(!cursor.is_end()); + ceph_assert(cursor.get_ghobj() == key); + ceph_assert(cursor.value()); + ceph_assert(cursor.value() != &onode); + ceph_assert(*cursor.value() == onode); + onode_t::validate_tail_magic(*cursor.value()); + } + + private: + uint16_t id = 0; + mutable std::random_device rd; + std::vector onodes; + std::vector> tracked_onodes; +}; + +class KVPool { + struct kv_conf_t { + unsigned index2; + unsigned index1; + unsigned index0; + size_t ns_size; + size_t oid_size; + const onode_t* p_value; + + ghobject_t get_ghobj() const { + assert(index1 < 10); + std::ostringstream os_ns; + os_ns << "ns" << index1; + unsigned current_size = (unsigned)os_ns.tellp(); + assert(ns_size >= current_size); + os_ns << std::string(ns_size - current_size, '_'); + + std::ostringstream os_oid; + os_oid << "oid" << index1; + current_size = (unsigned)os_oid.tellp(); + assert(oid_size >= current_size); + os_oid << std::string(oid_size - current_size, '_'); + + return ghobject_t(shard_id_t(index2), index2, index2, + os_ns.str(), os_oid.str(), index0, index0); + } + }; + using kv_vector_t = std::vector; + + public: + using kv_t = std::pair; + + KVPool(const std::vector& str_sizes, + const std::vector& onode_sizes, + const std::pair& range2, + const std::pair& range1, + const std::pair& range0) + : str_sizes{str_sizes}, onodes{onode_sizes} { + ceph_assert(range2.first < range2.second); + ceph_assert(range2.second - 1 <= (unsigned)std::numeric_limits::max()); + ceph_assert(range2.second - 1 <= std::numeric_limits::max()); + ceph_assert(range1.first < range1.second); + ceph_assert(range1.second - 1 <= 9); + ceph_assert(range0.first < range0.second); + std::random_device rd; + for (unsigned i = range2.first; i < range2.second; ++i) { + for (unsigned j = range1.first; j < range1.second; ++j) { + auto ns_size = (unsigned)str_sizes[rd() % str_sizes.size()]; + auto oid_size = (unsigned)str_sizes[rd() % str_sizes.size()]; + for (unsigned k = range0.first; k < range0.second; ++k) { + kvs.emplace_back(kv_conf_t{i, j, k, ns_size, oid_size, &onodes.pick()}); + } + } + } + random_kvs = kvs; + std::random_shuffle(random_kvs.begin(), random_kvs.end()); + } + + class iterator_t { + public: + iterator_t() = default; + iterator_t(const iterator_t&) = default; + iterator_t(iterator_t&&) = default; + iterator_t& operator=(const iterator_t&) = default; + iterator_t& operator=(iterator_t&&) = default; + + kv_t get_kv() const { + assert(!is_end()); + auto& conf = (*p_kvs)[i]; + return std::make_pair(conf.get_ghobj(), conf.p_value); + } + bool is_end() const { return !p_kvs || i >= p_kvs->size(); } + size_t index() const { return i; } + + iterator_t& operator++() { + assert(!is_end()); + ++i; + return *this; + } + + iterator_t operator++(int) { + iterator_t tmp = *this; + ++*this; + return tmp; + } + + private: + iterator_t(const kv_vector_t& kvs) : p_kvs{&kvs} {} + + const kv_vector_t* p_kvs = nullptr; + size_t i = 0; + friend class KVPool; + }; + + iterator_t begin() const { + return iterator_t(kvs); + } + + iterator_t random_begin() const { + return iterator_t(random_kvs); + } + + size_t size() const { + return kvs.size(); + } + + private: + std::vector str_sizes; + Onodes onodes; + kv_vector_t kvs; + kv_vector_t random_kvs; +}; + +template +class TreeBuilder { + public: + using ertr = Btree::btree_ertr; + template + using future = ertr::future; + + TreeBuilder(KVPool& kvs, NodeExtentManagerURef&& nm) + : kvs{kvs} { + tree.emplace(std::move(nm)); + } + + future<> bootstrap(Transaction& t) { + std::ostringstream oss; +#ifndef NDEBUG + oss << "debug=on, "; +#else + oss << "debug=off, "; +#endif +#ifdef UNIT_TESTS_BUILT + oss << "UNIT_TEST_BUILT=on, "; +#else + oss << "UNIT_TEST_BUILT=off, "; +#endif + if constexpr (TRACK) { + oss << "track=on, "; + } else { + oss << "track=off, "; + } + oss << *tree; + logger().warn("TreeBuilder: {}, bootstrapping ...", oss.str()); + return tree->mkfs(t); + } + + future<> insert(Transaction& t) { + kv_iter = kvs.random_begin(); + auto cursors = seastar::make_lw_shared>(); + logger().warn("start inserting {} kvs ...", kvs.size()); + auto start_time = mono_clock::now(); + return crimson::do_until([&t, this, cursors]() -> future { + if (kv_iter.is_end()) { + return ertr::make_ready_future(true); + } + auto [key, p_value] = kv_iter.get_kv(); + logger().debug("[{}] {} -> {}", kv_iter.index(), key_hobj_t{key}, *p_value); + return tree->insert(t, key, *p_value + ).safe_then([&t, this, cursors](auto ret) { + auto& [cursor, success] = ret; + assert(success == true); + if constexpr (TRACK) { + cursors->emplace_back(cursor); + } +#ifndef NDEBUG + auto [key, p_value] = kv_iter.get_kv(); + Onodes::validate_cursor(cursor, key, *p_value); + return tree->lower_bound(t, key).safe_then([this, cursor](auto cursor_) { + auto [key, p_value] = kv_iter.get_kv(); + ceph_assert(cursor_.get_ghobj() == key); + ceph_assert(cursor_.value() == cursor.value()); + ++kv_iter; + return ertr::make_ready_future(false); + }); +#else + ++kv_iter; + return ertr::make_ready_future(false); +#endif + }); + }).safe_then([&t, this, start_time, cursors] { + std::chrono::duration duration = mono_clock::now() - start_time; + logger().warn("Insert done! {}s", duration.count()); + if (!cursors->empty()) { + logger().info("Verifing tracked cursors ..."); + kv_iter = kvs.random_begin(); + return seastar::do_with( + cursors->begin(), [&t, this, cursors](auto& c_iter) { + return crimson::do_until([&t, this, &c_iter, cursors]() -> future { + if (kv_iter.is_end()) { + logger().info("Verify done!"); + return ertr::make_ready_future(true); + } + assert(c_iter != cursors->end()); + auto [k, v] = kv_iter.get_kv(); + // validate values in tree keep intact + return tree->lower_bound(t, k).safe_then([this, &c_iter](auto cursor) { + auto [k, v] = kv_iter.get_kv(); + Onodes::validate_cursor(cursor, k, *v); + // validate values in cursors keep intact + Onodes::validate_cursor(*c_iter, k, *v); + ++kv_iter; + ++c_iter; + return ertr::make_ready_future(false); + }); + }); + }); + } else { + return ertr::now(); + } + }); + } + + future<> get_stats(Transaction& t) { + return tree->get_stats_slow(t + ).safe_then([this](auto stats) { + logger().warn("{}", stats); + }); + } + + void reload(NodeExtentManagerURef&& nm) { + tree.emplace(std::move(nm)); + } + + future<> validate(Transaction& t) { + logger().info("Verifing insertion ..."); + return seastar::do_with( + kvs.begin(), [&t, this] (auto& kvs_iter) { + return crimson::do_until([&t, this, &kvs_iter]() -> future { + if (kvs_iter.is_end()) { + logger().info("Verify done!"); + return ertr::make_ready_future(true); + } + auto [k, v] = kvs_iter.get_kv(); + return tree->lower_bound(t, k + ).safe_then([&kvs_iter, k=k, v=v] (auto cursor) { + Onodes::validate_cursor(cursor, k, *v); + ++kvs_iter; + return ertr::make_ready_future(false); + }); + }); + }); + } + + private: + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + + KVPool& kvs; + std::optional tree; + KVPool::iterator_t kv_iter; +}; + +} -- cgit v1.2.3