summaryrefslogtreecommitdiffstats
path: root/src/crimson/os/seastore/onode_manager/simple-fltree
diff options
context:
space:
mode:
Diffstat (limited to 'src/crimson/os/seastore/onode_manager/simple-fltree')
-rw-r--r--src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc71
-rw-r--r--src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h65
-rw-r--r--src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc188
-rw-r--r--src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h70
-rw-r--r--src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc567
-rw-r--r--src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h942
6 files changed, 1903 insertions, 0 deletions
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc
new file mode 100644
index 000000000..b05ea76a3
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "onode_block.h"
+
+namespace crimson::os::seastore {
+
+ceph::bufferlist OnodeBlock::get_delta()
+{
+ bufferlist bl;
+ assert(deltas.size() <= std::numeric_limits<uint8_t>::max());
+ uint8_t n_deltas = deltas.size();
+ ceph::encode(n_deltas, bl);
+ for (auto& delta : deltas) {
+ delta->encode(bl);
+ }
+ return bl;
+}
+
+void OnodeBlock::logical_on_delta_write()
+{
+ // journal submitted to disk, now update the memory
+ apply_pending_changes(true);
+}
+
+void OnodeBlock::apply_delta(const ceph::bufferlist &bl)
+{
+ assert(deltas.empty());
+
+ auto p = bl.cbegin();
+ uint8_t n_deltas = 0;
+ ceph::decode(n_deltas, p);
+ for (uint8_t i = 0; i < n_deltas; i++) {
+ delta_t delta;
+ delta.decode(p);
+ mutate(std::move(delta));
+ }
+ apply_pending_changes(true);
+}
+
+void OnodeBlock::mutate(delta_t&& d)
+{
+ if (is_initial_pending()) {
+ char* const p = get_bptr().c_str();
+ mutate_func(p, d);
+ }
+ deltas.push_back(std::make_unique<delta_t>(std::move(d)));
+}
+
+void OnodeBlock::apply_pending_changes(bool do_cleanup)
+{
+ if (!is_mutation_pending()) {
+ return;
+ }
+ if (share_buffer) {
+ // do a deep copy so i can change my own copy
+ get_bptr() = ceph::bufferptr{get_bptr().c_str(),
+ get_bptr().length()};
+ share_buffer = false;
+ }
+ assert(mutate_func);
+ char* const p = get_bptr().c_str();
+ for (auto& delta : deltas) {
+ mutate_func(p, *delta);
+ if (do_cleanup) {
+ delta.reset();
+ }
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h
new file mode 100644
index 000000000..0025d9847
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <cstdint>
+#include <boost/container/small_vector.hpp>
+
+#include "crimson/os/seastore/transaction_manager.h"
+#include "onode_delta.h"
+
+namespace crimson::os::seastore {
+
+// TODO s/CachedExtent/LogicalCachedExtent/
+struct OnodeBlock final : LogicalCachedExtent {
+ using Ref = TCachedExtentRef<OnodeBlock>;
+
+ template <typename... T>
+ OnodeBlock(T&&... t) : LogicalCachedExtent(std::forward<T>(t)...) {}
+ OnodeBlock(OnodeBlock&& block) = delete;
+ OnodeBlock(const OnodeBlock& block, CachedExtent::share_buffer_t tag) noexcept
+ : LogicalCachedExtent{block, tag},
+ share_buffer{true}
+ {}
+
+ CachedExtentRef duplicate_for_write() final {
+ return new OnodeBlock{*this, CachedExtent::share_buffer_t{}};
+ }
+
+ // could materialize the pending changes to the underlying buffer here,
+ // but since we write the change to the buffer immediately, let skip
+ // this for now.
+ void prepare_write() final {}
+
+ // queries
+ static constexpr extent_types_t TYPE = extent_types_t::ONODE_BLOCK;
+ extent_types_t get_type() const final {
+ return TYPE;
+ }
+
+ // have to stash all the changes before on_delta_write() is called,
+ // otherwise we could pollute the extent with pending mutations
+ // before the transaction carrying these mutations is committed to
+ // disk
+ ceph::bufferlist get_delta() final;
+ void logical_on_delta_write() final;
+ void apply_delta(const ceph::bufferlist &bl) final;
+
+ void sync() {
+ apply_pending_changes(false);
+ }
+ void mutate(delta_t&& d);
+ using mutate_func_t = std::function<void (char*, const delta_t&)>;
+ void set_delta_applier(mutate_func_t&& func) {
+ mutate_func = std::move(func);
+ }
+private:
+ // before looking at the extent, we need to make sure the content is up to date
+ void apply_pending_changes(bool do_cleanup);
+ // assuming we don't stash too many deltas to a single block
+ // otherwise a fullwrite op is necessary
+ boost::container::small_vector<std::unique_ptr<delta_t>, 2> deltas;
+ mutate_func_t mutate_func;
+ bool share_buffer = false;
+};
+
+}
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc
new file mode 100644
index 000000000..869685d45
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc
@@ -0,0 +1,188 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "onode_delta.h"
+
+delta_t::delta_t(delta_t&& delta)
+{
+ assert(op == op_t::nop);
+ op = delta.op;
+ n = delta.n;
+ oid = std::move(delta.oid);
+ onode = std::move(delta.onode);
+ keys = std::move(delta.keys);
+ cells = std::move(delta.cells);
+ delta.op = op_t::nop;
+}
+
+delta_t& delta_t::operator=(delta_t&& delta)
+{
+ assert(op == op_t::nop);
+ op = delta.op;
+ n = delta.n;
+ oid = std::move(delta.oid);
+ onode = std::move(delta.onode);
+ keys = std::move(delta.keys);
+ cells = std::move(delta.cells);
+ delta.op = op_t::nop;
+ return *this;
+}
+
+delta_t delta_t::nop()
+{
+ return delta_t{op_t::nop};
+}
+
+delta_t delta_t::insert_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode)
+{
+ delta_t delta{op_t::insert_onode};
+ delta.n = slot;
+ delta.oid = oid;
+ delta.onode = onode;
+ return delta;
+}
+
+delta_t delta_t::update_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode)
+{
+ delta_t delta{op_t::update_onode};
+ delta.n = slot;
+ delta.oid = oid;
+ delta.onode = onode;
+ return delta;
+}
+
+delta_t delta_t::insert_child(unsigned slot,
+ const ghobject_t& oid,
+ crimson::os::seastore::laddr_t addr)
+{
+ delta_t delta{op_t::insert_child};
+ delta.n = slot;
+ delta.oid = oid;
+ delta.addr = addr;
+ return delta;
+}
+
+delta_t delta_t::update_key(unsigned slot, const ghobject_t& oid)
+{
+ delta_t delta{op_t::update_key};
+ delta.n = slot;
+ delta.oid = oid;
+ return delta;
+}
+
+delta_t delta_t::shift_left(unsigned n)
+{
+ delta_t delta{op_t::shift_left};
+ delta.n = n;
+ return delta;
+}
+
+delta_t delta_t::trim_right(unsigned n)
+{
+ delta_t delta{op_t::trim_right};
+ delta.n = n;
+ return delta;
+}
+
+delta_t delta_t::insert_front(ceph::buffer::ptr keys,
+ ceph::buffer::ptr cells)
+{
+ delta_t delta{op_t::insert_front};
+ delta.keys = std::move(keys);
+ delta.cells = std::move(cells);
+ return delta;
+}
+
+delta_t delta_t::insert_back(ceph::buffer::ptr keys,
+ ceph::buffer::ptr cells)
+{
+ delta_t delta{op_t::insert_back};
+ delta.keys = std::move(keys);
+ delta.cells = std::move(cells);
+ return delta;
+}
+
+delta_t delta_t::remove_from(unsigned slot)
+{
+ delta_t delta{op_t::remove_from};
+ delta.n = slot;
+ return delta;
+}
+
+void delta_t::encode(ceph::bufferlist& bl)
+{
+ using ceph::encode;
+ switch (op) {
+ case op_t::insert_onode:
+ [[fallthrough]];
+ case op_t::update_onode:
+ // the slot # is not encoded, because we can alway figure it out
+ // when we have to replay the delta by looking the oid up in the
+ // node block
+ encode(oid, bl);
+ encode(*onode, bl);
+ break;
+ case op_t::insert_child:
+ encode(oid, bl);
+ encode(addr, bl);
+ case op_t::update_key:
+ encode(n, bl);
+ encode(oid, bl);
+ break;
+ case op_t::shift_left:
+ encode(n, bl);
+ break;
+ case op_t::trim_right:
+ encode(n, bl);
+ break;
+ case op_t::insert_front:
+ [[fallthrough]];
+ case op_t::insert_back:
+ encode(n, bl);
+ encode(keys, bl);
+ encode(cells, bl);
+ break;
+ case op_t::remove_from:
+ encode(n, bl);
+ break;
+ default:
+ assert(0 == "unknown onode op");
+ }
+}
+
+void delta_t::decode(ceph::bufferlist::const_iterator& p) {
+ using ceph::decode;
+ decode(op, p);
+ switch (op) {
+ case op_t::insert_onode:
+ [[fallthrough]];
+ case op_t::update_onode:
+ decode(oid, p);
+ decode(*onode, p);
+ break;
+ case op_t::insert_child:
+ [[fallthrough]];
+ case op_t::update_key:
+ decode(n, p);
+ decode(oid, p);
+ break;
+ case op_t::shift_left:
+ decode(n, p);
+ break;
+ case op_t::trim_right:
+ decode(n, p);
+ break;
+ case op_t::insert_front:
+ [[fallthrough]];
+ case op_t::insert_back:
+ decode(n, p);
+ decode(keys, p);
+ decode(cells, p);
+ break;
+ case op_t::remove_from:
+ decode(n, p);
+ break;
+ default:
+ assert(0 == "unknown onode op");
+ }
+}
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h
new file mode 100644
index 000000000..3e7e7315e
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cstdint>
+
+#include "common/hobject.h"
+#include "include/buffer_fwd.h"
+
+#include "crimson/os/seastore/onode.h"
+#include "crimson/os/seastore/seastore_types.h"
+
+using crimson::os::seastore::OnodeRef;
+
+struct delta_t {
+ enum class op_t : uint8_t {
+ nop,
+ insert_onode,
+ update_onode,
+ insert_child,
+ update_key,
+ shift_left,
+ trim_right,
+ insert_front,
+ insert_back,
+ remove_from,
+ // finer grained op?
+ // - changing the embedded extent map of given oid
+ // - mutating the embedded xattrs of given oid
+ } op = op_t::nop;
+
+ unsigned n = 0;
+ ghobject_t oid;
+ crimson::os::seastore::laddr_t addr = 0;
+ OnodeRef onode;
+ ceph::bufferptr keys;
+ ceph::bufferptr cells;
+
+ delta_t() = default;
+ delta_t(op_t op)
+ : op{op}
+ {}
+ delta_t(delta_t&& delta);
+ delta_t& operator=(delta_t&& delta);
+
+ static delta_t nop();
+ static delta_t insert_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode);
+ static delta_t update_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode);
+ static delta_t insert_child(unsigned slot, const ghobject_t& oid, crimson::os::seastore::laddr_t addr);
+ static delta_t update_key(unsigned slot, const ghobject_t& oid);
+ static delta_t shift_left(unsigned n);
+ static delta_t trim_right(unsigned n);
+ static delta_t insert_front(ceph::buffer::ptr keys,
+ ceph::buffer::ptr cells);
+ static delta_t insert_back(ceph::buffer::ptr keys,
+ ceph::buffer::ptr cells);
+ static delta_t remove_from(unsigned slot);
+
+ // shortcuts
+ static delta_t insert_item(unsigned slot, const ghobject_t& oid, OnodeRef onode) {
+ return insert_onode(slot, oid, onode);
+ }
+ static delta_t insert_item(unsigned slot, const ghobject_t& oid, crimson::os::seastore::laddr_t addr) {
+ return insert_child(slot, oid, addr);
+ }
+
+ void encode(ceph::bufferlist& bl);
+ void decode(ceph::bufferlist::const_iterator& p);
+};
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc
new file mode 100644
index 000000000..fdcaa2fcb
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc
@@ -0,0 +1,567 @@
+#include "onode_node.h"
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+auto node_t<BlockSize, N, NodeType>::key_at(unsigned slot) const
+ -> std::pair<const key_prefix_t&, const key_suffix_t&>
+{
+ auto& key = keys[slot];
+ if constexpr (item_in_key) {
+ return {key, key_suffix_t{}};
+ } else {
+ auto p = from_end(key.offset);
+ return {key, *reinterpret_cast<const key_suffix_t*>(p)};
+ }
+}
+
+// update an existing oid with the specified item
+template<size_t BlockSize, int N, ntype_t NodeType>
+ghobject_t
+node_t<BlockSize, N, NodeType>::get_oid_at(unsigned slot,
+ const ghobject_t& oid) const
+{
+ auto [prefix, suffix] = key_at(slot);
+ ghobject_t updated = oid;
+ prefix.update_oid(updated);
+ suffix.update_oid(updated);
+ return updated;
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+auto node_t<BlockSize, N, NodeType>::item_at(const key_prefix_t& key) const
+ -> const_item_t
+{
+ if constexpr (item_in_key) {
+ return key.child_addr;
+ } else {
+ assert(key.offset < BlockSize);
+ auto p = from_end(key.offset);
+ auto partial_key = reinterpret_cast<const key_suffix_t*>(p);
+ p += size_of(*partial_key);
+ return *reinterpret_cast<const item_t*>(p);
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::dump(std::ostream& os) const
+{
+ for (uint16_t i = 0; i < count; i++) {
+ const auto& [prefix, suffix] = key_at(i);
+ os << " [" << i << '/' << count - 1 << "]\n"
+ << " key1 = (" << prefix << ")\n"
+ << " key2 = (" << suffix << ")\n";
+ const auto& item = item_at(prefix);
+ if (_is_leaf()) {
+ os << " item = " << item << "\n";
+ } else {
+ os << " child = " << std::hex << item << std::dec << "\n";
+ }
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+char* node_t<BlockSize, N, NodeType>::from_end(uint16_t offset)
+{
+ auto end = reinterpret_cast<char*>(this) + BlockSize;
+ return end - static_cast<int>(offset);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+const char* node_t<BlockSize, N, NodeType>::from_end(uint16_t offset) const
+{
+ auto end = reinterpret_cast<const char*>(this) + BlockSize;
+ return end - static_cast<int>(offset);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+uint16_t node_t<BlockSize, N, NodeType>::used_space() const
+{
+ if constexpr (item_in_key) {
+ return count * sizeof(key_prefix_t);
+ } else {
+ if (count) {
+ return keys[count - 1].offset + count * sizeof(key_prefix_t);
+ } else {
+ return 0;
+ }
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+uint16_t node_t<BlockSize, N, NodeType>::capacity()
+{
+ auto p = reinterpret_cast<node_t*>(0);
+ return BlockSize - (reinterpret_cast<char*>(p->keys) -
+ reinterpret_cast<char*>(p));
+}
+
+// TODO: if it's allowed to update 2 siblings at the same time, we can have
+// B* tree
+template<size_t BlockSize, int N, ntype_t NodeType>
+constexpr uint16_t node_t<BlockSize, N, NodeType>::min_size()
+{
+ return capacity() / 2;
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+constexpr std::pair<int16_t, int16_t>
+node_t<BlockSize, N, NodeType>::bytes_to_add(uint16_t size)
+{
+ assert(size < min_size());
+ return {min_size() - size, capacity() - size};
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+constexpr std::pair<int16_t, int16_t>
+node_t<BlockSize, N, NodeType>::bytes_to_remove(uint16_t size)
+{
+ assert(size > capacity());
+ return {size - capacity(), size - min_size()};
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+size_state_t node_t<BlockSize, N, NodeType>::size_state(uint16_t size) const
+{
+ if (size > capacity()) {
+ return size_state_t::overflow;
+ } else if (size < capacity() / 2) {
+ return size_state_t::underflow;
+ } else {
+ return size_state_t::okay;
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+bool node_t<BlockSize, N, NodeType>::is_underflow(uint16_t size) const
+{
+ switch (size_state(size)) {
+ case size_state_t::underflow:
+ return true;
+ case size_state_t::okay:
+ return false;
+ default:
+ assert(0);
+ return false;
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+int16_t node_t<BlockSize, N, NodeType>::size_with_key(unsigned slot,
+ const ghobject_t& oid) const
+{
+ if constexpr (item_in_key) {
+ return capacity();
+ } else {
+ // the size of fixed key does not change
+ [[maybe_unused]] const auto& [prefix, suffix] = key_at(slot);
+ return capacity() + key_suffix_t::size_from(oid) - suffix.size();
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+ordering_t node_t<BlockSize, N, NodeType>::compare_with_slot(unsigned slot,
+ const ghobject_t& oid) const
+{
+ const auto& [prefix, suffix] = key_at(slot);
+ if (auto result = prefix.compare(oid); result != ordering_t::equivalent) {
+ return result;
+ } else {
+ return suffix.compare(oid);
+ }
+}
+
+/// return the slot number of the first slot that is greater or equal to
+/// key
+template<size_t BlockSize, int N, ntype_t NodeType>
+std::pair<unsigned, bool> node_t<BlockSize, N, NodeType>::lower_bound(const ghobject_t& oid) const
+{
+ unsigned s = 0, e = count;
+ while (s != e) {
+ unsigned mid = (s + e) / 2;
+ switch (compare_with_slot(mid, oid)) {
+ case ordering_t::less:
+ s = ++mid;
+ break;
+ case ordering_t::greater:
+ e = mid;
+ break;
+ case ordering_t::equivalent:
+ assert(mid == 0 || mid < count);
+ return {mid, true};
+ }
+ }
+ return {s, false};
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+uint16_t node_t<BlockSize, N, NodeType>::size_of_item(const ghobject_t& oid,
+ const item_t& item)
+{
+ if constexpr (item_in_key) {
+ return sizeof(key_prefix_t);
+ } else {
+ return (sizeof(key_prefix_t) +
+ key_suffix_t::size_from(oid) + size_of(item));
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+bool node_t<BlockSize, N, NodeType>::is_overflow(const ghobject_t& oid,
+ const item_t& item) const
+{
+ return free_space() < size_of_item(oid, item);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+bool node_t<BlockSize, N, NodeType>::is_overflow(const ghobject_t& oid,
+ const OnodeRef& item) const
+{
+ return free_space() < (sizeof(key_prefix_t) + key_suffix_t::size_from(oid) + item->size());
+}
+
+// inserts an item into the given slot, pushing all subsequent keys forward
+// @note if the item is not embedded in key, shift the right half as well
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::insert_at(unsigned slot,
+ const ghobject_t& oid,
+ const item_t& item)
+{
+ assert(!is_overflow(oid, item));
+ assert(slot <= count);
+ if constexpr (item_in_key) {
+ // shift the keys right
+ key_prefix_t* key = keys + slot;
+ key_prefix_t* last_key = keys + count;
+ std::copy_backward(key, last_key, last_key + 1);
+ key->set(oid, item);
+ } else {
+ const uint16_t size = key_suffix_t::size_from(oid) + size_of(item);
+ uint16_t offset = size;
+ if (slot > 0) {
+ offset += keys[slot - 1].offset;
+ }
+ if (slot < count) {
+ // V
+ // | |... // ...|//////|| |
+ // | |... // ...|//////| | |
+ // shift the partial keys and items left
+ auto first = keys[slot - 1].offset;
+ auto last = keys[count - 1].offset;
+ std::memmove(from_end(last + size), from_end(last), last - first);
+ // shift the keys right and update the pointers
+ for (key_prefix_t* dst = keys + count; dst > keys + slot; dst--) {
+ key_prefix_t* src = dst - 1;
+ *dst = *src;
+ dst->offset += size;
+ }
+ }
+ keys[slot].set(oid, offset);
+ auto p = from_end(offset);
+ auto partial_key = reinterpret_cast<key_suffix_t*>(p);
+ partial_key->set(oid);
+ p += size_of(*partial_key);
+ auto item_ptr = reinterpret_cast<item_t*>(p);
+ *item_ptr = item;
+ }
+ count++;
+ assert(used_space() <= capacity());
+}
+
+// used by InnerNode for updating the keys indexing its children when their lower boundaries
+// is updated
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::update_key_at(unsigned slot, const ghobject_t& oid)
+{
+ if constexpr (is_leaf()) {
+ assert(0);
+ } else if constexpr (item_in_key) {
+ keys[slot].update(oid);
+ } else {
+ const auto& [prefix, suffix] = key_at(slot);
+ int16_t delta = key_suffix_t::size_from(oid) - suffix.size();
+ if (delta > 0) {
+ // shift the cells sitting at its left side
+ auto first = keys[slot].offset;
+ auto last = keys[count - 1].offset;
+ std::memmove(from_end(last + delta), from_end(last), last - first);
+ // update the pointers
+ for (key_prefix_t* key = keys + slot; key < keys + count; key++) {
+ key->offset += delta;
+ }
+ }
+ keys[slot].update(oid);
+ auto p = from_end(keys[slot].offset);
+ auto partial_key = reinterpret_cast<key_suffix_t*>(p);
+ partial_key->set(oid);
+ // we don't update item here
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+std::pair<unsigned, uint16_t>
+node_t<BlockSize, N, NodeType>::calc_grab_front(uint16_t min_grab,
+ uint16_t max_grab) const
+{
+ // TODO: split by likeness
+ uint16_t grabbed = 0;
+ uint16_t used = used_space();
+ int n = 0;
+ for (; n < count; n++) {
+ const auto& [prefix, suffix] = key_at(n);
+ uint16_t to_grab = sizeof(prefix) + size_of(suffix);
+ if constexpr (!item_in_key) {
+ const auto& item = item_at(prefix);
+ to_grab += size_of(item);
+ }
+ if (grabbed + to_grab > max_grab) {
+ break;
+ }
+ grabbed += to_grab;
+ }
+ if (grabbed >= min_grab) {
+ if (n == count) {
+ return {n, grabbed};
+ } else if (!is_underflow(used - grabbed)) {
+ return {n, grabbed};
+ }
+ }
+ return {0, 0};
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+std::pair<unsigned, uint16_t>
+node_t<BlockSize, N, NodeType>::calc_grab_back(uint16_t min_grab,
+ uint16_t max_grab) const
+{
+ // TODO: split by likeness
+ uint16_t grabbed = 0;
+ uint16_t used = used_space();
+ for (int i = count - 1; i >= 0; i--) {
+ const auto& [prefix, suffix] = key_at(i);
+ uint16_t to_grab = sizeof(prefix) + size_of(suffix);
+ if constexpr (!item_in_key) {
+ const auto& item = item_at(prefix);
+ to_grab += size_of(item);
+ }
+ grabbed += to_grab;
+ if (is_underflow(used - grabbed)) {
+ return {0, 0};
+ } else if (grabbed > max_grab) {
+ return {0, 0};
+ } else if (grabbed >= min_grab) {
+ return {i + 1, grabbed};
+ }
+ }
+ return {0, 0};
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+template<int LeftN, class Mover>
+void node_t<BlockSize, N, NodeType>::grab_from_left(node_t<BlockSize, LeftN, NodeType>& left,
+ unsigned n, uint16_t bytes,
+ Mover& mover)
+{
+ // TODO: rebuild keys if moving across different layouts
+ // group by likeness
+ shift_right(n, bytes);
+ mover.move_from(left.count - n, 0, n);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+template<int RightN, class Mover>
+delta_t node_t<BlockSize, N, NodeType>::acquire_right(node_t<BlockSize, RightN, NodeType>& right,
+ unsigned whoami, Mover& mover)
+{
+ mover.move_from(0, count, right.count);
+ return mover.to_delta();
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+template<int RightN, class Mover>
+void node_t<BlockSize, N, NodeType>::grab_from_right(node_t<BlockSize, RightN, NodeType>& right,
+ unsigned n, uint16_t bytes,
+ Mover& mover)
+{
+ mover.move_from(0, count, n);
+ right.shift_left(n, 0);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+template<int LeftN, class Mover>
+void node_t<BlockSize, N, NodeType>::push_to_left(node_t<BlockSize, LeftN, NodeType>& left,
+ unsigned n, uint16_t bytes,
+ Mover& mover)
+{
+ left.grab_from_right(*this, n, bytes, mover);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+template<int RightN, class Mover>
+void node_t<BlockSize, N, NodeType>::push_to_right(node_t<BlockSize, RightN, NodeType>& right,
+ unsigned n, uint16_t bytes,
+ Mover& mover)
+{
+ right.grab_from_left(*this, n, bytes, mover);
+}
+
+// [to, from) are removed, so we need to shift left
+// actually there are only two use cases:
+// - to = 0: for giving elements in bulk
+// - to = from - 1: for removing a single element
+// old: |////|.....| |.....|/|........|
+// new: |.....| |.....||........|
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::shift_left(unsigned from, unsigned to)
+{
+ assert(from < count);
+ assert(to < from);
+ if constexpr (item_in_key) {
+ std::copy(keys + from, keys + count, keys + to);
+ } else {
+ const uint16_t cell_hi = keys[count - 1].offset;
+ const uint16_t cell_lo = keys[from - 1].offset;
+ const uint16_t offset_delta = keys[from].offset - keys[to].offset;
+ for (auto src_key = keys + from, dst_key = keys + to;
+ src_key != keys + count;
+ ++src_key, ++dst_key) {
+ // shift the keys left
+ *dst_key = *src_key;
+ // update the pointers
+ dst_key->offset -= offset_delta;
+ }
+ // and cells
+ auto dst = from_end(cell_hi);
+ std::memmove(dst + offset_delta, dst, cell_hi - cell_lo);
+ }
+ count -= (from - to);
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::insert_front(const ceph::bufferptr& keys_buf,
+ const ceph::bufferptr& cells_buf)
+{
+ unsigned n = keys_buf.length() / sizeof(key_prefix_t);
+ shift_right(n, cells_buf.length());
+ keys_buf.copy_out(0, keys_buf.length(), reinterpret_cast<char*>(keys));
+ if constexpr (item_in_key) {
+ assert(cells_buf.length() == 0);
+ } else {
+ cells_buf.copy_out(0, cells_buf.length(), from_end(keys[n - 1].offset));
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::insert_back(const ceph::bufferptr& keys_buf,
+ const ceph::bufferptr& cells_buf)
+{
+ keys_buf.copy_out(0, keys_buf.length(), reinterpret_cast<char*>(keys + count));
+ count += keys_buf.length() / sizeof(key_prefix_t);
+ if constexpr (item_in_key) {
+ assert(cells_buf.length() == 0);
+ } else {
+ cells_buf.copy_out(0, cells_buf.length(), from_end(keys[count - 1].offset));
+ }
+}
+
+// one or more elements are inserted, so we need to shift the elements right
+// actually there are only two use cases:
+// - bytes != 0: for inserting bytes before from
+// - bytes = 0: for inserting a single element before from
+// old: ||.....|
+// new: |/////|.....|
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::shift_right(unsigned n, unsigned bytes)
+{
+ assert(bytes + used_space() < capacity());
+ // shift the keys left
+ std::copy_backward(keys, keys + count, keys + count + n);
+ count += n;
+ if constexpr (!item_in_key) {
+ uint16_t cells = keys[count - 1].offset;
+ // copy the partial keys and items
+ std::memmove(from_end(cells + bytes), from_end(cells), cells);
+ // update the pointers
+ for (auto key = keys + n; key < keys + count; ++key) {
+ key->offset += bytes;
+ }
+ }
+}
+
+// shift all keys after slot is removed.
+// @note if the item is not embdedded in key, all items sitting at the left
+// side of it will be shifted right
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::remove_from(unsigned slot)
+{
+ assert(slot < count);
+ if (unsigned next = slot + 1; next < count) {
+ shift_left(next, slot);
+ } else {
+ // slot is the last one
+ count--;
+ }
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::trim_right(unsigned n)
+{
+ count = n;
+}
+
+template<size_t BlockSize, int N, ntype_t NodeType>
+void node_t<BlockSize, N, NodeType>::play_delta(const delta_t& delta)
+{
+ switch (delta.op) {
+ case delta_t::op_t::insert_onode:
+ if constexpr (is_leaf()) {
+ auto [slot, found] = lower_bound(delta.oid);
+ assert(!found);
+ assert(delta.onode->size() <= std::numeric_limits<unsigned>::max());
+ ceph::bufferptr buf{static_cast<unsigned>(delta.onode->size())};
+ delta.onode->encode(buf.c_str(), buf.length());
+ auto onode = reinterpret_cast<const onode_t*>(buf.c_str());
+ return insert_at(slot, delta.oid, *onode);
+ } else {
+ throw std::invalid_argument("wrong node type");
+ }
+ case delta_t::op_t::update_onode:
+ // TODO
+ assert(0 == "not implemented");
+ break;
+ case delta_t::op_t::insert_child:
+ if constexpr (is_leaf()) {
+ throw std::invalid_argument("wrong node type");
+ } else {
+ auto [slot, found] = lower_bound(delta.oid);
+ assert(!found);
+ insert_at(slot, delta.oid, delta.addr);
+ }
+ case delta_t::op_t::update_key:
+ if constexpr (is_leaf()) {
+ throw std::invalid_argument("wrong node type");
+ } else {
+ return update_key_at(delta.n, delta.oid);
+ }
+ case delta_t::op_t::shift_left:
+ return shift_left(delta.n, 0);
+ case delta_t::op_t::trim_right:
+ return trim_right(delta.n);
+ case delta_t::op_t::insert_front:
+ return insert_front(delta.keys, delta.cells);
+ case delta_t::op_t::insert_back:
+ return insert_back(delta.keys, delta.cells);
+ case delta_t::op_t::remove_from:
+ return remove_from(delta.n);
+ default:
+ assert(0 == "unknown onode delta");
+ }
+}
+
+// explicit instantiate the node_t classes used by test_node.cc
+template class node_t<512, 0, ntype_t::inner>;
+template class node_t<512, 0, ntype_t::leaf>;
+template class node_t<512, 1, ntype_t::inner>;
+template class node_t<512, 1, ntype_t::leaf>;
+template class node_t<512, 2, ntype_t::inner>;
+template class node_t<512, 2, ntype_t::leaf>;
+template class node_t<512, 3, ntype_t::inner>;
+template class node_t<512, 3, ntype_t::leaf>;
diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h
new file mode 100644
index 000000000..d833a6682
--- /dev/null
+++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h
@@ -0,0 +1,942 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <algorithm>
+#include <cstdint>
+#include <type_traits>
+#include <variant>
+
+#include "common/hobject.h"
+#include "crimson/common/layout.h"
+#include "crimson/os/seastore/onode.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "onode_delta.h"
+
+namespace asci = absl::container_internal;
+
+namespace boost::beast {
+ template<class T>
+ bool operator==(const span<T>& lhs, const span<T>& rhs) {
+ return std::equal(
+ lhs.begin(), lhs.end(),
+ rhs.begin(), rhs.end());
+ }
+}
+
+// on-disk onode
+// it only keeps the bits necessary to rebuild an in-memory onode
+struct [[gnu::packed]] onode_t {
+ onode_t& operator=(const onode_t& onode) {
+ len = onode.len;
+ std::memcpy(data, onode.data, len);
+ return *this;
+ }
+ size_t size() const {
+ return sizeof(*this) + len;
+ }
+ OnodeRef decode() const {
+ return new crimson::os::seastore::Onode(std::string_view{data, len});
+ }
+ uint8_t struct_v = 1;
+ uint8_t struct_compat = 1;
+ // TODO:
+ // - use uint16_t for length, as the size of an onode should be less
+ // than a block (16K for now)
+ // - drop struct_len
+ uint32_t struct_len = 0;
+ uint32_t len;
+ char data[];
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const onode_t& onode) {
+ return os << *onode.decode();
+}
+
+using crimson::os::seastore::laddr_t;
+
+struct [[gnu::packed]] child_addr_t {
+ laddr_t data;
+ child_addr_t(laddr_t data)
+ : data{data}
+ {}
+ child_addr_t& operator=(laddr_t addr) {
+ data = addr;
+ return *this;
+ }
+ laddr_t get() const {
+ return data;
+ }
+ operator laddr_t() const {
+ return data;
+ }
+ size_t size() const {
+ return sizeof(laddr_t);
+ }
+};
+
+// poor man's operator<=>
+enum class ordering_t {
+ less,
+ equivalent,
+ greater,
+};
+
+template<class L, class R>
+ordering_t compare_element(const L& x, const R& y)
+{
+ if constexpr (std::is_arithmetic_v<L>) {
+ static_assert(std::is_arithmetic_v<R>);
+ if (x < y) {
+ return ordering_t::less;
+ } else if (x > y) {
+ return ordering_t::greater;
+ } else {
+ return ordering_t::equivalent;
+ }
+ } else {
+ // string_view::compare(), string::compare(), ...
+ auto result = x.compare(y);
+ if (result < 0) {
+ return ordering_t::less;
+ } else if (result > 0) {
+ return ordering_t::greater;
+ } else {
+ return ordering_t::equivalent;
+ }
+ }
+}
+
+template<typename L, typename R>
+constexpr ordering_t tuple_cmp(const L&, const R&, std::index_sequence<>)
+{
+ return ordering_t::equivalent;
+}
+
+template<typename L, typename R,
+ size_t Head, size_t... Tail>
+constexpr ordering_t tuple_cmp(const L& x, const R& y,
+ std::index_sequence<Head, Tail...>)
+{
+ auto ordering = compare_element(std::get<Head>(x), std::get<Head>(y));
+ if (ordering != ordering_t::equivalent) {
+ return ordering;
+ } else {
+ return tuple_cmp(x, y, std::index_sequence<Tail...>());
+ }
+}
+
+template<typename... Ls, typename... Rs>
+constexpr ordering_t cmp(const std::tuple<Ls...>& x,
+ const std::tuple<Rs...>& y)
+{
+ static_assert(sizeof...(Ls) == sizeof...(Rs));
+ return tuple_cmp(x, y, std::index_sequence_for<Ls...>());
+}
+
+enum class likes_t {
+ yes,
+ no,
+ maybe,
+};
+
+struct [[gnu::packed]] variable_key_suffix {
+ uint64_t snap;
+ uint64_t gen;
+ uint8_t nspace_len;
+ uint8_t name_len;
+ char data[];
+ struct index_t {
+ enum {
+ nspace_data = 0,
+ name_data = 1,
+ };
+ };
+ using layout_type = asci::Layout<char, char>;
+ layout_type cell_layout() const {
+ return layout_type{nspace_len, name_len};
+ }
+ void set(const ghobject_t& oid) {
+ snap = oid.hobj.snap;
+ gen = oid.generation;
+ nspace_len = oid.hobj.nspace.size();
+ name_len = oid.hobj.oid.name.size();
+ auto layout = cell_layout();
+ std::memcpy(layout.Pointer<index_t::nspace_data>(data),
+ oid.hobj.nspace.data(), oid.hobj.nspace.size());
+ std::memcpy(layout.Pointer<index_t::name_data>(data),
+ oid.hobj.oid.name.data(), oid.hobj.oid.name.size());
+ }
+
+ void update_oid(ghobject_t& oid) const {
+ oid.hobj.snap = snap;
+ oid.generation = gen;
+ oid.hobj.nspace = nspace();
+ oid.hobj.oid.name = name();
+ }
+
+ variable_key_suffix& operator=(const variable_key_suffix& key) {
+ snap = key.snap;
+ gen = key.gen;
+ auto layout = cell_layout();
+ auto nspace = key.nspace();
+ std::copy_n(nspace.data(), nspace.size(),
+ layout.Pointer<index_t::nspace_data>(data));
+ auto name = key.name();
+ std::copy_n(name.data(), name.size(),
+ layout.Pointer<index_t::name_data>(data));
+ return *this;
+ }
+ const std::string_view nspace() const {
+ auto layout = cell_layout();
+ auto nspace = layout.Slice<index_t::nspace_data>(data);
+ return {nspace.data(), nspace.size()};
+ }
+ const std::string_view name() const {
+ auto layout = cell_layout();
+ auto name = layout.Slice<index_t::name_data>(data);
+ return {name.data(), name.size()};
+ }
+ size_t size() const {
+ return sizeof(*this) + nspace_len + name_len;
+ }
+ static size_t size_from(const ghobject_t& oid) {
+ return (sizeof(variable_key_suffix) +
+ oid.hobj.nspace.size() +
+ oid.hobj.oid.name.size());
+ }
+ ordering_t compare(const ghobject_t& oid) const {
+ return cmp(std::tie(nspace(), name(), snap, gen),
+ std::tie(oid.hobj.nspace, oid.hobj.oid.name, oid.hobj.snap.val,
+ oid.generation));
+ }
+ bool likes(const variable_key_suffix& key) const {
+ return nspace() == key.nspace() && name() == key.name();
+ }
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const variable_key_suffix& k) {
+ if (k.snap != CEPH_NOSNAP) {
+ os << "s" << k.snap << ",";
+ }
+ if (k.gen != ghobject_t::NO_GEN) {
+ os << "g" << k.gen << ",";
+ }
+ return os << k.nspace() << "/" << k.name();
+}
+
+// should use [[no_unique_address]] in C++20
+struct empty_key_suffix {
+ static constexpr ordering_t compare(const ghobject_t&) {
+ return ordering_t::equivalent;
+ }
+ static void set(const ghobject_t&) {}
+ static constexpr size_t size() {
+ return 0;
+ }
+ static size_t size_from(const ghobject_t&) {
+ return 0;
+ }
+ static void update_oid(ghobject_t&) {}
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const empty_key_suffix&)
+{
+ return os;
+}
+
+enum class ntype_t : uint8_t {
+ leaf = 0u,
+ inner,
+};
+
+constexpr ntype_t flip_ntype(ntype_t ntype) noexcept
+{
+ if (ntype == ntype_t::leaf) {
+ return ntype_t::inner;
+ } else {
+ return ntype_t::leaf;
+ }
+}
+
+template<int N, ntype_t NodeType>
+struct FixedKeyPrefix {};
+
+template<ntype_t NodeType>
+struct FixedKeyPrefix<0, NodeType>
+{
+ static constexpr bool item_in_key = false;
+ int8_t shard = -1;
+ int64_t pool = -1;
+ uint32_t hash = 0;
+ uint16_t offset = 0;
+
+ FixedKeyPrefix() = default;
+ FixedKeyPrefix(const ghobject_t& oid, uint16_t offset)
+ : shard{oid.shard_id},
+ pool{oid.hobj.pool},
+ hash{oid.hobj.get_hash()},
+ offset{offset}
+ {}
+
+ void set(const ghobject_t& oid, uint16_t new_offset) {
+ shard = oid.shard_id;
+ pool = oid.hobj.pool;
+ hash = oid.hobj.get_hash();
+ offset = new_offset;
+ }
+
+ void set(const FixedKeyPrefix& k, uint16_t new_offset) {
+ shard = k.shard;
+ pool = k.pool;
+ hash = k.hash;
+ offset = new_offset;
+ }
+
+ void update(const ghobject_t& oid) {
+ shard = oid.shard_id;
+ pool = oid.hobj.pool;
+ hash = oid.hobj.get_hash();
+ }
+
+ void update_oid(ghobject_t& oid) const {
+ oid.set_shard(shard_id_t{shard});
+ oid.hobj.pool = pool;
+ oid.hobj.set_hash(hash);
+ }
+
+ ordering_t compare(const ghobject_t& oid) const {
+ // so std::tie() can bind them by reference
+ int8_t rhs_shard = oid.shard_id;
+ uint32_t rhs_hash = oid.hobj.get_hash();
+ return cmp(std::tie(shard, pool, hash),
+ std::tie(rhs_shard, oid.hobj.pool, rhs_hash));
+ }
+ // @return true if i likes @c k, we will can be pushed down to next level
+ // in the same node
+ likes_t likes(const FixedKeyPrefix& k) const {
+ if (shard == k.shard && pool == k.pool) {
+ return likes_t::yes;
+ } else {
+ return likes_t::no;
+ }
+ }
+};
+
+template<ntype_t NodeType>
+std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<0, NodeType>& k) {
+ if (k.shard != shard_id_t::NO_SHARD) {
+ os << "s" << k.shard;
+ }
+ return os << "p=" << k.pool << ","
+ << "h=" << std::hex << k.hash << std::dec << ","
+ << ">" << k.offset;
+}
+
+// all elements in this node share the same <shard, pool>
+template<ntype_t NodeType>
+struct FixedKeyPrefix<1, NodeType> {
+ static constexpr bool item_in_key = false;
+ uint32_t hash = 0;
+ uint16_t offset = 0;
+
+ FixedKeyPrefix() = default;
+ FixedKeyPrefix(uint32_t hash, uint16_t offset)
+ : hash{hash},
+ offset{offset}
+ {}
+ FixedKeyPrefix(const ghobject_t& oid, uint16_t offset)
+ : FixedKeyPrefix(oid.hobj.get_hash(), offset)
+ {}
+ void set(const ghobject_t& oid, uint16_t new_offset) {
+ hash = oid.hobj.get_hash();
+ offset = new_offset;
+ }
+ template<int N>
+ void set(const FixedKeyPrefix<N, NodeType>& k, uint16_t new_offset) {
+ static_assert(N < 2, "only N0, N1 have hash");
+ hash = k.hash;
+ offset = new_offset;
+ }
+ void update_oid(ghobject_t& oid) const {
+ oid.hobj.set_hash(hash);
+ }
+ void update(const ghobject_t& oid) {
+ hash = oid.hobj.get_hash();
+ }
+ ordering_t compare(const ghobject_t& oid) const {
+ return compare_element(hash, oid.hobj.get_hash());
+ }
+ likes_t likes(const FixedKeyPrefix& k) const {
+ return hash == k.hash ? likes_t::yes : likes_t::no;
+ }
+};
+
+template<ntype_t NodeType>
+std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<1, NodeType>& k) {
+ return os << "0x" << std::hex << k.hash << std::dec << ","
+ << ">" << k.offset;
+}
+
+// all elements in this node must share the same <shard, pool, hash>
+template<ntype_t NodeType>
+struct FixedKeyPrefix<2, NodeType> {
+ static constexpr bool item_in_key = false;
+ uint16_t offset = 0;
+
+ FixedKeyPrefix() = default;
+
+ static constexpr ordering_t compare(const ghobject_t& oid) {
+ // need to compare the cell
+ return ordering_t::equivalent;
+ }
+ // always defer to my cell for likeness
+ constexpr likes_t likes(const FixedKeyPrefix&) const {
+ return likes_t::maybe;
+ }
+ void set(const ghobject_t&, uint16_t new_offset) {
+ offset = new_offset;
+ }
+ template<int N>
+ void set(const FixedKeyPrefix<N, NodeType>&, uint16_t new_offset) {
+ offset = new_offset;
+ }
+ void update(const ghobject_t&) {}
+ void update_oid(ghobject_t&) const {}
+};
+
+template<ntype_t NodeType>
+std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<2, NodeType>& k) {
+ return os << ">" << k.offset;
+}
+
+struct fixed_key_3 {
+ uint64_t snap = 0;
+ uint64_t gen = 0;
+
+ fixed_key_3() = default;
+ fixed_key_3(const ghobject_t& oid)
+ : snap{oid.hobj.snap}, gen{oid.generation}
+ {}
+ ordering_t compare(const ghobject_t& oid) const {
+ return cmp(std::tie(snap, gen),
+ std::tie(oid.hobj.snap.val, oid.generation));
+ }
+ // no object likes each other at this level
+ constexpr likes_t likes(const fixed_key_3&) const {
+ return likes_t::no;
+ }
+ void update_with_oid(const ghobject_t& oid) {
+ snap = oid.hobj.snap;
+ gen = oid.generation;
+ }
+ void update_oid(ghobject_t& oid) const {
+ oid.hobj.snap = snap;
+ oid.generation = gen;
+ }
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const fixed_key_3& k) {
+ if (k.snap != CEPH_NOSNAP) {
+ os << "s" << k.snap << ",";
+ }
+ if (k.gen != ghobject_t::NO_GEN) {
+ os << "g" << k.gen << ",";
+ }
+ return os;
+}
+
+// all elements in this node must share the same <shard, pool, hash, namespace, oid>
+// but the unlike other FixedKeyPrefix<>, a node with FixedKeyPrefix<3> does not have
+// variable_sized_key, so if it is an inner node, we can just embed the child
+// addr right in the key.
+template<>
+struct FixedKeyPrefix<3, ntype_t::inner> : public fixed_key_3 {
+ // the item is embedded in the key
+ static constexpr bool item_in_key = true;
+ laddr_t child_addr = 0;
+
+ FixedKeyPrefix() = default;
+ void set(const ghobject_t& oid, laddr_t new_child_addr) {
+ update_with_oid(oid);
+ child_addr = new_child_addr;
+ }
+ // unlikely get called, though..
+ void update(const ghobject_t& oid) {}
+ template<int N>
+ std::enable_if_t<N < 3> set(const FixedKeyPrefix<N, ntype_t::inner>&,
+ laddr_t new_child_addr) {
+ child_addr = new_child_addr;
+ }
+ void set(const FixedKeyPrefix& k, laddr_t new_child_addr) {
+ snap = k.snap;
+ gen = k.gen;
+ child_addr = new_child_addr;
+ }
+ void set(const variable_key_suffix& k, laddr_t new_child_addr) {
+ snap = k.snap;
+ gen = k.gen;
+ child_addr = new_child_addr;
+ }
+};
+
+template<>
+struct FixedKeyPrefix<3, ntype_t::leaf> : public fixed_key_3 {
+ static constexpr bool item_in_key = false;
+ uint16_t offset = 0;
+
+ FixedKeyPrefix() = default;
+ void set(const ghobject_t& oid, uint16_t new_offset) {
+ update_with_oid(oid);
+ offset = new_offset;
+ }
+ void set(const FixedKeyPrefix& k, uint16_t new_offset) {
+ snap = k.snap;
+ gen = k.gen;
+ offset = new_offset;
+ }
+ template<int N>
+ std::enable_if_t<N < 3> set(const FixedKeyPrefix<N, ntype_t::leaf>&,
+ uint16_t new_offset) {
+ offset = new_offset;
+ }
+};
+
+struct tag_t {
+ template<int N, ntype_t node_type>
+ static constexpr tag_t create() {
+ static_assert(std::clamp(N, 0, 3) == N);
+ return tag_t{N, static_cast<uint8_t>(node_type)};
+ }
+ bool is_leaf() const {
+ return type() == ntype_t::leaf;
+ }
+ int layout() const {
+ return layout_type;
+ }
+ ntype_t type() const {
+ return ntype_t{node_type};
+ }
+ int layout_type : 4;
+ uint8_t node_type : 4;
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const tag_t& tag) {
+ return os << "n=" << tag.layout() << ", leaf=" << tag.is_leaf();
+}
+
+// for calculating size of variable-sized item/key
+template<class T>
+size_t size_of(const T& t) {
+ using decayed_t = std::decay_t<T>;
+ if constexpr (std::is_scalar_v<decayed_t>) {
+ return sizeof(decayed_t);
+ } else {
+ return t.size();
+ }
+}
+
+enum class size_state_t {
+ okay,
+ underflow,
+ overflow,
+};
+
+// layout of a node of B+ tree
+//
+// it is different from a typical B+ tree in following ways
+// - the size of keys is not necessarily fixed, neither is the size of value.
+// - the max number of elements in a node is determined by the total size of
+// the keys and values in the node
+// - in internal nodes, each key maps to the logical address of the child
+// node whose minimum key is greater or equal to that key.
+template<size_t BlockSize,
+ int N,
+ ntype_t NodeType>
+struct node_t {
+ static_assert(std::clamp(N, 0, 3) == N);
+ constexpr static ntype_t node_type = NodeType;
+ constexpr static int node_n = N;
+
+ using key_prefix_t = FixedKeyPrefix<N, NodeType>;
+ using item_t = std::conditional_t<NodeType == ntype_t::leaf,
+ onode_t,
+ child_addr_t>;
+ using const_item_t = std::conditional_t<NodeType == ntype_t::leaf,
+ const onode_t&,
+ child_addr_t>;
+ static constexpr bool item_in_key = key_prefix_t::item_in_key;
+ using key_suffix_t = std::conditional_t<N < 3,
+ variable_key_suffix,
+ empty_key_suffix>;
+
+ std::pair<const key_prefix_t&, const key_suffix_t&>
+ key_at(unsigned slot) const;
+
+ // update an existing oid with the specified item
+ ghobject_t get_oid_at(unsigned slot, const ghobject_t& oid) const;
+ const_item_t item_at(const key_prefix_t& key) const;
+ void dump(std::ostream& os) const;
+
+ // for debugging only.
+ static constexpr bool is_leaf() {
+ return node_type == ntype_t::leaf;
+ }
+
+ bool _is_leaf() const {
+ return tag.is_leaf();
+ }
+
+ char* from_end(uint16_t offset);
+ const char* from_end(uint16_t offset) const;
+ uint16_t used_space() const;
+ uint16_t free_space() const {
+ return capacity() - used_space();
+ }
+ static uint16_t capacity();
+ // TODO: if it's allowed to update 2 siblings at the same time, we can have
+ // B* tree
+ static constexpr uint16_t min_size();
+
+
+ // calculate the allowable bounds on bytes to remove from an overflow node
+ // with specified size
+ // @param size the overflowed size
+ // @return <minimum bytes to grab, maximum bytes to grab>
+ static constexpr std::pair<int16_t, int16_t> bytes_to_remove(uint16_t size);
+
+ // calculate the allowable bounds on bytes to add to an underflow node
+ // with specified size
+ // @param size the underflowed size
+ // @return <minimum bytes to push, maximum bytes to push>
+ static constexpr std::pair<int16_t, int16_t> bytes_to_add(uint16_t size);
+
+ size_state_t size_state(uint16_t size) const;
+ bool is_underflow(uint16_t size) const;
+ int16_t size_with_key(unsigned slot, const ghobject_t& oid) const;
+ ordering_t compare_with_slot(unsigned slot, const ghobject_t& oid) const;
+ /// return the slot number of the first slot that is greater or equal to
+ /// key
+ std::pair<unsigned, bool> lower_bound(const ghobject_t& oid) const;
+ static uint16_t size_of_item(const ghobject_t& oid, const item_t& item);
+ bool is_overflow(const ghobject_t& oid, const item_t& item) const;
+ bool is_overflow(const ghobject_t& oid, const OnodeRef& item) const;
+
+ // inserts an item into the given slot, pushing all subsequent keys forward
+ // @note if the item is not embedded in key, shift the right half as well
+ void insert_at(unsigned slot, const ghobject_t& oid, const item_t& item);
+ // used by InnerNode for updating the keys indexing its children when their lower boundaries
+ // is updated
+ void update_key_at(unsigned slot, const ghobject_t& oid);
+ // try to figure out the number of elements and total size when trying to
+ // rebalance by moving the elements from the front of this node when its
+ // left sibling node is underflow
+ //
+ // @param min_grab lower bound of the number of bytes to move
+ // @param max_grab upper bound of the number of bytes to move
+ // @return the number of element to grab
+ // @note return {0, 0} if current node would be underflow if
+ // @c min_grab bytes of elements are taken from it
+ std::pair<unsigned, uint16_t> calc_grab_front(uint16_t min_grab, uint16_t max_grab) const;
+ // try to figure out the number of elements and their total size when trying to
+ // rebalance by moving the elements from the end of this node when its right
+ // sibling node is underflow
+ //
+ // @param min_grab lower bound of the number of bytes to move
+ // @param max_grab upper bound of the number of bytes to move
+ // @return the number of element to grab
+ // @note return {0, 0} if current node would be underflow if
+ // @c min_grab bytes of elements are taken from it
+ std::pair<unsigned, uint16_t> calc_grab_back(uint16_t min_grab, uint16_t max_grab) const;
+ template<int LeftN, class Mover> void grab_from_left(
+ node_t<BlockSize, LeftN, NodeType>& left,
+ unsigned n, uint16_t bytes,
+ Mover& mover);
+ template<int RightN, class Mover>
+ delta_t acquire_right(node_t<BlockSize, RightN, NodeType>& right,
+ unsigned whoami, Mover& mover);
+ // transfer n elements at the front of given node to me
+ template<int RightN, class Mover>
+ void grab_from_right(node_t<BlockSize, RightN, NodeType>& right,
+ unsigned n, uint16_t bytes,
+ Mover& mover);
+ template<int LeftN, class Mover>
+ void push_to_left(node_t<BlockSize, LeftN, NodeType>& left,
+ unsigned n, uint16_t bytes,
+ Mover& mover);
+ template<int RightN, class Mover>
+ void push_to_right(node_t<BlockSize, RightN, NodeType>& right,
+ unsigned n, uint16_t bytes,
+ Mover& mover);
+ // [to, from) are removed, so we need to shift left
+ // actually there are only two use cases:
+ // - to = 0: for giving elements in bulk
+ // - to = from - 1: for removing a single element
+ // old: |////|.....| |.....|/|........|
+ // new: |.....| |.....||........|
+ void shift_left(unsigned from, unsigned to);
+ void insert_front(const ceph::bufferptr& keys_buf, const ceph::bufferptr& cells_buf);
+ void insert_back(const ceph::bufferptr& keys_buf, const ceph::bufferptr& cells_buf);
+ // one or more elements are inserted, so we need to shift the elements right
+ // actually there are only two use cases:
+ // - bytes != 0: for inserting bytes before from
+ // - bytes = 0: for inserting a single element before from
+ // old: ||.....|
+ // new: |/////|.....|
+ void shift_right(unsigned n, unsigned bytes);
+ // shift all keys after slot is removed.
+ // @note if the item is not embdedded in key, all items sitting at the left
+ // side of it will be shifted right
+ void remove_from(unsigned slot);
+ void trim_right(unsigned n);
+ void play_delta(const delta_t& delta);
+ // /-------------------------------|
+ // | V
+ // |header|k0|k1|k2|... | / / |k2'v2|k1'v1|k0'.v0| v_m |
+ // |<-- count -->|
+ tag_t tag = tag_t::create<N, NodeType>();
+ // the count of values in the node
+ uint16_t count = 0;
+ key_prefix_t keys[];
+};
+
+template<class parent_t,
+ class from_t,
+ class to_t,
+ typename=void>
+class EntryMover {
+public:
+ // a "trap" mover
+ EntryMover(const parent_t&, from_t&, to_t& dst, unsigned) {
+ assert(0);
+ }
+ void move_from(unsigned, unsigned, unsigned) {
+ assert(0);
+ }
+ delta_t get_delta() {
+ return delta_t::nop();
+ }
+};
+
+// lower the layout, for instance, from L0 to L1, no reference oid is used
+template<class parent_t,
+ class from_t,
+ class to_t>
+class EntryMover<parent_t,
+ from_t,
+ to_t,
+ std::enable_if_t<from_t::node_n < to_t::node_n>>
+{
+public:
+ EntryMover(const parent_t&, from_t& src, to_t& dst, unsigned)
+ : src{src}, dst{dst}
+ {}
+ void move_from(unsigned src_first, unsigned dst_first, unsigned n)
+ {
+ ceph::bufferptr keys_buf{n * sizeof(to_t::key_prefix_t)};
+ ceph::bufferptr cells_buf;
+ auto dst_keys = reinterpret_cast<typename to_t::key_prefix_t*>(keys_buf.c_str());
+ if constexpr (to_t::item_in_key) {
+ for (unsigned i = 0; i < n; i++) {
+ const auto& [prefix, suffix] = src.key_at(src_first + i);
+ dst_keys[i].set(suffix, src.item_at(prefix));
+ }
+ } else {
+ // copy keys
+ uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0;
+ uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0;
+ for (unsigned i = 0; i < n; i++) {
+ auto& src_key = src.keys[src_first + i];
+ uint16_t offset = src_key.offset - src_offset + dst_offset;
+ dst_keys[i].set(src_key, offset);
+ }
+ // copy cells in bulk, yay!
+ auto src_end = src.keys[src_first + n - 1].offset;
+ uint16_t total_cell_size = src_end - src_offset;
+ cells_buf = ceph::bufferptr{total_cell_size};
+ cells_buf.copy_in(0, total_cell_size, src.from_end(src_end));
+ }
+ if (dst_first == dst.count) {
+ dst_delta = delta_t::insert_back(keys_buf, cells_buf);
+ } else {
+ dst_delta = delta_t::insert_front(keys_buf, cells_buf);
+ }
+ if (src_first > 0 && src_first + n == src.count) {
+ src_delta = delta_t::trim_right(src_first);
+ } else if (src_first == 0 && n < src.count) {
+ src_delta = delta_t::shift_left(n);
+ } else if (src_first == 0 && n == src.count) {
+ // the caller will retire the src extent
+ } else {
+ // grab in the middle?
+ assert(0);
+ }
+ }
+
+ delta_t from_delta() {
+ return std::move(src_delta);
+ }
+ delta_t to_delta() {
+ return std::move(dst_delta);
+ }
+private:
+ const from_t& src;
+ const to_t& dst;
+ delta_t dst_delta;
+ delta_t src_delta;
+};
+
+// lift the layout, for instance, from L2 to L0, need a reference oid
+template<class parent_t,
+ class from_t,
+ class to_t>
+class EntryMover<parent_t, from_t, to_t,
+ std::enable_if_t<(from_t::node_n > to_t::node_n)>>
+{
+public:
+ EntryMover(const parent_t& parent, from_t& src, to_t& dst, unsigned from_slot)
+ : src{src}, dst{dst}, ref_oid{parent->get_oid_at(from_slot, {})}
+ {}
+ void move_from(unsigned src_first, unsigned dst_first, unsigned n)
+ {
+ ceph::bufferptr keys_buf{n * sizeof(to_t::key_prefix_t)};
+ ceph::bufferptr cells_buf;
+ auto dst_keys = reinterpret_cast<typename to_t::key_prefix_t*>(keys_buf.c_str());
+ uint16_t in_node_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0;
+ static_assert(!std::is_same_v<typename to_t::key_suffix_t, empty_key_suffix>);
+ // copy keys
+ uint16_t buf_offset = 0;
+ for (unsigned i = 0; i < n; i++) {
+ auto& src_key = src.keys[src_first + i];
+ if constexpr (std::is_same_v<typename from_t::key_suffix_t, empty_key_suffix>) {
+ // heterogeneous partial key, have to rebuild dst partial key from oid
+ src_key.update_oid(ref_oid);
+ const auto& src_item = src.item_at(src_key);
+ size_t key2_size = to_t::key_suffix_t::size_from(ref_oid);
+ buf_offset += key2_size + size_of(src_item);
+ dst_keys[i].set(ref_oid, in_node_offset + buf_offset);
+ auto p = from_end(cells_buf, buf_offset);
+ auto partial_key = reinterpret_cast<typename to_t::key_suffix_t*>(p);
+ partial_key->set(ref_oid);
+ p += key2_size;
+ auto dst_item = reinterpret_cast<typename to_t::item_t*>(p);
+ *dst_item = src_item;
+ } else {
+ // homogeneous partial key, just update the pointers
+ uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0;
+ uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0;
+ uint16_t offset = src_key.offset - src_offset + dst_offset;
+ dst_keys[i].set(ref_oid, in_node_offset + offset);
+ }
+ }
+ if constexpr (std::is_same_v<typename to_t::key_suffix_t,
+ typename from_t::key_suffix_t>) {
+ // copy cells in bulk, yay!
+ uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0;
+ uint16_t src_end = src.keys[src_first + n - 1].offset;
+ uint16_t total_cell_size = src_end - src_offset;
+ cells_buf.copy_in(0, total_cell_size, src.from_end(src_end));
+ }
+ if (dst_first == dst.count) {
+ dst_delta = delta_t::insert_back(keys_buf, cells_buf);
+ } else {
+ dst_delta = delta_t::insert_front(keys_buf, cells_buf);
+ }
+ if (src_first + n == src.count && src_first > 0) {
+ src_delta = delta_t::trim_right(src_first);
+ } else {
+ // the caller will retire the src extent
+ assert(src_first == 0);
+ }
+ }
+
+ delta_t from_delta() {
+ return std::move(src_delta);
+ }
+ delta_t to_delta() {
+ return std::move(dst_delta);
+ }
+private:
+ char* from_end(ceph::bufferptr& ptr, uint16_t offset) {
+ return ptr.end_c_str() - static_cast<int>(offset);
+ }
+private:
+ const from_t& src;
+ const to_t& dst;
+ delta_t dst_delta;
+ delta_t src_delta;
+ ghobject_t ref_oid;
+};
+
+// identical layout, yay!
+template<class parent_t,
+ class child_t>
+class EntryMover<parent_t, child_t, child_t>
+{
+public:
+ EntryMover(const parent_t&, child_t& src, child_t& dst, unsigned)
+ : src{src}, dst{dst}
+ {}
+
+ void move_from(unsigned src_first, unsigned dst_first, unsigned n)
+ {
+ ceph::bufferptr keys_buf{static_cast<unsigned>(n * sizeof(typename child_t::key_prefix_t))};
+ ceph::bufferptr cells_buf;
+ auto dst_keys = reinterpret_cast<typename child_t::key_prefix_t*>(keys_buf.c_str());
+
+ // copy keys
+ std::copy(src.keys + src_first, src.keys + src_first + n,
+ dst_keys);
+ if constexpr (!child_t::item_in_key) {
+ uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0;
+ uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0;
+ const int offset_delta = dst_offset - src_offset;
+ // update the pointers
+ for (unsigned i = 0; i < n; i++) {
+ dst_keys[i].offset += offset_delta;
+ }
+ // copy cells in bulk, yay!
+ auto src_end = src.keys[src_first + n - 1].offset;
+ uint16_t total_cell_size = src_end - src_offset;
+ cells_buf = ceph::bufferptr{total_cell_size};
+ cells_buf.copy_in(0, total_cell_size, src.from_end(src_end));
+ }
+ if (dst_first == dst.count) {
+ dst_delta = delta_t::insert_back(std::move(keys_buf), std::move(cells_buf));
+ } else {
+ dst_delta = delta_t::insert_front(std::move(keys_buf), std::move(cells_buf));
+ }
+ if (src_first + n == src.count && src_first > 0) {
+ src_delta = delta_t::trim_right(n);
+ } else if (src_first == 0 && n < src.count) {
+ src_delta = delta_t::shift_left(n);
+ } else if (src_first == 0 && n == src.count) {
+ // the caller will retire the src extent
+ } else {
+ // grab in the middle?
+ assert(0);
+ }
+ }
+
+ delta_t from_delta() {
+ return std::move(src_delta);
+ }
+
+ delta_t to_delta() {
+ return std::move(dst_delta);
+ }
+private:
+ char* from_end(ceph::bufferptr& ptr, uint16_t offset) {
+ return ptr.end_c_str() - static_cast<int>(offset);
+ }
+private:
+ const child_t& src;
+ const child_t& dst;
+ delta_t src_delta;
+ delta_t dst_delta;
+};
+
+template<class parent_t, class from_t, class to_t>
+EntryMover<parent_t, from_t, to_t>
+make_mover(const parent_t& parent, from_t& src, to_t& dst, unsigned from_slot) {
+ return EntryMover<parent_t, from_t, to_t>(parent, src, dst, from_slot);
+}