summaryrefslogtreecommitdiffstats
path: root/src/crimson/os/seastore/omap_manager
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/crimson/os/seastore/omap_manager
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/crimson/os/seastore/omap_manager')
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc293
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h111
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h122
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc738
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h250
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/omap_types.h157
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h1550
7 files changed, 3221 insertions, 0 deletions
diff --git a/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc
new file mode 100644
index 000000000..1782d7ee6
--- /dev/null
+++ b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc
@@ -0,0 +1,293 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <string.h>
+
+#include "crimson/common/log.h"
+
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/omap_manager/btree/btree_omap_manager.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h"
+
+SET_SUBSYS(seastore_omap);
+
+namespace crimson::os::seastore::omap_manager {
+
+BtreeOMapManager::BtreeOMapManager(
+ TransactionManager &tm)
+ : tm(tm) {}
+
+BtreeOMapManager::initialize_omap_ret
+BtreeOMapManager::initialize_omap(Transaction &t, laddr_t hint)
+{
+ LOG_PREFIX(BtreeOMapManager::initialize_omap);
+ DEBUGT("hint: {}", t, hint);
+ return tm.alloc_extent<OMapLeafNode>(t, hint, OMAP_LEAF_BLOCK_SIZE)
+ .si_then([hint, &t](auto&& root_extent) {
+ root_extent->set_size(0);
+ omap_node_meta_t meta{1};
+ root_extent->set_meta(meta);
+ omap_root_t omap_root;
+ omap_root.update(root_extent->get_laddr(), 1, hint);
+ t.get_omap_tree_stats().depth = 1u;
+ t.get_omap_tree_stats().extents_num_delta++;
+ return initialize_omap_iertr::make_ready_future<omap_root_t>(omap_root);
+ });
+}
+
+BtreeOMapManager::get_root_ret
+BtreeOMapManager::get_omap_root(omap_context_t oc, const omap_root_t &omap_root)
+{
+ assert(omap_root.get_location() != L_ADDR_NULL);
+ laddr_t laddr = omap_root.get_location();
+ return omap_load_extent(oc, laddr, omap_root.get_depth());
+}
+
+BtreeOMapManager::handle_root_split_ret
+BtreeOMapManager::handle_root_split(
+ omap_context_t oc,
+ omap_root_t &omap_root,
+ const OMapNode::mutation_result_t& mresult)
+{
+ LOG_PREFIX(BtreeOMapManager::handle_root_split);
+ DEBUGT("{}", oc.t, omap_root);
+ return oc.tm.alloc_extent<OMapInnerNode>(oc.t, omap_root.hint,
+ OMAP_INNER_BLOCK_SIZE)
+ .si_then([&omap_root, mresult, oc](auto&& nroot) -> handle_root_split_ret {
+ auto [left, right, pivot] = *(mresult.split_tuple);
+ omap_node_meta_t meta{omap_root.depth + 1};
+ nroot->set_meta(meta);
+ nroot->journal_inner_insert(nroot->iter_begin(), left->get_laddr(),
+ "", nroot->maybe_get_delta_buffer());
+ nroot->journal_inner_insert(nroot->iter_begin() + 1, right->get_laddr(),
+ pivot, nroot->maybe_get_delta_buffer());
+ omap_root.update(nroot->get_laddr(), omap_root.get_depth() + 1, omap_root.hint);
+ oc.t.get_omap_tree_stats().depth = omap_root.depth;
+ ++(oc.t.get_omap_tree_stats().extents_num_delta);
+ return seastar::now();
+ });
+}
+
+BtreeOMapManager::handle_root_merge_ret
+BtreeOMapManager::handle_root_merge(
+ omap_context_t oc,
+ omap_root_t &omap_root,
+ OMapNode::mutation_result_t mresult)
+{
+ LOG_PREFIX(BtreeOMapManager::handle_root_merge);
+ DEBUGT("{}", oc.t, omap_root);
+ auto root = *(mresult.need_merge);
+ auto iter = root->cast<OMapInnerNode>()->iter_begin();
+ omap_root.update(
+ iter->get_val(),
+ omap_root.depth -= 1,
+ omap_root.hint);
+ oc.t.get_omap_tree_stats().depth = omap_root.depth;
+ oc.t.get_omap_tree_stats().extents_num_delta--;
+ return oc.tm.dec_ref(oc.t, root->get_laddr()
+ ).si_then([](auto &&ret) -> handle_root_merge_ret {
+ return seastar::now();
+ }).handle_error_interruptible(
+ handle_root_merge_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in handle_root_merge"
+ }
+ );
+}
+
+BtreeOMapManager::omap_get_value_ret
+BtreeOMapManager::omap_get_value(
+ const omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &key)
+{
+ LOG_PREFIX(BtreeOMapManager::omap_get_value);
+ DEBUGT("key={}", t, key);
+ return get_omap_root(
+ get_omap_context(t, omap_root.hint),
+ omap_root
+ ).si_then([this, &t, &key, &omap_root](auto&& extent) {
+ return extent->get_value(get_omap_context(t, omap_root.hint), key);
+ }).si_then([](auto &&e) {
+ return omap_get_value_ret(
+ interruptible::ready_future_marker{},
+ std::move(e));
+ });
+}
+
+BtreeOMapManager::omap_set_keys_ret
+BtreeOMapManager::omap_set_keys(
+ omap_root_t &omap_root,
+ Transaction &t,
+ std::map<std::string, ceph::bufferlist>&& keys)
+{
+ return seastar::do_with(std::move(keys), [&, this](auto& keys) {
+ return trans_intr::do_for_each(
+ keys.begin(),
+ keys.end(),
+ [&, this](auto &p) {
+ return omap_set_key(omap_root, t, p.first, p.second);
+ });
+ });
+}
+
+BtreeOMapManager::omap_set_key_ret
+BtreeOMapManager::omap_set_key(
+ omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &key,
+ const ceph::bufferlist &value)
+{
+ LOG_PREFIX(BtreeOMapManager::omap_set_key);
+ DEBUGT("{} -> {}", t, key, value);
+ return get_omap_root(
+ get_omap_context(t, omap_root.hint),
+ omap_root
+ ).si_then([this, &t, &key, &value, &omap_root](auto root) {
+ return root->insert(get_omap_context(t, omap_root.hint), key, value);
+ }).si_then([this, &omap_root, &t](auto mresult) -> omap_set_key_ret {
+ if (mresult.status == mutation_status_t::SUCCESS)
+ return seastar::now();
+ else if (mresult.status == mutation_status_t::WAS_SPLIT)
+ return handle_root_split(get_omap_context(t, omap_root.hint), omap_root, mresult);
+ else
+ return seastar::now();
+ });
+}
+
+BtreeOMapManager::omap_rm_key_ret
+BtreeOMapManager::omap_rm_key(
+ omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &key)
+{
+ LOG_PREFIX(BtreeOMapManager::omap_rm_key);
+ DEBUGT("{}", t, key);
+ return get_omap_root(
+ get_omap_context(t, omap_root.hint),
+ omap_root
+ ).si_then([this, &t, &key, &omap_root](auto root) {
+ return root->rm_key(get_omap_context(t, omap_root.hint), key);
+ }).si_then([this, &omap_root, &t](auto mresult) -> omap_rm_key_ret {
+ if (mresult.status == mutation_status_t::SUCCESS) {
+ return seastar::now();
+ } else if (mresult.status == mutation_status_t::WAS_SPLIT) {
+ return handle_root_split(get_omap_context(t, omap_root.hint), omap_root, mresult);
+ } else if (mresult.status == mutation_status_t::NEED_MERGE) {
+ auto root = *(mresult.need_merge);
+ if (root->get_node_size() == 1 && omap_root.depth != 1) {
+ return handle_root_merge(get_omap_context(t, omap_root.hint), omap_root, mresult);
+ } else {
+ return seastar::now();
+ }
+ } else {
+ return seastar::now();
+ }
+ });
+
+}
+
+BtreeOMapManager::omap_rm_key_range_ret
+BtreeOMapManager::omap_rm_key_range(
+ omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &first,
+ const std::string &last,
+ omap_list_config_t config)
+{
+ LOG_PREFIX(BtreeOMapManager::omap_rm_key_range);
+ DEBUGT("{} ~ {}", t, first, last);
+ assert(first <= last);
+ return seastar::do_with(
+ std::make_optional<std::string>(first),
+ std::make_optional<std::string>(last),
+ [this, &omap_root, &t, config](auto &first, auto &last) {
+ return omap_list(
+ omap_root,
+ t,
+ first,
+ last,
+ config);
+ }).si_then([this, &omap_root, &t](auto results) {
+ LOG_PREFIX(BtreeOMapManager::omap_rm_key_range);
+ auto &[complete, kvs] = results;
+ std::vector<std::string> keys;
+ for (const auto& [k, _] : kvs) {
+ keys.push_back(k);
+ }
+ DEBUGT("total {} keys to remove", t, keys.size());
+ return seastar::do_with(
+ std::move(keys),
+ [this, &omap_root, &t](auto& keys) {
+ return trans_intr::do_for_each(
+ keys.begin(),
+ keys.end(),
+ [this, &omap_root, &t](auto& key) {
+ return omap_rm_key(omap_root, t, key);
+ });
+ });
+ });
+}
+
+BtreeOMapManager::omap_list_ret
+BtreeOMapManager::omap_list(
+ const omap_root_t &omap_root,
+ Transaction &t,
+ const std::optional<std::string> &first,
+ const std::optional<std::string> &last,
+ omap_list_config_t config)
+{
+ LOG_PREFIX(BtreeOMapManager::omap_list);
+ if (first && last) {
+ DEBUGT("{}, first: {}, last: {}", t, omap_root, *first, *last);
+ assert(last >= first);
+ } else if (first) {
+ DEBUGT("{}, first: {}", t, omap_root, *first);
+ } else if (last) {
+ DEBUGT("{}, last: {}", t, omap_root, *last);
+ } else {
+ DEBUGT("{}", t, omap_root);
+ }
+
+ return get_omap_root(
+ get_omap_context(t, omap_root.hint),
+ omap_root
+ ).si_then([this, config, &t, &first, &last, &omap_root](auto extent) {
+ return extent->list(
+ get_omap_context(t, omap_root.hint),
+ first,
+ last,
+ config);
+ });
+}
+
+BtreeOMapManager::omap_clear_ret
+BtreeOMapManager::omap_clear(
+ omap_root_t &omap_root,
+ Transaction &t)
+{
+ LOG_PREFIX(BtreeOMapManager::omap_clear);
+ DEBUGT("{}", t, omap_root);
+ return get_omap_root(
+ get_omap_context(t, omap_root.hint),
+ omap_root
+ ).si_then([this, &t, &omap_root](auto extent) {
+ return extent->clear(get_omap_context(t, omap_root.hint));
+ }).si_then([this, &omap_root, &t] {
+ return tm.dec_ref(
+ t, omap_root.get_location()
+ ).si_then([&omap_root] (auto ret) {
+ omap_root.update(
+ L_ADDR_NULL,
+ 0, L_ADDR_MIN);
+ return omap_clear_iertr::now();
+ });
+ }).handle_error_interruptible(
+ omap_clear_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in BtreeOMapManager::omap_clear"
+ }
+ );
+}
+
+}
diff --git a/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h
new file mode 100644
index 000000000..7fcba64c0
--- /dev/null
+++ b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/omap_manager.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_btree_node.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+
+namespace crimson::os::seastore::omap_manager {
+/**
+ * BtreeOMapManager
+ *
+ * Uses a btree to track :
+ * string -> string mapping for each onode omap
+ */
+
+class BtreeOMapManager : public OMapManager {
+ TransactionManager &tm;
+
+ omap_context_t get_omap_context(
+ Transaction &t, laddr_t addr_min) {
+ return omap_context_t{tm, t, addr_min};
+ }
+
+ /* get_omap_root
+ *
+ * load omap tree root node
+ */
+ using get_root_iertr = base_iertr;
+ using get_root_ret = get_root_iertr::future<OMapNodeRef>;
+ static get_root_ret get_omap_root(
+ omap_context_t c,
+ const omap_root_t &omap_root);
+
+ /* handle_root_split
+ *
+ * root has been split and needs to update omap_root_t
+ */
+ using handle_root_split_iertr = base_iertr;
+ using handle_root_split_ret = handle_root_split_iertr::future<>;
+ handle_root_split_ret handle_root_split(
+ omap_context_t c,
+ omap_root_t &omap_root,
+ const OMapNode::mutation_result_t& mresult);
+
+ /* handle_root_merge
+ *
+ * root node has only one item and it is not leaf node, need remove a layer
+ */
+ using handle_root_merge_iertr = base_iertr;
+ using handle_root_merge_ret = handle_root_merge_iertr::future<>;
+ handle_root_merge_ret handle_root_merge(
+ omap_context_t oc,
+ omap_root_t &omap_root,
+ OMapNode:: mutation_result_t mresult);
+
+public:
+ explicit BtreeOMapManager(TransactionManager &tm);
+
+ initialize_omap_ret initialize_omap(Transaction &t, laddr_t hint) final;
+
+ omap_get_value_ret omap_get_value(
+ const omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &key) final;
+
+ omap_set_key_ret omap_set_key(
+ omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &key, const ceph::bufferlist &value) final;
+
+ omap_set_keys_ret omap_set_keys(
+ omap_root_t &omap_root,
+ Transaction &t,
+ std::map<std::string, ceph::bufferlist>&& keys) final;
+
+ omap_rm_key_ret omap_rm_key(
+ omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &key) final;
+
+ omap_rm_key_range_ret omap_rm_key_range(
+ omap_root_t &omap_root,
+ Transaction &t,
+ const std::string &first,
+ const std::string &last,
+ omap_list_config_t config) final;
+
+ omap_list_ret omap_list(
+ const omap_root_t &omap_root,
+ Transaction &t,
+ const std::optional<std::string> &first,
+ const std::optional<std::string> &last,
+ omap_list_config_t config = omap_list_config_t()) final;
+
+ omap_clear_ret omap_clear(
+ omap_root_t &omap_root,
+ Transaction &t) final;
+
+};
+using BtreeOMapManagerRef = std::unique_ptr<BtreeOMapManager>;
+
+}
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h
new file mode 100644
index 000000000..795daeddb
--- /dev/null
+++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <string>
+#include <vector>
+
+//#include <boost/iterator/counting_iterator.hpp>
+
+#include "crimson/common/log.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/omap_manager.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_types.h"
+
+namespace crimson::os::seastore::omap_manager{
+
+struct omap_context_t {
+ TransactionManager &tm;
+ Transaction &t;
+ laddr_t hint;
+};
+
+enum class mutation_status_t : uint8_t {
+ SUCCESS = 0,
+ WAS_SPLIT = 1,
+ NEED_MERGE = 2,
+ FAIL = 3
+};
+
+struct OMapNode : LogicalCachedExtent {
+ using base_iertr = OMapManager::base_iertr;
+
+ using OMapNodeRef = TCachedExtentRef<OMapNode>;
+
+ struct mutation_result_t {
+ mutation_status_t status;
+ /// Only populated if WAS_SPLIT, indicates the newly created left and right nodes
+ /// from splitting the target entry during insertion.
+ std::optional<std::tuple<OMapNodeRef, OMapNodeRef, std::string>> split_tuple;
+ /// only sopulated if need merged, indicate which entry need be doing merge in upper layer.
+ std::optional<OMapNodeRef> need_merge;
+
+ mutation_result_t(mutation_status_t s, std::optional<std::tuple<OMapNodeRef,
+ OMapNodeRef, std::string>> tuple, std::optional<OMapNodeRef> n_merge)
+ : status(s),
+ split_tuple(tuple),
+ need_merge(n_merge) {}
+ };
+
+ OMapNode(ceph::bufferptr &&ptr) : LogicalCachedExtent(std::move(ptr)) {}
+ OMapNode(const OMapNode &other)
+ : LogicalCachedExtent(other) {}
+
+ using get_value_iertr = base_iertr;
+ using get_value_ret = OMapManager::omap_get_value_ret;
+ virtual get_value_ret get_value(
+ omap_context_t oc,
+ const std::string &key) = 0;
+
+ using insert_iertr = base_iertr;
+ using insert_ret = insert_iertr::future<mutation_result_t>;
+ virtual insert_ret insert(
+ omap_context_t oc,
+ const std::string &key,
+ const ceph::bufferlist &value) = 0;
+
+ using rm_key_iertr = base_iertr;
+ using rm_key_ret = rm_key_iertr::future<mutation_result_t>;
+ virtual rm_key_ret rm_key(
+ omap_context_t oc,
+ const std::string &key) = 0;
+
+ using omap_list_config_t = OMapManager::omap_list_config_t;
+ using list_iertr = base_iertr;
+ using list_bare_ret = OMapManager::omap_list_bare_ret;
+ using list_ret = OMapManager::omap_list_ret;
+ virtual list_ret list(
+ omap_context_t oc,
+ const std::optional<std::string> &first,
+ const std::optional<std::string> &last,
+ omap_list_config_t config) = 0;
+
+ using clear_iertr = base_iertr;
+ using clear_ret = clear_iertr::future<>;
+ virtual clear_ret clear(omap_context_t oc) = 0;
+
+ using full_merge_iertr = base_iertr;
+ using full_merge_ret = full_merge_iertr::future<OMapNodeRef>;
+ virtual full_merge_ret make_full_merge(
+ omap_context_t oc,
+ OMapNodeRef right) = 0;
+
+ using make_balanced_iertr = base_iertr;
+ using make_balanced_ret = make_balanced_iertr::future
+ <std::tuple<OMapNodeRef, OMapNodeRef, std::string>>;
+ virtual make_balanced_ret make_balanced(
+ omap_context_t oc,
+ OMapNodeRef _right) = 0;
+
+ virtual omap_node_meta_t get_node_meta() const = 0;
+ virtual bool extent_will_overflow(
+ size_t ksize,
+ std::optional<size_t> vsize) const = 0;
+ virtual bool can_merge(OMapNodeRef right) const = 0;
+ virtual bool extent_is_below_min() const = 0;
+ virtual uint32_t get_node_size() = 0;
+
+ virtual ~OMapNode() = default;
+};
+
+using OMapNodeRef = OMapNode::OMapNodeRef;
+
+using omap_load_extent_iertr = OMapNode::base_iertr;
+omap_load_extent_iertr::future<OMapNodeRef>
+omap_load_extent(omap_context_t oc, laddr_t laddr, depth_t depth);
+
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::omap_manager::OMapNode> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc
new file mode 100644
index 000000000..4db58414a
--- /dev/null
+++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc
@@ -0,0 +1,738 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <algorithm>
+#include <string.h>
+#include "include/buffer.h"
+#include "include/byteorder.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_btree_node.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h"
+#include "seastar/core/thread.hh"
+
+SET_SUBSYS(seastore_omap);
+
+namespace crimson::os::seastore::omap_manager {
+
+std::ostream &operator<<(std::ostream &out, const omap_inner_key_t &rhs)
+{
+ return out << "omap_inner_key (" << rhs.key_off<< " - " << rhs.key_len
+ << " - " << rhs.laddr << ")";
+}
+
+std::ostream &operator<<(std::ostream &out, const omap_leaf_key_t &rhs)
+{
+ return out << "omap_leaf_key_t (" << rhs.key_off<< " - " << rhs.key_len
+ << " - " << rhs.val_len << ")";
+}
+
+std::ostream &OMapInnerNode::print_detail_l(std::ostream &out) const
+{
+ return out << ", size=" << get_size()
+ << ", depth=" << get_meta().depth;
+}
+
+using dec_ref_iertr = OMapInnerNode::base_iertr;
+using dec_ref_ret = dec_ref_iertr::future<>;
+template <typename T>
+dec_ref_ret dec_ref(omap_context_t oc, T&& addr) {
+ return oc.tm.dec_ref(oc.t, std::forward<T>(addr)).handle_error_interruptible(
+ dec_ref_iertr::pass_further{},
+ crimson::ct_error::assert_all{
+ "Invalid error in OMapInnerNode helper dec_ref"
+ }
+ ).si_then([](auto &&e) {});
+}
+
+/**
+ * make_split_insert
+ *
+ * insert an entry at iter, with the address of key.
+ * will result in a split outcome encoded in the returned mutation_result_t
+ */
+OMapInnerNode::make_split_insert_ret
+OMapInnerNode::make_split_insert(
+ omap_context_t oc,
+ internal_iterator_t iter,
+ std::string key,
+ laddr_t laddr)
+{
+ LOG_PREFIX(OMapInnerNode::make_split_insert);
+ DEBUGT("this: {}, key: {}", oc.t, *this, key);
+ return make_split_children(oc).si_then([=] (auto tuple) {
+ auto [left, right, pivot] = tuple;
+ if (pivot > key) {
+ auto liter = left->iter_idx(iter.get_index());
+ left->journal_inner_insert(liter, laddr, key,
+ left->maybe_get_delta_buffer());
+ } else { //right
+ auto riter = right->iter_idx(iter.get_index() - left->get_node_size());
+ right->journal_inner_insert(riter, laddr, key,
+ right->maybe_get_delta_buffer());
+ }
+ ++(oc.t.get_omap_tree_stats().extents_num_delta);
+ return make_split_insert_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::WAS_SPLIT, tuple, std::nullopt));
+ });
+
+}
+
+
+OMapInnerNode::handle_split_ret
+OMapInnerNode::handle_split(
+ omap_context_t oc,
+ internal_iterator_t iter,
+ mutation_result_t mresult)
+{
+ LOG_PREFIX(OMapInnerNode::handle_split);
+ DEBUGT("this: {}", oc.t, *this);
+ if (!is_mutable()) {
+ auto mut = oc.tm.get_mutable_extent(oc.t, this)->cast<OMapInnerNode>();
+ auto mut_iter = mut->iter_idx(iter.get_index());
+ return mut->handle_split(oc, mut_iter, mresult);
+ }
+ auto [left, right, pivot] = *(mresult.split_tuple);
+ //update operation will not cause node overflow, so we can do it first.
+ journal_inner_update(iter, left->get_laddr(), maybe_get_delta_buffer());
+ bool overflow = extent_will_overflow(pivot.size(), std::nullopt);
+ if (!overflow) {
+ journal_inner_insert(iter + 1, right->get_laddr(), pivot,
+ maybe_get_delta_buffer());
+ return insert_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt));
+ } else {
+ return make_split_insert(oc, iter + 1, pivot, right->get_laddr())
+ .si_then([this, oc] (auto m_result) {
+ return dec_ref(oc, get_laddr())
+ .si_then([m_result = std::move(m_result)] {
+ return insert_ret(
+ interruptible::ready_future_marker{},
+ m_result);
+ });
+ });
+ }
+}
+
+OMapInnerNode::get_value_ret
+OMapInnerNode::get_value(
+ omap_context_t oc,
+ const std::string &key)
+{
+ LOG_PREFIX(OMapInnerNode::get_value);
+ DEBUGT("key = {}, this: {}", oc.t, key, *this);
+ auto child_pt = get_containing_child(key);
+ assert(child_pt != iter_cend());
+ auto laddr = child_pt->get_val();
+ return omap_load_extent(oc, laddr, get_meta().depth - 1).si_then(
+ [oc, &key] (auto extent) {
+ return extent->get_value(oc, key);
+ }).finally([ref = OMapNodeRef(this)] {});
+}
+
+OMapInnerNode::insert_ret
+OMapInnerNode::insert(
+ omap_context_t oc,
+ const std::string &key,
+ const ceph::bufferlist &value)
+{
+ LOG_PREFIX(OMapInnerNode::insert);
+ DEBUGT("{}->{}, this: {}", oc.t, key, value, *this);
+ auto child_pt = get_containing_child(key);
+ assert(child_pt != iter_cend());
+ auto laddr = child_pt->get_val();
+ return omap_load_extent(oc, laddr, get_meta().depth - 1).si_then(
+ [oc, &key, &value] (auto extent) {
+ return extent->insert(oc, key, value);
+ }).si_then([this, oc, child_pt] (auto mresult) {
+ if (mresult.status == mutation_status_t::SUCCESS) {
+ return insert_iertr::make_ready_future<mutation_result_t>(mresult);
+ } else if (mresult.status == mutation_status_t::WAS_SPLIT) {
+ return handle_split(oc, child_pt, mresult);
+ } else {
+ return insert_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt));
+ }
+ });
+}
+
+OMapInnerNode::rm_key_ret
+OMapInnerNode::rm_key(omap_context_t oc, const std::string &key)
+{
+ LOG_PREFIX(OMapInnerNode::rm_key);
+ DEBUGT("key={}, this: {}", oc.t, key, *this);
+ auto child_pt = get_containing_child(key);
+ assert(child_pt != iter_cend());
+ auto laddr = child_pt->get_val();
+ return omap_load_extent(oc, laddr, get_meta().depth - 1).si_then(
+ [this, oc, &key, child_pt] (auto extent) {
+ return extent->rm_key(oc, key)
+ .si_then([this, oc, child_pt, extent = std::move(extent)] (auto mresult) {
+ switch (mresult.status) {
+ case mutation_status_t::SUCCESS:
+ case mutation_status_t::FAIL:
+ return rm_key_iertr::make_ready_future<mutation_result_t>(mresult);
+ case mutation_status_t::NEED_MERGE: {
+ if (get_node_size() >1)
+ return merge_entry(oc, child_pt, *(mresult.need_merge));
+ else
+ return rm_key_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::SUCCESS,
+ std::nullopt, std::nullopt));
+ }
+ case mutation_status_t::WAS_SPLIT:
+ return handle_split(oc, child_pt, mresult);
+ default:
+ return rm_key_iertr::make_ready_future<mutation_result_t>(mresult);
+ }
+ });
+ });
+}
+
+OMapInnerNode::list_ret
+OMapInnerNode::list(
+ omap_context_t oc,
+ const std::optional<std::string> &first,
+ const std::optional<std::string> &last,
+ omap_list_config_t config)
+{
+ LOG_PREFIX(OMapInnerNode::list);
+ if (first && last) {
+ DEBUGT("first: {}, last: {}, this: {}", oc.t, *first, *last, *this);
+ assert(*first <= *last);
+ } else if (first) {
+ DEBUGT("first: {}, this: {}", oc.t, *first, *this);
+ } else if (last) {
+ DEBUGT("last: {}, this: {}", oc.t, *last, *this);
+ } else {
+ DEBUGT("this: {}", oc.t, *this);
+ }
+
+ auto first_iter = first ?
+ get_containing_child(*first) :
+ iter_cbegin();
+ auto last_iter = last ?
+ get_containing_child(*last) + 1:
+ iter_cend();
+ assert(first_iter != iter_cend());
+
+ return seastar::do_with(
+ first_iter,
+ last_iter,
+ iter_t(first_iter),
+ list_bare_ret(false, {}),
+ [this, &first, &last, oc, config](
+ auto &fiter,
+ auto &liter,
+ auto &iter,
+ auto &ret)
+ {
+ auto &complete = std::get<0>(ret);
+ auto &result = std::get<1>(ret);
+ return trans_intr::repeat(
+ [&, config, oc, this]() -> list_iertr::future<seastar::stop_iteration>
+ {
+ if (iter == liter) {
+ complete = true;
+ return list_iertr::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }
+ assert(result.size() < config.max_result_size);
+ auto laddr = iter->get_val();
+ return omap_load_extent(
+ oc, laddr,
+ get_meta().depth - 1
+ ).si_then([&, config, oc](auto &&extent) {
+ return seastar::do_with(
+ iter == fiter ? first : std::optional<std::string>(std::nullopt),
+ iter == liter - 1 ? last : std::optional<std::string>(std::nullopt),
+ [&result, extent = std::move(extent), config, oc](
+ auto &nfirst,
+ auto &nlast) {
+ return extent->list(
+ oc,
+ nfirst,
+ nlast,
+ config.with_reduced_max(result.size()));
+ }).si_then([&, config](auto &&child_ret) mutable {
+ boost::ignore_unused(config); // avoid clang warning;
+ auto &[child_complete, child_result] = child_ret;
+ if (result.size() && child_result.size()) {
+ assert(child_result.begin()->first > result.rbegin()->first);
+ }
+ if (child_result.size() && first && iter == fiter) {
+ if (config.first_inclusive) {
+ assert(child_result.begin()->first >= *first);
+ } else {
+ assert(child_result.begin()->first > *first);
+ }
+ }
+ if (child_result.size() && last && iter == liter - 1) {
+ auto biter = --(child_result.end());
+ if (config.last_inclusive) {
+ assert(biter->first <= *last);
+ } else {
+ assert(biter->first < *last);
+ }
+ }
+ result.merge(std::move(child_result));
+ if (result.size() == config.max_result_size) {
+ return list_iertr::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::yes);
+ }
+ ++iter;
+ assert(child_complete);
+ return list_iertr::make_ready_future<seastar::stop_iteration>(
+ seastar::stop_iteration::no);
+ });
+ });
+ }).si_then([&ret, ref = OMapNodeRef(this)] {
+ return list_iertr::make_ready_future<list_bare_ret>(std::move(ret));
+ });
+ });
+}
+
+OMapInnerNode::clear_ret
+OMapInnerNode::clear(omap_context_t oc)
+{
+ LOG_PREFIX(OMapInnerNode::clear);
+ DEBUGT("this: {}", oc.t, *this);
+ return trans_intr::do_for_each(iter_begin(), iter_end(),
+ [oc, this](auto iter) {
+ auto laddr = iter->get_val();
+ auto ndepth = get_meta().depth - 1;
+ if (ndepth > 1) {
+ return omap_load_extent(oc, laddr, ndepth
+ ).si_then([oc](auto &&extent) {
+ return extent->clear(oc);
+ }).si_then([oc, laddr] {
+ return dec_ref(oc, laddr);
+ }).si_then([ref = OMapNodeRef(this)] {
+ return clear_iertr::now();
+ });
+ } else {
+ assert(ndepth == 1);
+ return dec_ref(oc, laddr
+ ).si_then([ref = OMapNodeRef(this)] {
+ return clear_iertr::now();
+ });
+ }
+ });
+}
+
+OMapInnerNode::split_children_ret
+OMapInnerNode:: make_split_children(omap_context_t oc)
+{
+ LOG_PREFIX(OMapInnerNode::make_split_children);
+ DEBUGT("this: {}", oc.t, *this);
+ return oc.tm.alloc_extents<OMapInnerNode>(oc.t, oc.hint,
+ OMAP_INNER_BLOCK_SIZE, 2)
+ .si_then([this, oc] (auto &&ext_pair) {
+ LOG_PREFIX(OMapInnerNode::make_split_children);
+ auto left = ext_pair.front();
+ auto right = ext_pair.back();
+ DEBUGT("this: {}, split into: l {} r {}", oc.t, *this, *left, *right);
+ return split_children_ret(
+ interruptible::ready_future_marker{},
+ std::make_tuple(left, right, split_into(*left, *right)));
+ });
+}
+
+OMapInnerNode::full_merge_ret
+OMapInnerNode::make_full_merge(omap_context_t oc, OMapNodeRef right)
+{
+ LOG_PREFIX(OMapInnerNode::make_full_merge);
+ DEBUGT("", oc.t);
+ return oc.tm.alloc_extent<OMapInnerNode>(oc.t, oc.hint,
+ OMAP_INNER_BLOCK_SIZE)
+ .si_then([this, right] (auto &&replacement) {
+ replacement->merge_from(*this, *right->cast<OMapInnerNode>());
+ return full_merge_ret(
+ interruptible::ready_future_marker{},
+ std::move(replacement));
+ });
+}
+
+OMapInnerNode::make_balanced_ret
+OMapInnerNode::make_balanced(omap_context_t oc, OMapNodeRef _right)
+{
+ LOG_PREFIX(OMapInnerNode::make_balanced);
+ DEBUGT("l: {}, r: {}", oc.t, *this, *_right);
+ ceph_assert(_right->get_type() == TYPE);
+ return oc.tm.alloc_extents<OMapInnerNode>(oc.t, oc.hint,
+ OMAP_INNER_BLOCK_SIZE, 2)
+ .si_then([this, _right] (auto &&replacement_pair){
+ auto replacement_left = replacement_pair.front();
+ auto replacement_right = replacement_pair.back();
+ auto &right = *_right->cast<OMapInnerNode>();
+ return make_balanced_ret(
+ interruptible::ready_future_marker{},
+ std::make_tuple(replacement_left, replacement_right,
+ balance_into_new_nodes(*this, right,
+ *replacement_left, *replacement_right)));
+ });
+}
+
+OMapInnerNode::merge_entry_ret
+OMapInnerNode::merge_entry(
+ omap_context_t oc,
+ internal_iterator_t iter,
+ OMapNodeRef entry)
+{
+ LOG_PREFIX(OMapInnerNode::merge_entry);
+ DEBUGT("{}, parent: {}", oc.t, *entry, *this);
+ if (!is_mutable()) {
+ auto mut = oc.tm.get_mutable_extent(oc.t, this)->cast<OMapInnerNode>();
+ auto mut_iter = mut->iter_idx(iter->get_index());
+ return mut->merge_entry(oc, mut_iter, entry);
+ }
+ auto is_left = (iter + 1) == iter_cend();
+ auto donor_iter = is_left ? iter - 1 : iter + 1;
+ return omap_load_extent(oc, donor_iter->get_val(), get_meta().depth - 1
+ ).si_then([=, this](auto &&donor) mutable {
+ LOG_PREFIX(OMapInnerNode::merge_entry);
+ auto [l, r] = is_left ?
+ std::make_pair(donor, entry) : std::make_pair(entry, donor);
+ auto [liter, riter] = is_left ?
+ std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter);
+ if (l->can_merge(r)) {
+ DEBUGT("make_full_merge l {} r {}", oc.t, *l, *r);
+ assert(entry->extent_is_below_min());
+ return l->make_full_merge(oc, r
+ ).si_then([liter=liter, riter=riter, l=l, r=r, oc, this]
+ (auto &&replacement) {
+ LOG_PREFIX(OMapInnerNode::merge_entry);
+ DEBUGT("to update parent: {}", oc.t, *this);
+ journal_inner_update(
+ liter,
+ replacement->get_laddr(),
+ maybe_get_delta_buffer());
+ journal_inner_remove(riter, maybe_get_delta_buffer());
+ //retire extent
+ std::vector<laddr_t> dec_laddrs {l->get_laddr(), r->get_laddr()};
+ return dec_ref(oc, dec_laddrs
+ ).si_then([this, oc] {
+ --(oc.t.get_omap_tree_stats().extents_num_delta);
+ if (extent_is_below_min()) {
+ return merge_entry_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::NEED_MERGE,
+ std::nullopt, this));
+ } else {
+ return merge_entry_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::SUCCESS,
+ std::nullopt, std::nullopt));
+ }
+ });
+ });
+ } else {
+ DEBUGT("balanced l {} r {}", oc.t, *l, *r);
+ return l->make_balanced(oc, r
+ ).si_then([liter=liter, riter=riter, l=l, r=r, oc, this](auto tuple) {
+ LOG_PREFIX(OMapInnerNode::merge_entry);
+ DEBUGT("to update parent: {}", oc.t, *this);
+ auto [replacement_l, replacement_r, replacement_pivot] = tuple;
+ //update operation will not cuase node overflow, so we can do it first
+ journal_inner_update(
+ liter,
+ replacement_l->get_laddr(),
+ maybe_get_delta_buffer());
+ bool overflow = extent_will_overflow(replacement_pivot.size(),
+ std::nullopt);
+ if (!overflow) {
+ journal_inner_remove(riter, maybe_get_delta_buffer());
+ journal_inner_insert(
+ riter,
+ replacement_r->get_laddr(),
+ replacement_pivot,
+ maybe_get_delta_buffer());
+ std::vector<laddr_t> dec_laddrs{l->get_laddr(), r->get_laddr()};
+ return dec_ref(oc, dec_laddrs
+ ).si_then([] {
+ return merge_entry_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::SUCCESS,
+ std::nullopt, std::nullopt));
+ });
+ } else {
+ DEBUGT("balanced and split {} r {}", oc.t, *l, *r);
+ //use remove and insert to instead of replace,
+ //remove operation will not cause node split, so we can do it first
+ journal_inner_remove(riter, maybe_get_delta_buffer());
+ return make_split_insert(oc, riter, replacement_pivot,
+ replacement_r->get_laddr()
+ ).si_then([this, oc, l = l, r = r](auto mresult) {
+ std::vector<laddr_t> dec_laddrs{
+ l->get_laddr(),
+ r->get_laddr(),
+ get_laddr()};
+ return dec_ref(oc, dec_laddrs
+ ).si_then([mresult = std::move(mresult)] {
+ return merge_entry_ret(
+ interruptible::ready_future_marker{}, mresult);
+ });
+ });
+ }
+ });
+ }
+ });
+
+}
+
+OMapInnerNode::internal_iterator_t
+OMapInnerNode::get_containing_child(const std::string &key)
+{
+ auto iter = std::find_if(iter_begin(), iter_end(),
+ [&key](auto it) { return it.contains(key); });
+ return iter;
+}
+
+std::ostream &OMapLeafNode::print_detail_l(std::ostream &out) const
+{
+ return out << ", size=" << get_size()
+ << ", depth=" << get_meta().depth;
+}
+
+OMapLeafNode::get_value_ret
+OMapLeafNode::get_value(omap_context_t oc, const std::string &key)
+{
+ LOG_PREFIX(OMapLeafNode::get_value);
+ DEBUGT("key = {}, this: {}", oc.t, *this, key);
+ auto ite = find_string_key(key);
+ if (ite != iter_end()) {
+ auto value = ite->get_val();
+ return get_value_ret(
+ interruptible::ready_future_marker{},
+ value);
+ } else {
+ return get_value_ret(
+ interruptible::ready_future_marker{},
+ std::nullopt);
+ }
+}
+
+OMapLeafNode::insert_ret
+OMapLeafNode::insert(
+ omap_context_t oc,
+ const std::string &key,
+ const ceph::bufferlist &value)
+{
+ LOG_PREFIX(OMapLeafNode::insert);
+ DEBUGT("{} -> {}, this: {}", oc.t, key, value, *this);
+ bool overflow = extent_will_overflow(key.size(), value.length());
+ if (!overflow) {
+ if (!is_mutable()) {
+ auto mut = oc.tm.get_mutable_extent(oc.t, this)->cast<OMapLeafNode>();
+ return mut->insert(oc, key, value);
+ }
+ auto replace_pt = find_string_key(key);
+ if (replace_pt != iter_end()) {
+ ++(oc.t.get_omap_tree_stats().num_updates);
+ journal_leaf_update(replace_pt, key, value, maybe_get_delta_buffer());
+ } else {
+ ++(oc.t.get_omap_tree_stats().num_inserts);
+ auto insert_pt = string_lower_bound(key);
+ journal_leaf_insert(insert_pt, key, value, maybe_get_delta_buffer());
+
+ DEBUGT("inserted {}, this: {}", oc.t, insert_pt.get_key(), *this);
+ }
+ return insert_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt));
+ } else {
+ return make_split_children(oc).si_then([this, oc, &key, &value] (auto tuple) {
+ auto [left, right, pivot] = tuple;
+ auto replace_pt = find_string_key(key);
+ if (replace_pt != iter_end()) {
+ ++(oc.t.get_omap_tree_stats().num_updates);
+ if (key < pivot) { //left
+ auto mut_iter = left->iter_idx(replace_pt->get_index());
+ left->journal_leaf_update(mut_iter, key, value, left->maybe_get_delta_buffer());
+ } else if (key >= pivot) { //right
+ auto mut_iter = right->iter_idx(replace_pt->get_index() - left->get_node_size());
+ right->journal_leaf_update(mut_iter, key, value, right->maybe_get_delta_buffer());
+ }
+ } else {
+ ++(oc.t.get_omap_tree_stats().num_inserts);
+ auto insert_pt = string_lower_bound(key);
+ if (key < pivot) { //left
+ auto mut_iter = left->iter_idx(insert_pt->get_index());
+ left->journal_leaf_insert(mut_iter, key, value, left->maybe_get_delta_buffer());
+ } else {
+ auto mut_iter = right->iter_idx(insert_pt->get_index() - left->get_node_size());
+ right->journal_leaf_insert(mut_iter, key, value, right->maybe_get_delta_buffer());
+ }
+ }
+ ++(oc.t.get_omap_tree_stats().extents_num_delta);
+ return dec_ref(oc, get_laddr())
+ .si_then([tuple = std::move(tuple)] {
+ return insert_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::WAS_SPLIT, tuple, std::nullopt));
+ });
+ });
+ }
+}
+
+OMapLeafNode::rm_key_ret
+OMapLeafNode::rm_key(omap_context_t oc, const std::string &key)
+{
+ LOG_PREFIX(OMapLeafNode::rm_key);
+ DEBUGT("{}, this: {}", oc.t, key, *this);
+ auto rm_pt = find_string_key(key);
+ if (!is_mutable() && rm_pt != iter_end()) {
+ auto mut = oc.tm.get_mutable_extent(oc.t, this)->cast<OMapLeafNode>();
+ return mut->rm_key(oc, key);
+ }
+
+ if (rm_pt != iter_end()) {
+ ++(oc.t.get_omap_tree_stats().num_erases);
+ journal_leaf_remove(rm_pt, maybe_get_delta_buffer());
+ if (extent_is_below_min()) {
+ return rm_key_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::NEED_MERGE, std::nullopt,
+ this->cast<OMapNode>()));
+ } else {
+ return rm_key_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt));
+ }
+ } else {
+ return rm_key_ret(
+ interruptible::ready_future_marker{},
+ mutation_result_t(mutation_status_t::FAIL, std::nullopt, std::nullopt));
+ }
+
+}
+
+OMapLeafNode::list_ret
+OMapLeafNode::list(
+ omap_context_t oc,
+ const std::optional<std::string> &first,
+ const std::optional<std::string> &last,
+ omap_list_config_t config)
+{
+ LOG_PREFIX(OMapLeafNode::list);
+ DEBUGT(
+ "first {} last {} max_result_size {} first_inclusive {} \
+ last_inclusive {}, this: {}",
+ oc.t,
+ first ? first->c_str() : "",
+ last ? last->c_str() : "",
+ config.max_result_size,
+ config.first_inclusive,
+ config.last_inclusive,
+ *this
+ );
+ auto ret = list_bare_ret(false, {});
+ auto &[complete, result] = ret;
+ auto iter = first ?
+ (config.first_inclusive ?
+ string_lower_bound(*first) :
+ string_upper_bound(*first)) :
+ iter_begin();
+ auto liter = last ?
+ (config.last_inclusive ?
+ string_upper_bound(*last) :
+ string_lower_bound(*last)) :
+ iter_end();
+
+ for (; iter != liter && result.size() < config.max_result_size; iter++) {
+ result.emplace(std::make_pair(iter->get_key(), iter->get_val()));
+ }
+
+ complete = (iter == liter);
+
+ return list_iertr::make_ready_future<list_bare_ret>(
+ std::move(ret));
+}
+
+OMapLeafNode::clear_ret
+OMapLeafNode::clear(omap_context_t oc)
+{
+ return clear_iertr::now();
+}
+
+OMapLeafNode::split_children_ret
+OMapLeafNode::make_split_children(omap_context_t oc)
+{
+ LOG_PREFIX(OMapLeafNode::make_split_children);
+ DEBUGT("this: {}", oc.t, *this);
+ return oc.tm.alloc_extents<OMapLeafNode>(oc.t, oc.hint, OMAP_LEAF_BLOCK_SIZE, 2)
+ .si_then([this] (auto &&ext_pair) {
+ auto left = ext_pair.front();
+ auto right = ext_pair.back();
+ return split_children_ret(
+ interruptible::ready_future_marker{},
+ std::make_tuple(left, right, split_into(*left, *right)));
+ });
+}
+
+OMapLeafNode::full_merge_ret
+OMapLeafNode::make_full_merge(omap_context_t oc, OMapNodeRef right)
+{
+ ceph_assert(right->get_type() == TYPE);
+ LOG_PREFIX(OMapLeafNode::make_full_merge);
+ DEBUGT("this: {}", oc.t, *this);
+ return oc.tm.alloc_extent<OMapLeafNode>(oc.t, oc.hint, OMAP_LEAF_BLOCK_SIZE)
+ .si_then([this, right] (auto &&replacement) {
+ replacement->merge_from(*this, *right->cast<OMapLeafNode>());
+ return full_merge_ret(
+ interruptible::ready_future_marker{},
+ std::move(replacement));
+ });
+}
+
+OMapLeafNode::make_balanced_ret
+OMapLeafNode::make_balanced(omap_context_t oc, OMapNodeRef _right)
+{
+ ceph_assert(_right->get_type() == TYPE);
+ LOG_PREFIX(OMapLeafNode::make_balanced);
+ DEBUGT("this: {}", oc.t, *this);
+ return oc.tm.alloc_extents<OMapLeafNode>(oc.t, oc.hint, OMAP_LEAF_BLOCK_SIZE, 2)
+ .si_then([this, _right] (auto &&replacement_pair) {
+ auto replacement_left = replacement_pair.front();
+ auto replacement_right = replacement_pair.back();
+ auto &right = *_right->cast<OMapLeafNode>();
+ return make_balanced_ret(
+ interruptible::ready_future_marker{},
+ std::make_tuple(
+ replacement_left, replacement_right,
+ balance_into_new_nodes(
+ *this, right,
+ *replacement_left, *replacement_right)));
+ });
+}
+
+
+omap_load_extent_iertr::future<OMapNodeRef>
+omap_load_extent(omap_context_t oc, laddr_t laddr, depth_t depth)
+{
+ ceph_assert(depth > 0);
+ if (depth > 1) {
+ return oc.tm.read_extent<OMapInnerNode>(oc.t, laddr,
+ OMAP_INNER_BLOCK_SIZE)
+ .handle_error_interruptible(
+ omap_load_extent_iertr::pass_further{},
+ crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" }
+ ).si_then(
+ [](auto&& e) {
+ return seastar::make_ready_future<OMapNodeRef>(std::move(e));
+ });
+ } else {
+ return oc.tm.read_extent<OMapLeafNode>(oc.t, laddr, OMAP_LEAF_BLOCK_SIZE
+ ).handle_error_interruptible(
+ omap_load_extent_iertr::pass_further{},
+ crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" }
+ ).si_then(
+ [](auto&& e) {
+ return seastar::make_ready_future<OMapNodeRef>(std::move(e));
+ });
+ }
+}
+}
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h
new file mode 100644
index 000000000..a2b51bbb0
--- /dev/null
+++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h
@@ -0,0 +1,250 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string.h>
+
+#include "include/buffer.h"
+
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/omap_manager.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_types.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_btree_node.h"
+
+namespace crimson::os::seastore::omap_manager {
+
+/**
+ * OMapInnerNode
+ *
+ * Abstracts operations on and layout of internal nodes for the
+ * omap Tree.
+ *
+ * Layout (4k):
+ * num_entries: meta : keys : values :
+ */
+
+struct OMapInnerNode
+ : OMapNode,
+ StringKVInnerNodeLayout {
+ using OMapInnerNodeRef = TCachedExtentRef<OMapInnerNode>;
+ using internal_iterator_t = const_iterator;
+ template <typename... T>
+ OMapInnerNode(T&&... t) :
+ OMapNode(std::forward<T>(t)...),
+ StringKVInnerNodeLayout(get_bptr().c_str()) {}
+
+ omap_node_meta_t get_node_meta() const final { return get_meta(); }
+ bool extent_will_overflow(size_t ksize, std::optional<size_t> vsize) const {
+ return is_overflow(ksize);
+ }
+ bool can_merge(OMapNodeRef right) const {
+ return !is_overflow(*right->cast<OMapInnerNode>());
+ }
+ bool extent_is_below_min() const { return below_min(); }
+ uint32_t get_node_size() { return get_size(); }
+
+ CachedExtentRef duplicate_for_write(Transaction&) final {
+ assert(delta_buffer.empty());
+ return CachedExtentRef(new OMapInnerNode(*this));
+ }
+
+ delta_inner_buffer_t delta_buffer;
+ delta_inner_buffer_t *maybe_get_delta_buffer() {
+ return is_mutation_pending() ? &delta_buffer : nullptr;
+ }
+
+ get_value_ret get_value(omap_context_t oc, const std::string &key) final;
+
+ insert_ret insert(
+ omap_context_t oc,
+ const std::string &key,
+ const ceph::bufferlist &value) final;
+
+ rm_key_ret rm_key(
+ omap_context_t oc,
+ const std::string &key) final;
+
+ list_ret list(
+ omap_context_t oc,
+ const std::optional<std::string> &first,
+ const std::optional<std::string> &last,
+ omap_list_config_t config) final;
+
+ clear_ret clear(omap_context_t oc) final;
+
+ using split_children_iertr = base_iertr;
+ using split_children_ret = split_children_iertr::future
+ <std::tuple<OMapInnerNodeRef, OMapInnerNodeRef, std::string>>;
+ split_children_ret make_split_children(omap_context_t oc);
+
+ full_merge_ret make_full_merge(
+ omap_context_t oc, OMapNodeRef right) final;
+
+ make_balanced_ret make_balanced(
+ omap_context_t oc, OMapNodeRef right) final;
+
+ using make_split_insert_iertr = base_iertr;
+ using make_split_insert_ret = make_split_insert_iertr::future<mutation_result_t>;
+ make_split_insert_ret make_split_insert(
+ omap_context_t oc, internal_iterator_t iter,
+ std::string key, laddr_t laddr);
+
+ using merge_entry_iertr = base_iertr;
+ using merge_entry_ret = merge_entry_iertr::future<mutation_result_t>;
+ merge_entry_ret merge_entry(
+ omap_context_t oc,
+ internal_iterator_t iter, OMapNodeRef entry);
+
+ using handle_split_iertr = base_iertr;
+ using handle_split_ret = handle_split_iertr::future<mutation_result_t>;
+ handle_split_ret handle_split(
+ omap_context_t oc, internal_iterator_t iter,
+ mutation_result_t mresult);
+
+ std::ostream &print_detail_l(std::ostream &out) const final;
+
+ static constexpr extent_types_t TYPE = extent_types_t::OMAP_INNER;
+ extent_types_t get_type() const final {
+ return TYPE;
+ }
+
+ ceph::bufferlist get_delta() final {
+ ceph::bufferlist bl;
+ if (!delta_buffer.empty()) {
+ encode(delta_buffer, bl);
+ delta_buffer.clear();
+ }
+ return bl;
+ }
+
+ void apply_delta(const ceph::bufferlist &bl) final {
+ assert(bl.length());
+ delta_inner_buffer_t buffer;
+ auto bptr = bl.cbegin();
+ decode(buffer, bptr);
+ buffer.replay(*this);
+ }
+
+ internal_iterator_t get_containing_child(const std::string &key);
+};
+using OMapInnerNodeRef = OMapInnerNode::OMapInnerNodeRef;
+
+/**
+ * OMapLeafNode
+ *
+ * Abstracts operations on and layout of leaf nodes for the
+ * OMap Tree.
+ *
+ * Layout (4k):
+ * num_entries: meta : keys : values :
+ */
+
+struct OMapLeafNode
+ : OMapNode,
+ StringKVLeafNodeLayout {
+
+ using OMapLeafNodeRef = TCachedExtentRef<OMapLeafNode>;
+ using internal_iterator_t = const_iterator;
+ template <typename... T>
+ OMapLeafNode(T&&... t) :
+ OMapNode(std::forward<T>(t)...),
+ StringKVLeafNodeLayout(get_bptr().c_str()) {}
+
+ omap_node_meta_t get_node_meta() const final { return get_meta(); }
+ bool extent_will_overflow(
+ size_t ksize, std::optional<size_t> vsize) const {
+ return is_overflow(ksize, *vsize);
+ }
+ bool can_merge(OMapNodeRef right) const {
+ return !is_overflow(*right->cast<OMapLeafNode>());
+ }
+ bool extent_is_below_min() const { return below_min(); }
+ uint32_t get_node_size() { return get_size(); }
+
+ CachedExtentRef duplicate_for_write(Transaction&) final {
+ assert(delta_buffer.empty());
+ return CachedExtentRef(new OMapLeafNode(*this));
+ }
+
+ delta_leaf_buffer_t delta_buffer;
+ delta_leaf_buffer_t *maybe_get_delta_buffer() {
+ return is_mutation_pending() ? &delta_buffer : nullptr;
+ }
+
+ get_value_ret get_value(
+ omap_context_t oc, const std::string &key) final;
+
+ insert_ret insert(
+ omap_context_t oc,
+ const std::string &key,
+ const ceph::bufferlist &value) final;
+
+ rm_key_ret rm_key(
+ omap_context_t oc, const std::string &key) final;
+
+ list_ret list(
+ omap_context_t oc,
+ const std::optional<std::string> &first,
+ const std::optional<std::string> &last,
+ omap_list_config_t config) final;
+
+ clear_ret clear(
+ omap_context_t oc) final;
+
+ using split_children_iertr = base_iertr;
+ using split_children_ret = split_children_iertr::future
+ <std::tuple<OMapLeafNodeRef, OMapLeafNodeRef, std::string>>;
+ split_children_ret make_split_children(
+ omap_context_t oc);
+
+ full_merge_ret make_full_merge(
+ omap_context_t oc,
+ OMapNodeRef right) final;
+
+ make_balanced_ret make_balanced(
+ omap_context_t oc,
+ OMapNodeRef _right) final;
+
+ static constexpr extent_types_t TYPE = extent_types_t::OMAP_LEAF;
+ extent_types_t get_type() const final {
+ return TYPE;
+ }
+
+ ceph::bufferlist get_delta() final {
+ ceph::bufferlist bl;
+ if (!delta_buffer.empty()) {
+ encode(delta_buffer, bl);
+ delta_buffer.clear();
+ }
+ return bl;
+ }
+
+ void apply_delta(const ceph::bufferlist &_bl) final {
+ assert(_bl.length());
+ ceph::bufferlist bl = _bl;
+ bl.rebuild();
+ delta_leaf_buffer_t buffer;
+ auto bptr = bl.cbegin();
+ decode(buffer, bptr);
+ buffer.replay(*this);
+ }
+
+ std::ostream &print_detail_l(std::ostream &out) const final;
+
+ std::pair<internal_iterator_t, internal_iterator_t>
+ get_leaf_entries(std::string &key);
+
+};
+using OMapLeafNodeRef = OMapLeafNode::OMapLeafNodeRef;
+
+std::ostream &operator<<(std::ostream &out, const omap_inner_key_t &rhs);
+std::ostream &operator<<(std::ostream &out, const omap_leaf_key_t &rhs);
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::omap_manager::OMapInnerNode> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::omap_manager::OMapLeafNode> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_types.h b/src/crimson/os/seastore/omap_manager/btree/omap_types.h
new file mode 100644
index 000000000..9e0d10e03
--- /dev/null
+++ b/src/crimson/os/seastore/omap_manager/btree/omap_types.h
@@ -0,0 +1,157 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+#include "crimson/os/seastore/seastore_types.h"
+
+namespace crimson::os::seastore::omap_manager {
+
+struct omap_node_meta_t {
+ depth_t depth = 0;
+
+ std::pair<omap_node_meta_t, omap_node_meta_t> split_into() const {
+ return std::make_pair(
+ omap_node_meta_t{depth},
+ omap_node_meta_t{depth});
+ }
+
+ static omap_node_meta_t merge_from(
+ const omap_node_meta_t &lhs, const omap_node_meta_t &rhs) {
+ assert(lhs.depth == rhs.depth);
+ return omap_node_meta_t{lhs.depth};
+ }
+
+ static std::pair<omap_node_meta_t, omap_node_meta_t>
+ rebalance(const omap_node_meta_t &lhs, const omap_node_meta_t &rhs) {
+ assert(lhs.depth == rhs.depth);
+ return std::make_pair(
+ omap_node_meta_t{lhs.depth},
+ omap_node_meta_t{lhs.depth});
+ }
+};
+
+struct omap_node_meta_le_t {
+ depth_le_t depth = init_depth_le(0);
+
+ omap_node_meta_le_t() = default;
+ omap_node_meta_le_t(const omap_node_meta_le_t &) = default;
+ explicit omap_node_meta_le_t(const omap_node_meta_t &val)
+ : depth(init_depth_le(val.depth)) {}
+
+ operator omap_node_meta_t() const {
+ return omap_node_meta_t{ depth };
+ }
+};
+
+struct omap_inner_key_t {
+ uint16_t key_off = 0;
+ uint16_t key_len = 0;
+ laddr_t laddr = 0;
+
+ omap_inner_key_t() = default;
+ omap_inner_key_t(uint16_t off, uint16_t len, laddr_t addr)
+ : key_off(off), key_len(len), laddr(addr) {}
+
+ inline bool operator==(const omap_inner_key_t b) const {
+ return key_off == b.key_off && key_len == b.key_len && laddr == b.laddr;
+ }
+ inline bool operator!=(const omap_inner_key_t b) const {
+ return key_off != b.key_off || key_len != b.key_len || laddr != b.laddr;
+ }
+ DENC(omap_inner_key_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.key_off, p);
+ denc(v.key_len, p);
+ denc(v.laddr, p);
+ DENC_FINISH(p);
+ }
+};
+
+struct omap_inner_key_le_t {
+ ceph_le16 key_off{0};
+ ceph_le16 key_len{0};
+ laddr_le_t laddr{0};
+
+ omap_inner_key_le_t() = default;
+ omap_inner_key_le_t(const omap_inner_key_le_t &) = default;
+ explicit omap_inner_key_le_t(const omap_inner_key_t &key)
+ : key_off(key.key_off),
+ key_len(key.key_len),
+ laddr(key.laddr) {}
+
+ operator omap_inner_key_t() const {
+ return omap_inner_key_t{uint16_t(key_off), uint16_t(key_len), laddr_t(laddr)};
+ }
+
+ omap_inner_key_le_t& operator=(omap_inner_key_t key) {
+ key_off = key.key_off;
+ key_len = key.key_len;
+ laddr = laddr_le_t(key.laddr);
+ return *this;
+ }
+
+ inline bool operator==(const omap_inner_key_le_t b) const {
+ return key_off == b.key_off && key_len == b.key_len && laddr == b.laddr;
+ }
+};
+
+struct omap_leaf_key_t {
+ uint16_t key_off = 0;
+ uint16_t key_len = 0;
+ uint16_t val_len = 0;
+
+ omap_leaf_key_t() = default;
+ omap_leaf_key_t(uint16_t k_off, uint16_t k_len, uint16_t v_len)
+ : key_off(k_off), key_len(k_len), val_len(v_len) {}
+
+ inline bool operator==(const omap_leaf_key_t b) const {
+ return key_off == b.key_off && key_len == b.key_len &&
+ val_len == b.val_len;
+ }
+ inline bool operator!=(const omap_leaf_key_t b) const {
+ return key_off != b.key_off || key_len != b.key_len ||
+ val_len != b.val_len;
+ }
+
+ DENC(omap_leaf_key_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.key_off, p);
+ denc(v.key_len, p);
+ denc(v.val_len, p);
+ DENC_FINISH(p);
+ }
+};
+
+struct omap_leaf_key_le_t {
+ ceph_le16 key_off{0};
+ ceph_le16 key_len{0};
+ ceph_le16 val_len{0};
+
+ omap_leaf_key_le_t() = default;
+ omap_leaf_key_le_t(const omap_leaf_key_le_t &) = default;
+ explicit omap_leaf_key_le_t(const omap_leaf_key_t &key)
+ : key_off(key.key_off),
+ key_len(key.key_len),
+ val_len(key.val_len) {}
+
+ operator omap_leaf_key_t() const {
+ return omap_leaf_key_t{uint16_t(key_off), uint16_t(key_len),
+ uint16_t(val_len)};
+ }
+
+ omap_leaf_key_le_t& operator=(omap_leaf_key_t key) {
+ key_off = key.key_off;
+ key_len = key.key_len;
+ val_len = key.val_len;
+ return *this;
+ }
+
+ inline bool operator==(const omap_leaf_key_le_t b) const {
+ return key_off == b.key_off && key_len == b.key_len &&
+ val_len == b.val_len;
+ }
+};
+
+}
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::omap_manager::omap_inner_key_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::omap_manager::omap_leaf_key_t)
diff --git a/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h b/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h
new file mode 100644
index 000000000..72b13fedf
--- /dev/null
+++ b/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h
@@ -0,0 +1,1550 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include "include/byteorder.h"
+#include "include/denc.h"
+#include "include/encoding.h"
+
+#include "crimson/common/layout.h"
+#include "crimson/common/fixed_kv_node_layout.h"
+#include "crimson/os/seastore/omap_manager.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_types.h"
+
+namespace crimson::os::seastore::omap_manager {
+class StringKVInnerNodeLayout;
+class StringKVLeafNodeLayout;
+
+/**
+ * copy_from_foreign
+ *
+ * Copy from another node entries to this node.
+ * [from_src, to_src) is another node entry range.
+ * tgt is this node entry to copy to.
+ * tgt and from_src must be from different nodes.
+ * from_src and to_src must be in the same node.
+ */
+template <typename iterator, typename const_iterator>
+static void copy_from_foreign(
+ iterator tgt,
+ const_iterator from_src,
+ const_iterator to_src) {
+ assert(tgt->node != from_src->node);
+ assert(to_src->node == from_src->node);
+ if (from_src == to_src)
+ return;
+
+ auto to_copy = from_src->get_right_ptr_end() - to_src->get_right_ptr_end();
+ assert(to_copy > 0);
+ memcpy(
+ tgt->get_right_ptr_end() - to_copy,
+ to_src->get_right_ptr_end(),
+ to_copy);
+ memcpy(
+ tgt->get_node_key_ptr(),
+ from_src->get_node_key_ptr(),
+ to_src->get_node_key_ptr() - from_src->get_node_key_ptr());
+
+ auto offset_diff = tgt->get_right_offset_end() - from_src->get_right_offset_end();
+ for (auto i = tgt; i != tgt + (to_src - from_src); ++i) {
+ i->update_offset(offset_diff);
+ }
+}
+
+/**
+ * copy_from_local
+ *
+ * Copies entries from [from_src, to_src) to tgt.
+ * tgt, from_src, and to_src must be from the same node.
+ */
+template <typename iterator>
+static void copy_from_local(
+ unsigned len,
+ iterator tgt,
+ iterator from_src,
+ iterator to_src) {
+ assert(tgt->node == from_src->node);
+ assert(to_src->node == from_src->node);
+
+ auto to_copy = from_src->get_right_ptr_end() - to_src->get_right_ptr_end();
+ assert(to_copy > 0);
+ int adjust_offset = tgt > from_src? -len : len;
+ memmove(to_src->get_right_ptr_end() + adjust_offset,
+ to_src->get_right_ptr_end(),
+ to_copy);
+
+ for ( auto ite = from_src; ite < to_src; ite++) {
+ ite->update_offset(-adjust_offset);
+ }
+ memmove(tgt->get_node_key_ptr(), from_src->get_node_key_ptr(),
+ to_src->get_node_key_ptr() - from_src->get_node_key_ptr());
+}
+
+struct delta_inner_t {
+ enum class op_t : uint_fast8_t {
+ INSERT,
+ UPDATE,
+ REMOVE,
+ } op;
+ std::string key;
+ laddr_t addr;
+
+ DENC(delta_inner_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.op, p);
+ denc(v.key, p);
+ denc(v.addr, p);
+ DENC_FINISH(p);
+ }
+
+ void replay(StringKVInnerNodeLayout &l);
+ bool operator==(const delta_inner_t &rhs) const {
+ return op == rhs.op &&
+ key == rhs.key &&
+ addr == rhs.addr;
+ }
+};
+}
+WRITE_CLASS_DENC(crimson::os::seastore::omap_manager::delta_inner_t)
+
+namespace crimson::os::seastore::omap_manager {
+struct delta_leaf_t {
+ enum class op_t : uint_fast8_t {
+ INSERT,
+ UPDATE,
+ REMOVE,
+ } op;
+ std::string key;
+ ceph::bufferlist val;
+
+ DENC(delta_leaf_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.op, p);
+ denc(v.key, p);
+ denc(v.val, p);
+ DENC_FINISH(p);
+ }
+
+ void replay(StringKVLeafNodeLayout &l);
+ bool operator==(const delta_leaf_t &rhs) const {
+ return op == rhs.op &&
+ key == rhs.key &&
+ val == rhs.val;
+ }
+};
+}
+WRITE_CLASS_DENC(crimson::os::seastore::omap_manager::delta_leaf_t)
+
+namespace crimson::os::seastore::omap_manager {
+class delta_inner_buffer_t {
+ std::vector<delta_inner_t> buffer;
+public:
+ bool empty() const {
+ return buffer.empty();
+ }
+ void insert(
+ const std::string &key,
+ laddr_t addr) {
+ buffer.push_back(
+ delta_inner_t{
+ delta_inner_t::op_t::INSERT,
+ key,
+ addr
+ });
+ }
+ void update(
+ const std::string &key,
+ laddr_t addr) {
+ buffer.push_back(
+ delta_inner_t{
+ delta_inner_t::op_t::UPDATE,
+ key,
+ addr
+ });
+ }
+ void remove(const std::string &key) {
+ buffer.push_back(
+ delta_inner_t{
+ delta_inner_t::op_t::REMOVE,
+ key,
+ L_ADDR_NULL
+ });
+ }
+
+ void replay(StringKVInnerNodeLayout &node) {
+ for (auto &i: buffer) {
+ i.replay(node);
+ }
+ }
+
+ void clear() {
+ buffer.clear();
+ }
+
+ DENC(delta_inner_buffer_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.buffer, p);
+ DENC_FINISH(p);
+ }
+
+ bool operator==(const delta_inner_buffer_t &rhs) const {
+ return buffer == rhs.buffer;
+ }
+};
+}
+WRITE_CLASS_DENC(crimson::os::seastore::omap_manager::delta_inner_buffer_t)
+
+namespace crimson::os::seastore::omap_manager {
+class delta_leaf_buffer_t {
+ std::vector<delta_leaf_t> buffer;
+public:
+ bool empty() const {
+ return buffer.empty();
+ }
+ void insert(
+ const std::string &key,
+ const ceph::bufferlist &val) {
+ buffer.push_back(
+ delta_leaf_t{
+ delta_leaf_t::op_t::INSERT,
+ key,
+ val
+ });
+ }
+ void update(
+ const std::string &key,
+ const ceph::bufferlist &val) {
+ buffer.push_back(
+ delta_leaf_t{
+ delta_leaf_t::op_t::UPDATE,
+ key,
+ val
+ });
+ }
+ void remove(const std::string &key) {
+ buffer.push_back(
+ delta_leaf_t{
+ delta_leaf_t::op_t::REMOVE,
+ key,
+ bufferlist()
+ });
+ }
+
+ void replay(StringKVLeafNodeLayout &node) {
+ for (auto &i: buffer) {
+ i.replay(node);
+ }
+ }
+
+ void clear() {
+ buffer.clear();
+ }
+
+ DENC(delta_leaf_buffer_t, v, p) {
+ DENC_START(1, 1, p);
+ denc(v.buffer, p);
+ DENC_FINISH(p);
+ }
+
+ bool operator==(const delta_leaf_buffer_t &rhs) const {
+ return buffer == rhs.buffer;
+ }
+};
+}
+WRITE_CLASS_DENC(crimson::os::seastore::omap_manager::delta_leaf_buffer_t)
+
+namespace crimson::os::seastore::omap_manager {
+/**
+ * StringKVInnerNodeLayout
+ *
+ * Uses absl::container_internal::Layout for the actual key memory layout.
+ *
+ * The primary interface exposed is centered on the iterator
+ * and related methods.
+ *
+ * Also included are helpers for doing splits and merges as for a btree.
+ *
+ * layout diagram:
+ *
+ * # <----------------------------- node range --------------------------------------------> #
+ * # #<~># free space #
+ * # <------------- left part -----------------------------> # <~# <----- right keys -----> #
+ * # # <------------ left keys --------------> #~> # #
+ * # # keys [2, n) |<~># #<~>| right keys [2, n) #
+ * # # <--- key 0 ----> | <--- key 1 ----> | # # | <- k1 -> | <-- k0 --> #
+ * # # | | # # | | #
+ * # num_ | meta # key | key | val | key | key | val | # # | key | key #
+ * # keys | depth # off | len | laddr| off | len | laddr| # # | buff | buff #
+ * # | # 0 | 0 | 0 | 1 | 1 | 1 |...#...#...| key 1 | key 0 #
+ * # | | | <- off --+----------> #
+ * # | | ^ | <- off --> #
+ * | | | ^
+ * | +----------------------------------+ |
+ * +----------------------------------------------------------------+
+ */
+class StringKVInnerNodeLayout {
+ char *buf = nullptr;
+
+ using L = absl::container_internal::Layout<ceph_le32, omap_node_meta_le_t, omap_inner_key_le_t>;
+ static constexpr L layout{1, 1, 1}; // = L::Partial(1, 1, 1);
+ friend class delta_inner_t;
+public:
+ template <bool is_const>
+ class iter_t {
+ friend class StringKVInnerNodeLayout;
+
+ template <typename iterator, typename const_iterator>
+ friend void copy_from_foreign(iterator, const_iterator, const_iterator);
+ template <typename iterator>
+ friend void copy_from_local(unsigned, iterator, iterator, iterator);
+
+ using parent_t = typename crimson::common::maybe_const_t<StringKVInnerNodeLayout, is_const>::type;
+
+ mutable parent_t node;
+ uint16_t index;
+
+ iter_t(
+ parent_t parent,
+ uint16_t index) : node(parent), index(index) {}
+
+ public:
+ using iterator_category = std::input_iterator_tag;
+ using value_type = StringKVInnerNodeLayout;
+ using difference_type = std::ptrdiff_t;
+ using pointer = StringKVInnerNodeLayout*;
+ using reference = iter_t&;
+
+ iter_t(const iter_t &) = default;
+ iter_t(iter_t &&) = default;
+ iter_t &operator=(const iter_t &) = default;
+ iter_t &operator=(iter_t &&) = default;
+
+ operator iter_t<!is_const>() const {
+ static_assert(!is_const);
+ return iter_t<!is_const>(node, index);
+ }
+
+ iter_t &operator*() { return *this; }
+ iter_t *operator->() { return this; }
+
+ iter_t operator++(int) {
+ auto ret = *this;
+ ++index;
+ return ret;
+ }
+
+ iter_t &operator++() {
+ ++index;
+ return *this;
+ }
+
+ iter_t operator--(int) {
+ auto ret = *this;
+ assert(index > 0);
+ --index;
+ return ret;
+ }
+
+ iter_t &operator--() {
+ assert(index > 0);
+ --index;
+ return *this;
+ }
+
+ uint16_t operator-(const iter_t &rhs) const {
+ assert(rhs.node == node);
+ return index - rhs.index;
+ }
+
+ iter_t operator+(uint16_t off) const {
+ return iter_t(node, index + off);
+ }
+ iter_t operator-(uint16_t off) const {
+ return iter_t(node, index - off);
+ }
+
+ uint16_t operator<(const iter_t &rhs) const {
+ assert(rhs.node == node);
+ return index < rhs.index;
+ }
+
+ uint16_t operator>(const iter_t &rhs) const {
+ assert(rhs.node == node);
+ return index > rhs.index;
+ }
+
+ friend bool operator==(const iter_t &lhs, const iter_t &rhs) {
+ assert(lhs.node == rhs.node);
+ return lhs.index == rhs.index;
+ }
+
+ private:
+ omap_inner_key_t get_node_key() const {
+ omap_inner_key_le_t kint = node->get_node_key_ptr()[index];
+ return omap_inner_key_t(kint);
+ }
+ auto get_node_key_ptr() const {
+ return reinterpret_cast<
+ typename crimson::common::maybe_const_t<char, is_const>::type>(
+ node->get_node_key_ptr() + index);
+ }
+
+ uint32_t get_node_val_offset() const {
+ return get_node_key().key_off;
+ }
+ auto get_node_val_ptr() const {
+ auto tail = node->buf + OMAP_INNER_BLOCK_SIZE;
+ if (*this == node->iter_end())
+ return tail;
+ else {
+ return tail - get_node_val_offset();
+ }
+ }
+
+ int get_right_offset_end() const {
+ if (index == 0)
+ return 0;
+ else
+ return (*this - 1)->get_node_val_offset();
+ }
+ auto get_right_ptr_end() const {
+ return node->buf + OMAP_INNER_BLOCK_SIZE - get_right_offset_end();
+ }
+
+ void update_offset(int offset) {
+ static_assert(!is_const);
+ auto key = get_node_key();
+ assert(offset + key.key_off >= 0);
+ key.key_off += offset;
+ set_node_key(key);
+ }
+
+ void set_node_key(omap_inner_key_t _lb) {
+ static_assert(!is_const);
+ omap_inner_key_le_t lb;
+ lb = _lb;
+ node->get_node_key_ptr()[index] = lb;
+ }
+
+ void set_node_val(const std::string &str) {
+ static_assert(!is_const);
+ assert(str.size() == get_node_key().key_len);
+ assert(get_node_key().key_off >= str.size());
+ assert(get_node_key().key_off < OMAP_INNER_BLOCK_SIZE);
+ assert(str.size() < OMAP_INNER_BLOCK_SIZE);
+ ::memcpy(get_node_val_ptr(), str.data(), str.size());
+ }
+
+ public:
+ uint16_t get_index() const {
+ return index;
+ }
+
+ std::string get_key() const {
+ return std::string(
+ get_node_val_ptr(),
+ get_node_key().key_len);
+ }
+
+ laddr_t get_val() const {
+ return get_node_key().laddr;
+ }
+
+ bool contains(std::string_view key) const {
+ assert(*this != node->iter_end());
+ auto next = *this + 1;
+ if (next == node->iter_end()) {
+ return get_key() <= key;
+ } else {
+ return (get_key() <= key) && (next->get_key() > key);
+ }
+ }
+ };
+ using const_iterator = iter_t<true>;
+ using iterator = iter_t<false>;
+
+public:
+ void journal_inner_insert(
+ const_iterator _iter,
+ const laddr_t laddr,
+ const std::string &key,
+ delta_inner_buffer_t *recorder) {
+ auto iter = iterator(this, _iter.index);
+ if (recorder) {
+ recorder->insert(
+ key,
+ laddr);
+ }
+ inner_insert(iter, key, laddr);
+ }
+
+ void journal_inner_update(
+ const_iterator _iter,
+ const laddr_t laddr,
+ delta_inner_buffer_t *recorder) {
+ auto iter = iterator(this, _iter.index);
+ auto key = iter->get_key();
+ if (recorder) {
+ recorder->update(key, laddr);
+ }
+ inner_update(iter, laddr);
+ }
+
+ void journal_inner_remove(
+ const_iterator _iter,
+ delta_inner_buffer_t *recorder) {
+ auto iter = iterator(this, _iter.index);
+ if (recorder) {
+ recorder->remove(iter->get_key());
+ }
+ inner_remove(iter);
+ }
+
+ StringKVInnerNodeLayout(char *buf) :
+ buf(buf) {}
+
+ uint32_t get_size() const {
+ ceph_le32 &size = *layout.template Pointer<0>(buf);
+ return uint32_t(size);
+ }
+
+ /**
+ * set_size
+ *
+ * Set size representation to match size
+ */
+ void set_size(uint32_t size) {
+ ceph_le32 s;
+ s = size;
+ *layout.template Pointer<0>(buf) = s;
+ }
+
+ const_iterator iter_cbegin() const {
+ return const_iterator(
+ this,
+ 0);
+ }
+ const_iterator iter_begin() const {
+ return iter_cbegin();
+ }
+
+ const_iterator iter_cend() const {
+ return const_iterator(
+ this,
+ get_size());
+ }
+ const_iterator iter_end() const {
+ return iter_cend();
+ }
+
+ iterator iter_begin() {
+ return iterator(
+ this,
+ 0);
+ }
+
+ iterator iter_end() {
+ return iterator(
+ this,
+ get_size());
+ }
+
+ const_iterator iter_idx(uint16_t off) const {
+ return const_iterator(
+ this,
+ off);
+ }
+
+ const_iterator string_lower_bound(std::string_view str) const {
+ auto it = std::lower_bound(boost::make_counting_iterator<uint16_t>(0),
+ boost::make_counting_iterator<uint16_t>(get_size()),
+ str,
+ [this](uint16_t i, std::string_view str) {
+ const_iterator iter(this, i);
+ return iter->get_key() < str;
+ });
+ return const_iterator(this, *it);
+ }
+
+ iterator string_lower_bound(std::string_view str) {
+ const auto &tref = *this;
+ return iterator(this, tref.string_lower_bound(str).index);
+ }
+
+ const_iterator string_upper_bound(std::string_view str) const {
+ auto it = std::upper_bound(boost::make_counting_iterator<uint16_t>(0),
+ boost::make_counting_iterator<uint16_t>(get_size()),
+ str,
+ [this](std::string_view str, uint16_t i) {
+ const_iterator iter(this, i);
+ return str < iter->get_key();
+ });
+ return const_iterator(this, *it);
+ }
+
+ iterator string_upper_bound(std::string_view str) {
+ const auto &tref = *this;
+ return iterator(this, tref.string_upper_bound(str).index);
+ }
+
+ const_iterator find_string_key(std::string_view str) const {
+ auto ret = iter_begin();
+ for (; ret != iter_end(); ++ret) {
+ std::string s = ret->get_key();
+ if (s == str)
+ break;
+ }
+ return ret;
+ }
+
+ iterator find_string_key(std::string_view str) {
+ const auto &tref = *this;
+ return iterator(this, tref.find_string_key(str).index);
+ }
+
+ const_iterator get_split_pivot() const {
+ uint32_t total_size = omap_inner_key_t(
+ get_node_key_ptr()[get_size()-1]).key_off;
+ uint32_t pivot_size = total_size / 2;
+ uint32_t size = 0;
+ for (auto ite = iter_begin(); ite < iter_end(); ite++) {
+ auto node_key = ite->get_node_key();
+ size += node_key.key_len;
+ if (size >= pivot_size){
+ return ite;
+ }
+ }
+ return iter_end();
+ }
+
+
+ /**
+ * get_meta/set_meta
+ *
+ * Enables stashing a templated type within the layout.
+ * Cannot be modified after initial write as it is not represented
+ * in delta_t
+ */
+ omap_node_meta_t get_meta() const {
+ omap_node_meta_le_t &metaint = *layout.template Pointer<1>(buf);
+ return omap_node_meta_t(metaint);
+ }
+ void set_meta(const omap_node_meta_t &meta) {
+ *layout.template Pointer<1>(buf) = omap_node_meta_le_t(meta);
+ }
+
+ uint32_t used_space() const {
+ uint32_t count = get_size();
+ if (count) {
+ omap_inner_key_t last_key = omap_inner_key_t(get_node_key_ptr()[count-1]);
+ return last_key.key_off + count * sizeof(omap_inner_key_le_t);
+ } else {
+ return 0;
+ }
+ }
+
+ uint32_t free_space() const {
+ return capacity() - used_space();
+ }
+
+ uint16_t capacity() const {
+ return OMAP_INNER_BLOCK_SIZE
+ - (reinterpret_cast<char*>(layout.template Pointer<2>(buf))
+ - reinterpret_cast<char*>(layout.template Pointer<0>(buf)));
+ }
+
+ bool is_overflow(size_t ksize) const {
+ return free_space() < (sizeof(omap_inner_key_le_t) + ksize);
+ }
+
+ bool is_overflow(const StringKVInnerNodeLayout &rhs) const {
+ return free_space() < rhs.used_space();
+ }
+
+ bool below_min() const {
+ return free_space() > (capacity() / 2);
+ }
+
+ bool operator==(const StringKVInnerNodeLayout &rhs) const {
+ if (get_size() != rhs.get_size()) {
+ return false;
+ }
+
+ auto iter = iter_begin();
+ auto iter2 = rhs.iter_begin();
+ while (iter != iter_end()) {
+ if (iter->get_key() != iter2->get_key() ||
+ iter->get_val() != iter2->get_val()) {
+ return false;
+ }
+ iter++;
+ iter2++;
+ }
+ return true;
+ }
+
+ /**
+ * split_into
+ *
+ * Takes *this and splits its contents into left and right.
+ */
+ std::string split_into(
+ StringKVInnerNodeLayout &left,
+ StringKVInnerNodeLayout &right) const {
+ auto piviter = get_split_pivot();
+ assert(piviter != iter_end());
+
+ copy_from_foreign(left.iter_begin(), iter_begin(), piviter);
+ left.set_size(piviter - iter_begin());
+
+ copy_from_foreign(right.iter_begin(), piviter, iter_end());
+ right.set_size(iter_end() - piviter);
+
+ auto [lmeta, rmeta] = get_meta().split_into();
+ left.set_meta(lmeta);
+ right.set_meta(rmeta);
+
+ return piviter->get_key();
+ }
+
+ /**
+ * merge_from
+ *
+ * Takes two nodes and copies their contents into *this.
+ *
+ * precondition: left.size() + right.size() < CAPACITY
+ */
+ void merge_from(
+ const StringKVInnerNodeLayout &left,
+ const StringKVInnerNodeLayout &right) {
+ copy_from_foreign(
+ iter_end(),
+ left.iter_begin(),
+ left.iter_end());
+ set_size(left.get_size());
+
+ copy_from_foreign(
+ iter_end(),
+ right.iter_begin(),
+ right.iter_end());
+ set_size(left.get_size() + right.get_size());
+ set_meta(omap_node_meta_t::merge_from(left.get_meta(), right.get_meta()));
+ }
+
+ /**
+ * balance_into_new_nodes
+ *
+ * Takes the contents of left and right and copies them into
+ * replacement_left and replacement_right such that
+ * the size of replacement_left just >= 1/2 of (left + right)
+ */
+ static std::string balance_into_new_nodes(
+ const StringKVInnerNodeLayout &left,
+ const StringKVInnerNodeLayout &right,
+ StringKVInnerNodeLayout &replacement_left,
+ StringKVInnerNodeLayout &replacement_right)
+ {
+ uint32_t left_size = omap_inner_key_t(left.get_node_key_ptr()[left.get_size()-1]).key_off;
+ uint32_t right_size = omap_inner_key_t(right.get_node_key_ptr()[right.get_size()-1]).key_off;
+ uint32_t total = left_size + right_size;
+ uint32_t pivot_size = total / 2;
+ uint32_t pivot_idx = 0;
+ if (pivot_size < left_size) {
+ uint32_t size = 0;
+ for (auto ite = left.iter_begin(); ite < left.iter_end(); ite++) {
+ auto node_key = ite->get_node_key();
+ size += node_key.key_len;
+ if (size >= pivot_size){
+ pivot_idx = ite.get_index();
+ break;
+ }
+ }
+ } else {
+ uint32_t more_size = pivot_size - left_size;
+ uint32_t size = 0;
+ for (auto ite = right.iter_begin(); ite < right.iter_end(); ite++) {
+ auto node_key = ite->get_node_key();
+ size += node_key.key_len;
+ if (size >= more_size){
+ pivot_idx = ite.get_index() + left.get_size();
+ break;
+ }
+ }
+ }
+
+ auto replacement_pivot = pivot_idx >= left.get_size() ?
+ right.iter_idx(pivot_idx - left.get_size())->get_key() :
+ left.iter_idx(pivot_idx)->get_key();
+
+ if (pivot_size < left_size) {
+ copy_from_foreign(
+ replacement_left.iter_end(),
+ left.iter_begin(),
+ left.iter_idx(pivot_idx));
+ replacement_left.set_size(pivot_idx);
+
+ copy_from_foreign(
+ replacement_right.iter_end(),
+ left.iter_idx(pivot_idx),
+ left.iter_end());
+ replacement_right.set_size(left.get_size() - pivot_idx);
+
+ copy_from_foreign(
+ replacement_right.iter_end(),
+ right.iter_begin(),
+ right.iter_end());
+ replacement_right.set_size(right.get_size() + left.get_size()- pivot_idx);
+ } else {
+ copy_from_foreign(
+ replacement_left.iter_end(),
+ left.iter_begin(),
+ left.iter_end());
+ replacement_left.set_size(left.get_size());
+
+ copy_from_foreign(
+ replacement_left.iter_end(),
+ right.iter_begin(),
+ right.iter_idx(pivot_idx - left.get_size()));
+ replacement_left.set_size(pivot_idx);
+
+ copy_from_foreign(
+ replacement_right.iter_end(),
+ right.iter_idx(pivot_idx - left.get_size()),
+ right.iter_end());
+ replacement_right.set_size(right.get_size() + left.get_size() - pivot_idx);
+ }
+
+ auto [lmeta, rmeta] = omap_node_meta_t::rebalance(
+ left.get_meta(), right.get_meta());
+ replacement_left.set_meta(lmeta);
+ replacement_right.set_meta(rmeta);
+ return replacement_pivot;
+ }
+
+private:
+ void inner_insert(
+ iterator iter,
+ const std::string &key,
+ laddr_t val) {
+ if (iter != iter_begin()) {
+ assert((iter - 1)->get_key() < key);
+ }
+ if (iter != iter_end()) {
+ assert(iter->get_key() > key);
+ }
+ assert(!is_overflow(key.size()));
+
+ if (iter != iter_end()) {
+ copy_from_local(key.size(), iter + 1, iter, iter_end());
+ }
+
+ omap_inner_key_t nkey;
+ nkey.key_len = key.size();
+ nkey.laddr = val;
+ if (iter != iter_begin()) {
+ auto pkey = (iter - 1).get_node_key();
+ nkey.key_off = nkey.key_len + pkey.key_off;
+ } else {
+ nkey.key_off = nkey.key_len;
+ }
+
+ iter->set_node_key(nkey);
+ set_size(get_size() + 1);
+ iter->set_node_val(key);
+ }
+
+ void inner_update(
+ iterator iter,
+ laddr_t addr) {
+ assert(iter != iter_end());
+ auto node_key = iter->get_node_key();
+ node_key.laddr = addr;
+ iter->set_node_key(node_key);
+ }
+
+ void inner_remove(iterator iter) {
+ assert(iter != iter_end());
+ if ((iter + 1) != iter_end())
+ copy_from_local(iter->get_node_key().key_len, iter, iter + 1, iter_end());
+ set_size(get_size() - 1);
+ }
+
+ /**
+ * get_key_ptr
+ *
+ * Get pointer to start of key array
+ */
+ omap_inner_key_le_t *get_node_key_ptr() {
+ return L::Partial(1, 1, get_size()).template Pointer<2>(buf);
+ }
+ const omap_inner_key_le_t *get_node_key_ptr() const {
+ return L::Partial(1, 1, get_size()).template Pointer<2>(buf);
+ }
+
+};
+
+/**
+ * StringKVLeafNodeLayout
+ *
+ * layout diagram:
+ *
+ * # <----------------------------- node range -------------------------------------------------> #
+ * # #<~># free space #
+ * # <------------- left part ---------------------------> # <~# <----- right key-value pairs --> #
+ * # # <------------ left keys ------------> #~> # #
+ * # # keys [2, n) |<~># #<~>| right kvs [2, n) #
+ * # # <--- key 0 ---> | <--- key 1 ---> | # # | <-- kv 1 --> | <-- kv 0 --> #
+ * # # | | # # | | #
+ * # num_ | meta # key | key | val | key | key | val | # # | key | val | key | val #
+ * # keys | depth # off | len | len | off | len | len | # # | buff | buff | buff | buff #
+ * # # 0 | 0 | 0 | 1 | 1 | 1 |...#...#...| key 1 | val 1| key 0 | val 0 #
+ * # | | | <--- off ----+-------------> #
+ * # | | ^ | <--- off ---> #
+ * | | | ^
+ * | +-----------------------------------+ |
+ * +-------------------------------------------------------------------+
+ */
+class StringKVLeafNodeLayout {
+ char *buf = nullptr;
+
+ using L = absl::container_internal::Layout<ceph_le32, omap_node_meta_le_t, omap_leaf_key_le_t>;
+ static constexpr L layout{1, 1, 1}; // = L::Partial(1, 1, 1);
+ friend class delta_leaf_t;
+
+public:
+ template <bool is_const>
+ class iter_t {
+ friend class StringKVLeafNodeLayout;
+ using parent_t = typename crimson::common::maybe_const_t<StringKVLeafNodeLayout, is_const>::type;
+
+ template <typename iterator, typename const_iterator>
+ friend void copy_from_foreign(iterator, const_iterator, const_iterator);
+ template <typename iterator>
+ friend void copy_from_local(unsigned, iterator, iterator, iterator);
+
+ parent_t node;
+ uint16_t index;
+
+ iter_t(
+ parent_t parent,
+ uint16_t index) : node(parent), index(index) {}
+
+ public:
+ iter_t(const iter_t &) = default;
+ iter_t(iter_t &&) = default;
+ iter_t &operator=(const iter_t &) = default;
+ iter_t &operator=(iter_t &&) = default;
+
+ operator iter_t<!is_const>() const {
+ static_assert(!is_const);
+ return iter_t<!is_const>(node, index);
+ }
+
+ iter_t &operator*() { return *this; }
+ iter_t *operator->() { return this; }
+
+ iter_t operator++(int) {
+ auto ret = *this;
+ ++index;
+ return ret;
+ }
+
+ iter_t &operator++() {
+ ++index;
+ return *this;
+ }
+
+ uint16_t operator-(const iter_t &rhs) const {
+ assert(rhs.node == node);
+ return index - rhs.index;
+ }
+
+ iter_t operator+(uint16_t off) const {
+ return iter_t(
+ node,
+ index + off);
+ }
+ iter_t operator-(uint16_t off) const {
+ return iter_t(
+ node,
+ index - off);
+ }
+
+ uint16_t operator<(const iter_t &rhs) const {
+ assert(rhs.node == node);
+ return index < rhs.index;
+ }
+
+ uint16_t operator>(const iter_t &rhs) const {
+ assert(rhs.node == node);
+ return index > rhs.index;
+ }
+
+ bool operator==(const iter_t &rhs) const {
+ assert(node == rhs.node);
+ return rhs.index == index;
+ }
+
+ bool operator!=(const iter_t &rhs) const {
+ assert(node == rhs.node);
+ return index != rhs.index;
+ }
+
+ private:
+ omap_leaf_key_t get_node_key() const {
+ omap_leaf_key_le_t kint = node->get_node_key_ptr()[index];
+ return omap_leaf_key_t(kint);
+ }
+ auto get_node_key_ptr() const {
+ return reinterpret_cast<
+ typename crimson::common::maybe_const_t<char, is_const>::type>(
+ node->get_node_key_ptr() + index);
+ }
+
+ uint32_t get_node_val_offset() const {
+ return get_node_key().key_off;
+ }
+ auto get_node_val_ptr() const {
+ auto tail = node->buf + OMAP_LEAF_BLOCK_SIZE;
+ if (*this == node->iter_end())
+ return tail;
+ else {
+ return tail - get_node_val_offset();
+ }
+ }
+
+ int get_right_offset_end() const {
+ if (index == 0)
+ return 0;
+ else
+ return (*this - 1)->get_node_val_offset();
+ }
+ auto get_right_ptr_end() const {
+ return node->buf + OMAP_LEAF_BLOCK_SIZE - get_right_offset_end();
+ }
+
+ void update_offset(int offset) {
+ auto key = get_node_key();
+ assert(offset + key.key_off >= 0);
+ key.key_off += offset;
+ set_node_key(key);
+ }
+
+ void set_node_key(omap_leaf_key_t _lb) const {
+ static_assert(!is_const);
+ omap_leaf_key_le_t lb;
+ lb = _lb;
+ node->get_node_key_ptr()[index] = lb;
+ }
+
+ void set_node_val(const std::string &key, const ceph::bufferlist &val) {
+ static_assert(!is_const);
+ auto node_key = get_node_key();
+ assert(key.size() == node_key.key_len);
+ assert(val.length() == node_key.val_len);
+ ::memcpy(get_node_val_ptr(), key.data(), key.size());
+ auto bliter = val.begin();
+ bliter.copy(node_key.val_len, get_node_val_ptr() + node_key.key_len);
+ }
+
+ public:
+ uint16_t get_index() const {
+ return index;
+ }
+
+ std::string get_key() const {
+ return std::string(
+ get_node_val_ptr(),
+ get_node_key().key_len);
+ }
+
+ std::string get_str_val() const {
+ auto node_key = get_node_key();
+ return std::string(
+ get_node_val_ptr() + node_key.key_len,
+ get_node_key().val_len);
+ }
+
+ ceph::bufferlist get_val() const {
+ auto node_key = get_node_key();
+ ceph::bufferlist bl;
+ ceph::bufferptr bptr(
+ get_node_val_ptr() + node_key.key_len,
+ get_node_key().val_len);
+ bl.append(bptr);
+ return bl;
+ }
+ };
+ using const_iterator = iter_t<true>;
+ using iterator = iter_t<false>;
+
+public:
+ void journal_leaf_insert(
+ const_iterator _iter,
+ const std::string &key,
+ const ceph::bufferlist &val,
+ delta_leaf_buffer_t *recorder) {
+ auto iter = iterator(this, _iter.index);
+ if (recorder) {
+ recorder->insert(
+ key,
+ val);
+ }
+ leaf_insert(iter, key, val);
+ }
+
+ void journal_leaf_update(
+ const_iterator _iter,
+ const std::string &key,
+ const ceph::bufferlist &val,
+ delta_leaf_buffer_t *recorder) {
+ auto iter = iterator(this, _iter.index);
+ if (recorder) {
+ recorder->remove(iter->get_key());
+ recorder->insert(key, val);
+ }
+ leaf_update(iter, key, val);
+ }
+
+ void journal_leaf_remove(
+ const_iterator _iter,
+ delta_leaf_buffer_t *recorder) {
+ auto iter = iterator(this, _iter.index);
+ if (recorder) {
+ recorder->remove(iter->get_key());
+ }
+ leaf_remove(iter);
+ }
+
+ StringKVLeafNodeLayout(char *buf) :
+ buf(buf) {}
+
+ const_iterator iter_begin() const {
+ return const_iterator(
+ this,
+ 0);
+ }
+
+ const_iterator iter_end() const {
+ return const_iterator(
+ this,
+ get_size());
+ }
+
+ iterator iter_begin() {
+ return iterator(
+ this,
+ 0);
+ }
+
+ iterator iter_end() {
+ return iterator(
+ this,
+ get_size());
+ }
+
+ const_iterator iter_idx(uint16_t off) const {
+ return const_iterator(
+ this,
+ off);
+ }
+
+ const_iterator string_lower_bound(std::string_view str) const {
+ uint16_t start = 0, end = get_size();
+ while (start != end) {
+ unsigned mid = (start + end) / 2;
+ const_iterator iter(this, mid);
+ std::string s = iter->get_key();
+ if (s < str) {
+ start = ++mid;
+ } else if (s > str) {
+ end = mid;
+ } else {
+ return iter;
+ }
+ }
+ return const_iterator(this, start);
+ }
+
+ iterator string_lower_bound(std::string_view str) {
+ const auto &tref = *this;
+ return iterator(this, tref.string_lower_bound(str).index);
+ }
+
+ const_iterator string_upper_bound(std::string_view str) const {
+ auto ret = iter_begin();
+ for (; ret != iter_end(); ++ret) {
+ std::string s = ret->get_key();
+ if (s > str)
+ break;
+ }
+ return ret;
+ }
+
+ iterator string_upper_bound(std::string_view str) {
+ const auto &tref = *this;
+ return iterator(this, tref.string_upper_bound(str).index);
+ }
+
+ const_iterator find_string_key(std::string_view str) const {
+ auto ret = iter_begin();
+ for (; ret != iter_end(); ++ret) {
+ std::string s = ret->get_key();
+ if (s == str)
+ break;
+ }
+ return ret;
+ }
+ iterator find_string_key(std::string_view str) {
+ const auto &tref = *this;
+ return iterator(this, tref.find_string_key(str).index);
+ }
+
+ const_iterator get_split_pivot() const {
+ uint32_t total_size = omap_leaf_key_t(get_node_key_ptr()[get_size()-1]).key_off;
+ uint32_t pivot_size = total_size / 2;
+ uint32_t size = 0;
+ for (auto ite = iter_begin(); ite < iter_end(); ite++) {
+ auto node_key = ite->get_node_key();
+ size += node_key.key_len + node_key.val_len;
+ if (size >= pivot_size){
+ return ite;
+ }
+ }
+ return iter_end();
+ }
+
+ uint32_t get_size() const {
+ ceph_le32 &size = *layout.template Pointer<0>(buf);
+ return uint32_t(size);
+ }
+
+ /**
+ * set_size
+ *
+ * Set size representation to match size
+ */
+ void set_size(uint32_t size) {
+ ceph_le32 s;
+ s = size;
+ *layout.template Pointer<0>(buf) = s;
+ }
+
+ /**
+ * get_meta/set_meta
+ *
+ * Enables stashing a templated type within the layout.
+ * Cannot be modified after initial write as it is not represented
+ * in delta_t
+ */
+ omap_node_meta_t get_meta() const {
+ omap_node_meta_le_t &metaint = *layout.template Pointer<1>(buf);
+ return omap_node_meta_t(metaint);
+ }
+ void set_meta(const omap_node_meta_t &meta) {
+ *layout.template Pointer<1>(buf) = omap_node_meta_le_t(meta);
+ }
+
+ uint32_t used_space() const {
+ uint32_t count = get_size();
+ if (count) {
+ omap_leaf_key_t last_key = omap_leaf_key_t(get_node_key_ptr()[count-1]);
+ return last_key.key_off + count * sizeof(omap_leaf_key_le_t);
+ } else {
+ return 0;
+ }
+ }
+
+ uint32_t free_space() const {
+ return capacity() - used_space();
+ }
+
+ uint32_t capacity() const {
+ return OMAP_LEAF_BLOCK_SIZE
+ - (reinterpret_cast<char*>(layout.template Pointer<2>(buf))
+ - reinterpret_cast<char*>(layout.template Pointer<0>(buf)));
+ }
+
+ bool is_overflow(size_t ksize, size_t vsize) const {
+ return free_space() < (sizeof(omap_leaf_key_le_t) + ksize + vsize);
+ }
+
+ bool is_overflow(const StringKVLeafNodeLayout &rhs) const {
+ return free_space() < rhs.used_space();
+ }
+
+ bool below_min() const {
+ return free_space() > (capacity() / 2);
+ }
+
+ bool operator==(const StringKVLeafNodeLayout &rhs) const {
+ if (get_size() != rhs.get_size()) {
+ return false;
+ }
+
+ auto iter = iter_begin();
+ auto iter2 = rhs.iter_begin();
+ while (iter != iter_end()) {
+ if(iter->get_key() != iter2->get_key() ||
+ iter->get_val() != iter2->get_val()) {
+ return false;
+ }
+ iter++;
+ iter2++;
+ }
+ return true;
+ }
+
+ /**
+ * split_into
+ *
+ * Takes *this and splits its contents into left and right.
+ */
+ std::string split_into(
+ StringKVLeafNodeLayout &left,
+ StringKVLeafNodeLayout &right) const {
+ auto piviter = get_split_pivot();
+ assert (piviter != iter_end());
+
+ copy_from_foreign(left.iter_begin(), iter_begin(), piviter);
+ left.set_size(piviter - iter_begin());
+
+ copy_from_foreign(right.iter_begin(), piviter, iter_end());
+ right.set_size(iter_end() - piviter);
+
+ auto [lmeta, rmeta] = get_meta().split_into();
+ left.set_meta(lmeta);
+ right.set_meta(rmeta);
+
+ return piviter->get_key();
+ }
+
+ /**
+ * merge_from
+ *
+ * Takes two nodes and copies their contents into *this.
+ *
+ * precondition: left.size() + right.size() < CAPACITY
+ */
+ void merge_from(
+ const StringKVLeafNodeLayout &left,
+ const StringKVLeafNodeLayout &right)
+ {
+ copy_from_foreign(
+ iter_end(),
+ left.iter_begin(),
+ left.iter_end());
+ set_size(left.get_size());
+ copy_from_foreign(
+ iter_end(),
+ right.iter_begin(),
+ right.iter_end());
+ set_size(left.get_size() + right.get_size());
+ set_meta(omap_node_meta_t::merge_from(left.get_meta(), right.get_meta()));
+ }
+
+ /**
+ * balance_into_new_nodes
+ *
+ * Takes the contents of left and right and copies them into
+ * replacement_left and replacement_right such that
+ * the size of replacement_left side just >= 1/2 of the total size (left + right).
+ */
+ static std::string balance_into_new_nodes(
+ const StringKVLeafNodeLayout &left,
+ const StringKVLeafNodeLayout &right,
+ StringKVLeafNodeLayout &replacement_left,
+ StringKVLeafNodeLayout &replacement_right)
+ {
+ uint32_t left_size = omap_leaf_key_t(left.get_node_key_ptr()[left.get_size()-1]).key_off;
+ uint32_t right_size = omap_leaf_key_t(right.get_node_key_ptr()[right.get_size()-1]).key_off;
+ uint32_t total = left_size + right_size;
+ uint32_t pivot_size = total / 2;
+ uint32_t pivot_idx = 0;
+ if (pivot_size < left_size) {
+ uint32_t size = 0;
+ for (auto ite = left.iter_begin(); ite < left.iter_end(); ite++) {
+ auto node_key = ite->get_node_key();
+ size += node_key.key_len + node_key.val_len;
+ if (size >= pivot_size){
+ pivot_idx = ite.get_index();
+ break;
+ }
+ }
+ } else {
+ uint32_t more_size = pivot_size - left_size;
+ uint32_t size = 0;
+ for (auto ite = right.iter_begin(); ite < right.iter_end(); ite++) {
+ auto node_key = ite->get_node_key();
+ size += node_key.key_len + node_key.val_len;
+ if (size >= more_size){
+ pivot_idx = ite.get_index() + left.get_size();
+ break;
+ }
+ }
+ }
+
+ auto replacement_pivot = pivot_idx >= left.get_size() ?
+ right.iter_idx(pivot_idx - left.get_size())->get_key() :
+ left.iter_idx(pivot_idx)->get_key();
+
+ if (pivot_size < left_size) {
+ copy_from_foreign(
+ replacement_left.iter_end(),
+ left.iter_begin(),
+ left.iter_idx(pivot_idx));
+ replacement_left.set_size(pivot_idx);
+
+ copy_from_foreign(
+ replacement_right.iter_end(),
+ left.iter_idx(pivot_idx),
+ left.iter_end());
+ replacement_right.set_size(left.get_size() - pivot_idx);
+
+ copy_from_foreign(
+ replacement_right.iter_end(),
+ right.iter_begin(),
+ right.iter_end());
+ replacement_right.set_size(right.get_size() + left.get_size() - pivot_idx);
+ } else {
+ copy_from_foreign(
+ replacement_left.iter_end(),
+ left.iter_begin(),
+ left.iter_end());
+ replacement_left.set_size(left.get_size());
+
+ copy_from_foreign(
+ replacement_left.iter_end(),
+ right.iter_begin(),
+ right.iter_idx(pivot_idx - left.get_size()));
+ replacement_left.set_size(pivot_idx);
+
+ copy_from_foreign(
+ replacement_right.iter_end(),
+ right.iter_idx(pivot_idx - left.get_size()),
+ right.iter_end());
+ replacement_right.set_size(right.get_size() + left.get_size() - pivot_idx);
+ }
+
+ auto [lmeta, rmeta] = omap_node_meta_t::rebalance(
+ left.get_meta(), right.get_meta());
+ replacement_left.set_meta(lmeta);
+ replacement_right.set_meta(rmeta);
+ return replacement_pivot;
+ }
+
+private:
+ void leaf_insert(
+ iterator iter,
+ const std::string &key,
+ const bufferlist &val) {
+ if (iter != iter_begin()) {
+ assert((iter - 1)->get_key() < key);
+ }
+ if (iter != iter_end()) {
+ assert(iter->get_key() > key);
+ }
+ assert(!is_overflow(key.size(), val.length()));
+ omap_leaf_key_t node_key;
+ if (iter == iter_begin()) {
+ node_key.key_off = key.size() + val.length();
+ node_key.key_len = key.size();
+ node_key.val_len = val.length();
+ } else {
+ node_key.key_off = (iter - 1)->get_node_key().key_off +
+ (key.size() + val.length());
+ node_key.key_len = key.size();
+ node_key.val_len = val.length();
+ }
+ if (get_size() != 0 && iter != iter_end())
+ copy_from_local(node_key.key_len + node_key.val_len, iter + 1, iter, iter_end());
+
+ iter->set_node_key(node_key);
+ set_size(get_size() + 1);
+ iter->set_node_val(key, val);
+ }
+
+ void leaf_update(
+ iterator iter,
+ const std::string &key,
+ const ceph::bufferlist &val) {
+ assert(iter != iter_end());
+ leaf_remove(iter);
+ assert(!is_overflow(key.size(), val.length()));
+ leaf_insert(iter, key, val);
+ }
+
+ void leaf_remove(iterator iter) {
+ assert(iter != iter_end());
+ if ((iter + 1) != iter_end()) {
+ omap_leaf_key_t key = iter->get_node_key();
+ copy_from_local(key.key_len + key.val_len, iter, iter + 1, iter_end());
+ }
+ set_size(get_size() - 1);
+ }
+
+ /**
+ * get_key_ptr
+ *
+ * Get pointer to start of key array
+ */
+ omap_leaf_key_le_t *get_node_key_ptr() {
+ return L::Partial(1, 1, get_size()).template Pointer<2>(buf);
+ }
+ const omap_leaf_key_le_t *get_node_key_ptr() const {
+ return L::Partial(1, 1, get_size()).template Pointer<2>(buf);
+ }
+
+};
+
+inline void delta_inner_t::replay(StringKVInnerNodeLayout &l) {
+ switch (op) {
+ case op_t::INSERT: {
+ l.inner_insert(l.string_lower_bound(key), key, addr);
+ break;
+ }
+ case op_t::UPDATE: {
+ auto iter = l.find_string_key(key);
+ assert(iter != l.iter_end());
+ l.inner_update(iter, addr);
+ break;
+ }
+ case op_t::REMOVE: {
+ auto iter = l.find_string_key(key);
+ assert(iter != l.iter_end());
+ l.inner_remove(iter);
+ break;
+ }
+ default:
+ assert(0 == "Impossible");
+ }
+}
+
+inline void delta_leaf_t::replay(StringKVLeafNodeLayout &l) {
+ switch (op) {
+ case op_t::INSERT: {
+ l.leaf_insert(l.string_lower_bound(key), key, val);
+ break;
+ }
+ case op_t::UPDATE: {
+ auto iter = l.find_string_key(key);
+ assert(iter != l.iter_end());
+ l.leaf_update(iter, key, val);
+ break;
+ }
+ case op_t::REMOVE: {
+ auto iter = l.find_string_key(key);
+ assert(iter != l.iter_end());
+ l.leaf_remove(iter);
+ break;
+ }
+ default:
+ assert(0 == "Impossible");
+ }
+}
+
+}