summaryrefslogtreecommitdiffstats
path: root/src/crimson/os/seastore/lba_manager/btree
diff options
context:
space:
mode:
Diffstat (limited to 'src/crimson/os/seastore/lba_manager/btree')
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc580
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h188
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc153
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h274
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h269
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc701
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h555
7 files changed, 2720 insertions, 0 deletions
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
new file mode 100644
index 000000000..a837ae37e
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
@@ -0,0 +1,580 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include "crimson/common/log.h"
+
+#include "include/buffer.h"
+#include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h"
+
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+BtreeLBAManager::mkfs_ret BtreeLBAManager::mkfs(
+ Transaction &t)
+{
+ logger().debug("BtreeLBAManager::mkfs");
+ return cache.get_root(t).safe_then([this, &t](auto croot) {
+ auto root_leaf = cache.alloc_new_extent<LBALeafNode>(
+ t,
+ LBA_BLOCK_SIZE);
+ root_leaf->set_size(0);
+ lba_node_meta_t meta{0, L_ADDR_MAX, 1};
+ root_leaf->set_meta(meta);
+ root_leaf->pin.set_range(meta);
+ croot->get_root() =
+ root_t{
+ 1,
+ 0,
+ root_leaf->get_paddr(),
+ make_record_relative_paddr(0),
+ L_ADDR_NULL};
+ return mkfs_ertr::now();
+ });
+}
+
+BtreeLBAManager::get_root_ret
+BtreeLBAManager::get_root(Transaction &t)
+{
+ return cache.get_root(t).safe_then([this, &t](auto croot) {
+ logger().debug(
+ "BtreeLBAManager::get_root: reading root at {} depth {}",
+ paddr_t{croot->get_root().lba_root_addr},
+ unsigned(croot->get_root().lba_depth));
+ return get_lba_btree_extent(
+ get_context(t),
+ croot->get_root().lba_depth,
+ croot->get_root().lba_root_addr,
+ paddr_t());
+ });
+}
+
+BtreeLBAManager::get_mapping_ret
+BtreeLBAManager::get_mapping(
+ Transaction &t,
+ laddr_t offset, extent_len_t length)
+{
+ logger().debug("BtreeLBAManager::get_mapping: {}, {}", offset, length);
+ return get_root(
+ t).safe_then([this, &t, offset, length](auto extent) {
+ return extent->lookup_range(
+ get_context(t),
+ offset, length
+ ).safe_then([extent](auto ret) { return ret; });
+ }).safe_then([](auto &&e) {
+ logger().debug("BtreeLBAManager::get_mapping: got mapping {}", e);
+ return get_mapping_ret(
+ get_mapping_ertr::ready_future_marker{},
+ std::move(e));
+ });
+}
+
+
+BtreeLBAManager::get_mappings_ret
+BtreeLBAManager::get_mappings(
+ Transaction &t,
+ laddr_list_t &&list)
+{
+ logger().debug("BtreeLBAManager::get_mappings: {}", list);
+ auto l = std::make_unique<laddr_list_t>(std::move(list));
+ auto retptr = std::make_unique<lba_pin_list_t>();
+ auto &ret = *retptr;
+ return crimson::do_for_each(
+ l->begin(),
+ l->end(),
+ [this, &t, &ret](const auto &p) {
+ return get_mapping(t, p.first, p.second).safe_then(
+ [&ret](auto res) {
+ ret.splice(ret.end(), res, res.begin(), res.end());
+ });
+ }).safe_then([l=std::move(l), retptr=std::move(retptr)]() mutable {
+ return std::move(*retptr);
+ });
+}
+
+BtreeLBAManager::alloc_extent_ret
+BtreeLBAManager::alloc_extent(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len,
+ paddr_t addr)
+{
+ // TODO: we can certainly combine the lookup and the insert.
+ return get_root(
+ t).safe_then([this, &t, hint, len](auto extent) {
+ logger().debug(
+ "BtreeLBAManager::alloc_extent: beginning search at {}",
+ *extent);
+ return extent->find_hole(
+ get_context(t),
+ hint,
+ L_ADDR_MAX,
+ len).safe_then([extent](auto ret) {
+ return std::make_pair(ret, extent);
+ });
+ }).safe_then([this, &t, len, addr](auto allocation_pair) {
+ auto &[laddr, extent] = allocation_pair;
+ ceph_assert(laddr != L_ADDR_MAX);
+ return insert_mapping(
+ t,
+ extent,
+ laddr,
+ { len, addr, 1, 0 }
+ ).safe_then([laddr=laddr, addr, len](auto pin) {
+ logger().debug(
+ "BtreeLBAManager::alloc_extent: alloc {}~{} for {}",
+ laddr,
+ len,
+ addr);
+ return alloc_extent_ret(
+ alloc_extent_ertr::ready_future_marker{},
+ LBAPinRef(pin.release()));
+ });
+ });
+}
+
+BtreeLBAManager::set_extent_ret
+BtreeLBAManager::set_extent(
+ Transaction &t,
+ laddr_t off, extent_len_t len, paddr_t addr)
+{
+ return get_root(
+ t).safe_then([this, &t, off, len, addr](auto root) {
+ return insert_mapping(
+ t,
+ root,
+ off,
+ { len, addr, 1, 0 });
+ }).safe_then([](auto ret) {
+ return set_extent_ret(
+ set_extent_ertr::ready_future_marker{},
+ LBAPinRef(ret.release()));
+ });
+}
+
+static bool is_lba_node(extent_types_t type)
+{
+ return type == extent_types_t::LADDR_INTERNAL ||
+ type == extent_types_t::LADDR_LEAF;
+}
+
+static bool is_lba_node(const CachedExtent &e)
+{
+ return is_lba_node(e.get_type());
+}
+
+btree_range_pin_t &BtreeLBAManager::get_pin(CachedExtent &e)
+{
+ if (is_lba_node(e)) {
+ return e.cast<LBANode>()->pin;
+ } else if (e.is_logical()) {
+ return static_cast<BtreeLBAPin &>(
+ e.cast<LogicalCachedExtent>()->get_pin()).pin;
+ } else {
+ ceph_abort_msg("impossible");
+ }
+}
+
+static depth_t get_depth(const CachedExtent &e)
+{
+ if (is_lba_node(e)) {
+ return e.cast<LBANode>()->get_node_meta().depth;
+ } else if (e.is_logical()) {
+ return 0;
+ } else {
+ ceph_assert(0 == "currently impossible");
+ return 0;
+ }
+}
+
+BtreeLBAManager::complete_transaction_ret
+BtreeLBAManager::complete_transaction(
+ Transaction &t)
+{
+ std::vector<CachedExtentRef> to_clear;
+ to_clear.reserve(t.get_retired_set().size());
+ for (auto &e: t.get_retired_set()) {
+ if (e->is_logical() || is_lba_node(*e))
+ to_clear.push_back(e);
+ }
+ // need to call check_parent from leaf->parent
+ std::sort(
+ to_clear.begin(), to_clear.end(),
+ [](auto &l, auto &r) { return get_depth(*l) < get_depth(*r); });
+
+ for (auto &e: to_clear) {
+ auto &pin = get_pin(*e);
+ logger().debug("{}: retiring {}, {}", __func__, *e, pin);
+ pin_set.retire(pin);
+ }
+
+ // ...but add_pin from parent->leaf
+ std::vector<CachedExtentRef> to_link;
+ to_link.reserve(t.get_fresh_block_list().size());
+ for (auto &e: t.get_fresh_block_list()) {
+ if (e->is_valid() && (is_lba_node(*e) || e->is_logical()))
+ to_link.push_back(e);
+ }
+ std::sort(
+ to_link.begin(), to_link.end(),
+ [](auto &l, auto &r) -> bool { return get_depth(*l) > get_depth(*r); });
+
+ for (auto &e : to_link) {
+ logger().debug("{}: linking {}", __func__, *e);
+ pin_set.add_pin(get_pin(*e));
+ }
+
+ for (auto &e: to_clear) {
+ auto &pin = get_pin(*e);
+ logger().debug("{}: checking {}, {}", __func__, *e, pin);
+ pin_set.check_parent(pin);
+ }
+ return complete_transaction_ertr::now();
+}
+
+BtreeLBAManager::init_cached_extent_ret BtreeLBAManager::init_cached_extent(
+ Transaction &t,
+ CachedExtentRef e)
+{
+ logger().debug("{}: {}", __func__, *e);
+ return get_root(t).safe_then(
+ [this, &t, e=std::move(e)](LBANodeRef root) mutable {
+ if (is_lba_node(*e)) {
+ auto lban = e->cast<LBANode>();
+ logger().debug("init_cached_extent: lba node, getting root");
+ return root->lookup(
+ op_context_t{cache, pin_set, t},
+ lban->get_node_meta().begin,
+ lban->get_node_meta().depth
+ ).safe_then([this, e=std::move(e)](LBANodeRef c) {
+ if (c->get_paddr() == e->get_paddr()) {
+ assert(&*c == &*e);
+ logger().debug("init_cached_extent: {} initialized", *e);
+ } else {
+ // e is obsolete
+ logger().debug("init_cached_extent: {} obsolete", *e);
+ cache.drop_from_cache(e);
+ }
+ return init_cached_extent_ertr::now();
+ });
+ } else if (e->is_logical()) {
+ auto logn = e->cast<LogicalCachedExtent>();
+ return root->lookup_range(
+ op_context_t{cache, pin_set, t},
+ logn->get_laddr(),
+ logn->get_length()).safe_then(
+ [this, logn=std::move(logn)](auto pins) {
+ if (pins.size() == 1) {
+ auto pin = std::move(pins.front());
+ pins.pop_front();
+ if (pin->get_paddr() == logn->get_paddr()) {
+ logn->set_pin(std::move(pin));
+ pin_set.add_pin(
+ static_cast<BtreeLBAPin&>(logn->get_pin()).pin);
+ logger().debug("init_cached_extent: {} initialized", *logn);
+ } else {
+ // paddr doesn't match, remapped, obsolete
+ logger().debug("init_cached_extent: {} obsolete", *logn);
+ cache.drop_from_cache(logn);
+ }
+ } else {
+ // set of extents changed, obsolete
+ logger().debug("init_cached_extent: {} obsolete", *logn);
+ cache.drop_from_cache(logn);
+ }
+ return init_cached_extent_ertr::now();
+ });
+ } else {
+ logger().debug("init_cached_extent: {} skipped", *e);
+ return init_cached_extent_ertr::now();
+ }
+ });
+}
+
+BtreeLBAManager::scan_mappings_ret BtreeLBAManager::scan_mappings(
+ Transaction &t,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &&f)
+{
+ return seastar::do_with(
+ std::move(f),
+ LBANodeRef(),
+ [=, &t](auto &f, auto &lbarootref) {
+ return get_root(t).safe_then(
+ [=, &t, &f](LBANodeRef lbaroot) mutable {
+ lbarootref = lbaroot;
+ return lbaroot->scan_mappings(
+ get_context(t),
+ begin,
+ end,
+ f);
+ });
+ });
+}
+
+BtreeLBAManager::scan_mapped_space_ret BtreeLBAManager::scan_mapped_space(
+ Transaction &t,
+ scan_mapped_space_func_t &&f)
+{
+ return seastar::do_with(
+ std::move(f),
+ LBANodeRef(),
+ [=, &t](auto &f, auto &lbarootref) {
+ return get_root(t).safe_then(
+ [=, &t, &f](LBANodeRef lbaroot) mutable {
+ lbarootref = lbaroot;
+ return lbaroot->scan_mapped_space(
+ get_context(t),
+ f);
+ });
+ });
+}
+
+BtreeLBAManager::rewrite_extent_ret BtreeLBAManager::rewrite_extent(
+ Transaction &t,
+ CachedExtentRef extent)
+{
+ if (extent->is_logical()) {
+ auto lextent = extent->cast<LogicalCachedExtent>();
+ cache.retire_extent(t, extent);
+ auto nlextent = cache.alloc_new_extent_by_type(
+ t,
+ lextent->get_type(),
+ lextent->get_length())->cast<LogicalCachedExtent>();
+ lextent->get_bptr().copy_out(
+ 0,
+ lextent->get_length(),
+ nlextent->get_bptr().c_str());
+ nlextent->set_laddr(lextent->get_laddr());
+ nlextent->set_pin(lextent->get_pin().duplicate());
+
+ logger().debug(
+ "{}: rewriting {} into {}",
+ __func__,
+ *lextent,
+ *nlextent);
+
+ return update_mapping(
+ t,
+ lextent->get_laddr(),
+ [prev_addr = lextent->get_paddr(), addr = nlextent->get_paddr()](
+ const lba_map_val_t &in) {
+ lba_map_val_t ret = in;
+ ceph_assert(in.paddr == prev_addr);
+ ret.paddr = addr;
+ return ret;
+ }).safe_then([nlextent](auto e) {}).handle_error(
+ rewrite_extent_ertr::pass_further{},
+ /* ENOENT in particular should be impossible */
+ crimson::ct_error::assert_all{}
+ );
+ } else if (is_lba_node(*extent)) {
+ auto lba_extent = extent->cast<LBANode>();
+ cache.retire_extent(t, extent);
+ auto nlba_extent = cache.alloc_new_extent_by_type(
+ t,
+ lba_extent->get_type(),
+ lba_extent->get_length())->cast<LBANode>();
+ lba_extent->get_bptr().copy_out(
+ 0,
+ lba_extent->get_length(),
+ nlba_extent->get_bptr().c_str());
+ nlba_extent->pin.set_range(nlba_extent->get_node_meta());
+
+ /* This is a bit underhanded. Any relative addrs here must necessarily
+ * be record relative as we are rewriting a dirty extent. Thus, we
+ * are using resolve_relative_addrs with a (likely negative) block
+ * relative offset to correct them to block-relative offsets adjusted
+ * for our new transaction location.
+ *
+ * Upon commit, these now block relative addresses will be interpretted
+ * against the real final address.
+ */
+ nlba_extent->resolve_relative_addrs(
+ make_record_relative_paddr(0) - nlba_extent->get_paddr());
+
+ return update_internal_mapping(
+ t,
+ nlba_extent->get_node_meta().depth,
+ nlba_extent->get_node_meta().begin,
+ nlba_extent->get_paddr()).safe_then(
+ [](auto) {},
+ rewrite_extent_ertr::pass_further {},
+ crimson::ct_error::assert_all{});
+ } else {
+ return rewrite_extent_ertr::now();
+ }
+}
+
+BtreeLBAManager::get_physical_extent_if_live_ret
+BtreeLBAManager::get_physical_extent_if_live(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t addr,
+ laddr_t laddr,
+ segment_off_t len)
+{
+ ceph_assert(is_lba_node(type));
+ return cache.get_extent_by_type(
+ t,
+ type,
+ addr,
+ laddr,
+ len
+ ).safe_then([=, &t](CachedExtentRef extent) {
+ return get_root(t).safe_then([=, &t](LBANodeRef root) {
+ auto lba_node = extent->cast<LBANode>();
+ return root->lookup(
+ op_context_t{cache, pin_set, t},
+ lba_node->get_node_meta().begin,
+ lba_node->get_node_meta().depth).safe_then([=](LBANodeRef c) {
+ if (c->get_paddr() == lba_node->get_paddr()) {
+ return get_physical_extent_if_live_ret(
+ get_physical_extent_if_live_ertr::ready_future_marker{},
+ lba_node);
+ } else {
+ cache.drop_from_cache(lba_node);
+ return get_physical_extent_if_live_ret(
+ get_physical_extent_if_live_ertr::ready_future_marker{},
+ CachedExtentRef());
+ }
+ });
+ });
+ });
+}
+
+BtreeLBAManager::BtreeLBAManager(
+ SegmentManager &segment_manager,
+ Cache &cache)
+ : segment_manager(segment_manager),
+ cache(cache) {}
+
+BtreeLBAManager::insert_mapping_ret BtreeLBAManager::insert_mapping(
+ Transaction &t,
+ LBANodeRef root,
+ laddr_t laddr,
+ lba_map_val_t val)
+{
+ auto split = insert_mapping_ertr::future<LBANodeRef>(
+ insert_mapping_ertr::ready_future_marker{},
+ root);
+ if (root->at_max_capacity()) {
+ split = cache.get_root(t).safe_then(
+ [this, root, laddr, &t](RootBlockRef croot) {
+ logger().debug(
+ "BtreeLBAManager::insert_mapping: splitting root {}",
+ *croot);
+ {
+ auto mut_croot = cache.duplicate_for_write(t, croot);
+ croot = mut_croot->cast<RootBlock>();
+ }
+ auto nroot = cache.alloc_new_extent<LBAInternalNode>(t, LBA_BLOCK_SIZE);
+ lba_node_meta_t meta{0, L_ADDR_MAX, root->get_node_meta().depth + 1};
+ nroot->set_meta(meta);
+ nroot->pin.set_range(meta);
+ nroot->journal_insert(
+ nroot->begin(),
+ L_ADDR_MIN,
+ root->get_paddr(),
+ nullptr);
+ croot->get_root().lba_root_addr = nroot->get_paddr();
+ croot->get_root().lba_depth = root->get_node_meta().depth + 1;
+ return nroot->split_entry(
+ get_context(t),
+ laddr, nroot->begin(), root);
+ });
+ }
+ return split.safe_then([this, &t, laddr, val](LBANodeRef node) {
+ return node->insert(
+ get_context(t),
+ laddr, val);
+ });
+}
+
+BtreeLBAManager::update_refcount_ret BtreeLBAManager::update_refcount(
+ Transaction &t,
+ laddr_t addr,
+ int delta)
+{
+ return update_mapping(
+ t,
+ addr,
+ [delta](const lba_map_val_t &in) {
+ lba_map_val_t out = in;
+ ceph_assert((int)out.refcount + delta >= 0);
+ out.refcount += delta;
+ return out;
+ }).safe_then([](auto result) {
+ return ref_update_result_t{result.refcount, result.paddr};
+ });
+}
+
+BtreeLBAManager::update_mapping_ret BtreeLBAManager::update_mapping(
+ Transaction &t,
+ laddr_t addr,
+ update_func_t &&f)
+{
+ return get_root(t
+ ).safe_then([this, f=std::move(f), &t, addr](LBANodeRef root) mutable {
+ return root->mutate_mapping(
+ get_context(t),
+ addr,
+ std::move(f));
+ });
+}
+
+BtreeLBAManager::update_internal_mapping_ret
+BtreeLBAManager::update_internal_mapping(
+ Transaction &t,
+ depth_t depth,
+ laddr_t laddr,
+ paddr_t paddr)
+{
+ return cache.get_root(t).safe_then([=, &t](RootBlockRef croot) {
+ if (depth == croot->get_root().lba_depth) {
+ logger().debug(
+ "update_internal_mapping: updating lba root to: {}->{}",
+ laddr,
+ paddr);
+ {
+ auto mut_croot = cache.duplicate_for_write(t, croot);
+ croot = mut_croot->cast<RootBlock>();
+ }
+ ceph_assert(laddr == 0);
+ auto old_paddr = croot->get_root().lba_root_addr;
+ croot->get_root().lba_root_addr = paddr;
+ return update_internal_mapping_ret(
+ update_internal_mapping_ertr::ready_future_marker{},
+ old_paddr);
+ } else {
+ logger().debug(
+ "update_internal_mapping: updating lba node at depth {} to: {}->{}",
+ depth,
+ laddr,
+ paddr);
+ return get_lba_btree_extent(
+ get_context(t),
+ croot->get_root().lba_depth,
+ croot->get_root().lba_root_addr,
+ paddr_t()).safe_then([=, &t](LBANodeRef broot) {
+ return broot->mutate_internal_address(
+ get_context(t),
+ depth,
+ laddr,
+ paddr);
+ });
+ }
+ });
+}
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
new file mode 100644
index 000000000..640d56734
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
@@ -0,0 +1,188 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "include/buffer_fwd.h"
+#include "include/interval_set.h"
+#include "common/interval_map.h"
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/lba_manager.h"
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/segment_manager.h"
+
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+/**
+ * BtreeLBAManager
+ *
+ * Uses a wandering btree to track two things:
+ * 1) lba state including laddr_t -> paddr_t mapping
+ * 2) reverse paddr_t -> laddr_t mapping for gc (TODO)
+ *
+ * Generally, any transaction will involve
+ * 1) deltas against lba tree nodes
+ * 2) new lba tree nodes
+ * - Note, there must necessarily be a delta linking
+ * these new nodes into the tree -- might be a
+ * bootstrap_state_t delta if new root
+ *
+ * get_mappings, alloc_extent_*, etc populate a Transaction
+ * which then gets submitted
+ */
+class BtreeLBAManager : public LBAManager {
+public:
+ BtreeLBAManager(
+ SegmentManager &segment_manager,
+ Cache &cache);
+
+ mkfs_ret mkfs(
+ Transaction &t) final;
+
+ get_mapping_ret get_mapping(
+ Transaction &t,
+ laddr_t offset, extent_len_t length) final;
+
+ get_mappings_ret get_mappings(
+ Transaction &t,
+ laddr_list_t &&list) final;
+
+ alloc_extent_ret alloc_extent(
+ Transaction &t,
+ laddr_t hint,
+ extent_len_t len,
+ paddr_t addr) final;
+
+ set_extent_ret set_extent(
+ Transaction &t,
+ laddr_t off, extent_len_t len, paddr_t addr) final;
+
+ ref_ret decref_extent(
+ Transaction &t,
+ laddr_t addr) final {
+ return update_refcount(t, addr, -1);
+ }
+
+ ref_ret incref_extent(
+ Transaction &t,
+ laddr_t addr) final {
+ return update_refcount(t, addr, 1);
+ }
+
+ complete_transaction_ret complete_transaction(
+ Transaction &t) final;
+
+ init_cached_extent_ret init_cached_extent(
+ Transaction &t,
+ CachedExtentRef e) final;
+
+ scan_mappings_ret scan_mappings(
+ Transaction &t,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &&f) final;
+
+ scan_mapped_space_ret scan_mapped_space(
+ Transaction &t,
+ scan_mapped_space_func_t &&f) final;
+
+ rewrite_extent_ret rewrite_extent(
+ Transaction &t,
+ CachedExtentRef extent);
+
+ get_physical_extent_if_live_ret get_physical_extent_if_live(
+ Transaction &t,
+ extent_types_t type,
+ paddr_t addr,
+ laddr_t laddr,
+ segment_off_t len) final;
+
+ void add_pin(LBAPin &pin) final {
+ auto *bpin = reinterpret_cast<BtreeLBAPin*>(&pin);
+ pin_set.add_pin(bpin->pin);
+ bpin->parent = nullptr;
+ }
+
+private:
+ SegmentManager &segment_manager;
+ Cache &cache;
+
+ btree_pin_set_t pin_set;
+
+ op_context_t get_context(Transaction &t) {
+ return op_context_t{cache, pin_set, t};
+ }
+
+ static btree_range_pin_t &get_pin(CachedExtent &e);
+
+
+ /**
+ * get_root
+ *
+ * Get a reference to the root LBANode.
+ */
+ using get_root_ertr = Cache::get_extent_ertr;
+ using get_root_ret = get_root_ertr::future<LBANodeRef>;
+ get_root_ret get_root(Transaction &);
+
+ /**
+ * insert_mapping
+ *
+ * Insert a lba mapping into the tree
+ */
+ using insert_mapping_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using insert_mapping_ret = insert_mapping_ertr::future<LBAPinRef>;
+ insert_mapping_ret insert_mapping(
+ Transaction &t, ///< [in,out] transaction
+ LBANodeRef root, ///< [in] root node
+ laddr_t laddr, ///< [in] logical addr to insert
+ lba_map_val_t val ///< [in] mapping to insert
+ );
+
+ /**
+ * update_refcount
+ *
+ * Updates refcount, returns resulting refcount
+ */
+ using update_refcount_ret = ref_ret;
+ update_refcount_ret update_refcount(
+ Transaction &t,
+ laddr_t addr,
+ int delta);
+
+ /**
+ * update_mapping
+ *
+ * Updates mapping, removes if f returns nullopt
+ */
+ using update_mapping_ertr = ref_ertr;
+ using update_mapping_ret = ref_ertr::future<lba_map_val_t>;
+ using update_func_t = LBANode::mutate_func_t;
+ update_mapping_ret update_mapping(
+ Transaction &t,
+ laddr_t addr,
+ update_func_t &&f);
+
+ using update_internal_mapping_ertr = LBANode::mutate_internal_address_ertr;
+ using update_internal_mapping_ret = LBANode::mutate_internal_address_ret;
+ update_internal_mapping_ret update_internal_mapping(
+ Transaction &t,
+ depth_t depth,
+ laddr_t laddr,
+ paddr_t paddr);
+};
+using BtreeLBAManagerRef = std::unique_ptr<BtreeLBAManager>;
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc
new file mode 100644
index 000000000..a86c3cc57
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc
@@ -0,0 +1,153 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/common/log.h"
+
+#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+void btree_range_pin_t::take_pin(btree_range_pin_t &other)
+{
+ assert(other.extent);
+ assert(other.pins);
+ other.pins->replace_pin(*this, other);
+ pins = other.pins;
+ other.pins = nullptr;
+
+ if (other.has_ref()) {
+ other.drop_ref();
+ acquire_ref();
+ }
+}
+
+btree_range_pin_t::~btree_range_pin_t()
+{
+ assert(!pins == !is_linked());
+ assert(!ref);
+ if (pins) {
+ logger().debug("{}: removing {}", __func__, *this);
+ pins->remove_pin(*this, true);
+ }
+ extent = nullptr;
+}
+
+void btree_pin_set_t::replace_pin(btree_range_pin_t &to, btree_range_pin_t &from)
+{
+ pins.replace_node(pins.iterator_to(from), to);
+}
+
+void btree_pin_set_t::remove_pin(btree_range_pin_t &pin, bool do_check_parent)
+{
+ logger().debug("{}: {}", __func__, pin);
+ assert(pin.is_linked());
+ assert(pin.pins);
+ assert(!pin.ref);
+
+ pins.erase(pin);
+ pin.pins = nullptr;
+
+ if (do_check_parent) {
+ check_parent(pin);
+ }
+}
+
+btree_range_pin_t *btree_pin_set_t::maybe_get_parent(
+ const lba_node_meta_t &meta)
+{
+ auto cmeta = meta;
+ cmeta.depth++;
+ auto iter = pins.upper_bound(cmeta, btree_range_pin_t::meta_cmp_t());
+ if (iter == pins.begin()) {
+ return nullptr;
+ } else {
+ --iter;
+ if (iter->range.is_parent_of(meta)) {
+ return &*iter;
+ } else {
+ return nullptr;
+ }
+ }
+}
+
+const btree_range_pin_t *btree_pin_set_t::maybe_get_first_child(
+ const lba_node_meta_t &meta) const
+{
+ if (meta.depth == 0) {
+ return nullptr;
+ }
+
+ auto cmeta = meta;
+ cmeta.depth--;
+
+ auto iter = pins.lower_bound(cmeta, btree_range_pin_t::meta_cmp_t());
+ if (iter == pins.end()) {
+ return nullptr;
+ } else if (meta.is_parent_of(iter->range)) {
+ return &*iter;
+ } else {
+ return nullptr;
+ }
+}
+
+void btree_pin_set_t::release_if_no_children(btree_range_pin_t &pin)
+{
+ assert(pin.is_linked());
+ if (maybe_get_first_child(pin.range) == nullptr) {
+ pin.drop_ref();
+ }
+}
+
+void btree_pin_set_t::add_pin(btree_range_pin_t &pin)
+{
+ assert(!pin.is_linked());
+ assert(!pin.pins);
+ assert(!pin.ref);
+
+ auto [prev, inserted] = pins.insert(pin);
+ if (!inserted) {
+ logger().error("{}: unable to add {}, found {}", __func__, pin, *prev);
+ assert(0 == "impossible");
+ return;
+ }
+ pin.pins = this;
+ if (!pin.is_root()) {
+ auto *parent = maybe_get_parent(pin.range);
+ assert(parent);
+ if (!parent->has_ref()) {
+ logger().debug("{}: acquiring parent {}", __func__,
+ static_cast<void*>(parent));
+ parent->acquire_ref();
+ } else {
+ logger().debug("{}: parent has ref {}", __func__,
+ static_cast<void*>(parent));
+ }
+ }
+ if (maybe_get_first_child(pin.range) != nullptr) {
+ logger().debug("{}: acquiring self {}", __func__, pin);
+ pin.acquire_ref();
+ }
+}
+
+void btree_pin_set_t::retire(btree_range_pin_t &pin)
+{
+ pin.drop_ref();
+ remove_pin(pin, false);
+}
+
+void btree_pin_set_t::check_parent(btree_range_pin_t &pin)
+{
+ auto parent = maybe_get_parent(pin.range);
+ if (parent) {
+ logger().debug("{}: releasing parent {}", __func__, *parent);
+ release_if_no_children(*parent);
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h
new file mode 100644
index 000000000..3fa218fc8
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h
@@ -0,0 +1,274 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive/set.hpp>
+
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/seastore_types.h"
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+class LBANode;
+using LBANodeRef = TCachedExtentRef<LBANode>;
+
+struct lba_node_meta_t {
+ laddr_t begin = 0;
+ laddr_t end = 0;
+ depth_t depth = 0;
+
+ bool is_parent_of(const lba_node_meta_t &other) const {
+ return (depth == other.depth + 1) &&
+ (begin <= other.begin) &&
+ (end >= other.end);
+ }
+
+ std::pair<lba_node_meta_t, lba_node_meta_t> split_into(laddr_t pivot) const {
+ return std::make_pair(
+ lba_node_meta_t{begin, pivot, depth},
+ lba_node_meta_t{pivot, end, depth});
+ }
+
+ static lba_node_meta_t merge_from(
+ const lba_node_meta_t &lhs, const lba_node_meta_t &rhs) {
+ assert(lhs.depth == rhs.depth);
+ return lba_node_meta_t{lhs.begin, rhs.end, lhs.depth};
+ }
+
+ static std::pair<lba_node_meta_t, lba_node_meta_t>
+ rebalance(const lba_node_meta_t &lhs, const lba_node_meta_t &rhs, laddr_t pivot) {
+ assert(lhs.depth == rhs.depth);
+ return std::make_pair(
+ lba_node_meta_t{lhs.begin, pivot, lhs.depth},
+ lba_node_meta_t{pivot, rhs.end, lhs.depth});
+ }
+
+ bool is_root() const {
+ return begin == 0 && end == L_ADDR_MAX;
+ }
+};
+
+inline std::ostream &operator<<(
+ std::ostream &lhs,
+ const lba_node_meta_t &rhs)
+{
+ return lhs << "btree_node_meta_t("
+ << "begin=" << rhs.begin
+ << ", end=" << rhs.end
+ << ", depth=" << rhs.depth
+ << ")";
+}
+
+/**
+ * btree_range_pin_t
+ *
+ * Element tracked by btree_pin_set_t below. Encapsulates the intrusive_set
+ * hook, the lba_node_meta_t representing the lba range covered by a node,
+ * and extent and ref members intended to hold a reference when the extent
+ * should be pinned.
+ */
+class btree_pin_set_t;
+class btree_range_pin_t : public boost::intrusive::set_base_hook<> {
+ friend class btree_pin_set_t;
+ lba_node_meta_t range;
+
+ btree_pin_set_t *pins = nullptr;
+
+ // We need to be able to remember extent without holding a reference,
+ // but we can do it more compactly -- TODO
+ CachedExtent *extent = nullptr;
+ CachedExtentRef ref;
+
+ using index_t = boost::intrusive::set<btree_range_pin_t>;
+
+ static auto get_tuple(const lba_node_meta_t &meta) {
+ return std::make_tuple(-meta.depth, meta.begin);
+ }
+
+ void acquire_ref() {
+ ref = CachedExtentRef(extent);
+ }
+
+ void drop_ref() {
+ ref.reset();
+ }
+
+public:
+ btree_range_pin_t() = default;
+ btree_range_pin_t(CachedExtent *extent)
+ : extent(extent) {}
+ btree_range_pin_t(const btree_range_pin_t &rhs, CachedExtent *extent)
+ : range(rhs.range), extent(extent) {}
+
+ bool has_ref() const {
+ return !!ref;
+ }
+
+ bool is_root() const {
+ return range.is_root();
+ }
+
+ void set_range(const lba_node_meta_t &nrange) {
+ range = nrange;
+ }
+ void set_extent(CachedExtent *nextent) {
+ assert(!extent);
+ extent = nextent;
+ }
+
+ void take_pin(btree_range_pin_t &other);
+
+ friend bool operator<(
+ const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) {
+ return get_tuple(lhs.range) < get_tuple(rhs.range);
+ }
+ friend bool operator>(
+ const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) {
+ return get_tuple(lhs.range) > get_tuple(rhs.range);
+ }
+ friend bool operator==(
+ const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) {
+ return get_tuple(lhs.range) == rhs.get_tuple(rhs.range);
+ }
+
+ struct meta_cmp_t {
+ bool operator()(
+ const btree_range_pin_t &lhs, const lba_node_meta_t &rhs) const {
+ return get_tuple(lhs.range) < get_tuple(rhs);
+ }
+ bool operator()(
+ const lba_node_meta_t &lhs, const btree_range_pin_t &rhs) const {
+ return get_tuple(lhs) < get_tuple(rhs.range);
+ }
+ };
+
+ friend std::ostream &operator<<(
+ std::ostream &lhs,
+ const btree_range_pin_t &rhs) {
+ return lhs << "btree_range_pin_t("
+ << "begin=" << rhs.range.begin
+ << ", end=" << rhs.range.end
+ << ", depth=" << rhs.range.depth
+ << ", extent=" << rhs.extent
+ << ")";
+ }
+
+ friend class BtreeLBAPin;
+ ~btree_range_pin_t();
+};
+
+/**
+ * btree_pin_set_t
+ *
+ * Ensures that for every cached node, all parent LBANodes required
+ * to map it are present in cache. Relocating these nodes can
+ * therefore be done without further reads or cache space.
+ *
+ * Contains a btree_range_pin_t for every clean or dirty LBANode
+ * or LogicalCachedExtent instance in cache at any point in time.
+ * For any LBANode, the contained btree_range_pin_t will hold
+ * a reference to that node pinning it in cache as long as that
+ * node has children in the set. This invariant can be violated
+ * only by calling retire_extent and is repaired by calling
+ * check_parent synchronously after adding any new extents.
+ */
+class btree_pin_set_t {
+ friend class btree_range_pin_t;
+ using pins_t = btree_range_pin_t::index_t;
+ pins_t pins;
+
+ pins_t::iterator get_iter(btree_range_pin_t &pin) {
+ return pins_t::s_iterator_to(pin);
+ }
+
+ /// Removes pin from set optionally checking whether parent has other children
+ void remove_pin(btree_range_pin_t &pin, bool check_parent);
+
+ void replace_pin(btree_range_pin_t &to, btree_range_pin_t &from);
+
+ /// Returns parent pin if exists
+ btree_range_pin_t *maybe_get_parent(const lba_node_meta_t &pin);
+
+ /// Returns earliest child pin if exist
+ const btree_range_pin_t *maybe_get_first_child(const lba_node_meta_t &pin) const;
+
+ /// Releases pin if it has no children
+ void release_if_no_children(btree_range_pin_t &pin);
+
+public:
+ /// Adds pin to set, assumes set is consistent
+ void add_pin(btree_range_pin_t &pin);
+
+ /**
+ * retire/check_parent
+ *
+ * See BtreeLBAManager::complete_transaction.
+ * retire removes the specified pin from the set, but does not
+ * check parents. After any new extents are added to the set,
+ * the caller is required to call check_parent to restore the
+ * invariant.
+ */
+ void retire(btree_range_pin_t &pin);
+ void check_parent(btree_range_pin_t &pin);
+
+ ~btree_pin_set_t() {
+ assert(pins.empty());
+ }
+};
+
+class BtreeLBAPin : public LBAPin {
+ friend class BtreeLBAManager;
+
+ /**
+ * parent
+ *
+ * populated until link_extent is called to ensure cache residence
+ * until add_pin is called.
+ */
+ CachedExtentRef parent;
+
+ paddr_t paddr;
+ btree_range_pin_t pin;
+
+public:
+ BtreeLBAPin() = default;
+
+ BtreeLBAPin(
+ CachedExtentRef parent,
+ paddr_t paddr,
+ lba_node_meta_t &&meta)
+ : parent(parent), paddr(paddr) {
+ pin.set_range(std::move(meta));
+ }
+
+ void link_extent(LogicalCachedExtent *ref) final {
+ pin.set_extent(ref);
+ }
+
+ extent_len_t get_length() const final {
+ assert(pin.range.end > pin.range.begin);
+ return pin.range.end - pin.range.begin;
+ }
+
+ paddr_t get_paddr() const final {
+ return paddr;
+ }
+
+ laddr_t get_laddr() const final {
+ return pin.range.begin;
+ }
+
+ LBAPinRef duplicate() const final {
+ auto ret = std::unique_ptr<BtreeLBAPin>(new BtreeLBAPin);
+ ret->pin.set_range(pin.range);
+ ret->paddr = paddr;
+ return ret;
+ }
+
+ void take_pin(LBAPin &opin) final {
+ pin.take_pin(static_cast<BtreeLBAPin&>(opin).pin);
+ }
+};
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h
new file mode 100644
index 000000000..b6f33a1ae
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h
@@ -0,0 +1,269 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <sys/mman.h>
+#include <memory>
+#include <string.h>
+
+#include "crimson/common/log.h"
+#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h"
+#include "crimson/os/seastore/lba_manager.h"
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+struct op_context_t {
+ Cache &cache;
+ btree_pin_set_t &pins;
+ Transaction &trans;
+};
+
+/**
+ * lba_map_val_t
+ *
+ * struct representing a single lba mapping
+ */
+struct lba_map_val_t {
+ extent_len_t len = 0; ///< length of mapping
+ paddr_t paddr; ///< physical addr of mapping
+ uint32_t refcount = 0; ///< refcount
+ uint32_t checksum = 0; ///< checksum of original block written at paddr (TODO)
+
+ lba_map_val_t(
+ extent_len_t len,
+ paddr_t paddr,
+ uint32_t refcount,
+ uint32_t checksum)
+ : len(len), paddr(paddr), refcount(refcount), checksum(checksum) {}
+};
+
+class BtreeLBAPin;
+using BtreeLBAPinRef = std::unique_ptr<BtreeLBAPin>;
+
+/**
+ * LBANode
+ *
+ * Base class enabling recursive lookup between internal and leaf nodes.
+ */
+struct LBANode : CachedExtent {
+ using LBANodeRef = TCachedExtentRef<LBANode>;
+ using lookup_range_ertr = LBAManager::get_mapping_ertr;
+ using lookup_range_ret = LBAManager::get_mapping_ret;
+
+ btree_range_pin_t pin;
+
+ LBANode(ceph::bufferptr &&ptr) : CachedExtent(std::move(ptr)), pin(this) {}
+ LBANode(const LBANode &rhs)
+ : CachedExtent(rhs), pin(rhs.pin, this) {}
+
+ virtual lba_node_meta_t get_node_meta() const = 0;
+
+ /**
+ * lookup
+ *
+ * Returns the node at the specified depth responsible
+ * for laddr
+ */
+ using lookup_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using lookup_ret = lookup_ertr::future<LBANodeRef>;
+ virtual lookup_ret lookup(
+ op_context_t c,
+ laddr_t addr,
+ depth_t depth) = 0;
+
+ /**
+ * lookup_range
+ *
+ * Returns mappings within range [addr, addr+len)
+ */
+ virtual lookup_range_ret lookup_range(
+ op_context_t c,
+ laddr_t addr,
+ extent_len_t len) = 0;
+
+ /**
+ * insert
+ *
+ * Recursively inserts into subtree rooted at *this. Caller
+ * must already have handled splitting if at_max_capacity().
+ *
+ * Precondition: !at_max_capacity()
+ */
+ using insert_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error
+ >;
+ using insert_ret = insert_ertr::future<LBAPinRef>;
+ virtual insert_ret insert(
+ op_context_t c,
+ laddr_t laddr,
+ lba_map_val_t val) = 0;
+
+ /**
+ * find_hole
+ *
+ * Finds minimum hole of size len in [min, max)
+ *
+ * @return addr of hole, L_ADDR_NULL if unfound
+ */
+ using find_hole_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error>;
+ using find_hole_ret = find_hole_ertr::future<laddr_t>;
+ virtual find_hole_ret find_hole(
+ op_context_t c,
+ laddr_t min,
+ laddr_t max,
+ extent_len_t len) = 0;
+
+ /**
+ * scan_mappings
+ *
+ * Call f for all mappings in [begin, end)
+ */
+ using scan_mappings_ertr = LBAManager::scan_mappings_ertr;
+ using scan_mappings_ret = LBAManager::scan_mappings_ret;
+ using scan_mappings_func_t = LBAManager::scan_mappings_func_t;
+ virtual scan_mappings_ret scan_mappings(
+ op_context_t c,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &f) = 0;
+
+ using scan_mapped_space_ertr = LBAManager::scan_mapped_space_ertr;
+ using scan_mapped_space_ret = LBAManager::scan_mapped_space_ret;
+ using scan_mapped_space_func_t = LBAManager::scan_mapped_space_func_t;
+ virtual scan_mapped_space_ret scan_mapped_space(
+ op_context_t c,
+ scan_mapped_space_func_t &f) = 0;
+
+ /**
+ * mutate_mapping
+ *
+ * Lookups up laddr, calls f on value. If f returns a value, inserts it.
+ * If it returns nullopt, removes the value.
+ * Caller must already have merged if at_min_capacity().
+ *
+ * Recursive calls use mutate_mapping_internal.
+ *
+ * Precondition: !at_min_capacity()
+ */
+ using mutate_mapping_ertr = crimson::errorator<
+ crimson::ct_error::enoent, ///< mapping does not exist
+ crimson::ct_error::input_output_error
+ >;
+ using mutate_mapping_ret = mutate_mapping_ertr::future<
+ lba_map_val_t>;
+ using mutate_func_t = std::function<
+ lba_map_val_t(const lba_map_val_t &v)
+ >;
+ virtual mutate_mapping_ret mutate_mapping(
+ op_context_t c,
+ laddr_t laddr,
+ mutate_func_t &&f) = 0;
+ virtual mutate_mapping_ret mutate_mapping_internal(
+ op_context_t c,
+ laddr_t laddr,
+ bool is_root,
+ mutate_func_t &&f) = 0;
+
+ /**
+ * mutate_internal_address
+ *
+ * Looks up internal node mapping at laddr, depth and
+ * updates the mapping to paddr. Returns previous paddr
+ * (for debugging purposes).
+ */
+ using mutate_internal_address_ertr = crimson::errorator<
+ crimson::ct_error::enoent, ///< mapping does not exist
+ crimson::ct_error::input_output_error
+ >;
+ using mutate_internal_address_ret = mutate_internal_address_ertr::future<
+ paddr_t>;
+ virtual mutate_internal_address_ret mutate_internal_address(
+ op_context_t c,
+ depth_t depth,
+ laddr_t laddr,
+ paddr_t paddr) = 0;
+
+ /**
+ * make_split_children
+ *
+ * Generates appropriately typed left and right nodes formed from the
+ * contents of *this.
+ *
+ * Returns <left, right, pivot> where pivot is the first value of right.
+ */
+ virtual std::tuple<
+ LBANodeRef,
+ LBANodeRef,
+ laddr_t>
+ make_split_children(
+ op_context_t c) = 0;
+
+ /**
+ * make_full_merge
+ *
+ * Returns a single node formed from merging *this and right.
+ * Precondition: at_min_capacity() && right.at_min_capacity()
+ */
+ virtual LBANodeRef make_full_merge(
+ op_context_t c,
+ LBANodeRef &right) = 0;
+
+ /**
+ * make_balanced
+ *
+ * Returns nodes formed by balancing the contents of *this and right.
+ *
+ * Returns <left, right, pivot> where pivot is the first value of right.
+ */
+ virtual std::tuple<
+ LBANodeRef,
+ LBANodeRef,
+ laddr_t>
+ make_balanced(
+ op_context_t c,
+ LBANodeRef &right,
+ bool prefer_left) = 0;
+
+ virtual bool at_max_capacity() const = 0;
+ virtual bool at_min_capacity() const = 0;
+
+ virtual ~LBANode() = default;
+
+ void on_delta_write(paddr_t record_block_offset) final {
+ // All in-memory relative addrs are necessarily record-relative
+ assert(get_prior_instance());
+ pin.take_pin(get_prior_instance()->cast<LBANode>()->pin);
+ resolve_relative_addrs(record_block_offset);
+ }
+
+ void on_initial_write() final {
+ // All in-memory relative addrs are necessarily block-relative
+ resolve_relative_addrs(get_paddr());
+ }
+
+ void on_clean_read() final {
+ // From initial write of block, relative addrs are necessarily block-relative
+ resolve_relative_addrs(get_paddr());
+ }
+
+ virtual void resolve_relative_addrs(paddr_t base) = 0;
+};
+using LBANodeRef = LBANode::LBANodeRef;
+
+/**
+ * get_lba_btree_extent
+ *
+ * Fetches node at depth of the appropriate type.
+ */
+Cache::get_extent_ertr::future<LBANodeRef> get_lba_btree_extent(
+ op_context_t c, ///< [in] context structure
+ depth_t depth, ///< [in] depth of node to fetch
+ paddr_t offset, ///< [in] physical addr of node
+ paddr_t base ///< [in] depending on user, block addr or record addr
+ /// in case offset is relative
+);
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc
new file mode 100644
index 000000000..5e400803b
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc
@@ -0,0 +1,701 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include <memory>
+#include <string.h>
+
+#include "include/buffer.h"
+#include "include/byteorder.h"
+
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h"
+
+namespace {
+ seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_filestore);
+ }
+}
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+std::ostream &LBAInternalNode::print_detail(std::ostream &out) const
+{
+ return out << ", size=" << get_size()
+ << ", meta=" << get_meta();
+}
+
+LBAInternalNode::lookup_ret LBAInternalNode::lookup(
+ op_context_t c,
+ laddr_t addr,
+ depth_t depth)
+{
+ auto meta = get_meta();
+ if (depth == get_meta().depth) {
+ return lookup_ret(
+ lookup_ertr::ready_future_marker{},
+ this);
+ }
+ assert(meta.begin <= addr);
+ assert(meta.end > addr);
+ auto iter = lower_bound(addr);
+ return get_lba_btree_extent(
+ c,
+ meta.depth - 1,
+ iter->get_val(),
+ get_paddr()).safe_then([c, addr, depth](auto child) {
+ return child->lookup(c, addr, depth);
+ }).finally([ref=LBANodeRef(this)] {});
+}
+
+LBAInternalNode::lookup_range_ret LBAInternalNode::lookup_range(
+ op_context_t c,
+ laddr_t addr,
+ extent_len_t len)
+{
+ auto [begin, end] = bound(addr, addr + len);
+ auto result_up = std::make_unique<lba_pin_list_t>();
+ auto &result = *result_up;
+ return crimson::do_for_each(
+ std::move(begin),
+ std::move(end),
+ [this, c, &result, addr, len](const auto &val) mutable {
+ return get_lba_btree_extent(
+ c,
+ get_meta().depth - 1,
+ val.get_val(),
+ get_paddr()).safe_then(
+ [c, &result, addr, len](auto extent) mutable {
+ return extent->lookup_range(
+ c,
+ addr,
+ len).safe_then(
+ [&result](auto pin_list) mutable {
+ result.splice(result.end(), pin_list,
+ pin_list.begin(), pin_list.end());
+ });
+ });
+ }).safe_then([result=std::move(result_up), ref=LBANodeRef(this)] {
+ return lookup_range_ertr::make_ready_future<lba_pin_list_t>(
+ std::move(*result));
+ });
+}
+
+LBAInternalNode::insert_ret LBAInternalNode::insert(
+ op_context_t c,
+ laddr_t laddr,
+ lba_map_val_t val)
+{
+ auto insertion_pt = get_containing_child(laddr);
+ return get_lba_btree_extent(
+ c,
+ get_meta().depth - 1,
+ insertion_pt->get_val(),
+ get_paddr()).safe_then(
+ [this, insertion_pt, c, laddr, val=std::move(val)](
+ auto extent) mutable {
+ return extent->at_max_capacity() ?
+ split_entry(c, laddr, insertion_pt, extent) :
+ insert_ertr::make_ready_future<LBANodeRef>(std::move(extent));
+ }).safe_then([c, laddr, val=std::move(val)](
+ LBANodeRef extent) mutable {
+ return extent->insert(c, laddr, val);
+ });
+}
+
+LBAInternalNode::mutate_mapping_ret LBAInternalNode::mutate_mapping(
+ op_context_t c,
+ laddr_t laddr,
+ mutate_func_t &&f)
+{
+ return mutate_mapping_internal(c, laddr, true, std::move(f));
+}
+
+LBAInternalNode::mutate_mapping_ret LBAInternalNode::mutate_mapping_internal(
+ op_context_t c,
+ laddr_t laddr,
+ bool is_root,
+ mutate_func_t &&f)
+{
+ auto mutation_pt = get_containing_child(laddr);
+ if (mutation_pt == end()) {
+ assert(0 == "impossible");
+ return crimson::ct_error::enoent::make();
+ }
+ return get_lba_btree_extent(
+ c,
+ get_meta().depth - 1,
+ mutation_pt->get_val(),
+ get_paddr()
+ ).safe_then([=](LBANodeRef extent) {
+ if (extent->at_min_capacity() && get_size() > 1) {
+ return merge_entry(
+ c,
+ laddr,
+ mutation_pt,
+ extent,
+ is_root);
+ } else {
+ return merge_ertr::make_ready_future<LBANodeRef>(
+ std::move(extent));
+ }
+ }).safe_then([c, laddr, f=std::move(f)](LBANodeRef extent) mutable {
+ return extent->mutate_mapping_internal(c, laddr, false, std::move(f));
+ });
+}
+
+LBAInternalNode::mutate_internal_address_ret LBAInternalNode::mutate_internal_address(
+ op_context_t c,
+ depth_t depth,
+ laddr_t laddr,
+ paddr_t paddr)
+{
+ if (get_meta().depth == (depth + 1)) {
+ if (!is_pending()) {
+ return c.cache.duplicate_for_write(c.trans, this)->cast<LBAInternalNode>(
+ )->mutate_internal_address(
+ c,
+ depth,
+ laddr,
+ paddr);
+ }
+ auto iter = get_containing_child(laddr);
+ if (iter->get_key() != laddr) {
+ return crimson::ct_error::enoent::make();
+ }
+
+ auto old_paddr = iter->get_val();
+
+ journal_update(
+ iter,
+ maybe_generate_relative(paddr),
+ maybe_get_delta_buffer());
+
+ return mutate_internal_address_ret(
+ mutate_internal_address_ertr::ready_future_marker{},
+ old_paddr
+ );
+ } else {
+ auto iter = get_containing_child(laddr);
+ return get_lba_btree_extent(
+ c,
+ get_meta().depth - 1,
+ iter->get_val(),
+ get_paddr()
+ ).safe_then([=](auto node) {
+ return node->mutate_internal_address(
+ c,
+ depth,
+ laddr,
+ paddr);
+ });
+ }
+}
+
+LBAInternalNode::find_hole_ret LBAInternalNode::find_hole(
+ op_context_t c,
+ laddr_t min_addr,
+ laddr_t max_addr,
+ extent_len_t len)
+{
+ logger().debug(
+ "LBAInternalNode::find_hole min={}, max={}, len={}, *this={}",
+ min_addr, max_addr, len, *this);
+ auto [begin, end] = bound(min_addr, max_addr);
+ return seastar::repeat_until_value(
+ [i=begin, e=end, c, min_addr, len, this]() mutable {
+ if (i == e) {
+ return seastar::make_ready_future<std::optional<laddr_t>>(
+ std::make_optional<laddr_t>(L_ADDR_NULL));
+ }
+ return get_lba_btree_extent(c,
+ get_meta().depth - 1,
+ i->get_val(),
+ get_paddr()).safe_then(
+ [c, min_addr, len, i](auto extent) mutable {
+ auto lb = std::max(min_addr, i->get_key());
+ auto ub = i->get_next_key_or_max();
+ logger().debug("LBAInternalNode::find_hole extent {} lb {} ub {}",
+ *extent, lb, ub);
+ return extent->find_hole(c, lb, ub, len);
+ }).safe_then([&i](auto addr) mutable -> std::optional<laddr_t> {
+ if (addr == L_ADDR_NULL) {
+ ++i;
+ return {};
+ } else {
+ return addr;
+ }
+ },
+ // TODO: GCC enters a dead loop if crimson::do_until() is used
+ // or erroratorized future is returned
+ crimson::ct_error::assert_all{ "fix me - APIv6" });
+ });
+}
+
+LBAInternalNode::scan_mappings_ret LBAInternalNode::scan_mappings(
+ op_context_t c,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &f)
+{
+ auto [biter, eiter] = bound(begin, end);
+ return crimson::do_for_each(
+ std::move(biter),
+ std::move(eiter),
+ [=, &f](auto &viter) {
+ return get_lba_btree_extent(
+ c,
+ get_meta().depth - 1,
+ viter->get_val(),
+ get_paddr()).safe_then([=, &f](auto child) {
+ return child->scan_mappings(c, begin, end, f);
+ });
+ }).safe_then([ref=LBANodeRef(this)]{});
+}
+
+LBAInternalNode::scan_mapped_space_ret LBAInternalNode::scan_mapped_space(
+ op_context_t c,
+ scan_mapped_space_func_t &f)
+{
+ f(get_paddr(), get_length());
+ return crimson::do_for_each(
+ begin(), end(),
+ [=, &f](auto &viter) {
+ return get_lba_btree_extent(
+ c,
+ get_meta().depth - 1,
+ viter->get_val(),
+ get_paddr()).safe_then([=, &f](auto child) {
+ return child->scan_mapped_space(c, f);
+ });
+ }).safe_then([ref=LBANodeRef(this)]{});
+}
+
+
+void LBAInternalNode::resolve_relative_addrs(paddr_t base)
+{
+ for (auto i: *this) {
+ if (i->get_val().is_relative()) {
+ auto updated = base.add_relative(i->get_val());
+ logger().debug(
+ "LBAInternalNode::resolve_relative_addrs {} -> {}",
+ i->get_val(),
+ updated);
+ i->set_val(updated);
+ }
+ }
+}
+
+
+LBAInternalNode::split_ret
+LBAInternalNode::split_entry(
+ op_context_t c,
+ laddr_t addr,
+ internal_iterator_t iter, LBANodeRef entry)
+{
+ if (!is_pending()) {
+ auto mut = c.cache.duplicate_for_write(
+ c.trans, this)->cast<LBAInternalNode>();
+ auto mut_iter = mut->iter_idx(iter->get_offset());
+ return mut->split_entry(c, addr, mut_iter, entry);
+ }
+
+ ceph_assert(!at_max_capacity());
+ auto [left, right, pivot] = entry->make_split_children(c);
+
+ journal_update(
+ iter,
+ maybe_generate_relative(left->get_paddr()),
+ maybe_get_delta_buffer());
+ journal_insert(
+ iter + 1,
+ pivot,
+ maybe_generate_relative(right->get_paddr()),
+ maybe_get_delta_buffer());
+
+ c.cache.retire_extent(c.trans, entry);
+
+ logger().debug(
+ "LBAInternalNode::split_entry *this {} entry {} into left {} right {}",
+ *this,
+ *entry,
+ *left,
+ *right);
+
+ return split_ertr::make_ready_future<LBANodeRef>(
+ pivot > addr ? left : right
+ );
+}
+
+LBAInternalNode::merge_ret
+LBAInternalNode::merge_entry(
+ op_context_t c,
+ laddr_t addr,
+ internal_iterator_t iter,
+ LBANodeRef entry,
+ bool is_root)
+{
+ if (!is_pending()) {
+ auto mut = c.cache.duplicate_for_write(c.trans, this)->cast<LBAInternalNode>();
+ auto mut_iter = mut->iter_idx(iter->get_offset());
+ return mut->merge_entry(c, addr, mut_iter, entry, is_root);
+ }
+
+ logger().debug(
+ "LBAInternalNode: merge_entry: {}, {}",
+ *this,
+ *entry);
+ auto donor_is_left = (iter + 1) == end();
+ auto donor_iter = donor_is_left ? iter - 1 : iter + 1;
+ return get_lba_btree_extent(
+ c,
+ get_meta().depth - 1,
+ donor_iter->get_val(),
+ get_paddr()
+ ).safe_then([=](auto donor) mutable {
+ auto [l, r] = donor_is_left ?
+ std::make_pair(donor, entry) : std::make_pair(entry, donor);
+ auto [liter, riter] = donor_is_left ?
+ std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter);
+ if (donor->at_min_capacity()) {
+ auto replacement = l->make_full_merge(
+ c,
+ r);
+
+ journal_update(
+ liter,
+ maybe_generate_relative(replacement->get_paddr()),
+ maybe_get_delta_buffer());
+ journal_remove(riter, maybe_get_delta_buffer());
+
+ c.cache.retire_extent(c.trans, l);
+ c.cache.retire_extent(c.trans, r);
+
+ if (is_root && get_size() == 1) {
+ return c.cache.get_root(c.trans).safe_then([=](RootBlockRef croot) {
+ {
+ auto mut_croot = c.cache.duplicate_for_write(c.trans, croot);
+ croot = mut_croot->cast<RootBlock>();
+ }
+ croot->root.lba_root_addr = begin()->get_val();
+ logger().debug(
+ "LBAInternalNode::merge_entry: collapsing root {} to addr {}",
+ *this,
+ begin()->get_val());
+ croot->root.lba_depth = get_meta().depth - 1;
+ c.cache.retire_extent(c.trans, this);
+ return merge_ertr::make_ready_future<LBANodeRef>(replacement);
+ });
+ } else {
+ return merge_ertr::make_ready_future<LBANodeRef>(replacement);
+ }
+ } else {
+ logger().debug(
+ "LBAInternalEntry::merge_entry balanced l {} r {}",
+ *l,
+ *r);
+ auto [replacement_l, replacement_r, pivot] =
+ l->make_balanced(
+ c,
+ r,
+ !donor_is_left);
+
+ journal_update(
+ liter,
+ maybe_generate_relative(replacement_l->get_paddr()),
+ maybe_get_delta_buffer());
+ journal_replace(
+ riter,
+ pivot,
+ maybe_generate_relative(replacement_r->get_paddr()),
+ maybe_get_delta_buffer());
+
+ c.cache.retire_extent(c.trans, l);
+ c.cache.retire_extent(c.trans, r);
+ return merge_ertr::make_ready_future<LBANodeRef>(
+ addr >= pivot ? replacement_r : replacement_l
+ );
+ }
+ });
+}
+
+
+LBAInternalNode::internal_iterator_t
+LBAInternalNode::get_containing_child(laddr_t laddr)
+{
+ // TODO: binary search
+ for (auto i = begin(); i != end(); ++i) {
+ if (i.contains(laddr))
+ return i;
+ }
+ ceph_assert(0 == "invalid");
+ return end();
+}
+
+std::ostream &LBALeafNode::print_detail(std::ostream &out) const
+{
+ return out << ", size=" << get_size()
+ << ", meta=" << get_meta();
+}
+
+LBALeafNode::lookup_range_ret LBALeafNode::lookup_range(
+ op_context_t c,
+ laddr_t addr,
+ extent_len_t len)
+{
+ logger().debug(
+ "LBALeafNode::lookup_range {}~{}",
+ addr,
+ len);
+ auto ret = lba_pin_list_t();
+ auto [i, end] = get_leaf_entries(addr, len);
+ for (; i != end; ++i) {
+ auto val = i->get_val();
+ auto begin = i->get_key();
+ ret.emplace_back(
+ std::make_unique<BtreeLBAPin>(
+ this,
+ val.paddr.maybe_relative_to(get_paddr()),
+ lba_node_meta_t{ begin, begin + val.len, 0}));
+ }
+ return lookup_range_ertr::make_ready_future<lba_pin_list_t>(
+ std::move(ret));
+}
+
+LBALeafNode::insert_ret LBALeafNode::insert(
+ op_context_t c,
+ laddr_t laddr,
+ lba_map_val_t val)
+{
+ ceph_assert(!at_max_capacity());
+
+ if (!is_pending()) {
+ return c.cache.duplicate_for_write(c.trans, this
+ )->cast<LBALeafNode>()->insert(c, laddr, val);
+ }
+
+ val.paddr = maybe_generate_relative(val.paddr);
+ logger().debug(
+ "LBALeafNode::insert: inserting {}~{} -> {}",
+ laddr,
+ val.len,
+ val.paddr);
+
+ auto insert_pt = lower_bound(laddr);
+ journal_insert(insert_pt, laddr, val, maybe_get_delta_buffer());
+
+ logger().debug(
+ "LBALeafNode::insert: inserted {}~{} -> {}",
+ insert_pt.get_key(),
+ insert_pt.get_val().len,
+ insert_pt.get_val().paddr);
+ auto begin = insert_pt.get_key();
+ return insert_ret(
+ insert_ertr::ready_future_marker{},
+ std::make_unique<BtreeLBAPin>(
+ this,
+ val.paddr.maybe_relative_to(get_paddr()),
+ lba_node_meta_t{ begin, begin + val.len, 0}));
+}
+
+LBALeafNode::mutate_mapping_ret LBALeafNode::mutate_mapping(
+ op_context_t c,
+ laddr_t laddr,
+ mutate_func_t &&f)
+{
+ return mutate_mapping_internal(c, laddr, true, std::move(f));
+}
+
+LBALeafNode::mutate_mapping_ret LBALeafNode::mutate_mapping_internal(
+ op_context_t c,
+ laddr_t laddr,
+ bool is_root,
+ mutate_func_t &&f)
+{
+ auto mutation_pt = find(laddr);
+ if (mutation_pt == end()) {
+ return crimson::ct_error::enoent::make();
+ }
+
+ if (!is_pending()) {
+ return c.cache.duplicate_for_write(c.trans, this)->cast<LBALeafNode>(
+ )->mutate_mapping_internal(
+ c,
+ laddr,
+ is_root,
+ std::move(f));
+ }
+
+ auto cur = mutation_pt.get_val();
+ auto mutated = f(cur);
+
+ mutated.paddr = maybe_generate_relative(mutated.paddr);
+
+ logger().debug(
+ "{}: mutate addr {}: {} -> {}",
+ __func__,
+ laddr,
+ cur.paddr,
+ mutated.paddr);
+
+ if (mutated.refcount > 0) {
+ journal_update(mutation_pt, mutated, maybe_get_delta_buffer());
+ return mutate_mapping_ret(
+ mutate_mapping_ertr::ready_future_marker{},
+ mutated);
+ } else {
+ journal_remove(mutation_pt, maybe_get_delta_buffer());
+ return mutate_mapping_ret(
+ mutate_mapping_ertr::ready_future_marker{},
+ mutated);
+ }
+}
+
+LBALeafNode::mutate_internal_address_ret LBALeafNode::mutate_internal_address(
+ op_context_t c,
+ depth_t depth,
+ laddr_t laddr,
+ paddr_t paddr)
+{
+ ceph_assert(0 == "Impossible");
+ return mutate_internal_address_ret(
+ mutate_internal_address_ertr::ready_future_marker{},
+ paddr);
+}
+
+LBALeafNode::find_hole_ret LBALeafNode::find_hole(
+ op_context_t c,
+ laddr_t min,
+ laddr_t max,
+ extent_len_t len)
+{
+ logger().debug(
+ "LBALeafNode::find_hole min={} max={}, len={}, *this={}",
+ min, max, len, *this);
+ auto [liter, uiter] = bound(min, max);
+ for (auto i = liter; i != uiter; ++i) {
+ auto ub = i->get_key();
+ if (min + len <= ub) {
+ return find_hole_ret(
+ find_hole_ertr::ready_future_marker{},
+ min);
+ } else {
+ min = i->get_key() + i->get_val().len;
+ }
+ }
+ if (min + len <= max) {
+ return find_hole_ret(
+ find_hole_ertr::ready_future_marker{},
+ min);
+ } else {
+ return find_hole_ret(
+ find_hole_ertr::ready_future_marker{},
+ L_ADDR_MAX);
+ }
+}
+
+LBALeafNode::scan_mappings_ret LBALeafNode::scan_mappings(
+ op_context_t c,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &f)
+{
+ auto [biter, eiter] = bound(begin, end);
+ for (auto i = biter; i != eiter; ++i) {
+ auto val = i->get_val();
+ f(i->get_key(), val.paddr, val.len);
+ }
+ return scan_mappings_ertr::now();
+}
+
+LBALeafNode::scan_mapped_space_ret LBALeafNode::scan_mapped_space(
+ op_context_t c,
+ scan_mapped_space_func_t &f)
+{
+ f(get_paddr(), get_length());
+ for (auto i = begin(); i != end(); ++i) {
+ auto val = i->get_val();
+ f(val.paddr, val.len);
+ }
+ return scan_mappings_ertr::now();
+}
+
+
+void LBALeafNode::resolve_relative_addrs(paddr_t base)
+{
+ for (auto i: *this) {
+ if (i->get_val().paddr.is_relative()) {
+ auto val = i->get_val();
+ val.paddr = base.add_relative(val.paddr);
+ logger().debug(
+ "LBALeafNode::resolve_relative_addrs {} -> {}",
+ i->get_val().paddr,
+ val.paddr);
+ i->set_val(val);
+ }
+ }
+}
+
+std::pair<LBALeafNode::internal_iterator_t, LBALeafNode::internal_iterator_t>
+LBALeafNode::get_leaf_entries(laddr_t addr, extent_len_t len)
+{
+ return bound(addr, addr + len);
+}
+
+Cache::get_extent_ertr::future<LBANodeRef> get_lba_btree_extent(
+ op_context_t c,
+ depth_t depth,
+ paddr_t offset,
+ paddr_t base)
+{
+ offset = offset.maybe_relative_to(base);
+ ceph_assert(depth > 0);
+ if (depth > 1) {
+ logger().debug(
+ "get_lba_btree_extent: reading internal at offset {}, depth {}",
+ offset,
+ depth);
+ return c.cache.get_extent<LBAInternalNode>(
+ c.trans,
+ offset,
+ LBA_BLOCK_SIZE).safe_then([c](auto ret) {
+ auto meta = ret->get_meta();
+ if (ret->get_size()) {
+ ceph_assert(meta.begin <= ret->begin()->get_key());
+ ceph_assert(meta.end > (ret->end() - 1)->get_key());
+ }
+ if (!ret->is_pending() && !ret->pin.is_linked()) {
+ ret->pin.set_range(meta);
+ c.pins.add_pin(ret->pin);
+ }
+ return LBANodeRef(ret.detach(), /* add_ref = */ false);
+ });
+ } else {
+ logger().debug(
+ "get_lba_btree_extent: reading leaf at offset {}, depth {}",
+ offset,
+ depth);
+ return c.cache.get_extent<LBALeafNode>(
+ c.trans,
+ offset,
+ LBA_BLOCK_SIZE).safe_then([offset, c](auto ret) {
+ logger().debug(
+ "get_lba_btree_extent: read leaf at offset {} {}",
+ offset,
+ *ret);
+ auto meta = ret->get_meta();
+ if (ret->get_size()) {
+ ceph_assert(meta.begin <= ret->begin()->get_key());
+ ceph_assert(meta.end > (ret->end() - 1)->get_key());
+ }
+ if (!ret->is_pending() && !ret->pin.is_linked()) {
+ ret->pin.set_range(meta);
+ c.pins.add_pin(ret->pin);
+ }
+ return LBANodeRef(ret.detach(), /* add_ref = */ false);
+ });
+ }
+}
+
+}
diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h
new file mode 100644
index 000000000..230eef682
--- /dev/null
+++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h
@@ -0,0 +1,555 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <sys/mman.h>
+#include <string.h>
+
+#include <memory>
+#include <string.h>
+
+#include "include/buffer.h"
+
+#include "crimson/common/fixed_kv_node_layout.h"
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/lba_manager.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
+#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h"
+
+namespace crimson::os::seastore::lba_manager::btree {
+
+constexpr size_t LBA_BLOCK_SIZE = 4096;
+
+/**
+ * lba_node_meta_le_t
+ *
+ * On disk layout for lba_node_meta_t
+ */
+struct lba_node_meta_le_t {
+ laddr_le_t begin = laddr_le_t(0);
+ laddr_le_t end = laddr_le_t(0);
+ depth_le_t depth = init_les32(0);
+
+ lba_node_meta_le_t() = default;
+ lba_node_meta_le_t(const lba_node_meta_le_t &) = default;
+ explicit lba_node_meta_le_t(const lba_node_meta_t &val)
+ : begin(init_le64(val.begin)),
+ end(init_le64(val.end)),
+ depth(init_les32(val.depth)) {}
+
+ operator lba_node_meta_t() const {
+ return lba_node_meta_t{ begin, end, depth };
+ }
+};
+
+
+/**
+ * LBAInternalNode
+ *
+ * Abstracts operations on and layout of internal nodes for the
+ * LBA Tree.
+ *
+ * Layout (4k):
+ * size : uint32_t[1] 4b
+ * (padding) : 4b
+ * meta : lba_node_meta_le_t[3] (1*24)b
+ * keys : laddr_t[255] (254*8)b
+ * values : paddr_t[255] (254*8)b
+ * = 4096
+
+ * TODO: make the above capacity calculation part of FixedKVNodeLayout
+ * TODO: the above alignment probably isn't portable without further work
+ */
+constexpr size_t INTERNAL_NODE_CAPACITY = 254;
+struct LBAInternalNode
+ : LBANode,
+ common::FixedKVNodeLayout<
+ INTERNAL_NODE_CAPACITY,
+ lba_node_meta_t, lba_node_meta_le_t,
+ laddr_t, laddr_le_t,
+ paddr_t, paddr_le_t> {
+ using internal_iterator_t = const_iterator;
+ template <typename... T>
+ LBAInternalNode(T&&... t) :
+ LBANode(std::forward<T>(t)...),
+ FixedKVNodeLayout(get_bptr().c_str()) {}
+
+ static constexpr extent_types_t type = extent_types_t::LADDR_INTERNAL;
+
+ lba_node_meta_t get_node_meta() const final { return get_meta(); }
+
+ CachedExtentRef duplicate_for_write() final {
+ assert(delta_buffer.empty());
+ return CachedExtentRef(new LBAInternalNode(*this));
+ };
+
+ delta_buffer_t delta_buffer;
+ delta_buffer_t *maybe_get_delta_buffer() {
+ return is_mutation_pending() ? &delta_buffer : nullptr;
+ }
+
+ lookup_ret lookup(op_context_t c, laddr_t addr, depth_t depth) final;
+
+ lookup_range_ret lookup_range(
+ op_context_t c,
+ laddr_t addr,
+ extent_len_t len) final;
+
+ insert_ret insert(
+ op_context_t c,
+ laddr_t laddr,
+ lba_map_val_t val) final;
+
+ mutate_mapping_ret mutate_mapping(
+ op_context_t c,
+ laddr_t laddr,
+ mutate_func_t &&f) final;
+ mutate_mapping_ret mutate_mapping_internal(
+ op_context_t c,
+ laddr_t laddr,
+ bool is_root,
+ mutate_func_t &&f) final;
+
+ mutate_internal_address_ret mutate_internal_address(
+ op_context_t c,
+ depth_t depth,
+ laddr_t laddr,
+ paddr_t paddr) final;
+
+ find_hole_ret find_hole(
+ op_context_t c,
+ laddr_t min,
+ laddr_t max,
+ extent_len_t len) final;
+
+ scan_mappings_ret scan_mappings(
+ op_context_t c,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &f) final;
+
+ scan_mapped_space_ret scan_mapped_space(
+ op_context_t c,
+ scan_mapped_space_func_t &f) final;
+
+ std::tuple<LBANodeRef, LBANodeRef, laddr_t>
+ make_split_children(op_context_t c) final {
+ auto left = c.cache.alloc_new_extent<LBAInternalNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto right = c.cache.alloc_new_extent<LBAInternalNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto pivot = split_into(*left, *right);
+ left->pin.set_range(left->get_meta());
+ right->pin.set_range(right->get_meta());
+ return std::make_tuple(
+ left,
+ right,
+ pivot);
+ }
+
+ LBANodeRef make_full_merge(
+ op_context_t c,
+ LBANodeRef &right) final {
+ auto replacement = c.cache.alloc_new_extent<LBAInternalNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ replacement->merge_from(*this, *right->cast<LBAInternalNode>());
+ replacement->pin.set_range(replacement->get_meta());
+ return replacement;
+ }
+
+ std::tuple<LBANodeRef, LBANodeRef, laddr_t>
+ make_balanced(
+ op_context_t c,
+ LBANodeRef &_right,
+ bool prefer_left) final {
+ ceph_assert(_right->get_type() == type);
+ auto &right = *_right->cast<LBAInternalNode>();
+ auto replacement_left = c.cache.alloc_new_extent<LBAInternalNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto replacement_right = c.cache.alloc_new_extent<LBAInternalNode>(
+ c.trans, LBA_BLOCK_SIZE);
+
+ auto pivot = balance_into_new_nodes(
+ *this,
+ right,
+ prefer_left,
+ *replacement_left,
+ *replacement_right);
+
+ replacement_left->pin.set_range(replacement_left->get_meta());
+ replacement_right->pin.set_range(replacement_right->get_meta());
+ return std::make_tuple(
+ replacement_left,
+ replacement_right,
+ pivot);
+ }
+
+ /**
+ * Internal relative addresses on read or in memory prior to commit
+ * are either record or block relative depending on whether this
+ * physical node is is_initial_pending() or just is_pending().
+ *
+ * User passes appropriate base depending on lifecycle and
+ * resolve_relative_addrs fixes up relative internal references
+ * based on base.
+ */
+ void resolve_relative_addrs(paddr_t base) final;
+ void node_resolve_vals(iterator from, iterator to) const final {
+ if (is_initial_pending()) {
+ for (auto i = from; i != to; ++i) {
+ if (i->get_val().is_relative()) {
+ assert(i->get_val().is_block_relative());
+ i->set_val(get_paddr().add_relative(i->get_val()));
+ }
+ }
+ }
+ }
+ void node_unresolve_vals(iterator from, iterator to) const final {
+ if (is_initial_pending()) {
+ for (auto i = from; i != to; ++i) {
+ if (i->get_val().is_relative()) {
+ assert(i->get_val().is_record_relative());
+ i->set_val(i->get_val() - get_paddr());
+ }
+ }
+ }
+ }
+
+ extent_types_t get_type() const final {
+ return type;
+ }
+
+ std::ostream &print_detail(std::ostream &out) const final;
+
+ ceph::bufferlist get_delta() final {
+ assert(!delta_buffer.empty());
+ ceph::buffer::ptr bptr(delta_buffer.get_bytes());
+ delta_buffer.copy_out(bptr.c_str(), bptr.length());
+ ceph::bufferlist bl;
+ bl.push_back(bptr);
+ return bl;
+ }
+
+ void apply_delta_and_adjust_crc(
+ paddr_t base, const ceph::bufferlist &_bl) final {
+ assert(_bl.length());
+ ceph::bufferlist bl = _bl;
+ bl.rebuild();
+ delta_buffer_t buffer;
+ buffer.copy_in(bl.front().c_str(), bl.front().length());
+ buffer.replay(*this);
+ set_last_committed_crc(get_crc32c());
+ resolve_relative_addrs(base);
+ }
+
+ bool at_max_capacity() const final {
+ return get_size() == get_capacity();
+ }
+
+ bool at_min_capacity() const {
+ return get_size() == (get_capacity() / 2);
+ }
+
+ /// returns iterators containing [l, r)
+ std::pair<internal_iterator_t, internal_iterator_t> bound(
+ laddr_t l, laddr_t r) {
+ // TODO: inefficient
+ auto retl = begin();
+ for (; retl != end(); ++retl) {
+ if (retl->get_next_key_or_max() > l)
+ break;
+ }
+ auto retr = retl;
+ for (; retr != end(); ++retr) {
+ if (retr->get_key() >= r)
+ break;
+ }
+ return std::make_pair(retl, retr);
+ }
+
+ using split_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error
+ >;
+ using split_ret = split_ertr::future<LBANodeRef>;
+ split_ret split_entry(
+ op_context_t c,
+ laddr_t addr,
+ internal_iterator_t,
+ LBANodeRef entry);
+
+ using merge_ertr = crimson::errorator<
+ crimson::ct_error::input_output_error
+ >;
+ using merge_ret = merge_ertr::future<LBANodeRef>;
+ merge_ret merge_entry(
+ op_context_t c,
+ laddr_t addr,
+ internal_iterator_t,
+ LBANodeRef entry,
+ bool is_root);
+
+ /// returns iterator for subtree containing laddr
+ internal_iterator_t get_containing_child(laddr_t laddr);
+};
+
+/**
+ * LBALeafNode
+ *
+ * Abstracts operations on and layout of leaf nodes for the
+ * LBA Tree.
+ *
+ * Layout (4k):
+ * size : uint32_t[1] 4b
+ * (padding) : 4b
+ * meta : lba_node_meta_le_t[3] (1*24)b
+ * keys : laddr_t[170] (145*8)b
+ * values : lba_map_val_t[170] (145*20)b
+ * = 4092
+ *
+ * TODO: update FixedKVNodeLayout to handle the above calculation
+ * TODO: the above alignment probably isn't portable without further work
+ */
+constexpr size_t LEAF_NODE_CAPACITY = 145;
+
+/**
+ * lba_map_val_le_t
+ *
+ * On disk layout for lba_map_val_t.
+ */
+struct lba_map_val_le_t {
+ extent_len_le_t len = init_extent_len_le_t(0);
+ paddr_le_t paddr;
+ ceph_le32 refcount = init_le32(0);
+ ceph_le32 checksum = init_le32(0);
+
+ lba_map_val_le_t() = default;
+ lba_map_val_le_t(const lba_map_val_le_t &) = default;
+ explicit lba_map_val_le_t(const lba_map_val_t &val)
+ : len(init_extent_len_le_t(val.len)),
+ paddr(paddr_le_t(val.paddr)),
+ refcount(init_le32(val.refcount)),
+ checksum(init_le32(val.checksum)) {}
+
+ operator lba_map_val_t() const {
+ return lba_map_val_t{ len, paddr, refcount, checksum };
+ }
+};
+
+struct LBALeafNode
+ : LBANode,
+ common::FixedKVNodeLayout<
+ LEAF_NODE_CAPACITY,
+ lba_node_meta_t, lba_node_meta_le_t,
+ laddr_t, laddr_le_t,
+ lba_map_val_t, lba_map_val_le_t> {
+ using internal_iterator_t = const_iterator;
+ template <typename... T>
+ LBALeafNode(T&&... t) :
+ LBANode(std::forward<T>(t)...),
+ FixedKVNodeLayout(get_bptr().c_str()) {}
+
+ static constexpr extent_types_t type = extent_types_t::LADDR_LEAF;
+
+ lba_node_meta_t get_node_meta() const final { return get_meta(); }
+
+ CachedExtentRef duplicate_for_write() final {
+ assert(delta_buffer.empty());
+ return CachedExtentRef(new LBALeafNode(*this));
+ };
+
+ delta_buffer_t delta_buffer;
+ delta_buffer_t *maybe_get_delta_buffer() {
+ return is_mutation_pending() ? &delta_buffer : nullptr;
+ }
+
+ lookup_ret lookup(op_context_t c, laddr_t addr, depth_t depth) final
+ {
+ return lookup_ret(
+ lookup_ertr::ready_future_marker{},
+ this);
+ }
+
+ lookup_range_ret lookup_range(
+ op_context_t c,
+ laddr_t addr,
+ extent_len_t len) final;
+
+ insert_ret insert(
+ op_context_t c,
+ laddr_t laddr,
+ lba_map_val_t val) final;
+
+ mutate_mapping_ret mutate_mapping(
+ op_context_t c,
+ laddr_t laddr,
+ mutate_func_t &&f) final;
+ mutate_mapping_ret mutate_mapping_internal(
+ op_context_t c,
+ laddr_t laddr,
+ bool is_root,
+ mutate_func_t &&f) final;
+
+ mutate_internal_address_ret mutate_internal_address(
+ op_context_t c,
+ depth_t depth,
+ laddr_t laddr,
+ paddr_t paddr) final;
+
+ find_hole_ret find_hole(
+ op_context_t c,
+ laddr_t min,
+ laddr_t max,
+ extent_len_t len) final;
+
+ scan_mappings_ret scan_mappings(
+ op_context_t c,
+ laddr_t begin,
+ laddr_t end,
+ scan_mappings_func_t &f) final;
+
+ scan_mapped_space_ret scan_mapped_space(
+ op_context_t c,
+ scan_mapped_space_func_t &f) final;
+
+ std::tuple<LBANodeRef, LBANodeRef, laddr_t>
+ make_split_children(op_context_t c) final {
+ auto left = c.cache.alloc_new_extent<LBALeafNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto right = c.cache.alloc_new_extent<LBALeafNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto pivot = split_into(*left, *right);
+ left->pin.set_range(left->get_meta());
+ right->pin.set_range(right->get_meta());
+ return std::make_tuple(
+ left,
+ right,
+ pivot);
+ }
+
+ LBANodeRef make_full_merge(
+ op_context_t c,
+ LBANodeRef &right) final {
+ auto replacement = c.cache.alloc_new_extent<LBALeafNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ replacement->merge_from(*this, *right->cast<LBALeafNode>());
+ replacement->pin.set_range(replacement->get_meta());
+ return replacement;
+ }
+
+ std::tuple<LBANodeRef, LBANodeRef, laddr_t>
+ make_balanced(
+ op_context_t c,
+ LBANodeRef &_right,
+ bool prefer_left) final {
+ ceph_assert(_right->get_type() == type);
+ auto &right = *_right->cast<LBALeafNode>();
+ auto replacement_left = c.cache.alloc_new_extent<LBALeafNode>(
+ c.trans, LBA_BLOCK_SIZE);
+ auto replacement_right = c.cache.alloc_new_extent<LBALeafNode>(
+ c.trans, LBA_BLOCK_SIZE);
+
+ auto pivot = balance_into_new_nodes(
+ *this,
+ right,
+ prefer_left,
+ *replacement_left,
+ *replacement_right);
+
+ replacement_left->pin.set_range(replacement_left->get_meta());
+ replacement_right->pin.set_range(replacement_right->get_meta());
+ return std::make_tuple(
+ replacement_left,
+ replacement_right,
+ pivot);
+ }
+
+ // See LBAInternalNode, same concept
+ void resolve_relative_addrs(paddr_t base) final;
+ void node_resolve_vals(iterator from, iterator to) const final {
+ if (is_initial_pending()) {
+ for (auto i = from; i != to; ++i) {
+ auto val = i->get_val();
+ if (val.paddr.is_relative()) {
+ assert(val.paddr.is_block_relative());
+ val.paddr = get_paddr().add_relative(val.paddr);
+ i->set_val(val);
+ }
+ }
+ }
+ }
+ void node_unresolve_vals(iterator from, iterator to) const final {
+ if (is_initial_pending()) {
+ for (auto i = from; i != to; ++i) {
+ auto val = i->get_val();
+ if (val.paddr.is_relative()) {
+ auto val = i->get_val();
+ assert(val.paddr.is_record_relative());
+ val.paddr = val.paddr - get_paddr();
+ i->set_val(val);
+ }
+ }
+ }
+ }
+
+ ceph::bufferlist get_delta() final {
+ assert(!delta_buffer.empty());
+ ceph::buffer::ptr bptr(delta_buffer.get_bytes());
+ delta_buffer.copy_out(bptr.c_str(), bptr.length());
+ ceph::bufferlist bl;
+ bl.push_back(bptr);
+ return bl;
+ }
+
+ void apply_delta_and_adjust_crc(
+ paddr_t base, const ceph::bufferlist &_bl) final {
+ assert(_bl.length());
+ ceph::bufferlist bl = _bl;
+ bl.rebuild();
+ delta_buffer_t buffer;
+ buffer.copy_in(bl.front().c_str(), bl.front().length());
+ buffer.replay(*this);
+ set_last_committed_crc(get_crc32c());
+ resolve_relative_addrs(base);
+ }
+
+ extent_types_t get_type() const final {
+ return type;
+ }
+
+ std::ostream &print_detail(std::ostream &out) const final;
+
+ bool at_max_capacity() const final {
+ return get_size() == get_capacity();
+ }
+
+ bool at_min_capacity() const final {
+ return get_size() == (get_capacity() / 2);
+ }
+
+ /// returns iterators <lb, ub> containing addresses [l, r)
+ std::pair<internal_iterator_t, internal_iterator_t> bound(
+ laddr_t l, laddr_t r) {
+ // TODO: inefficient
+ auto retl = begin();
+ for (; retl != end(); ++retl) {
+ if (retl->get_key() >= l || (retl->get_key() + retl->get_val().len) > l)
+ break;
+ }
+ auto retr = retl;
+ for (; retr != end(); ++retr) {
+ if (retr->get_key() >= r)
+ break;
+ }
+ return std::make_pair(retl, retr);
+ }
+
+ std::pair<internal_iterator_t, internal_iterator_t>
+ get_leaf_entries(laddr_t addr, extent_len_t len);
+};
+using LBALeafNodeRef = TCachedExtentRef<LBALeafNode>;
+
+}