diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
commit | 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch) | |
tree | 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/crimson/os/seastore/lba_manager/btree | |
parent | Initial commit. (diff) | |
download | ceph-upstream.tar.xz ceph-upstream.zip |
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/crimson/os/seastore/lba_manager/btree')
7 files changed, 2720 insertions, 0 deletions
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc new file mode 100644 index 000000000..a837ae37e --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc @@ -0,0 +1,580 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include "crimson/common/log.h" + +#include "include/buffer.h" +#include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h" + + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::lba_manager::btree { + +BtreeLBAManager::mkfs_ret BtreeLBAManager::mkfs( + Transaction &t) +{ + logger().debug("BtreeLBAManager::mkfs"); + return cache.get_root(t).safe_then([this, &t](auto croot) { + auto root_leaf = cache.alloc_new_extent<LBALeafNode>( + t, + LBA_BLOCK_SIZE); + root_leaf->set_size(0); + lba_node_meta_t meta{0, L_ADDR_MAX, 1}; + root_leaf->set_meta(meta); + root_leaf->pin.set_range(meta); + croot->get_root() = + root_t{ + 1, + 0, + root_leaf->get_paddr(), + make_record_relative_paddr(0), + L_ADDR_NULL}; + return mkfs_ertr::now(); + }); +} + +BtreeLBAManager::get_root_ret +BtreeLBAManager::get_root(Transaction &t) +{ + return cache.get_root(t).safe_then([this, &t](auto croot) { + logger().debug( + "BtreeLBAManager::get_root: reading root at {} depth {}", + paddr_t{croot->get_root().lba_root_addr}, + unsigned(croot->get_root().lba_depth)); + return get_lba_btree_extent( + get_context(t), + croot->get_root().lba_depth, + croot->get_root().lba_root_addr, + paddr_t()); + }); +} + +BtreeLBAManager::get_mapping_ret +BtreeLBAManager::get_mapping( + Transaction &t, + laddr_t offset, extent_len_t length) +{ + logger().debug("BtreeLBAManager::get_mapping: {}, {}", offset, length); + return get_root( + t).safe_then([this, &t, offset, length](auto extent) { + return extent->lookup_range( + get_context(t), + offset, length + ).safe_then([extent](auto ret) { return ret; }); + }).safe_then([](auto &&e) { + logger().debug("BtreeLBAManager::get_mapping: got mapping {}", e); + return get_mapping_ret( + get_mapping_ertr::ready_future_marker{}, + std::move(e)); + }); +} + + +BtreeLBAManager::get_mappings_ret +BtreeLBAManager::get_mappings( + Transaction &t, + laddr_list_t &&list) +{ + logger().debug("BtreeLBAManager::get_mappings: {}", list); + auto l = std::make_unique<laddr_list_t>(std::move(list)); + auto retptr = std::make_unique<lba_pin_list_t>(); + auto &ret = *retptr; + return crimson::do_for_each( + l->begin(), + l->end(), + [this, &t, &ret](const auto &p) { + return get_mapping(t, p.first, p.second).safe_then( + [&ret](auto res) { + ret.splice(ret.end(), res, res.begin(), res.end()); + }); + }).safe_then([l=std::move(l), retptr=std::move(retptr)]() mutable { + return std::move(*retptr); + }); +} + +BtreeLBAManager::alloc_extent_ret +BtreeLBAManager::alloc_extent( + Transaction &t, + laddr_t hint, + extent_len_t len, + paddr_t addr) +{ + // TODO: we can certainly combine the lookup and the insert. + return get_root( + t).safe_then([this, &t, hint, len](auto extent) { + logger().debug( + "BtreeLBAManager::alloc_extent: beginning search at {}", + *extent); + return extent->find_hole( + get_context(t), + hint, + L_ADDR_MAX, + len).safe_then([extent](auto ret) { + return std::make_pair(ret, extent); + }); + }).safe_then([this, &t, len, addr](auto allocation_pair) { + auto &[laddr, extent] = allocation_pair; + ceph_assert(laddr != L_ADDR_MAX); + return insert_mapping( + t, + extent, + laddr, + { len, addr, 1, 0 } + ).safe_then([laddr=laddr, addr, len](auto pin) { + logger().debug( + "BtreeLBAManager::alloc_extent: alloc {}~{} for {}", + laddr, + len, + addr); + return alloc_extent_ret( + alloc_extent_ertr::ready_future_marker{}, + LBAPinRef(pin.release())); + }); + }); +} + +BtreeLBAManager::set_extent_ret +BtreeLBAManager::set_extent( + Transaction &t, + laddr_t off, extent_len_t len, paddr_t addr) +{ + return get_root( + t).safe_then([this, &t, off, len, addr](auto root) { + return insert_mapping( + t, + root, + off, + { len, addr, 1, 0 }); + }).safe_then([](auto ret) { + return set_extent_ret( + set_extent_ertr::ready_future_marker{}, + LBAPinRef(ret.release())); + }); +} + +static bool is_lba_node(extent_types_t type) +{ + return type == extent_types_t::LADDR_INTERNAL || + type == extent_types_t::LADDR_LEAF; +} + +static bool is_lba_node(const CachedExtent &e) +{ + return is_lba_node(e.get_type()); +} + +btree_range_pin_t &BtreeLBAManager::get_pin(CachedExtent &e) +{ + if (is_lba_node(e)) { + return e.cast<LBANode>()->pin; + } else if (e.is_logical()) { + return static_cast<BtreeLBAPin &>( + e.cast<LogicalCachedExtent>()->get_pin()).pin; + } else { + ceph_abort_msg("impossible"); + } +} + +static depth_t get_depth(const CachedExtent &e) +{ + if (is_lba_node(e)) { + return e.cast<LBANode>()->get_node_meta().depth; + } else if (e.is_logical()) { + return 0; + } else { + ceph_assert(0 == "currently impossible"); + return 0; + } +} + +BtreeLBAManager::complete_transaction_ret +BtreeLBAManager::complete_transaction( + Transaction &t) +{ + std::vector<CachedExtentRef> to_clear; + to_clear.reserve(t.get_retired_set().size()); + for (auto &e: t.get_retired_set()) { + if (e->is_logical() || is_lba_node(*e)) + to_clear.push_back(e); + } + // need to call check_parent from leaf->parent + std::sort( + to_clear.begin(), to_clear.end(), + [](auto &l, auto &r) { return get_depth(*l) < get_depth(*r); }); + + for (auto &e: to_clear) { + auto &pin = get_pin(*e); + logger().debug("{}: retiring {}, {}", __func__, *e, pin); + pin_set.retire(pin); + } + + // ...but add_pin from parent->leaf + std::vector<CachedExtentRef> to_link; + to_link.reserve(t.get_fresh_block_list().size()); + for (auto &e: t.get_fresh_block_list()) { + if (e->is_valid() && (is_lba_node(*e) || e->is_logical())) + to_link.push_back(e); + } + std::sort( + to_link.begin(), to_link.end(), + [](auto &l, auto &r) -> bool { return get_depth(*l) > get_depth(*r); }); + + for (auto &e : to_link) { + logger().debug("{}: linking {}", __func__, *e); + pin_set.add_pin(get_pin(*e)); + } + + for (auto &e: to_clear) { + auto &pin = get_pin(*e); + logger().debug("{}: checking {}, {}", __func__, *e, pin); + pin_set.check_parent(pin); + } + return complete_transaction_ertr::now(); +} + +BtreeLBAManager::init_cached_extent_ret BtreeLBAManager::init_cached_extent( + Transaction &t, + CachedExtentRef e) +{ + logger().debug("{}: {}", __func__, *e); + return get_root(t).safe_then( + [this, &t, e=std::move(e)](LBANodeRef root) mutable { + if (is_lba_node(*e)) { + auto lban = e->cast<LBANode>(); + logger().debug("init_cached_extent: lba node, getting root"); + return root->lookup( + op_context_t{cache, pin_set, t}, + lban->get_node_meta().begin, + lban->get_node_meta().depth + ).safe_then([this, e=std::move(e)](LBANodeRef c) { + if (c->get_paddr() == e->get_paddr()) { + assert(&*c == &*e); + logger().debug("init_cached_extent: {} initialized", *e); + } else { + // e is obsolete + logger().debug("init_cached_extent: {} obsolete", *e); + cache.drop_from_cache(e); + } + return init_cached_extent_ertr::now(); + }); + } else if (e->is_logical()) { + auto logn = e->cast<LogicalCachedExtent>(); + return root->lookup_range( + op_context_t{cache, pin_set, t}, + logn->get_laddr(), + logn->get_length()).safe_then( + [this, logn=std::move(logn)](auto pins) { + if (pins.size() == 1) { + auto pin = std::move(pins.front()); + pins.pop_front(); + if (pin->get_paddr() == logn->get_paddr()) { + logn->set_pin(std::move(pin)); + pin_set.add_pin( + static_cast<BtreeLBAPin&>(logn->get_pin()).pin); + logger().debug("init_cached_extent: {} initialized", *logn); + } else { + // paddr doesn't match, remapped, obsolete + logger().debug("init_cached_extent: {} obsolete", *logn); + cache.drop_from_cache(logn); + } + } else { + // set of extents changed, obsolete + logger().debug("init_cached_extent: {} obsolete", *logn); + cache.drop_from_cache(logn); + } + return init_cached_extent_ertr::now(); + }); + } else { + logger().debug("init_cached_extent: {} skipped", *e); + return init_cached_extent_ertr::now(); + } + }); +} + +BtreeLBAManager::scan_mappings_ret BtreeLBAManager::scan_mappings( + Transaction &t, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &&f) +{ + return seastar::do_with( + std::move(f), + LBANodeRef(), + [=, &t](auto &f, auto &lbarootref) { + return get_root(t).safe_then( + [=, &t, &f](LBANodeRef lbaroot) mutable { + lbarootref = lbaroot; + return lbaroot->scan_mappings( + get_context(t), + begin, + end, + f); + }); + }); +} + +BtreeLBAManager::scan_mapped_space_ret BtreeLBAManager::scan_mapped_space( + Transaction &t, + scan_mapped_space_func_t &&f) +{ + return seastar::do_with( + std::move(f), + LBANodeRef(), + [=, &t](auto &f, auto &lbarootref) { + return get_root(t).safe_then( + [=, &t, &f](LBANodeRef lbaroot) mutable { + lbarootref = lbaroot; + return lbaroot->scan_mapped_space( + get_context(t), + f); + }); + }); +} + +BtreeLBAManager::rewrite_extent_ret BtreeLBAManager::rewrite_extent( + Transaction &t, + CachedExtentRef extent) +{ + if (extent->is_logical()) { + auto lextent = extent->cast<LogicalCachedExtent>(); + cache.retire_extent(t, extent); + auto nlextent = cache.alloc_new_extent_by_type( + t, + lextent->get_type(), + lextent->get_length())->cast<LogicalCachedExtent>(); + lextent->get_bptr().copy_out( + 0, + lextent->get_length(), + nlextent->get_bptr().c_str()); + nlextent->set_laddr(lextent->get_laddr()); + nlextent->set_pin(lextent->get_pin().duplicate()); + + logger().debug( + "{}: rewriting {} into {}", + __func__, + *lextent, + *nlextent); + + return update_mapping( + t, + lextent->get_laddr(), + [prev_addr = lextent->get_paddr(), addr = nlextent->get_paddr()]( + const lba_map_val_t &in) { + lba_map_val_t ret = in; + ceph_assert(in.paddr == prev_addr); + ret.paddr = addr; + return ret; + }).safe_then([nlextent](auto e) {}).handle_error( + rewrite_extent_ertr::pass_further{}, + /* ENOENT in particular should be impossible */ + crimson::ct_error::assert_all{} + ); + } else if (is_lba_node(*extent)) { + auto lba_extent = extent->cast<LBANode>(); + cache.retire_extent(t, extent); + auto nlba_extent = cache.alloc_new_extent_by_type( + t, + lba_extent->get_type(), + lba_extent->get_length())->cast<LBANode>(); + lba_extent->get_bptr().copy_out( + 0, + lba_extent->get_length(), + nlba_extent->get_bptr().c_str()); + nlba_extent->pin.set_range(nlba_extent->get_node_meta()); + + /* This is a bit underhanded. Any relative addrs here must necessarily + * be record relative as we are rewriting a dirty extent. Thus, we + * are using resolve_relative_addrs with a (likely negative) block + * relative offset to correct them to block-relative offsets adjusted + * for our new transaction location. + * + * Upon commit, these now block relative addresses will be interpretted + * against the real final address. + */ + nlba_extent->resolve_relative_addrs( + make_record_relative_paddr(0) - nlba_extent->get_paddr()); + + return update_internal_mapping( + t, + nlba_extent->get_node_meta().depth, + nlba_extent->get_node_meta().begin, + nlba_extent->get_paddr()).safe_then( + [](auto) {}, + rewrite_extent_ertr::pass_further {}, + crimson::ct_error::assert_all{}); + } else { + return rewrite_extent_ertr::now(); + } +} + +BtreeLBAManager::get_physical_extent_if_live_ret +BtreeLBAManager::get_physical_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + segment_off_t len) +{ + ceph_assert(is_lba_node(type)); + return cache.get_extent_by_type( + t, + type, + addr, + laddr, + len + ).safe_then([=, &t](CachedExtentRef extent) { + return get_root(t).safe_then([=, &t](LBANodeRef root) { + auto lba_node = extent->cast<LBANode>(); + return root->lookup( + op_context_t{cache, pin_set, t}, + lba_node->get_node_meta().begin, + lba_node->get_node_meta().depth).safe_then([=](LBANodeRef c) { + if (c->get_paddr() == lba_node->get_paddr()) { + return get_physical_extent_if_live_ret( + get_physical_extent_if_live_ertr::ready_future_marker{}, + lba_node); + } else { + cache.drop_from_cache(lba_node); + return get_physical_extent_if_live_ret( + get_physical_extent_if_live_ertr::ready_future_marker{}, + CachedExtentRef()); + } + }); + }); + }); +} + +BtreeLBAManager::BtreeLBAManager( + SegmentManager &segment_manager, + Cache &cache) + : segment_manager(segment_manager), + cache(cache) {} + +BtreeLBAManager::insert_mapping_ret BtreeLBAManager::insert_mapping( + Transaction &t, + LBANodeRef root, + laddr_t laddr, + lba_map_val_t val) +{ + auto split = insert_mapping_ertr::future<LBANodeRef>( + insert_mapping_ertr::ready_future_marker{}, + root); + if (root->at_max_capacity()) { + split = cache.get_root(t).safe_then( + [this, root, laddr, &t](RootBlockRef croot) { + logger().debug( + "BtreeLBAManager::insert_mapping: splitting root {}", + *croot); + { + auto mut_croot = cache.duplicate_for_write(t, croot); + croot = mut_croot->cast<RootBlock>(); + } + auto nroot = cache.alloc_new_extent<LBAInternalNode>(t, LBA_BLOCK_SIZE); + lba_node_meta_t meta{0, L_ADDR_MAX, root->get_node_meta().depth + 1}; + nroot->set_meta(meta); + nroot->pin.set_range(meta); + nroot->journal_insert( + nroot->begin(), + L_ADDR_MIN, + root->get_paddr(), + nullptr); + croot->get_root().lba_root_addr = nroot->get_paddr(); + croot->get_root().lba_depth = root->get_node_meta().depth + 1; + return nroot->split_entry( + get_context(t), + laddr, nroot->begin(), root); + }); + } + return split.safe_then([this, &t, laddr, val](LBANodeRef node) { + return node->insert( + get_context(t), + laddr, val); + }); +} + +BtreeLBAManager::update_refcount_ret BtreeLBAManager::update_refcount( + Transaction &t, + laddr_t addr, + int delta) +{ + return update_mapping( + t, + addr, + [delta](const lba_map_val_t &in) { + lba_map_val_t out = in; + ceph_assert((int)out.refcount + delta >= 0); + out.refcount += delta; + return out; + }).safe_then([](auto result) { + return ref_update_result_t{result.refcount, result.paddr}; + }); +} + +BtreeLBAManager::update_mapping_ret BtreeLBAManager::update_mapping( + Transaction &t, + laddr_t addr, + update_func_t &&f) +{ + return get_root(t + ).safe_then([this, f=std::move(f), &t, addr](LBANodeRef root) mutable { + return root->mutate_mapping( + get_context(t), + addr, + std::move(f)); + }); +} + +BtreeLBAManager::update_internal_mapping_ret +BtreeLBAManager::update_internal_mapping( + Transaction &t, + depth_t depth, + laddr_t laddr, + paddr_t paddr) +{ + return cache.get_root(t).safe_then([=, &t](RootBlockRef croot) { + if (depth == croot->get_root().lba_depth) { + logger().debug( + "update_internal_mapping: updating lba root to: {}->{}", + laddr, + paddr); + { + auto mut_croot = cache.duplicate_for_write(t, croot); + croot = mut_croot->cast<RootBlock>(); + } + ceph_assert(laddr == 0); + auto old_paddr = croot->get_root().lba_root_addr; + croot->get_root().lba_root_addr = paddr; + return update_internal_mapping_ret( + update_internal_mapping_ertr::ready_future_marker{}, + old_paddr); + } else { + logger().debug( + "update_internal_mapping: updating lba node at depth {} to: {}->{}", + depth, + laddr, + paddr); + return get_lba_btree_extent( + get_context(t), + croot->get_root().lba_depth, + croot->get_root().lba_root_addr, + paddr_t()).safe_then([=, &t](LBANodeRef broot) { + return broot->mutate_internal_address( + get_context(t), + depth, + laddr, + paddr); + }); + } + }); +} + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h new file mode 100644 index 000000000..640d56734 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h @@ -0,0 +1,188 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "include/buffer_fwd.h" +#include "include/interval_set.h" +#include "common/interval_map.h" +#include "crimson/osd/exceptions.h" + +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/lba_manager.h" +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/segment_manager.h" + +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" + +namespace crimson::os::seastore::lba_manager::btree { + +/** + * BtreeLBAManager + * + * Uses a wandering btree to track two things: + * 1) lba state including laddr_t -> paddr_t mapping + * 2) reverse paddr_t -> laddr_t mapping for gc (TODO) + * + * Generally, any transaction will involve + * 1) deltas against lba tree nodes + * 2) new lba tree nodes + * - Note, there must necessarily be a delta linking + * these new nodes into the tree -- might be a + * bootstrap_state_t delta if new root + * + * get_mappings, alloc_extent_*, etc populate a Transaction + * which then gets submitted + */ +class BtreeLBAManager : public LBAManager { +public: + BtreeLBAManager( + SegmentManager &segment_manager, + Cache &cache); + + mkfs_ret mkfs( + Transaction &t) final; + + get_mapping_ret get_mapping( + Transaction &t, + laddr_t offset, extent_len_t length) final; + + get_mappings_ret get_mappings( + Transaction &t, + laddr_list_t &&list) final; + + alloc_extent_ret alloc_extent( + Transaction &t, + laddr_t hint, + extent_len_t len, + paddr_t addr) final; + + set_extent_ret set_extent( + Transaction &t, + laddr_t off, extent_len_t len, paddr_t addr) final; + + ref_ret decref_extent( + Transaction &t, + laddr_t addr) final { + return update_refcount(t, addr, -1); + } + + ref_ret incref_extent( + Transaction &t, + laddr_t addr) final { + return update_refcount(t, addr, 1); + } + + complete_transaction_ret complete_transaction( + Transaction &t) final; + + init_cached_extent_ret init_cached_extent( + Transaction &t, + CachedExtentRef e) final; + + scan_mappings_ret scan_mappings( + Transaction &t, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &&f) final; + + scan_mapped_space_ret scan_mapped_space( + Transaction &t, + scan_mapped_space_func_t &&f) final; + + rewrite_extent_ret rewrite_extent( + Transaction &t, + CachedExtentRef extent); + + get_physical_extent_if_live_ret get_physical_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + segment_off_t len) final; + + void add_pin(LBAPin &pin) final { + auto *bpin = reinterpret_cast<BtreeLBAPin*>(&pin); + pin_set.add_pin(bpin->pin); + bpin->parent = nullptr; + } + +private: + SegmentManager &segment_manager; + Cache &cache; + + btree_pin_set_t pin_set; + + op_context_t get_context(Transaction &t) { + return op_context_t{cache, pin_set, t}; + } + + static btree_range_pin_t &get_pin(CachedExtent &e); + + + /** + * get_root + * + * Get a reference to the root LBANode. + */ + using get_root_ertr = Cache::get_extent_ertr; + using get_root_ret = get_root_ertr::future<LBANodeRef>; + get_root_ret get_root(Transaction &); + + /** + * insert_mapping + * + * Insert a lba mapping into the tree + */ + using insert_mapping_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using insert_mapping_ret = insert_mapping_ertr::future<LBAPinRef>; + insert_mapping_ret insert_mapping( + Transaction &t, ///< [in,out] transaction + LBANodeRef root, ///< [in] root node + laddr_t laddr, ///< [in] logical addr to insert + lba_map_val_t val ///< [in] mapping to insert + ); + + /** + * update_refcount + * + * Updates refcount, returns resulting refcount + */ + using update_refcount_ret = ref_ret; + update_refcount_ret update_refcount( + Transaction &t, + laddr_t addr, + int delta); + + /** + * update_mapping + * + * Updates mapping, removes if f returns nullopt + */ + using update_mapping_ertr = ref_ertr; + using update_mapping_ret = ref_ertr::future<lba_map_val_t>; + using update_func_t = LBANode::mutate_func_t; + update_mapping_ret update_mapping( + Transaction &t, + laddr_t addr, + update_func_t &&f); + + using update_internal_mapping_ertr = LBANode::mutate_internal_address_ertr; + using update_internal_mapping_ret = LBANode::mutate_internal_address_ret; + update_internal_mapping_ret update_internal_mapping( + Transaction &t, + depth_t depth, + laddr_t laddr, + paddr_t paddr); +}; +using BtreeLBAManagerRef = std::unique_ptr<BtreeLBAManager>; + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc new file mode 100644 index 000000000..a86c3cc57 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc @@ -0,0 +1,153 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/common/log.h" + +#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::lba_manager::btree { + +void btree_range_pin_t::take_pin(btree_range_pin_t &other) +{ + assert(other.extent); + assert(other.pins); + other.pins->replace_pin(*this, other); + pins = other.pins; + other.pins = nullptr; + + if (other.has_ref()) { + other.drop_ref(); + acquire_ref(); + } +} + +btree_range_pin_t::~btree_range_pin_t() +{ + assert(!pins == !is_linked()); + assert(!ref); + if (pins) { + logger().debug("{}: removing {}", __func__, *this); + pins->remove_pin(*this, true); + } + extent = nullptr; +} + +void btree_pin_set_t::replace_pin(btree_range_pin_t &to, btree_range_pin_t &from) +{ + pins.replace_node(pins.iterator_to(from), to); +} + +void btree_pin_set_t::remove_pin(btree_range_pin_t &pin, bool do_check_parent) +{ + logger().debug("{}: {}", __func__, pin); + assert(pin.is_linked()); + assert(pin.pins); + assert(!pin.ref); + + pins.erase(pin); + pin.pins = nullptr; + + if (do_check_parent) { + check_parent(pin); + } +} + +btree_range_pin_t *btree_pin_set_t::maybe_get_parent( + const lba_node_meta_t &meta) +{ + auto cmeta = meta; + cmeta.depth++; + auto iter = pins.upper_bound(cmeta, btree_range_pin_t::meta_cmp_t()); + if (iter == pins.begin()) { + return nullptr; + } else { + --iter; + if (iter->range.is_parent_of(meta)) { + return &*iter; + } else { + return nullptr; + } + } +} + +const btree_range_pin_t *btree_pin_set_t::maybe_get_first_child( + const lba_node_meta_t &meta) const +{ + if (meta.depth == 0) { + return nullptr; + } + + auto cmeta = meta; + cmeta.depth--; + + auto iter = pins.lower_bound(cmeta, btree_range_pin_t::meta_cmp_t()); + if (iter == pins.end()) { + return nullptr; + } else if (meta.is_parent_of(iter->range)) { + return &*iter; + } else { + return nullptr; + } +} + +void btree_pin_set_t::release_if_no_children(btree_range_pin_t &pin) +{ + assert(pin.is_linked()); + if (maybe_get_first_child(pin.range) == nullptr) { + pin.drop_ref(); + } +} + +void btree_pin_set_t::add_pin(btree_range_pin_t &pin) +{ + assert(!pin.is_linked()); + assert(!pin.pins); + assert(!pin.ref); + + auto [prev, inserted] = pins.insert(pin); + if (!inserted) { + logger().error("{}: unable to add {}, found {}", __func__, pin, *prev); + assert(0 == "impossible"); + return; + } + pin.pins = this; + if (!pin.is_root()) { + auto *parent = maybe_get_parent(pin.range); + assert(parent); + if (!parent->has_ref()) { + logger().debug("{}: acquiring parent {}", __func__, + static_cast<void*>(parent)); + parent->acquire_ref(); + } else { + logger().debug("{}: parent has ref {}", __func__, + static_cast<void*>(parent)); + } + } + if (maybe_get_first_child(pin.range) != nullptr) { + logger().debug("{}: acquiring self {}", __func__, pin); + pin.acquire_ref(); + } +} + +void btree_pin_set_t::retire(btree_range_pin_t &pin) +{ + pin.drop_ref(); + remove_pin(pin, false); +} + +void btree_pin_set_t::check_parent(btree_range_pin_t &pin) +{ + auto parent = maybe_get_parent(pin.range); + if (parent) { + logger().debug("{}: releasing parent {}", __func__, *parent); + release_if_no_children(*parent); + } +} + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h new file mode 100644 index 000000000..3fa218fc8 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h @@ -0,0 +1,274 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive/set.hpp> + +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/seastore_types.h" + +namespace crimson::os::seastore::lba_manager::btree { + +class LBANode; +using LBANodeRef = TCachedExtentRef<LBANode>; + +struct lba_node_meta_t { + laddr_t begin = 0; + laddr_t end = 0; + depth_t depth = 0; + + bool is_parent_of(const lba_node_meta_t &other) const { + return (depth == other.depth + 1) && + (begin <= other.begin) && + (end >= other.end); + } + + std::pair<lba_node_meta_t, lba_node_meta_t> split_into(laddr_t pivot) const { + return std::make_pair( + lba_node_meta_t{begin, pivot, depth}, + lba_node_meta_t{pivot, end, depth}); + } + + static lba_node_meta_t merge_from( + const lba_node_meta_t &lhs, const lba_node_meta_t &rhs) { + assert(lhs.depth == rhs.depth); + return lba_node_meta_t{lhs.begin, rhs.end, lhs.depth}; + } + + static std::pair<lba_node_meta_t, lba_node_meta_t> + rebalance(const lba_node_meta_t &lhs, const lba_node_meta_t &rhs, laddr_t pivot) { + assert(lhs.depth == rhs.depth); + return std::make_pair( + lba_node_meta_t{lhs.begin, pivot, lhs.depth}, + lba_node_meta_t{pivot, rhs.end, lhs.depth}); + } + + bool is_root() const { + return begin == 0 && end == L_ADDR_MAX; + } +}; + +inline std::ostream &operator<<( + std::ostream &lhs, + const lba_node_meta_t &rhs) +{ + return lhs << "btree_node_meta_t(" + << "begin=" << rhs.begin + << ", end=" << rhs.end + << ", depth=" << rhs.depth + << ")"; +} + +/** + * btree_range_pin_t + * + * Element tracked by btree_pin_set_t below. Encapsulates the intrusive_set + * hook, the lba_node_meta_t representing the lba range covered by a node, + * and extent and ref members intended to hold a reference when the extent + * should be pinned. + */ +class btree_pin_set_t; +class btree_range_pin_t : public boost::intrusive::set_base_hook<> { + friend class btree_pin_set_t; + lba_node_meta_t range; + + btree_pin_set_t *pins = nullptr; + + // We need to be able to remember extent without holding a reference, + // but we can do it more compactly -- TODO + CachedExtent *extent = nullptr; + CachedExtentRef ref; + + using index_t = boost::intrusive::set<btree_range_pin_t>; + + static auto get_tuple(const lba_node_meta_t &meta) { + return std::make_tuple(-meta.depth, meta.begin); + } + + void acquire_ref() { + ref = CachedExtentRef(extent); + } + + void drop_ref() { + ref.reset(); + } + +public: + btree_range_pin_t() = default; + btree_range_pin_t(CachedExtent *extent) + : extent(extent) {} + btree_range_pin_t(const btree_range_pin_t &rhs, CachedExtent *extent) + : range(rhs.range), extent(extent) {} + + bool has_ref() const { + return !!ref; + } + + bool is_root() const { + return range.is_root(); + } + + void set_range(const lba_node_meta_t &nrange) { + range = nrange; + } + void set_extent(CachedExtent *nextent) { + assert(!extent); + extent = nextent; + } + + void take_pin(btree_range_pin_t &other); + + friend bool operator<( + const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { + return get_tuple(lhs.range) < get_tuple(rhs.range); + } + friend bool operator>( + const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { + return get_tuple(lhs.range) > get_tuple(rhs.range); + } + friend bool operator==( + const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { + return get_tuple(lhs.range) == rhs.get_tuple(rhs.range); + } + + struct meta_cmp_t { + bool operator()( + const btree_range_pin_t &lhs, const lba_node_meta_t &rhs) const { + return get_tuple(lhs.range) < get_tuple(rhs); + } + bool operator()( + const lba_node_meta_t &lhs, const btree_range_pin_t &rhs) const { + return get_tuple(lhs) < get_tuple(rhs.range); + } + }; + + friend std::ostream &operator<<( + std::ostream &lhs, + const btree_range_pin_t &rhs) { + return lhs << "btree_range_pin_t(" + << "begin=" << rhs.range.begin + << ", end=" << rhs.range.end + << ", depth=" << rhs.range.depth + << ", extent=" << rhs.extent + << ")"; + } + + friend class BtreeLBAPin; + ~btree_range_pin_t(); +}; + +/** + * btree_pin_set_t + * + * Ensures that for every cached node, all parent LBANodes required + * to map it are present in cache. Relocating these nodes can + * therefore be done without further reads or cache space. + * + * Contains a btree_range_pin_t for every clean or dirty LBANode + * or LogicalCachedExtent instance in cache at any point in time. + * For any LBANode, the contained btree_range_pin_t will hold + * a reference to that node pinning it in cache as long as that + * node has children in the set. This invariant can be violated + * only by calling retire_extent and is repaired by calling + * check_parent synchronously after adding any new extents. + */ +class btree_pin_set_t { + friend class btree_range_pin_t; + using pins_t = btree_range_pin_t::index_t; + pins_t pins; + + pins_t::iterator get_iter(btree_range_pin_t &pin) { + return pins_t::s_iterator_to(pin); + } + + /// Removes pin from set optionally checking whether parent has other children + void remove_pin(btree_range_pin_t &pin, bool check_parent); + + void replace_pin(btree_range_pin_t &to, btree_range_pin_t &from); + + /// Returns parent pin if exists + btree_range_pin_t *maybe_get_parent(const lba_node_meta_t &pin); + + /// Returns earliest child pin if exist + const btree_range_pin_t *maybe_get_first_child(const lba_node_meta_t &pin) const; + + /// Releases pin if it has no children + void release_if_no_children(btree_range_pin_t &pin); + +public: + /// Adds pin to set, assumes set is consistent + void add_pin(btree_range_pin_t &pin); + + /** + * retire/check_parent + * + * See BtreeLBAManager::complete_transaction. + * retire removes the specified pin from the set, but does not + * check parents. After any new extents are added to the set, + * the caller is required to call check_parent to restore the + * invariant. + */ + void retire(btree_range_pin_t &pin); + void check_parent(btree_range_pin_t &pin); + + ~btree_pin_set_t() { + assert(pins.empty()); + } +}; + +class BtreeLBAPin : public LBAPin { + friend class BtreeLBAManager; + + /** + * parent + * + * populated until link_extent is called to ensure cache residence + * until add_pin is called. + */ + CachedExtentRef parent; + + paddr_t paddr; + btree_range_pin_t pin; + +public: + BtreeLBAPin() = default; + + BtreeLBAPin( + CachedExtentRef parent, + paddr_t paddr, + lba_node_meta_t &&meta) + : parent(parent), paddr(paddr) { + pin.set_range(std::move(meta)); + } + + void link_extent(LogicalCachedExtent *ref) final { + pin.set_extent(ref); + } + + extent_len_t get_length() const final { + assert(pin.range.end > pin.range.begin); + return pin.range.end - pin.range.begin; + } + + paddr_t get_paddr() const final { + return paddr; + } + + laddr_t get_laddr() const final { + return pin.range.begin; + } + + LBAPinRef duplicate() const final { + auto ret = std::unique_ptr<BtreeLBAPin>(new BtreeLBAPin); + ret->pin.set_range(pin.range); + ret->paddr = paddr; + return ret; + } + + void take_pin(LBAPin &opin) final { + pin.take_pin(static_cast<BtreeLBAPin&>(opin).pin); + } +}; + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h new file mode 100644 index 000000000..b6f33a1ae --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h @@ -0,0 +1,269 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <sys/mman.h> +#include <memory> +#include <string.h> + +#include "crimson/common/log.h" +#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h" +#include "crimson/os/seastore/lba_manager.h" + +namespace crimson::os::seastore::lba_manager::btree { + +struct op_context_t { + Cache &cache; + btree_pin_set_t &pins; + Transaction &trans; +}; + +/** + * lba_map_val_t + * + * struct representing a single lba mapping + */ +struct lba_map_val_t { + extent_len_t len = 0; ///< length of mapping + paddr_t paddr; ///< physical addr of mapping + uint32_t refcount = 0; ///< refcount + uint32_t checksum = 0; ///< checksum of original block written at paddr (TODO) + + lba_map_val_t( + extent_len_t len, + paddr_t paddr, + uint32_t refcount, + uint32_t checksum) + : len(len), paddr(paddr), refcount(refcount), checksum(checksum) {} +}; + +class BtreeLBAPin; +using BtreeLBAPinRef = std::unique_ptr<BtreeLBAPin>; + +/** + * LBANode + * + * Base class enabling recursive lookup between internal and leaf nodes. + */ +struct LBANode : CachedExtent { + using LBANodeRef = TCachedExtentRef<LBANode>; + using lookup_range_ertr = LBAManager::get_mapping_ertr; + using lookup_range_ret = LBAManager::get_mapping_ret; + + btree_range_pin_t pin; + + LBANode(ceph::bufferptr &&ptr) : CachedExtent(std::move(ptr)), pin(this) {} + LBANode(const LBANode &rhs) + : CachedExtent(rhs), pin(rhs.pin, this) {} + + virtual lba_node_meta_t get_node_meta() const = 0; + + /** + * lookup + * + * Returns the node at the specified depth responsible + * for laddr + */ + using lookup_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using lookup_ret = lookup_ertr::future<LBANodeRef>; + virtual lookup_ret lookup( + op_context_t c, + laddr_t addr, + depth_t depth) = 0; + + /** + * lookup_range + * + * Returns mappings within range [addr, addr+len) + */ + virtual lookup_range_ret lookup_range( + op_context_t c, + laddr_t addr, + extent_len_t len) = 0; + + /** + * insert + * + * Recursively inserts into subtree rooted at *this. Caller + * must already have handled splitting if at_max_capacity(). + * + * Precondition: !at_max_capacity() + */ + using insert_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + using insert_ret = insert_ertr::future<LBAPinRef>; + virtual insert_ret insert( + op_context_t c, + laddr_t laddr, + lba_map_val_t val) = 0; + + /** + * find_hole + * + * Finds minimum hole of size len in [min, max) + * + * @return addr of hole, L_ADDR_NULL if unfound + */ + using find_hole_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using find_hole_ret = find_hole_ertr::future<laddr_t>; + virtual find_hole_ret find_hole( + op_context_t c, + laddr_t min, + laddr_t max, + extent_len_t len) = 0; + + /** + * scan_mappings + * + * Call f for all mappings in [begin, end) + */ + using scan_mappings_ertr = LBAManager::scan_mappings_ertr; + using scan_mappings_ret = LBAManager::scan_mappings_ret; + using scan_mappings_func_t = LBAManager::scan_mappings_func_t; + virtual scan_mappings_ret scan_mappings( + op_context_t c, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &f) = 0; + + using scan_mapped_space_ertr = LBAManager::scan_mapped_space_ertr; + using scan_mapped_space_ret = LBAManager::scan_mapped_space_ret; + using scan_mapped_space_func_t = LBAManager::scan_mapped_space_func_t; + virtual scan_mapped_space_ret scan_mapped_space( + op_context_t c, + scan_mapped_space_func_t &f) = 0; + + /** + * mutate_mapping + * + * Lookups up laddr, calls f on value. If f returns a value, inserts it. + * If it returns nullopt, removes the value. + * Caller must already have merged if at_min_capacity(). + * + * Recursive calls use mutate_mapping_internal. + * + * Precondition: !at_min_capacity() + */ + using mutate_mapping_ertr = crimson::errorator< + crimson::ct_error::enoent, ///< mapping does not exist + crimson::ct_error::input_output_error + >; + using mutate_mapping_ret = mutate_mapping_ertr::future< + lba_map_val_t>; + using mutate_func_t = std::function< + lba_map_val_t(const lba_map_val_t &v) + >; + virtual mutate_mapping_ret mutate_mapping( + op_context_t c, + laddr_t laddr, + mutate_func_t &&f) = 0; + virtual mutate_mapping_ret mutate_mapping_internal( + op_context_t c, + laddr_t laddr, + bool is_root, + mutate_func_t &&f) = 0; + + /** + * mutate_internal_address + * + * Looks up internal node mapping at laddr, depth and + * updates the mapping to paddr. Returns previous paddr + * (for debugging purposes). + */ + using mutate_internal_address_ertr = crimson::errorator< + crimson::ct_error::enoent, ///< mapping does not exist + crimson::ct_error::input_output_error + >; + using mutate_internal_address_ret = mutate_internal_address_ertr::future< + paddr_t>; + virtual mutate_internal_address_ret mutate_internal_address( + op_context_t c, + depth_t depth, + laddr_t laddr, + paddr_t paddr) = 0; + + /** + * make_split_children + * + * Generates appropriately typed left and right nodes formed from the + * contents of *this. + * + * Returns <left, right, pivot> where pivot is the first value of right. + */ + virtual std::tuple< + LBANodeRef, + LBANodeRef, + laddr_t> + make_split_children( + op_context_t c) = 0; + + /** + * make_full_merge + * + * Returns a single node formed from merging *this and right. + * Precondition: at_min_capacity() && right.at_min_capacity() + */ + virtual LBANodeRef make_full_merge( + op_context_t c, + LBANodeRef &right) = 0; + + /** + * make_balanced + * + * Returns nodes formed by balancing the contents of *this and right. + * + * Returns <left, right, pivot> where pivot is the first value of right. + */ + virtual std::tuple< + LBANodeRef, + LBANodeRef, + laddr_t> + make_balanced( + op_context_t c, + LBANodeRef &right, + bool prefer_left) = 0; + + virtual bool at_max_capacity() const = 0; + virtual bool at_min_capacity() const = 0; + + virtual ~LBANode() = default; + + void on_delta_write(paddr_t record_block_offset) final { + // All in-memory relative addrs are necessarily record-relative + assert(get_prior_instance()); + pin.take_pin(get_prior_instance()->cast<LBANode>()->pin); + resolve_relative_addrs(record_block_offset); + } + + void on_initial_write() final { + // All in-memory relative addrs are necessarily block-relative + resolve_relative_addrs(get_paddr()); + } + + void on_clean_read() final { + // From initial write of block, relative addrs are necessarily block-relative + resolve_relative_addrs(get_paddr()); + } + + virtual void resolve_relative_addrs(paddr_t base) = 0; +}; +using LBANodeRef = LBANode::LBANodeRef; + +/** + * get_lba_btree_extent + * + * Fetches node at depth of the appropriate type. + */ +Cache::get_extent_ertr::future<LBANodeRef> get_lba_btree_extent( + op_context_t c, ///< [in] context structure + depth_t depth, ///< [in] depth of node to fetch + paddr_t offset, ///< [in] physical addr of node + paddr_t base ///< [in] depending on user, block addr or record addr + /// in case offset is relative +); + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc new file mode 100644 index 000000000..5e400803b --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc @@ -0,0 +1,701 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include <memory> +#include <string.h> + +#include "include/buffer.h" +#include "include/byteorder.h" + +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::lba_manager::btree { + +std::ostream &LBAInternalNode::print_detail(std::ostream &out) const +{ + return out << ", size=" << get_size() + << ", meta=" << get_meta(); +} + +LBAInternalNode::lookup_ret LBAInternalNode::lookup( + op_context_t c, + laddr_t addr, + depth_t depth) +{ + auto meta = get_meta(); + if (depth == get_meta().depth) { + return lookup_ret( + lookup_ertr::ready_future_marker{}, + this); + } + assert(meta.begin <= addr); + assert(meta.end > addr); + auto iter = lower_bound(addr); + return get_lba_btree_extent( + c, + meta.depth - 1, + iter->get_val(), + get_paddr()).safe_then([c, addr, depth](auto child) { + return child->lookup(c, addr, depth); + }).finally([ref=LBANodeRef(this)] {}); +} + +LBAInternalNode::lookup_range_ret LBAInternalNode::lookup_range( + op_context_t c, + laddr_t addr, + extent_len_t len) +{ + auto [begin, end] = bound(addr, addr + len); + auto result_up = std::make_unique<lba_pin_list_t>(); + auto &result = *result_up; + return crimson::do_for_each( + std::move(begin), + std::move(end), + [this, c, &result, addr, len](const auto &val) mutable { + return get_lba_btree_extent( + c, + get_meta().depth - 1, + val.get_val(), + get_paddr()).safe_then( + [c, &result, addr, len](auto extent) mutable { + return extent->lookup_range( + c, + addr, + len).safe_then( + [&result](auto pin_list) mutable { + result.splice(result.end(), pin_list, + pin_list.begin(), pin_list.end()); + }); + }); + }).safe_then([result=std::move(result_up), ref=LBANodeRef(this)] { + return lookup_range_ertr::make_ready_future<lba_pin_list_t>( + std::move(*result)); + }); +} + +LBAInternalNode::insert_ret LBAInternalNode::insert( + op_context_t c, + laddr_t laddr, + lba_map_val_t val) +{ + auto insertion_pt = get_containing_child(laddr); + return get_lba_btree_extent( + c, + get_meta().depth - 1, + insertion_pt->get_val(), + get_paddr()).safe_then( + [this, insertion_pt, c, laddr, val=std::move(val)]( + auto extent) mutable { + return extent->at_max_capacity() ? + split_entry(c, laddr, insertion_pt, extent) : + insert_ertr::make_ready_future<LBANodeRef>(std::move(extent)); + }).safe_then([c, laddr, val=std::move(val)]( + LBANodeRef extent) mutable { + return extent->insert(c, laddr, val); + }); +} + +LBAInternalNode::mutate_mapping_ret LBAInternalNode::mutate_mapping( + op_context_t c, + laddr_t laddr, + mutate_func_t &&f) +{ + return mutate_mapping_internal(c, laddr, true, std::move(f)); +} + +LBAInternalNode::mutate_mapping_ret LBAInternalNode::mutate_mapping_internal( + op_context_t c, + laddr_t laddr, + bool is_root, + mutate_func_t &&f) +{ + auto mutation_pt = get_containing_child(laddr); + if (mutation_pt == end()) { + assert(0 == "impossible"); + return crimson::ct_error::enoent::make(); + } + return get_lba_btree_extent( + c, + get_meta().depth - 1, + mutation_pt->get_val(), + get_paddr() + ).safe_then([=](LBANodeRef extent) { + if (extent->at_min_capacity() && get_size() > 1) { + return merge_entry( + c, + laddr, + mutation_pt, + extent, + is_root); + } else { + return merge_ertr::make_ready_future<LBANodeRef>( + std::move(extent)); + } + }).safe_then([c, laddr, f=std::move(f)](LBANodeRef extent) mutable { + return extent->mutate_mapping_internal(c, laddr, false, std::move(f)); + }); +} + +LBAInternalNode::mutate_internal_address_ret LBAInternalNode::mutate_internal_address( + op_context_t c, + depth_t depth, + laddr_t laddr, + paddr_t paddr) +{ + if (get_meta().depth == (depth + 1)) { + if (!is_pending()) { + return c.cache.duplicate_for_write(c.trans, this)->cast<LBAInternalNode>( + )->mutate_internal_address( + c, + depth, + laddr, + paddr); + } + auto iter = get_containing_child(laddr); + if (iter->get_key() != laddr) { + return crimson::ct_error::enoent::make(); + } + + auto old_paddr = iter->get_val(); + + journal_update( + iter, + maybe_generate_relative(paddr), + maybe_get_delta_buffer()); + + return mutate_internal_address_ret( + mutate_internal_address_ertr::ready_future_marker{}, + old_paddr + ); + } else { + auto iter = get_containing_child(laddr); + return get_lba_btree_extent( + c, + get_meta().depth - 1, + iter->get_val(), + get_paddr() + ).safe_then([=](auto node) { + return node->mutate_internal_address( + c, + depth, + laddr, + paddr); + }); + } +} + +LBAInternalNode::find_hole_ret LBAInternalNode::find_hole( + op_context_t c, + laddr_t min_addr, + laddr_t max_addr, + extent_len_t len) +{ + logger().debug( + "LBAInternalNode::find_hole min={}, max={}, len={}, *this={}", + min_addr, max_addr, len, *this); + auto [begin, end] = bound(min_addr, max_addr); + return seastar::repeat_until_value( + [i=begin, e=end, c, min_addr, len, this]() mutable { + if (i == e) { + return seastar::make_ready_future<std::optional<laddr_t>>( + std::make_optional<laddr_t>(L_ADDR_NULL)); + } + return get_lba_btree_extent(c, + get_meta().depth - 1, + i->get_val(), + get_paddr()).safe_then( + [c, min_addr, len, i](auto extent) mutable { + auto lb = std::max(min_addr, i->get_key()); + auto ub = i->get_next_key_or_max(); + logger().debug("LBAInternalNode::find_hole extent {} lb {} ub {}", + *extent, lb, ub); + return extent->find_hole(c, lb, ub, len); + }).safe_then([&i](auto addr) mutable -> std::optional<laddr_t> { + if (addr == L_ADDR_NULL) { + ++i; + return {}; + } else { + return addr; + } + }, + // TODO: GCC enters a dead loop if crimson::do_until() is used + // or erroratorized future is returned + crimson::ct_error::assert_all{ "fix me - APIv6" }); + }); +} + +LBAInternalNode::scan_mappings_ret LBAInternalNode::scan_mappings( + op_context_t c, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &f) +{ + auto [biter, eiter] = bound(begin, end); + return crimson::do_for_each( + std::move(biter), + std::move(eiter), + [=, &f](auto &viter) { + return get_lba_btree_extent( + c, + get_meta().depth - 1, + viter->get_val(), + get_paddr()).safe_then([=, &f](auto child) { + return child->scan_mappings(c, begin, end, f); + }); + }).safe_then([ref=LBANodeRef(this)]{}); +} + +LBAInternalNode::scan_mapped_space_ret LBAInternalNode::scan_mapped_space( + op_context_t c, + scan_mapped_space_func_t &f) +{ + f(get_paddr(), get_length()); + return crimson::do_for_each( + begin(), end(), + [=, &f](auto &viter) { + return get_lba_btree_extent( + c, + get_meta().depth - 1, + viter->get_val(), + get_paddr()).safe_then([=, &f](auto child) { + return child->scan_mapped_space(c, f); + }); + }).safe_then([ref=LBANodeRef(this)]{}); +} + + +void LBAInternalNode::resolve_relative_addrs(paddr_t base) +{ + for (auto i: *this) { + if (i->get_val().is_relative()) { + auto updated = base.add_relative(i->get_val()); + logger().debug( + "LBAInternalNode::resolve_relative_addrs {} -> {}", + i->get_val(), + updated); + i->set_val(updated); + } + } +} + + +LBAInternalNode::split_ret +LBAInternalNode::split_entry( + op_context_t c, + laddr_t addr, + internal_iterator_t iter, LBANodeRef entry) +{ + if (!is_pending()) { + auto mut = c.cache.duplicate_for_write( + c.trans, this)->cast<LBAInternalNode>(); + auto mut_iter = mut->iter_idx(iter->get_offset()); + return mut->split_entry(c, addr, mut_iter, entry); + } + + ceph_assert(!at_max_capacity()); + auto [left, right, pivot] = entry->make_split_children(c); + + journal_update( + iter, + maybe_generate_relative(left->get_paddr()), + maybe_get_delta_buffer()); + journal_insert( + iter + 1, + pivot, + maybe_generate_relative(right->get_paddr()), + maybe_get_delta_buffer()); + + c.cache.retire_extent(c.trans, entry); + + logger().debug( + "LBAInternalNode::split_entry *this {} entry {} into left {} right {}", + *this, + *entry, + *left, + *right); + + return split_ertr::make_ready_future<LBANodeRef>( + pivot > addr ? left : right + ); +} + +LBAInternalNode::merge_ret +LBAInternalNode::merge_entry( + op_context_t c, + laddr_t addr, + internal_iterator_t iter, + LBANodeRef entry, + bool is_root) +{ + if (!is_pending()) { + auto mut = c.cache.duplicate_for_write(c.trans, this)->cast<LBAInternalNode>(); + auto mut_iter = mut->iter_idx(iter->get_offset()); + return mut->merge_entry(c, addr, mut_iter, entry, is_root); + } + + logger().debug( + "LBAInternalNode: merge_entry: {}, {}", + *this, + *entry); + auto donor_is_left = (iter + 1) == end(); + auto donor_iter = donor_is_left ? iter - 1 : iter + 1; + return get_lba_btree_extent( + c, + get_meta().depth - 1, + donor_iter->get_val(), + get_paddr() + ).safe_then([=](auto donor) mutable { + auto [l, r] = donor_is_left ? + std::make_pair(donor, entry) : std::make_pair(entry, donor); + auto [liter, riter] = donor_is_left ? + std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter); + if (donor->at_min_capacity()) { + auto replacement = l->make_full_merge( + c, + r); + + journal_update( + liter, + maybe_generate_relative(replacement->get_paddr()), + maybe_get_delta_buffer()); + journal_remove(riter, maybe_get_delta_buffer()); + + c.cache.retire_extent(c.trans, l); + c.cache.retire_extent(c.trans, r); + + if (is_root && get_size() == 1) { + return c.cache.get_root(c.trans).safe_then([=](RootBlockRef croot) { + { + auto mut_croot = c.cache.duplicate_for_write(c.trans, croot); + croot = mut_croot->cast<RootBlock>(); + } + croot->root.lba_root_addr = begin()->get_val(); + logger().debug( + "LBAInternalNode::merge_entry: collapsing root {} to addr {}", + *this, + begin()->get_val()); + croot->root.lba_depth = get_meta().depth - 1; + c.cache.retire_extent(c.trans, this); + return merge_ertr::make_ready_future<LBANodeRef>(replacement); + }); + } else { + return merge_ertr::make_ready_future<LBANodeRef>(replacement); + } + } else { + logger().debug( + "LBAInternalEntry::merge_entry balanced l {} r {}", + *l, + *r); + auto [replacement_l, replacement_r, pivot] = + l->make_balanced( + c, + r, + !donor_is_left); + + journal_update( + liter, + maybe_generate_relative(replacement_l->get_paddr()), + maybe_get_delta_buffer()); + journal_replace( + riter, + pivot, + maybe_generate_relative(replacement_r->get_paddr()), + maybe_get_delta_buffer()); + + c.cache.retire_extent(c.trans, l); + c.cache.retire_extent(c.trans, r); + return merge_ertr::make_ready_future<LBANodeRef>( + addr >= pivot ? replacement_r : replacement_l + ); + } + }); +} + + +LBAInternalNode::internal_iterator_t +LBAInternalNode::get_containing_child(laddr_t laddr) +{ + // TODO: binary search + for (auto i = begin(); i != end(); ++i) { + if (i.contains(laddr)) + return i; + } + ceph_assert(0 == "invalid"); + return end(); +} + +std::ostream &LBALeafNode::print_detail(std::ostream &out) const +{ + return out << ", size=" << get_size() + << ", meta=" << get_meta(); +} + +LBALeafNode::lookup_range_ret LBALeafNode::lookup_range( + op_context_t c, + laddr_t addr, + extent_len_t len) +{ + logger().debug( + "LBALeafNode::lookup_range {}~{}", + addr, + len); + auto ret = lba_pin_list_t(); + auto [i, end] = get_leaf_entries(addr, len); + for (; i != end; ++i) { + auto val = i->get_val(); + auto begin = i->get_key(); + ret.emplace_back( + std::make_unique<BtreeLBAPin>( + this, + val.paddr.maybe_relative_to(get_paddr()), + lba_node_meta_t{ begin, begin + val.len, 0})); + } + return lookup_range_ertr::make_ready_future<lba_pin_list_t>( + std::move(ret)); +} + +LBALeafNode::insert_ret LBALeafNode::insert( + op_context_t c, + laddr_t laddr, + lba_map_val_t val) +{ + ceph_assert(!at_max_capacity()); + + if (!is_pending()) { + return c.cache.duplicate_for_write(c.trans, this + )->cast<LBALeafNode>()->insert(c, laddr, val); + } + + val.paddr = maybe_generate_relative(val.paddr); + logger().debug( + "LBALeafNode::insert: inserting {}~{} -> {}", + laddr, + val.len, + val.paddr); + + auto insert_pt = lower_bound(laddr); + journal_insert(insert_pt, laddr, val, maybe_get_delta_buffer()); + + logger().debug( + "LBALeafNode::insert: inserted {}~{} -> {}", + insert_pt.get_key(), + insert_pt.get_val().len, + insert_pt.get_val().paddr); + auto begin = insert_pt.get_key(); + return insert_ret( + insert_ertr::ready_future_marker{}, + std::make_unique<BtreeLBAPin>( + this, + val.paddr.maybe_relative_to(get_paddr()), + lba_node_meta_t{ begin, begin + val.len, 0})); +} + +LBALeafNode::mutate_mapping_ret LBALeafNode::mutate_mapping( + op_context_t c, + laddr_t laddr, + mutate_func_t &&f) +{ + return mutate_mapping_internal(c, laddr, true, std::move(f)); +} + +LBALeafNode::mutate_mapping_ret LBALeafNode::mutate_mapping_internal( + op_context_t c, + laddr_t laddr, + bool is_root, + mutate_func_t &&f) +{ + auto mutation_pt = find(laddr); + if (mutation_pt == end()) { + return crimson::ct_error::enoent::make(); + } + + if (!is_pending()) { + return c.cache.duplicate_for_write(c.trans, this)->cast<LBALeafNode>( + )->mutate_mapping_internal( + c, + laddr, + is_root, + std::move(f)); + } + + auto cur = mutation_pt.get_val(); + auto mutated = f(cur); + + mutated.paddr = maybe_generate_relative(mutated.paddr); + + logger().debug( + "{}: mutate addr {}: {} -> {}", + __func__, + laddr, + cur.paddr, + mutated.paddr); + + if (mutated.refcount > 0) { + journal_update(mutation_pt, mutated, maybe_get_delta_buffer()); + return mutate_mapping_ret( + mutate_mapping_ertr::ready_future_marker{}, + mutated); + } else { + journal_remove(mutation_pt, maybe_get_delta_buffer()); + return mutate_mapping_ret( + mutate_mapping_ertr::ready_future_marker{}, + mutated); + } +} + +LBALeafNode::mutate_internal_address_ret LBALeafNode::mutate_internal_address( + op_context_t c, + depth_t depth, + laddr_t laddr, + paddr_t paddr) +{ + ceph_assert(0 == "Impossible"); + return mutate_internal_address_ret( + mutate_internal_address_ertr::ready_future_marker{}, + paddr); +} + +LBALeafNode::find_hole_ret LBALeafNode::find_hole( + op_context_t c, + laddr_t min, + laddr_t max, + extent_len_t len) +{ + logger().debug( + "LBALeafNode::find_hole min={} max={}, len={}, *this={}", + min, max, len, *this); + auto [liter, uiter] = bound(min, max); + for (auto i = liter; i != uiter; ++i) { + auto ub = i->get_key(); + if (min + len <= ub) { + return find_hole_ret( + find_hole_ertr::ready_future_marker{}, + min); + } else { + min = i->get_key() + i->get_val().len; + } + } + if (min + len <= max) { + return find_hole_ret( + find_hole_ertr::ready_future_marker{}, + min); + } else { + return find_hole_ret( + find_hole_ertr::ready_future_marker{}, + L_ADDR_MAX); + } +} + +LBALeafNode::scan_mappings_ret LBALeafNode::scan_mappings( + op_context_t c, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &f) +{ + auto [biter, eiter] = bound(begin, end); + for (auto i = biter; i != eiter; ++i) { + auto val = i->get_val(); + f(i->get_key(), val.paddr, val.len); + } + return scan_mappings_ertr::now(); +} + +LBALeafNode::scan_mapped_space_ret LBALeafNode::scan_mapped_space( + op_context_t c, + scan_mapped_space_func_t &f) +{ + f(get_paddr(), get_length()); + for (auto i = begin(); i != end(); ++i) { + auto val = i->get_val(); + f(val.paddr, val.len); + } + return scan_mappings_ertr::now(); +} + + +void LBALeafNode::resolve_relative_addrs(paddr_t base) +{ + for (auto i: *this) { + if (i->get_val().paddr.is_relative()) { + auto val = i->get_val(); + val.paddr = base.add_relative(val.paddr); + logger().debug( + "LBALeafNode::resolve_relative_addrs {} -> {}", + i->get_val().paddr, + val.paddr); + i->set_val(val); + } + } +} + +std::pair<LBALeafNode::internal_iterator_t, LBALeafNode::internal_iterator_t> +LBALeafNode::get_leaf_entries(laddr_t addr, extent_len_t len) +{ + return bound(addr, addr + len); +} + +Cache::get_extent_ertr::future<LBANodeRef> get_lba_btree_extent( + op_context_t c, + depth_t depth, + paddr_t offset, + paddr_t base) +{ + offset = offset.maybe_relative_to(base); + ceph_assert(depth > 0); + if (depth > 1) { + logger().debug( + "get_lba_btree_extent: reading internal at offset {}, depth {}", + offset, + depth); + return c.cache.get_extent<LBAInternalNode>( + c.trans, + offset, + LBA_BLOCK_SIZE).safe_then([c](auto ret) { + auto meta = ret->get_meta(); + if (ret->get_size()) { + ceph_assert(meta.begin <= ret->begin()->get_key()); + ceph_assert(meta.end > (ret->end() - 1)->get_key()); + } + if (!ret->is_pending() && !ret->pin.is_linked()) { + ret->pin.set_range(meta); + c.pins.add_pin(ret->pin); + } + return LBANodeRef(ret.detach(), /* add_ref = */ false); + }); + } else { + logger().debug( + "get_lba_btree_extent: reading leaf at offset {}, depth {}", + offset, + depth); + return c.cache.get_extent<LBALeafNode>( + c.trans, + offset, + LBA_BLOCK_SIZE).safe_then([offset, c](auto ret) { + logger().debug( + "get_lba_btree_extent: read leaf at offset {} {}", + offset, + *ret); + auto meta = ret->get_meta(); + if (ret->get_size()) { + ceph_assert(meta.begin <= ret->begin()->get_key()); + ceph_assert(meta.end > (ret->end() - 1)->get_key()); + } + if (!ret->is_pending() && !ret->pin.is_linked()) { + ret->pin.set_range(meta); + c.pins.add_pin(ret->pin); + } + return LBANodeRef(ret.detach(), /* add_ref = */ false); + }); + } +} + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h new file mode 100644 index 000000000..230eef682 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h @@ -0,0 +1,555 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <sys/mman.h> +#include <string.h> + +#include <memory> +#include <string.h> + +#include "include/buffer.h" + +#include "crimson/common/fixed_kv_node_layout.h" +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/lba_manager.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" +#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h" + +namespace crimson::os::seastore::lba_manager::btree { + +constexpr size_t LBA_BLOCK_SIZE = 4096; + +/** + * lba_node_meta_le_t + * + * On disk layout for lba_node_meta_t + */ +struct lba_node_meta_le_t { + laddr_le_t begin = laddr_le_t(0); + laddr_le_t end = laddr_le_t(0); + depth_le_t depth = init_les32(0); + + lba_node_meta_le_t() = default; + lba_node_meta_le_t(const lba_node_meta_le_t &) = default; + explicit lba_node_meta_le_t(const lba_node_meta_t &val) + : begin(init_le64(val.begin)), + end(init_le64(val.end)), + depth(init_les32(val.depth)) {} + + operator lba_node_meta_t() const { + return lba_node_meta_t{ begin, end, depth }; + } +}; + + +/** + * LBAInternalNode + * + * Abstracts operations on and layout of internal nodes for the + * LBA Tree. + * + * Layout (4k): + * size : uint32_t[1] 4b + * (padding) : 4b + * meta : lba_node_meta_le_t[3] (1*24)b + * keys : laddr_t[255] (254*8)b + * values : paddr_t[255] (254*8)b + * = 4096 + + * TODO: make the above capacity calculation part of FixedKVNodeLayout + * TODO: the above alignment probably isn't portable without further work + */ +constexpr size_t INTERNAL_NODE_CAPACITY = 254; +struct LBAInternalNode + : LBANode, + common::FixedKVNodeLayout< + INTERNAL_NODE_CAPACITY, + lba_node_meta_t, lba_node_meta_le_t, + laddr_t, laddr_le_t, + paddr_t, paddr_le_t> { + using internal_iterator_t = const_iterator; + template <typename... T> + LBAInternalNode(T&&... t) : + LBANode(std::forward<T>(t)...), + FixedKVNodeLayout(get_bptr().c_str()) {} + + static constexpr extent_types_t type = extent_types_t::LADDR_INTERNAL; + + lba_node_meta_t get_node_meta() const final { return get_meta(); } + + CachedExtentRef duplicate_for_write() final { + assert(delta_buffer.empty()); + return CachedExtentRef(new LBAInternalNode(*this)); + }; + + delta_buffer_t delta_buffer; + delta_buffer_t *maybe_get_delta_buffer() { + return is_mutation_pending() ? &delta_buffer : nullptr; + } + + lookup_ret lookup(op_context_t c, laddr_t addr, depth_t depth) final; + + lookup_range_ret lookup_range( + op_context_t c, + laddr_t addr, + extent_len_t len) final; + + insert_ret insert( + op_context_t c, + laddr_t laddr, + lba_map_val_t val) final; + + mutate_mapping_ret mutate_mapping( + op_context_t c, + laddr_t laddr, + mutate_func_t &&f) final; + mutate_mapping_ret mutate_mapping_internal( + op_context_t c, + laddr_t laddr, + bool is_root, + mutate_func_t &&f) final; + + mutate_internal_address_ret mutate_internal_address( + op_context_t c, + depth_t depth, + laddr_t laddr, + paddr_t paddr) final; + + find_hole_ret find_hole( + op_context_t c, + laddr_t min, + laddr_t max, + extent_len_t len) final; + + scan_mappings_ret scan_mappings( + op_context_t c, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &f) final; + + scan_mapped_space_ret scan_mapped_space( + op_context_t c, + scan_mapped_space_func_t &f) final; + + std::tuple<LBANodeRef, LBANodeRef, laddr_t> + make_split_children(op_context_t c) final { + auto left = c.cache.alloc_new_extent<LBAInternalNode>( + c.trans, LBA_BLOCK_SIZE); + auto right = c.cache.alloc_new_extent<LBAInternalNode>( + c.trans, LBA_BLOCK_SIZE); + auto pivot = split_into(*left, *right); + left->pin.set_range(left->get_meta()); + right->pin.set_range(right->get_meta()); + return std::make_tuple( + left, + right, + pivot); + } + + LBANodeRef make_full_merge( + op_context_t c, + LBANodeRef &right) final { + auto replacement = c.cache.alloc_new_extent<LBAInternalNode>( + c.trans, LBA_BLOCK_SIZE); + replacement->merge_from(*this, *right->cast<LBAInternalNode>()); + replacement->pin.set_range(replacement->get_meta()); + return replacement; + } + + std::tuple<LBANodeRef, LBANodeRef, laddr_t> + make_balanced( + op_context_t c, + LBANodeRef &_right, + bool prefer_left) final { + ceph_assert(_right->get_type() == type); + auto &right = *_right->cast<LBAInternalNode>(); + auto replacement_left = c.cache.alloc_new_extent<LBAInternalNode>( + c.trans, LBA_BLOCK_SIZE); + auto replacement_right = c.cache.alloc_new_extent<LBAInternalNode>( + c.trans, LBA_BLOCK_SIZE); + + auto pivot = balance_into_new_nodes( + *this, + right, + prefer_left, + *replacement_left, + *replacement_right); + + replacement_left->pin.set_range(replacement_left->get_meta()); + replacement_right->pin.set_range(replacement_right->get_meta()); + return std::make_tuple( + replacement_left, + replacement_right, + pivot); + } + + /** + * Internal relative addresses on read or in memory prior to commit + * are either record or block relative depending on whether this + * physical node is is_initial_pending() or just is_pending(). + * + * User passes appropriate base depending on lifecycle and + * resolve_relative_addrs fixes up relative internal references + * based on base. + */ + void resolve_relative_addrs(paddr_t base) final; + void node_resolve_vals(iterator from, iterator to) const final { + if (is_initial_pending()) { + for (auto i = from; i != to; ++i) { + if (i->get_val().is_relative()) { + assert(i->get_val().is_block_relative()); + i->set_val(get_paddr().add_relative(i->get_val())); + } + } + } + } + void node_unresolve_vals(iterator from, iterator to) const final { + if (is_initial_pending()) { + for (auto i = from; i != to; ++i) { + if (i->get_val().is_relative()) { + assert(i->get_val().is_record_relative()); + i->set_val(i->get_val() - get_paddr()); + } + } + } + } + + extent_types_t get_type() const final { + return type; + } + + std::ostream &print_detail(std::ostream &out) const final; + + ceph::bufferlist get_delta() final { + assert(!delta_buffer.empty()); + ceph::buffer::ptr bptr(delta_buffer.get_bytes()); + delta_buffer.copy_out(bptr.c_str(), bptr.length()); + ceph::bufferlist bl; + bl.push_back(bptr); + return bl; + } + + void apply_delta_and_adjust_crc( + paddr_t base, const ceph::bufferlist &_bl) final { + assert(_bl.length()); + ceph::bufferlist bl = _bl; + bl.rebuild(); + delta_buffer_t buffer; + buffer.copy_in(bl.front().c_str(), bl.front().length()); + buffer.replay(*this); + set_last_committed_crc(get_crc32c()); + resolve_relative_addrs(base); + } + + bool at_max_capacity() const final { + return get_size() == get_capacity(); + } + + bool at_min_capacity() const { + return get_size() == (get_capacity() / 2); + } + + /// returns iterators containing [l, r) + std::pair<internal_iterator_t, internal_iterator_t> bound( + laddr_t l, laddr_t r) { + // TODO: inefficient + auto retl = begin(); + for (; retl != end(); ++retl) { + if (retl->get_next_key_or_max() > l) + break; + } + auto retr = retl; + for (; retr != end(); ++retr) { + if (retr->get_key() >= r) + break; + } + return std::make_pair(retl, retr); + } + + using split_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + using split_ret = split_ertr::future<LBANodeRef>; + split_ret split_entry( + op_context_t c, + laddr_t addr, + internal_iterator_t, + LBANodeRef entry); + + using merge_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + using merge_ret = merge_ertr::future<LBANodeRef>; + merge_ret merge_entry( + op_context_t c, + laddr_t addr, + internal_iterator_t, + LBANodeRef entry, + bool is_root); + + /// returns iterator for subtree containing laddr + internal_iterator_t get_containing_child(laddr_t laddr); +}; + +/** + * LBALeafNode + * + * Abstracts operations on and layout of leaf nodes for the + * LBA Tree. + * + * Layout (4k): + * size : uint32_t[1] 4b + * (padding) : 4b + * meta : lba_node_meta_le_t[3] (1*24)b + * keys : laddr_t[170] (145*8)b + * values : lba_map_val_t[170] (145*20)b + * = 4092 + * + * TODO: update FixedKVNodeLayout to handle the above calculation + * TODO: the above alignment probably isn't portable without further work + */ +constexpr size_t LEAF_NODE_CAPACITY = 145; + +/** + * lba_map_val_le_t + * + * On disk layout for lba_map_val_t. + */ +struct lba_map_val_le_t { + extent_len_le_t len = init_extent_len_le_t(0); + paddr_le_t paddr; + ceph_le32 refcount = init_le32(0); + ceph_le32 checksum = init_le32(0); + + lba_map_val_le_t() = default; + lba_map_val_le_t(const lba_map_val_le_t &) = default; + explicit lba_map_val_le_t(const lba_map_val_t &val) + : len(init_extent_len_le_t(val.len)), + paddr(paddr_le_t(val.paddr)), + refcount(init_le32(val.refcount)), + checksum(init_le32(val.checksum)) {} + + operator lba_map_val_t() const { + return lba_map_val_t{ len, paddr, refcount, checksum }; + } +}; + +struct LBALeafNode + : LBANode, + common::FixedKVNodeLayout< + LEAF_NODE_CAPACITY, + lba_node_meta_t, lba_node_meta_le_t, + laddr_t, laddr_le_t, + lba_map_val_t, lba_map_val_le_t> { + using internal_iterator_t = const_iterator; + template <typename... T> + LBALeafNode(T&&... t) : + LBANode(std::forward<T>(t)...), + FixedKVNodeLayout(get_bptr().c_str()) {} + + static constexpr extent_types_t type = extent_types_t::LADDR_LEAF; + + lba_node_meta_t get_node_meta() const final { return get_meta(); } + + CachedExtentRef duplicate_for_write() final { + assert(delta_buffer.empty()); + return CachedExtentRef(new LBALeafNode(*this)); + }; + + delta_buffer_t delta_buffer; + delta_buffer_t *maybe_get_delta_buffer() { + return is_mutation_pending() ? &delta_buffer : nullptr; + } + + lookup_ret lookup(op_context_t c, laddr_t addr, depth_t depth) final + { + return lookup_ret( + lookup_ertr::ready_future_marker{}, + this); + } + + lookup_range_ret lookup_range( + op_context_t c, + laddr_t addr, + extent_len_t len) final; + + insert_ret insert( + op_context_t c, + laddr_t laddr, + lba_map_val_t val) final; + + mutate_mapping_ret mutate_mapping( + op_context_t c, + laddr_t laddr, + mutate_func_t &&f) final; + mutate_mapping_ret mutate_mapping_internal( + op_context_t c, + laddr_t laddr, + bool is_root, + mutate_func_t &&f) final; + + mutate_internal_address_ret mutate_internal_address( + op_context_t c, + depth_t depth, + laddr_t laddr, + paddr_t paddr) final; + + find_hole_ret find_hole( + op_context_t c, + laddr_t min, + laddr_t max, + extent_len_t len) final; + + scan_mappings_ret scan_mappings( + op_context_t c, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &f) final; + + scan_mapped_space_ret scan_mapped_space( + op_context_t c, + scan_mapped_space_func_t &f) final; + + std::tuple<LBANodeRef, LBANodeRef, laddr_t> + make_split_children(op_context_t c) final { + auto left = c.cache.alloc_new_extent<LBALeafNode>( + c.trans, LBA_BLOCK_SIZE); + auto right = c.cache.alloc_new_extent<LBALeafNode>( + c.trans, LBA_BLOCK_SIZE); + auto pivot = split_into(*left, *right); + left->pin.set_range(left->get_meta()); + right->pin.set_range(right->get_meta()); + return std::make_tuple( + left, + right, + pivot); + } + + LBANodeRef make_full_merge( + op_context_t c, + LBANodeRef &right) final { + auto replacement = c.cache.alloc_new_extent<LBALeafNode>( + c.trans, LBA_BLOCK_SIZE); + replacement->merge_from(*this, *right->cast<LBALeafNode>()); + replacement->pin.set_range(replacement->get_meta()); + return replacement; + } + + std::tuple<LBANodeRef, LBANodeRef, laddr_t> + make_balanced( + op_context_t c, + LBANodeRef &_right, + bool prefer_left) final { + ceph_assert(_right->get_type() == type); + auto &right = *_right->cast<LBALeafNode>(); + auto replacement_left = c.cache.alloc_new_extent<LBALeafNode>( + c.trans, LBA_BLOCK_SIZE); + auto replacement_right = c.cache.alloc_new_extent<LBALeafNode>( + c.trans, LBA_BLOCK_SIZE); + + auto pivot = balance_into_new_nodes( + *this, + right, + prefer_left, + *replacement_left, + *replacement_right); + + replacement_left->pin.set_range(replacement_left->get_meta()); + replacement_right->pin.set_range(replacement_right->get_meta()); + return std::make_tuple( + replacement_left, + replacement_right, + pivot); + } + + // See LBAInternalNode, same concept + void resolve_relative_addrs(paddr_t base) final; + void node_resolve_vals(iterator from, iterator to) const final { + if (is_initial_pending()) { + for (auto i = from; i != to; ++i) { + auto val = i->get_val(); + if (val.paddr.is_relative()) { + assert(val.paddr.is_block_relative()); + val.paddr = get_paddr().add_relative(val.paddr); + i->set_val(val); + } + } + } + } + void node_unresolve_vals(iterator from, iterator to) const final { + if (is_initial_pending()) { + for (auto i = from; i != to; ++i) { + auto val = i->get_val(); + if (val.paddr.is_relative()) { + auto val = i->get_val(); + assert(val.paddr.is_record_relative()); + val.paddr = val.paddr - get_paddr(); + i->set_val(val); + } + } + } + } + + ceph::bufferlist get_delta() final { + assert(!delta_buffer.empty()); + ceph::buffer::ptr bptr(delta_buffer.get_bytes()); + delta_buffer.copy_out(bptr.c_str(), bptr.length()); + ceph::bufferlist bl; + bl.push_back(bptr); + return bl; + } + + void apply_delta_and_adjust_crc( + paddr_t base, const ceph::bufferlist &_bl) final { + assert(_bl.length()); + ceph::bufferlist bl = _bl; + bl.rebuild(); + delta_buffer_t buffer; + buffer.copy_in(bl.front().c_str(), bl.front().length()); + buffer.replay(*this); + set_last_committed_crc(get_crc32c()); + resolve_relative_addrs(base); + } + + extent_types_t get_type() const final { + return type; + } + + std::ostream &print_detail(std::ostream &out) const final; + + bool at_max_capacity() const final { + return get_size() == get_capacity(); + } + + bool at_min_capacity() const final { + return get_size() == (get_capacity() / 2); + } + + /// returns iterators <lb, ub> containing addresses [l, r) + std::pair<internal_iterator_t, internal_iterator_t> bound( + laddr_t l, laddr_t r) { + // TODO: inefficient + auto retl = begin(); + for (; retl != end(); ++retl) { + if (retl->get_key() >= l || (retl->get_key() + retl->get_val().len) > l) + break; + } + auto retr = retl; + for (; retr != end(); ++retr) { + if (retr->get_key() >= r) + break; + } + return std::make_pair(retl, retr); + } + + std::pair<internal_iterator_t, internal_iterator_t> + get_leaf_entries(laddr_t addr, extent_len_t len); +}; +using LBALeafNodeRef = TCachedExtentRef<LBALeafNode>; + +} |