diff options
Diffstat (limited to 'src/crimson/os')
100 files changed, 25740 insertions, 0 deletions
diff --git a/src/crimson/os/CMakeLists.txt b/src/crimson/os/CMakeLists.txt new file mode 100644 index 000000000..f221dd7c1 --- /dev/null +++ b/src/crimson/os/CMakeLists.txt @@ -0,0 +1,15 @@ +add_library(crimson-os STATIC + futurized_store.cc + ${PROJECT_SOURCE_DIR}/src/os/Transaction.cc) +add_subdirectory(cyanstore) + +if(WITH_BLUESTORE) + add_subdirectory(alienstore) +endif() + +add_subdirectory(seastore) +target_link_libraries(crimson-os + crimson-cyanstore + crimson-alienstore + crimson-seastore + crimson) diff --git a/src/crimson/os/alienstore/CMakeLists.txt b/src/crimson/os/alienstore/CMakeLists.txt new file mode 100644 index 000000000..659a3c6ce --- /dev/null +++ b/src/crimson/os/alienstore/CMakeLists.txt @@ -0,0 +1,76 @@ +include_directories(SYSTEM "${CMAKE_SOURCE_DIR}/src/rocksdb/include") + +add_library(alien::cflags INTERFACE IMPORTED) +set_target_properties(alien::cflags PROPERTIES + INTERFACE_COMPILE_DEFINITIONS "WITH_SEASTAR;WITH_ALIEN" + INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:Seastar::seastar,INTERFACE_INCLUDE_DIRECTORIES>) + +add_library(crimson-alien-common STATIC + ${PROJECT_SOURCE_DIR}/src/common/admin_socket.cc + ${PROJECT_SOURCE_DIR}/src/common/blkdev.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_context.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_crypto.cc + ${PROJECT_SOURCE_DIR}/src/common/condition_variable_debug.cc + ${PROJECT_SOURCE_DIR}/src/common/cmdparse.cc + ${PROJECT_SOURCE_DIR}/src/common/Finisher.cc + ${PROJECT_SOURCE_DIR}/src/common/HeartbeatMap.cc + ${PROJECT_SOURCE_DIR}/src/common/PluginRegistry.cc + ${PROJECT_SOURCE_DIR}/src/common/lockdep.cc + ${PROJECT_SOURCE_DIR}/src/common/mutex_debug.cc + ${PROJECT_SOURCE_DIR}/src/common/perf_counters.cc + ${PROJECT_SOURCE_DIR}/src/common/perf_counters_collection.cc + ${PROJECT_SOURCE_DIR}/src/common/RefCountedObj.cc + ${PROJECT_SOURCE_DIR}/src/common/shared_mutex_debug.cc + ${PROJECT_SOURCE_DIR}/src/common/SubProcess.cc + ${PROJECT_SOURCE_DIR}/src/common/Throttle.cc + ${PROJECT_SOURCE_DIR}/src/common/Timer.cc + ${PROJECT_SOURCE_DIR}/src/common/TrackedOp.cc + ${PROJECT_SOURCE_DIR}/src/common/WorkQueue.cc + ${PROJECT_SOURCE_DIR}/src/common/util.cc + ${PROJECT_SOURCE_DIR}/src/crush/CrushLocation.cc + ${PROJECT_SOURCE_DIR}/src/global/global_context.cc + $<TARGET_OBJECTS:compressor_objs> + $<TARGET_OBJECTS:common_prioritycache_obj>) +target_link_libraries(crimson-alien-common + crimson-common + alien::cflags) + +set(alien_store_srcs + alien_store.cc + thread_pool.cc + ${PROJECT_SOURCE_DIR}/src/os/ObjectStore.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/Allocator.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/AvlAllocator.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/BitmapFreelistManager.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueFS.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/bluefs_types.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueRocksEnv.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueStore.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/bluestore_types.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/fastbmap_allocator_impl.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/FreelistManager.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/HybridAllocator.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/StupidAllocator.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/BitmapAllocator.cc) +if(WITH_ZBD) + list(APPEND alien_store_srcs + ${PROJECT_SOURCE_DIR}/src/os/bluestore/zoned_types.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/ZonedFreelistManager.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/ZonedAllocator.cc) +endif() +add_library(crimson-alienstore STATIC + ${alien_store_srcs}) +if(WITH_LTTNG) + add_dependencies(crimson-alienstore bluestore-tp) +endif() +target_link_libraries(crimson-alienstore + PRIVATE + alien::cflags + fmt::fmt + kv + heap_profiler + crimson-alien-common + ${BLKID_LIBRARIES} + ${UDEV_LIBRARIES} + crimson + blk) diff --git a/src/crimson/os/alienstore/alien_collection.h b/src/crimson/os/alienstore/alien_collection.h new file mode 100644 index 000000000..98b8fdef4 --- /dev/null +++ b/src/crimson/os/alienstore/alien_collection.h @@ -0,0 +1,26 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "os/ObjectStore.h" + +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" +#include "alien_store.h" + +namespace crimson::os { + +class AlienCollection final : public FuturizedCollection { +public: + AlienCollection(ObjectStore::CollectionHandle ch) + : FuturizedCollection(ch->cid), + collection(ch) {} + + ~AlienCollection() {} + +private: + ObjectStore::CollectionHandle collection; + friend AlienStore; +}; +} diff --git a/src/crimson/os/alienstore/alien_store.cc b/src/crimson/os/alienstore/alien_store.cc new file mode 100644 index 000000000..cb5553254 --- /dev/null +++ b/src/crimson/os/alienstore/alien_store.cc @@ -0,0 +1,575 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "alien_collection.h" +#include "alien_store.h" + +#include <map> +#include <string_view> +#include <boost/algorithm/string/trim.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include <seastar/core/alien.hh> +#include <seastar/core/future-util.hh> +#include <seastar/core/reactor.hh> + +#include "common/ceph_context.h" +#include "global/global_context.h" +#include "include/Context.h" +#include "os/bluestore/BlueStore.h" +#include "os/ObjectStore.h" +#include "os/Transaction.h" + +#include "crimson/common/log.h" +#include "crimson/os/futurized_store.h" + +namespace { + seastar::logger& logger() + { + return crimson::get_logger(ceph_subsys_filestore); + } + +class OnCommit final: public Context +{ + int cpuid; + Context *oncommit; + seastar::promise<> &alien_done; +public: + OnCommit( + int id, + seastar::promise<> &done, + Context *oncommit, + ceph::os::Transaction& txn) + : cpuid(id), oncommit(oncommit), + alien_done(done) {} + + void finish(int) final { + return seastar::alien::submit_to(cpuid, [this] { + if (oncommit) oncommit->complete(0); + alien_done.set_value(); + return seastar::make_ready_future<>(); + }).wait(); + } +}; +} + +namespace crimson::os { + +AlienStore::AlienStore(const std::string& path, const ConfigValues& values) + : path{path} +{ + cct = std::make_unique<CephContext>(CEPH_ENTITY_TYPE_OSD); + g_ceph_context = cct.get(); + cct->_conf.set_config_values(values); + store = std::make_unique<BlueStore>(cct.get(), path); + + long cpu_id = 0; + if (long nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); nr_cpus != -1) { + cpu_id = nr_cpus - 1; + } else { + logger().error("{}: unable to get nproc: {}", __func__, errno); + cpu_id = -1; + } + tp = std::make_unique<crimson::os::ThreadPool>(1, 128, cpu_id); +} + +seastar::future<> AlienStore::start() +{ + return tp->start(); +} + +seastar::future<> AlienStore::stop() +{ + return tp->submit([this] { + for (auto [cid, ch]: coll_map) + static_cast<AlienCollection*>(ch.get())->collection.reset(); + store.reset(); + }).then([this] { + return tp->stop(); + }); +} + +AlienStore::~AlienStore() = default; + +seastar::future<> AlienStore::mount() +{ + logger().debug("{}", __func__); + return tp->submit([this] { + return store->mount(); + }).then([] (int r) { + assert(r == 0); + return seastar::now(); + }); +} + +seastar::future<> AlienStore::umount() +{ + logger().info("{}", __func__); + return transaction_gate.close().then([this] { + return tp->submit([this] { + return store->umount(); + }); + }).then([] (int r) { + assert(r == 0); + return seastar::now(); + }); +} + +seastar::future<> AlienStore::mkfs(uuid_d osd_fsid) +{ + logger().debug("{}", __func__); + store->set_fsid(osd_fsid); + return tp->submit([this] { + return store->mkfs(); + }).then([] (int r) { + assert(r == 0); + return seastar::now(); + }); +} + +seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> +AlienStore::list_objects(CollectionRef ch, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const +{ + logger().debug("{}", __func__); + return seastar::do_with(std::vector<ghobject_t>(), ghobject_t(), + [=] (auto &objects, auto &next) { + objects.reserve(limit); + return tp->submit([=, &objects, &next] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->collection_list(c->collection, start, end, + store->get_ideal_list_max(), + &objects, &next); + }).then([&objects, &next] (int r) { + assert(r == 0); + return seastar::make_ready_future<std::tuple<std::vector<ghobject_t>, ghobject_t>>( + std::make_tuple(std::move(objects), std::move(next))); + }); + }); +} + +seastar::future<CollectionRef> AlienStore::create_new_collection(const coll_t& cid) +{ + logger().debug("{}", __func__); + return tp->submit([this, cid] { + return store->create_new_collection(cid); + }).then([this, cid] (ObjectStore::CollectionHandle c) { + CollectionRef ch; + auto cp = coll_map.find(c->cid); + if (cp == coll_map.end()) { + ch = new AlienCollection(c); + coll_map[c->cid] = ch; + } else { + ch = cp->second; + auto ach = static_cast<AlienCollection*>(ch.get()); + if (ach->collection != c) { + ach->collection = c; + } + } + return seastar::make_ready_future<CollectionRef>(ch); + }); + +} + +seastar::future<CollectionRef> AlienStore::open_collection(const coll_t& cid) +{ + logger().debug("{}", __func__); + return tp->submit([this, cid] { + return store->open_collection(cid); + }).then([this] (ObjectStore::CollectionHandle c) { + CollectionRef ch; + auto cp = coll_map.find(c->cid); + if (cp == coll_map.end()){ + ch = new AlienCollection(c); + coll_map[c->cid] = ch; + } else { + ch = cp->second; + auto ach = static_cast<AlienCollection*>(ch.get()); + if (ach->collection != c){ + ach->collection = c; + } + } + return seastar::make_ready_future<CollectionRef>(ch); + }); +} + +seastar::future<std::vector<coll_t>> AlienStore::list_collections() +{ + logger().debug("{}", __func__); + + return seastar::do_with(std::vector<coll_t>{}, [=] (auto &ls) { + return tp->submit([this, &ls] { + return store->list_collections(ls); + }).then([&ls] (int r) { + assert(r == 0); + return seastar::make_ready_future<std::vector<coll_t>>(std::move(ls)); + }); + }); +} + +AlienStore::read_errorator::future<ceph::bufferlist> +AlienStore::read(CollectionRef ch, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags) +{ + logger().debug("{}", __func__); + return seastar::do_with(ceph::bufferlist{}, [=] (auto &bl) { + return tp->submit([=, &bl] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->read(c->collection, oid, offset, len, bl, op_flags); + }).then([&bl] (int r) -> read_errorator::future<ceph::bufferlist> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else if (r == -EIO) { + return crimson::ct_error::input_output_error::make(); + } else { + return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl)); + } + }); + }); +} + +AlienStore::read_errorator::future<ceph::bufferlist> +AlienStore::readv(CollectionRef ch, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags) +{ + logger().debug("{}", __func__); + return seastar::do_with(ceph::bufferlist{}, + [this, ch, oid, &m, op_flags](auto& bl) { + return tp->submit([this, ch, oid, &m, op_flags, &bl] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->readv(c->collection, oid, m, bl, op_flags); + }).then([&bl](int r) -> read_errorator::future<ceph::bufferlist> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else if (r == -EIO) { + return crimson::ct_error::input_output_error::make(); + } else { + return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl)); + } + }); + }); +} + +AlienStore::get_attr_errorator::future<ceph::bufferptr> +AlienStore::get_attr(CollectionRef ch, + const ghobject_t& oid, + std::string_view name) const +{ + logger().debug("{}", __func__); + return seastar::do_with(ceph::bufferptr{}, [=] (auto &value) { + return tp->submit([=, &value] { + auto c =static_cast<AlienCollection*>(ch.get()); + return store->getattr(c->collection, oid, + static_cast<std::string>(name).c_str(), value); + }).then([oid, &value] (int r) -> get_attr_errorator::future<ceph::bufferptr> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else if (r == -ENODATA) { + return crimson::ct_error::enodata::make(); + } else { + return get_attr_errorator::make_ready_future<ceph::bufferptr>( + std::move(value)); + } + }); + }); +} + +AlienStore::get_attrs_ertr::future<AlienStore::attrs_t> +AlienStore::get_attrs(CollectionRef ch, + const ghobject_t& oid) +{ + logger().debug("{}", __func__); + return seastar::do_with(attrs_t{}, [=] (auto &aset) { + return tp->submit([=, &aset] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->getattrs(c->collection, oid, + reinterpret_cast<map<string,bufferptr>&>(aset)); + }).then([&aset] (int r) -> get_attrs_ertr::future<attrs_t> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else { + return get_attrs_ertr::make_ready_future<attrs_t>(std::move(aset)); + } + }); + }); +} + +auto AlienStore::omap_get_values(CollectionRef ch, + const ghobject_t& oid, + const set<string>& keys) + -> read_errorator::future<omap_values_t> +{ + logger().debug("{}", __func__); + return seastar::do_with(omap_values_t{}, [=] (auto &values) { + return tp->submit([=, &values] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->omap_get_values(c->collection, oid, keys, + reinterpret_cast<map<string, bufferlist>*>(&values)); + }).then([&values] (int r) -> read_errorator::future<omap_values_t> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else { + assert(r == 0); + return read_errorator::make_ready_future<omap_values_t>(std::move(values)); + } + }); + }); +} + +auto AlienStore::omap_get_values(CollectionRef ch, + const ghobject_t &oid, + const std::optional<string> &start) + -> read_errorator::future<std::tuple<bool, omap_values_t>> +{ + logger().debug("{} with_start", __func__); + return seastar::do_with(omap_values_t{}, [=] (auto &values) { + return tp->submit([=, &values] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->omap_get_values(c->collection, oid, start, + reinterpret_cast<map<string, bufferlist>*>(&values)); + }).then([&values] (int r) + -> read_errorator::future<std::tuple<bool, omap_values_t>> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else if (r < 0){ + logger().error("omap_get_values(start): {}", r); + return crimson::ct_error::input_output_error::make(); + } else { + return read_errorator::make_ready_future<std::tuple<bool, omap_values_t>>( + std::make_tuple(true, std::move(values))); + } + }); + }); +} + +seastar::future<> AlienStore::do_transaction(CollectionRef ch, + ceph::os::Transaction&& txn) +{ + logger().debug("{}", __func__); + auto id = seastar::this_shard_id(); + auto done = seastar::promise<>(); + return seastar::do_with( + std::move(txn), + std::move(done), + [this, ch, id] (auto &txn, auto &done) { + return seastar::with_gate(transaction_gate, [this, ch, id, &txn, &done] { + return tp_mutex.lock().then ([this, ch, id, &txn, &done] { + Context *crimson_wrapper = + ceph::os::Transaction::collect_all_contexts(txn); + return tp->submit([this, ch, id, crimson_wrapper, &txn, &done] { + txn.register_on_commit(new OnCommit(id, done, crimson_wrapper, txn)); + auto c = static_cast<AlienCollection*>(ch.get()); + return store->queue_transaction(c->collection, std::move(txn)); + }); + }).then([this, &done] (int r) { + assert(r == 0); + tp_mutex.unlock(); + return done.get_future(); + }); + }); + }); +} + +seastar::future<> AlienStore::write_meta(const std::string& key, + const std::string& value) +{ + logger().debug("{}", __func__); + return tp->submit([=] { + return store->write_meta(key, value); + }).then([] (int r) { + assert(r == 0); + return seastar::make_ready_future<>(); + }); +} + +seastar::future<std::tuple<int, std::string>> +AlienStore::read_meta(const std::string& key) +{ + logger().debug("{}", __func__); + return tp->submit([this, key] { + std::string value; + int r = store->read_meta(key, &value); + if (r > 0) { + value.resize(r); + boost::algorithm::trim_right_if(value, + [] (unsigned char c) {return isspace(c);}); + } else { + value.clear(); + } + return std::make_pair(r, value); + }).then([] (auto entry) { + return seastar::make_ready_future<std::tuple<int, std::string>>( + std::move(entry)); + }); +} + +uuid_d AlienStore::get_fsid() const +{ + logger().debug("{}", __func__); + return store->get_fsid(); +} + +seastar::future<store_statfs_t> AlienStore::stat() const +{ + logger().info("{}", __func__); + return seastar::do_with(store_statfs_t{}, [this] (store_statfs_t &st) { + return tp->submit([this, &st] { + return store->statfs(&st, nullptr); + }).then([&st] (int r) { + assert(r == 0); + return seastar::make_ready_future<store_statfs_t>(std::move(st)); + }); + }); +} + +unsigned AlienStore::get_max_attr_name_length() const +{ + logger().info("{}", __func__); + return 256; +} + +seastar::future<struct stat> AlienStore::stat( + CollectionRef ch, + const ghobject_t& oid) +{ + return seastar::do_with((struct stat){}, [this, ch, oid](auto& st) { + return tp->submit([this, ch, oid, &st] { + auto c = static_cast<AlienCollection*>(ch.get()); + store->stat(c->collection, oid, &st); + return st; + }); + }); +} + +auto AlienStore::omap_get_header(CollectionRef ch, + const ghobject_t& oid) + -> read_errorator::future<ceph::bufferlist> +{ + return seastar::do_with(ceph::bufferlist(), [=](auto& bl) { + return tp->submit([=, &bl] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->omap_get_header(c->collection, oid, &bl); + }).then([&bl] (int r) -> read_errorator::future<ceph::bufferlist> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else if (r < 0) { + logger().error("omap_get_header: {}", r); + return crimson::ct_error::input_output_error::make(); + } else { + return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl)); + } + }); + }); +} + +seastar::future<std::map<uint64_t, uint64_t>> AlienStore::fiemap( + CollectionRef ch, + const ghobject_t& oid, + uint64_t off, + uint64_t len) +{ + return seastar::do_with(std::map<uint64_t, uint64_t>(), [=](auto& destmap) { + return tp->submit([=, &destmap] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->fiemap(c->collection, oid, off, len, destmap); + }).then([&destmap] (int i) { + return seastar::make_ready_future + <std::map<uint64_t, uint64_t>> + (std::move(destmap)); + }); + }); +} + +seastar::future<FuturizedStore::OmapIteratorRef> AlienStore::get_omap_iterator( + CollectionRef ch, + const ghobject_t& oid) +{ + return tp->submit([=] { + auto c = static_cast<AlienCollection*>(ch.get()); + auto iter = store->get_omap_iterator(c->collection, oid); + return FuturizedStore::OmapIteratorRef( + new AlienStore::AlienOmapIterator(iter, + this)); + }); +} + +//TODO: each iterator op needs one submit, this is not efficient, +// needs further optimization. +seastar::future<> AlienStore::AlienOmapIterator::seek_to_first() +{ + return store->tp->submit([=] { + return iter->seek_to_first(); + }).then([] (int r) { + assert(r == 0); + return seastar::now(); + }); +} + +seastar::future<> AlienStore::AlienOmapIterator::upper_bound( + const std::string& after) +{ + return store->tp->submit([this, after] { + return iter->upper_bound(after); + }).then([] (int r) { + assert(r == 0); + return seastar::now(); + }); +} + +seastar::future<> AlienStore::AlienOmapIterator::lower_bound( + const std::string& to) +{ + return store->tp->submit([this, to] { + return iter->lower_bound(to); + }).then([] (int r) { + assert(r == 0); + return seastar::now(); + }); +} + +seastar::future<> AlienStore::AlienOmapIterator::next() +{ + return store->tp->submit([this] { + return iter->next(); + }).then([] (int r) { + assert(r == 0); + return seastar::now(); + }); +} + +bool AlienStore::AlienOmapIterator::valid() const +{ + return iter->valid(); +} + +std::string AlienStore::AlienOmapIterator::key() +{ + return iter->key(); +} + +seastar::future<std::string> AlienStore::AlienOmapIterator::tail_key() +{ + return store->tp->submit([this] { + return iter->tail_key(); + }); +} + +ceph::buffer::list AlienStore::AlienOmapIterator::value() +{ + return iter->value(); +} + +int AlienStore::AlienOmapIterator::status() const +{ + return iter->status(); +} + +} diff --git a/src/crimson/os/alienstore/alien_store.h b/src/crimson/os/alienstore/alien_store.h new file mode 100644 index 000000000..92739340e --- /dev/null +++ b/src/crimson/os/alienstore/alien_store.h @@ -0,0 +1,125 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> +#include <seastar/core/shared_mutex.hh> + +#include "common/ceph_context.h" +#include "os/ObjectStore.h" +#include "osd/osd_types.h" + +#include "crimson/os/alienstore/thread_pool.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" + +namespace ceph::os { +class Transaction; +} + +namespace crimson::os { +class AlienStore final : public FuturizedStore { +public: + class AlienOmapIterator final : public OmapIterator { + public: + AlienOmapIterator(ObjectMap::ObjectMapIterator& it, + AlienStore* store) : iter(it), store(store) {} + seastar::future<> seek_to_first(); + seastar::future<> upper_bound(const std::string& after); + seastar::future<> lower_bound(const std::string& to); + bool valid() const; + seastar::future<> next(); + std::string key(); + seastar::future<std::string> tail_key(); + ceph::buffer::list value(); + int status() const; + private: + ObjectMap::ObjectMapIterator iter; + AlienStore* store; + }; + AlienStore(const std::string& path, const ConfigValues& values); + ~AlienStore() final; + + seastar::future<> start() final; + seastar::future<> stop() final; + seastar::future<> mount() final; + seastar::future<> umount() final; + + seastar::future<> mkfs(uuid_d new_osd_fsid) final; + read_errorator::future<ceph::bufferlist> read(CollectionRef c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags = 0) final; + read_errorator::future<ceph::bufferlist> readv(CollectionRef c, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags = 0) final; + + + get_attr_errorator::future<ceph::bufferptr> get_attr(CollectionRef c, + const ghobject_t& oid, + std::string_view name) const final; + get_attrs_ertr::future<attrs_t> get_attrs(CollectionRef c, + const ghobject_t& oid) final; + + read_errorator::future<omap_values_t> omap_get_values( + CollectionRef c, + const ghobject_t& oid, + const omap_keys_t& keys) final; + + /// Retrieves paged set of values > start (if present) + read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values( + CollectionRef c, ///< [in] collection + const ghobject_t &oid, ///< [in] oid + const std::optional<std::string> &start ///< [in] start, empty for begin + ) final; ///< @return <done, values> values.empty() iff done + + seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( + CollectionRef c, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const final; + + seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; + seastar::future<CollectionRef> open_collection(const coll_t& cid) final; + seastar::future<std::vector<coll_t>> list_collections() final; + + seastar::future<> do_transaction(CollectionRef c, + ceph::os::Transaction&& txn) final; + + seastar::future<> write_meta(const std::string& key, + const std::string& value) final; + seastar::future<std::tuple<int, std::string>> read_meta( + const std::string& key) final; + uuid_d get_fsid() const final; + seastar::future<store_statfs_t> stat() const final; + unsigned get_max_attr_name_length() const final; + seastar::future<struct stat> stat( + CollectionRef, + const ghobject_t&) final; + read_errorator::future<ceph::bufferlist> omap_get_header( + CollectionRef, + const ghobject_t&) final; + seastar::future<std::map<uint64_t, uint64_t>> fiemap( + CollectionRef, + const ghobject_t&, + uint64_t off, + uint64_t len) final; + seastar::future<FuturizedStore::OmapIteratorRef> get_omap_iterator( + CollectionRef ch, + const ghobject_t& oid) final; + +private: + constexpr static unsigned MAX_KEYS_PER_OMAP_GET_CALL = 32; + mutable std::unique_ptr<crimson::os::ThreadPool> tp; + const std::string path; + uint64_t used_bytes = 0; + std::unique_ptr<ObjectStore> store; + std::unique_ptr<CephContext> cct; + seastar::gate transaction_gate; + std::unordered_map<coll_t, CollectionRef> coll_map; + seastar::shared_mutex tp_mutex; +}; +} diff --git a/src/crimson/os/alienstore/thread_pool.cc b/src/crimson/os/alienstore/thread_pool.cc new file mode 100644 index 000000000..e127d87d5 --- /dev/null +++ b/src/crimson/os/alienstore/thread_pool.cc @@ -0,0 +1,80 @@ +#include "thread_pool.h" + +#include <chrono> +#include <pthread.h> + +#include "include/ceph_assert.h" +#include "crimson/common/config_proxy.h" + +using crimson::common::local_conf; + +namespace crimson::os { + +ThreadPool::ThreadPool(size_t n_threads, + size_t queue_sz, + long cpu_id) + : queue_size{round_up_to(queue_sz, seastar::smp::count)}, + pending{queue_size} +{ + auto queue_max_wait = std::chrono::seconds(local_conf()->threadpool_empty_queue_max_wait); + for (size_t i = 0; i < n_threads; i++) { + threads.emplace_back([this, cpu_id, queue_max_wait] { + if (cpu_id >= 0) { + pin(cpu_id); + } + loop(queue_max_wait); + }); + } +} + +ThreadPool::~ThreadPool() +{ + for (auto& thread : threads) { + thread.join(); + } +} + +void ThreadPool::pin(unsigned cpu_id) +{ + cpu_set_t cs; + CPU_ZERO(&cs); + CPU_SET(cpu_id, &cs); + [[maybe_unused]] auto r = pthread_setaffinity_np(pthread_self(), + sizeof(cs), &cs); + ceph_assert(r == 0); +} + +void ThreadPool::loop(std::chrono::milliseconds queue_max_wait) +{ + for (;;) { + WorkItem* work_item = nullptr; + { + std::unique_lock lock{mutex}; + cond.wait_for(lock, queue_max_wait, + [this, &work_item] { + return pending.pop(work_item) || is_stopping(); + }); + } + if (work_item) { + work_item->process(); + } else if (is_stopping()) { + break; + } + } +} + +seastar::future<> ThreadPool::start() +{ + auto slots_per_shard = queue_size / seastar::smp::count; + return submit_queue.start(slots_per_shard); +} + +seastar::future<> ThreadPool::stop() +{ + return submit_queue.stop().then([this] { + stopping = true; + cond.notify_all(); + }); +} + +} // namespace crimson::os diff --git a/src/crimson/os/alienstore/thread_pool.h b/src/crimson/os/alienstore/thread_pool.h new file mode 100644 index 000000000..27840da18 --- /dev/null +++ b/src/crimson/os/alienstore/thread_pool.h @@ -0,0 +1,132 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include <atomic> +#include <condition_variable> +#include <tuple> +#include <type_traits> +#include <boost/lockfree/queue.hpp> +#include <boost/optional.hpp> +#include <seastar/core/future.hh> +#include <seastar/core/gate.hh> +#include <seastar/core/reactor.hh> +#include <seastar/core/semaphore.hh> +#include <seastar/core/sharded.hh> + +namespace crimson::os { + +struct WorkItem { + virtual ~WorkItem() {} + virtual void process() = 0; +}; + +template<typename Func> +struct Task final : WorkItem { + using T = std::invoke_result_t<Func>; + using future_stored_type_t = + std::conditional_t<std::is_void_v<T>, + seastar::internal::future_stored_type_t<>, + seastar::internal::future_stored_type_t<T>>; + using futurator_t = seastar::futurize<T>; +public: + explicit Task(Func&& f) + : func(std::move(f)) + {} + void process() override { + try { + if constexpr (std::is_void_v<T>) { + func(); + state.set(); + } else { + state.set(func()); + } + } catch (...) { + state.set_exception(std::current_exception()); + } + on_done.write_side().signal(1); + } + typename futurator_t::type get_future() { + return on_done.wait().then([this](size_t) { + if (state.failed()) { + return futurator_t::make_exception_future(state.get_exception()); + } else { + return futurator_t::from_tuple(state.get_value()); + } + }); + } +private: + Func func; + seastar::future_state<future_stored_type_t> state; + seastar::readable_eventfd on_done; +}; + +struct SubmitQueue { + seastar::semaphore free_slots; + seastar::gate pending_tasks; + explicit SubmitQueue(size_t num_free_slots) + : free_slots(num_free_slots) + {} + seastar::future<> stop() { + return pending_tasks.close(); + } +}; + +/// an engine for scheduling non-seastar tasks from seastar fibers +class ThreadPool { + std::atomic<bool> stopping = false; + std::mutex mutex; + std::condition_variable cond; + std::vector<std::thread> threads; + seastar::sharded<SubmitQueue> submit_queue; + const size_t queue_size; + boost::lockfree::queue<WorkItem*> pending; + + void loop(std::chrono::milliseconds queue_max_wait); + bool is_stopping() const { + return stopping.load(std::memory_order_relaxed); + } + static void pin(unsigned cpu_id); + seastar::semaphore& local_free_slots() { + return submit_queue.local().free_slots; + } + ThreadPool(const ThreadPool&) = delete; + ThreadPool& operator=(const ThreadPool&) = delete; +public: + /** + * @param queue_sz the depth of pending queue. before a task is scheduled, + * it waits in this queue. we will round this number to + * multiple of the number of cores. + * @param n_threads the number of threads in this thread pool. + * @param cpu the CPU core to which this thread pool is assigned + * @note each @c Task has its own crimson::thread::Condition, which possesses + * an fd, so we should keep the size of queue under a reasonable limit. + */ + ThreadPool(size_t n_threads, size_t queue_sz, long cpu); + ~ThreadPool(); + seastar::future<> start(); + seastar::future<> stop(); + template<typename Func, typename...Args> + auto submit(Func&& func, Args&&... args) { + auto packaged = [func=std::move(func), + args=std::forward_as_tuple(args...)] { + return std::apply(std::move(func), std::move(args)); + }; + return seastar::with_gate(submit_queue.local().pending_tasks, + [packaged=std::move(packaged), this] { + return local_free_slots().wait() + .then([packaged=std::move(packaged), this] { + auto task = new Task{std::move(packaged)}; + auto fut = task->get_future(); + pending.push(task); + cond.notify_one(); + return fut.finally([task, this] { + local_free_slots().signal(); + delete task; + }); + }); + }); + } +}; + +} // namespace crimson::os diff --git a/src/crimson/os/cyanstore/CMakeLists.txt b/src/crimson/os/cyanstore/CMakeLists.txt new file mode 100644 index 000000000..65f2b5498 --- /dev/null +++ b/src/crimson/os/cyanstore/CMakeLists.txt @@ -0,0 +1,7 @@ +add_library(crimson-cyanstore STATIC + cyan_store.cc + cyan_collection.cc + cyan_object.cc) +target_link_libraries(crimson-cyanstore + crimson + crimson-os) diff --git a/src/crimson/os/cyanstore/cyan_collection.cc b/src/crimson/os/cyanstore/cyan_collection.cc new file mode 100644 index 000000000..f44234e84 --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_collection.cc @@ -0,0 +1,76 @@ +#include "cyan_collection.h" + +#include "cyan_object.h" + +namespace crimson::os +{ + +Collection::Collection(const coll_t& c) + : FuturizedCollection{c} +{} + +Collection::~Collection() = default; + +Collection::ObjectRef Collection::create_object() const +{ + return new crimson::os::Object; +} + +Collection::ObjectRef Collection::get_object(ghobject_t oid) +{ + auto o = object_hash.find(oid); + if (o == object_hash.end()) + return ObjectRef(); + return o->second; +} + +Collection::ObjectRef Collection::get_or_create_object(ghobject_t oid) +{ + auto result = object_hash.emplace(oid, ObjectRef{}); + if (result.second) + object_map[oid] = result.first->second = create_object(); + return result.first->second; +} + +uint64_t Collection::used_bytes() const +{ + uint64_t result = 0; + for (auto& obj : object_map) { + result += obj.second->get_size(); + } + return result; +} + +void Collection::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(xattr, bl); + encode(use_page_set, bl); + uint32_t s = object_map.size(); + encode(s, bl); + for (auto& [oid, obj] : object_map) { + encode(oid, bl); + obj->encode(bl); + } + ENCODE_FINISH(bl); +} + +void Collection::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(xattr, p); + decode(use_page_set, p); + uint32_t s; + decode(s, p); + while (s--) { + ghobject_t k; + decode(k, p); + auto o = create_object(); + o->decode(p); + object_map.insert(make_pair(k, o)); + object_hash.insert(make_pair(k, o)); + } + DECODE_FINISH(p); +} + +} diff --git a/src/crimson/os/cyanstore/cyan_collection.h b/src/crimson/os/cyanstore/cyan_collection.h new file mode 100644 index 000000000..068e427d8 --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_collection.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string> +#include <unordered_map> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include "include/buffer.h" +#include "osd/osd_types.h" + +#include "crimson/os/futurized_collection.h" + +namespace crimson::os { + +class Object; +/** + * a collection also orders transactions + * + * Any transactions queued under a given collection will be applied in + * sequence. Transactions queued under different collections may run + * in parallel. + * + * ObjectStore users may get collection handles with open_collection() (or, + * for bootstrapping a new collection, create_new_collection()). + */ +struct Collection final : public FuturizedCollection { + using ObjectRef = boost::intrusive_ptr<Object>; + int bits = 0; + // always use bufferlist object for testing + bool use_page_set = false; + std::unordered_map<ghobject_t, ObjectRef> object_hash; ///< for lookup + std::map<ghobject_t, ObjectRef> object_map; ///< for iteration + std::map<std::string,bufferptr> xattr; + bool exists = true; + + Collection(const coll_t& c); + ~Collection() final; + + ObjectRef create_object() const; + ObjectRef get_object(ghobject_t oid); + ObjectRef get_or_create_object(ghobject_t oid); + uint64_t used_bytes() const; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); +}; + +} diff --git a/src/crimson/os/cyanstore/cyan_object.cc b/src/crimson/os/cyanstore/cyan_object.cc new file mode 100644 index 000000000..34bc13b7f --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_object.cc @@ -0,0 +1,89 @@ +#include "cyan_object.h" +#include "include/encoding.h" + +namespace crimson::os { + +size_t Object::get_size() const { + return data.length(); +} + +ceph::bufferlist Object::read(uint64_t offset, uint64_t len) +{ + bufferlist ret; + ret.substr_of(data, offset, len); + return ret; +} + +int Object::write(uint64_t offset, const bufferlist &src) +{ + unsigned len = src.length(); + // before + bufferlist newdata; + if (get_size() >= offset) { + newdata.substr_of(data, 0, offset); + } else { + if (get_size()) { + newdata.substr_of(data, 0, get_size()); + } + newdata.append_zero(offset - get_size()); + } + + newdata.append(src); + + // after + if (get_size() > offset + len) { + bufferlist tail; + tail.substr_of(data, offset + len, get_size() - (offset + len)); + newdata.append(tail); + } + + data = std::move(newdata); + return 0; +} + +int Object::clone(Object *src, uint64_t srcoff, uint64_t len, + uint64_t dstoff) +{ + bufferlist bl; + if (srcoff == dstoff && len == src->get_size()) { + data = src->data; + return 0; + } + bl.substr_of(src->data, srcoff, len); + return write(dstoff, bl); + +} + +int Object::truncate(uint64_t size) +{ + if (get_size() > size) { + bufferlist bl; + bl.substr_of(data, 0, size); + data = std::move(bl); + } else if (get_size() == size) { + // do nothing + } else { + data.append_zero(size - get_size()); + } + return 0; +} + +void Object::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(data, bl); + encode(xattr, bl); + encode(omap_header, bl); + encode(omap, bl); + ENCODE_FINISH(bl); +} + +void Object::decode(bufferlist::const_iterator& p) { + DECODE_START(1, p); + decode(data, p); + decode(xattr, p); + decode(omap_header, p); + decode(omap, p); + DECODE_FINISH(p); +} + +} diff --git a/src/crimson/os/cyanstore/cyan_object.h b/src/crimson/os/cyanstore/cyan_object.h new file mode 100644 index 000000000..f19b87212 --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_object.h @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include <cstddef> +#include <map> +#include <string> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include "include/buffer.h" + +namespace crimson::os { + +struct Object : public boost::intrusive_ref_counter< + Object, + boost::thread_unsafe_counter> +{ + using bufferlist = ceph::bufferlist; + + bufferlist data; + // use transparent comparator for better performance, see + // https://en.cppreference.com/w/cpp/utility/functional/less_void + std::map<std::string,bufferptr,std::less<>> xattr; + bufferlist omap_header; + std::map<std::string,bufferlist> omap; + + typedef boost::intrusive_ptr<Object> Ref; + + Object() = default; + + // interface for object data + size_t get_size() const; + ceph::bufferlist read(uint64_t offset, uint64_t len); + int write(uint64_t offset, const bufferlist &bl); + int clone(Object *src, uint64_t srcoff, uint64_t len, + uint64_t dstoff); + int truncate(uint64_t offset); + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); +}; +using ObjectRef = boost::intrusive_ptr<Object>; + +} diff --git a/src/crimson/os/cyanstore/cyan_store.cc b/src/crimson/os/cyanstore/cyan_store.cc new file mode 100644 index 000000000..eb93d72ec --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_store.cc @@ -0,0 +1,835 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "cyan_store.h" + +#include <boost/algorithm/string/trim.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "common/safe_io.h" +#include "os/Transaction.h" + +#include "crimson/common/buffer_io.h" +#include "crimson/common/config_proxy.h" +#include "cyan_collection.h" +#include "cyan_object.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +using crimson::common::local_conf; + +namespace crimson::os { + +using ObjectRef = boost::intrusive_ptr<Object>; + +CyanStore::CyanStore(const std::string& path) + : path{path} +{} + +CyanStore::~CyanStore() = default; + +seastar::future<> CyanStore::mount() +{ + ceph::bufferlist bl; + std::string fn = path + "/collections"; + std::string err; + if (int r = bl.read_file(fn.c_str(), &err); r < 0) { + throw std::runtime_error("read_file"); + } + + std::set<coll_t> collections; + auto p = bl.cbegin(); + ceph::decode(collections, p); + + for (auto& coll : collections) { + std::string fn = fmt::format("{}/{}", path, coll); + ceph::bufferlist cbl; + if (int r = cbl.read_file(fn.c_str(), &err); r < 0) { + throw std::runtime_error("read_file"); + } + boost::intrusive_ptr<Collection> c{new Collection{coll}}; + auto p = cbl.cbegin(); + c->decode(p); + coll_map[coll] = c; + used_bytes += c->used_bytes(); + } + return seastar::now(); +} + +seastar::future<> CyanStore::umount() +{ + return seastar::do_with(std::set<coll_t>{}, [this](auto& collections) { + return seastar::do_for_each(coll_map, [&collections, this](auto& coll) { + auto& [col, ch] = coll; + collections.insert(col); + ceph::bufferlist bl; + ceph_assert(ch); + ch->encode(bl); + std::string fn = fmt::format("{}/{}", path, col); + return crimson::write_file(std::move(bl), fn); + }).then([&collections, this] { + ceph::bufferlist bl; + ceph::encode(collections, bl); + std::string fn = fmt::format("{}/collections", path); + return crimson::write_file(std::move(bl), fn); + }); + }); +} + +seastar::future<> CyanStore::mkfs(uuid_d new_osd_fsid) +{ + return read_meta("fsid").then([=](auto&& ret) { + auto& [r, fsid_str] = ret; + if (r == -ENOENT) { + if (new_osd_fsid.is_zero()) { + osd_fsid.generate_random(); + } else { + osd_fsid = new_osd_fsid; + } + return write_meta("fsid", fmt::format("{}", osd_fsid)); + } else if (r < 0) { + throw std::runtime_error("read_meta"); + } else { + logger().info("{} already has fsid {}", __func__, fsid_str); + if (!osd_fsid.parse(fsid_str.c_str())) { + throw std::runtime_error("failed to parse fsid"); + } else if (osd_fsid != new_osd_fsid) { + logger().error("on-disk fsid {} != provided {}", osd_fsid, new_osd_fsid); + throw std::runtime_error("unmatched osd_fsid"); + } else { + return seastar::now(); + } + } + }).then([this]{ + std::string fn = path + "/collections"; + ceph::bufferlist bl; + std::set<coll_t> collections; + ceph::encode(collections, bl); + return crimson::write_file(std::move(bl), fn); + }).then([this] { + return write_meta("type", "memstore"); + }); +} + +seastar::future<store_statfs_t> CyanStore::stat() const +{ + logger().debug("{}", __func__); + store_statfs_t st; + st.total = crimson::common::local_conf().get_val<Option::size_t>("memstore_device_bytes"); + st.available = st.total - used_bytes; + return seastar::make_ready_future<store_statfs_t>(std::move(st)); +} + +seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> +CyanStore::list_objects(CollectionRef ch, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {} {} {}", + __func__, c->get_cid(), start, end, limit); + std::vector<ghobject_t> objects; + objects.reserve(limit); + ghobject_t next = ghobject_t::get_max(); + for (const auto& [oid, obj] : + boost::make_iterator_range(c->object_map.lower_bound(start), + c->object_map.end())) { + std::ignore = obj; + if (oid >= end || objects.size() >= limit) { + next = oid; + break; + } + objects.push_back(oid); + } + return seastar::make_ready_future<std::tuple<std::vector<ghobject_t>, ghobject_t>>( + std::make_tuple(std::move(objects), next)); +} + +seastar::future<CollectionRef> CyanStore::create_new_collection(const coll_t& cid) +{ + auto c = new Collection{cid}; + new_coll_map[cid] = c; + return seastar::make_ready_future<CollectionRef>(c); +} + +seastar::future<CollectionRef> CyanStore::open_collection(const coll_t& cid) +{ + return seastar::make_ready_future<CollectionRef>(_get_collection(cid)); +} + +seastar::future<std::vector<coll_t>> CyanStore::list_collections() +{ + std::vector<coll_t> collections; + for (auto& coll : coll_map) { + collections.push_back(coll.first); + } + return seastar::make_ready_future<std::vector<coll_t>>(std::move(collections)); +} + +CyanStore::read_errorator::future<ceph::bufferlist> CyanStore::read( + CollectionRef ch, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags) +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {} {}~{}", + __func__, c->get_cid(), oid, offset, len); + if (!c->exists) { + return crimson::ct_error::enoent::make(); + } + ObjectRef o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + if (offset >= o->get_size()) + return read_errorator::make_ready_future<ceph::bufferlist>(); + size_t l = len; + if (l == 0 && offset == 0) // note: len == 0 means read the entire object + l = o->get_size(); + else if (offset + l > o->get_size()) + l = o->get_size() - offset; + return read_errorator::make_ready_future<ceph::bufferlist>(o->read(offset, l)); +} + +CyanStore::read_errorator::future<ceph::bufferlist> CyanStore::readv( + CollectionRef ch, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags) +{ + return seastar::do_with(ceph::bufferlist{}, + [this, ch, oid, &m, op_flags](auto& bl) { + return crimson::do_for_each(m, + [this, ch, oid, op_flags, &bl](auto& p) { + return read(ch, oid, p.first, p.second, op_flags) + .safe_then([&bl](auto ret) { + bl.claim_append(ret); + }); + }).safe_then([&bl] { + return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl)); + }); + }); +} + + +CyanStore::get_attr_errorator::future<ceph::bufferptr> CyanStore::get_attr( + CollectionRef ch, + const ghobject_t& oid, + std::string_view name) const +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {}", + __func__, c->get_cid(), oid); + auto o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + if (auto found = o->xattr.find(name); found != o->xattr.end()) { + return get_attr_errorator::make_ready_future<ceph::bufferptr>(found->second); + } else { + return crimson::ct_error::enodata::make(); + } +} + +CyanStore::get_attrs_ertr::future<CyanStore::attrs_t> CyanStore::get_attrs( + CollectionRef ch, + const ghobject_t& oid) +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {}", + __func__, c->get_cid(), oid); + auto o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + return get_attrs_ertr::make_ready_future<attrs_t>(o->xattr); +} + +auto CyanStore::omap_get_values(CollectionRef ch, + const ghobject_t& oid, + const omap_keys_t& keys) + -> read_errorator::future<omap_values_t> +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {}", __func__, c->get_cid(), oid); + auto o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + omap_values_t values; + for (auto& key : keys) { + if (auto found = o->omap.find(key); found != o->omap.end()) { + values.insert(*found); + } + } + return seastar::make_ready_future<omap_values_t>(std::move(values)); +} + +auto +CyanStore::omap_get_values(CollectionRef ch, + const ghobject_t &oid, + const std::optional<string> &start) + -> read_errorator::future<std::tuple<bool, omap_values_t>> +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {}", __func__, c->get_cid(), oid); + auto o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + omap_values_t values; + for (auto i = start ? o->omap.upper_bound(*start) : o->omap.begin(); + values.size() < MAX_KEYS_PER_OMAP_GET_CALL && i != o->omap.end(); + ++i) { + values.insert(*i); + } + return seastar::make_ready_future<std::tuple<bool, omap_values_t>>( + std::make_tuple(true, std::move(values))); +} + +auto +CyanStore::omap_get_header(CollectionRef ch, + const ghobject_t& oid) + -> read_errorator::future<ceph::bufferlist> +{ + auto c = static_cast<Collection*>(ch.get()); + auto o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + + return read_errorator::make_ready_future<ceph::bufferlist>( + o->omap_header); +} + +seastar::future<> CyanStore::do_transaction(CollectionRef ch, + ceph::os::Transaction&& t) +{ + using ceph::os::Transaction; + int r = 0; + try { + auto i = t.begin(); + while (i.have_op()) { + r = 0; + switch (auto op = i.decode_op(); op->op) { + case Transaction::OP_NOP: + break; + case Transaction::OP_REMOVE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _remove(cid, oid); + if (r == -ENOENT) { + r = 0; + } + } + break; + case Transaction::OP_TOUCH: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _touch(cid, oid); + } + break; + case Transaction::OP_WRITE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + uint64_t len = op->len; + uint32_t fadvise_flags = i.get_fadvise_flags(); + ceph::bufferlist bl; + i.decode_bl(bl); + r = _write(cid, oid, off, len, bl, fadvise_flags); + } + break; + case Transaction::OP_ZERO: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + uint64_t len = op->len; + r = _zero(cid, oid, off, len); + } + break; + case Transaction::OP_TRUNCATE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + r = _truncate(cid, oid, off); + } + break; + case Transaction::OP_SETATTR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + std::string name = i.decode_string(); + ceph::bufferlist bl; + i.decode_bl(bl); + std::map<std::string, bufferptr> to_set; + to_set[name] = bufferptr(bl.c_str(), bl.length()); + r = _setattrs(cid, oid, to_set); + } + break; + case Transaction::OP_RMATTR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + std::string name = i.decode_string(); + r = _rm_attr(cid, oid, name); + } + break; + case Transaction::OP_MKCOLL: + { + coll_t cid = i.get_cid(op->cid); + r = _create_collection(cid, op->split_bits); + } + break; + case Transaction::OP_OMAP_CLEAR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _omap_clear(cid, oid); + } + break; + case Transaction::OP_OMAP_SETKEYS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + std::map<std::string, ceph::bufferlist> aset; + i.decode_attrset(aset); + r = _omap_set_values(cid, oid, std::move(aset)); + } + break; + case Transaction::OP_OMAP_SETHEADER: + { + const coll_t &cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + ceph::bufferlist bl; + i.decode_bl(bl); + r = _omap_set_header(cid, oid, bl); + } + break; + case Transaction::OP_OMAP_RMKEYS: + { + const coll_t &cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + omap_keys_t keys; + i.decode_keyset(keys); + r = _omap_rmkeys(cid, oid, keys); + } + break; + case Transaction::OP_OMAP_RMKEYRANGE: + { + const coll_t &cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + string first, last; + first = i.decode_string(); + last = i.decode_string(); + r = _omap_rmkeyrange(cid, oid, first, last); + } + break; + case Transaction::OP_COLL_HINT: + { + ceph::bufferlist hint; + i.decode_bl(hint); + // ignored + break; + } + default: + logger().error("bad op {}", static_cast<unsigned>(op->op)); + abort(); + } + if (r < 0) { + break; + } + } + } catch (std::exception &e) { + logger().error("{} got exception {}", __func__, e); + r = -EINVAL; + } + if (r < 0) { + logger().error(" transaction dump:\n"); + JSONFormatter f(true); + f.open_object_section("transaction"); + t.dump(&f); + f.close_section(); + std::stringstream str; + f.flush(str); + logger().error("{}", str.str()); + ceph_assert(r == 0); + } + for (auto i : { + t.get_on_applied(), + t.get_on_commit(), + t.get_on_applied_sync()}) { + if (i) { + i->complete(0); + } + } + return seastar::now(); +} + +int CyanStore::_remove(const coll_t& cid, const ghobject_t& oid) +{ + logger().debug("{} cid={} oid={}", + __func__, cid, oid); + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + auto i = c->object_hash.find(oid); + if (i == c->object_hash.end()) + return -ENOENT; + used_bytes -= i->second->get_size(); + c->object_hash.erase(i); + c->object_map.erase(oid); + return 0; +} + +int CyanStore::_touch(const coll_t& cid, const ghobject_t& oid) +{ + logger().debug("{} cid={} oid={}", + __func__, cid, oid); + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + c->get_or_create_object(oid); + return 0; +} + +int CyanStore::_write(const coll_t& cid, const ghobject_t& oid, + uint64_t offset, size_t len, const ceph::bufferlist& bl, + uint32_t fadvise_flags) +{ + logger().debug("{} {} {} {} ~ {}", + __func__, cid, oid, offset, len); + assert(len == bl.length()); + + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_or_create_object(oid); + if (len > 0 && !local_conf()->memstore_debug_omit_block_device_write) { + const ssize_t old_size = o->get_size(); + o->write(offset, bl); + used_bytes += (o->get_size() - old_size); + } + + return 0; +} + +int CyanStore::_zero(const coll_t& cid, const ghobject_t& oid, + uint64_t offset, size_t len) +{ + logger().debug("{} {} {} {} ~ {}", + __func__, cid, oid, offset, len); + + ceph::buffer::list bl; + bl.append_zero(len); + return _write(cid, oid, offset, len, bl, 0); +} + +int CyanStore::_omap_clear( + const coll_t& cid, + const ghobject_t& oid) +{ + logger().debug("{} {} {}", __func__, cid, oid); + + auto c = _get_collection(cid); + if (!c) { + return -ENOENT; + } + ObjectRef o = c->get_object(oid); + if (!o) { + return -ENOENT; + } + o->omap.clear(); + o->omap_header.clear(); + return 0; +} + +int CyanStore::_omap_set_values( + const coll_t& cid, + const ghobject_t& oid, + std::map<std::string, ceph::bufferlist> &&aset) +{ + logger().debug( + "{} {} {} {} keys", + __func__, cid, oid, aset.size()); + + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_or_create_object(oid); + for (auto &&i: aset) { + o->omap.insert(std::move(i)); + } + return 0; +} + +int CyanStore::_omap_set_header( + const coll_t& cid, + const ghobject_t& oid, + const ceph::bufferlist &header) +{ + logger().debug( + "{} {} {} {} bytes", + __func__, cid, oid, header.length()); + + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_or_create_object(oid); + o->omap_header = header; + return 0; +} + +int CyanStore::_omap_rmkeys( + const coll_t& cid, + const ghobject_t& oid, + const omap_keys_t& aset) +{ + logger().debug( + "{} {} {} {} keys", + __func__, cid, oid, aset.size()); + + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_or_create_object(oid); + for (auto &i: aset) { + o->omap.erase(i); + } + return 0; +} + +int CyanStore::_omap_rmkeyrange( + const coll_t& cid, + const ghobject_t& oid, + const std::string &first, + const std::string &last) +{ + logger().debug( + "{} {} {} first={} last={}", + __func__, cid, oid, first, last); + + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_or_create_object(oid); + for (auto i = o->omap.lower_bound(first); + i != o->omap.end() && i->first <= last; + o->omap.erase(i++)); + return 0; +} + +int CyanStore::_truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size) +{ + logger().debug("{} cid={} oid={} size={}", + __func__, cid, oid, size); + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + if (local_conf()->memstore_debug_omit_block_device_write) + return 0; + const ssize_t old_size = o->get_size(); + int r = o->truncate(size); + used_bytes += (o->get_size() - old_size); + return r; +} + +int CyanStore::_setattrs(const coll_t& cid, const ghobject_t& oid, + std::map<std::string,bufferptr>& aset) +{ + logger().debug("{} cid={} oid={}", + __func__, cid, oid); + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + for (std::map<std::string, bufferptr>::const_iterator p = aset.begin(); + p != aset.end(); ++p) + o->xattr[p->first] = p->second; + return 0; +} + +int CyanStore::_rm_attr(const coll_t& cid, const ghobject_t& oid, + std::string_view name) +{ + logger().debug("{} cid={} oid={} name={}", __func__, cid, oid, name); + auto c = _get_collection(cid); + if (!c) { + return -ENOENT; + } + ObjectRef o = c->get_object(oid); + if (!o) { + return -ENOENT; + } + auto i = o->xattr.find(name); + if (i == o->xattr.end()) { + return -ENODATA; + } + o->xattr.erase(i); + return 0; +} + +int CyanStore::_create_collection(const coll_t& cid, int bits) +{ + auto result = coll_map.try_emplace(cid); + if (!result.second) + return -EEXIST; + auto p = new_coll_map.find(cid); + assert(p != new_coll_map.end()); + result.first->second = p->second; + result.first->second->bits = bits; + new_coll_map.erase(p); + return 0; +} + +boost::intrusive_ptr<Collection> CyanStore::_get_collection(const coll_t& cid) +{ + auto cp = coll_map.find(cid); + if (cp == coll_map.end()) + return {}; + return cp->second; +} + +seastar::future<> CyanStore::write_meta(const std::string& key, + const std::string& value) +{ + std::string v = value; + v += "\n"; + if (int r = safe_write_file(path.c_str(), key.c_str(), + v.c_str(), v.length(), 0600); + r < 0) { + throw std::runtime_error{fmt::format("unable to write_meta({})", key)}; + } + return seastar::make_ready_future<>(); +} + +seastar::future<std::tuple<int, std::string>> +CyanStore::read_meta(const std::string& key) +{ + std::string fsid(4096, '\0'); + int r = safe_read_file(path.c_str(), key.c_str(), fsid.data(), fsid.size()); + if (r > 0) { + fsid.resize(r); + // drop trailing newlines + boost::algorithm::trim_right_if(fsid, + [](unsigned char c) {return isspace(c);}); + } else { + fsid.clear(); + } + return seastar::make_ready_future<std::tuple<int, std::string>>( + std::make_tuple(r, fsid)); +} + +uuid_d CyanStore::get_fsid() const +{ + return osd_fsid; +} + +unsigned CyanStore::get_max_attr_name_length() const +{ + // arbitrary limitation exactly like in the case of MemStore. + return 256; +} + +seastar::future<FuturizedStore::OmapIteratorRef> CyanStore::get_omap_iterator( + CollectionRef ch, + const ghobject_t& oid) +{ + auto c = static_cast<Collection*>(ch.get()); + auto o = c->get_object(oid); + if (!o) { + throw std::runtime_error(fmt::format("object does not exist: {}", oid)); + } + return seastar::make_ready_future<FuturizedStore::OmapIteratorRef>( + new CyanStore::CyanOmapIterator(o)); +} + +seastar::future<std::map<uint64_t, uint64_t>> +CyanStore::fiemap( + CollectionRef ch, + const ghobject_t& oid, + uint64_t off, + uint64_t len) +{ + auto c = static_cast<Collection*>(ch.get()); + + ObjectRef o = c->get_object(oid); + if (!o) { + throw std::runtime_error(fmt::format("object does not exist: {}", oid)); + } + std::map<uint64_t, uint64_t> m{{0, o->get_size()}}; + return seastar::make_ready_future<std::map<uint64_t, uint64_t>>(std::move(m)); +} + +seastar::future<struct stat> +CyanStore::stat( + CollectionRef ch, + const ghobject_t& oid) +{ + auto c = static_cast<Collection*>(ch.get()); + auto o = c->get_object(oid); + if (!o) { + throw std::runtime_error(fmt::format("object does not exist: {}", oid)); + } + struct stat st; + st.st_size = o->get_size(); + return seastar::make_ready_future<struct stat>(std::move(st)); +} + +seastar::future<> CyanStore::CyanOmapIterator::seek_to_first() +{ + iter = obj->omap.begin(); + return seastar::make_ready_future<>(); +} + +seastar::future<> CyanStore::CyanOmapIterator::upper_bound(const std::string& after) +{ + iter = obj->omap.upper_bound(after); + return seastar::make_ready_future<>(); +} + +seastar::future<> CyanStore::CyanOmapIterator::lower_bound(const std::string &to) +{ + iter = obj->omap.lower_bound(to); + return seastar::make_ready_future<>(); +} + +bool CyanStore::CyanOmapIterator::valid() const +{ + return iter != obj->omap.end(); +} + +seastar::future<> CyanStore::CyanOmapIterator::next() +{ + ++iter; + return seastar::make_ready_future<>(); +} + +} diff --git a/src/crimson/os/cyanstore/cyan_store.h b/src/crimson/os/cyanstore/cyan_store.h new file mode 100644 index 000000000..07a8ff29e --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_store.h @@ -0,0 +1,185 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string> +#include <unordered_map> +#include <map> +#include <typeinfo> +#include <vector> + +#include <optional> +#include <seastar/core/future.hh> +#include <seastar/core/future-util.hh> + +#include "osd/osd_types.h" +#include "include/uuid.h" + +#include "crimson/os/cyanstore/cyan_object.h" +#include "crimson/os/futurized_store.h" + +namespace ceph::os { +class Transaction; +} + +namespace crimson::os { +class Collection; + +class CyanStore final : public FuturizedStore { + constexpr static unsigned MAX_KEYS_PER_OMAP_GET_CALL = 32; + + const std::string path; + std::unordered_map<coll_t, boost::intrusive_ptr<Collection>> coll_map; + std::map<coll_t, boost::intrusive_ptr<Collection>> new_coll_map; + uint64_t used_bytes = 0; + uuid_d osd_fsid; + +public: + class CyanOmapIterator final : public OmapIterator { + public: + CyanOmapIterator() {} + CyanOmapIterator(ObjectRef obj) : obj(obj) { + iter = obj->omap.begin(); + } + seastar::future<> seek_to_first() final; + seastar::future<> upper_bound(const std::string &after) final; + seastar::future<> lower_bound(const std::string &to) final; + bool valid() const final; + seastar::future<> next() final; + std::string key() final { + return iter->first; + } + virtual seastar::future<std::string> tail_key(){ + return seastar::make_ready_future<std::string>((++obj->omap.end())->first); + } + virtual ceph::buffer::list value() { + return iter->second; + } + virtual int status() const { + return iter != obj->omap.end() ? 0 : -1; + } + virtual ~CyanOmapIterator() {} + private: + std::map<std::string, bufferlist>::const_iterator iter; + ObjectRef obj; + }; + + CyanStore(const std::string& path); + ~CyanStore() final; + + seastar::future<> stop() final { + return seastar::now(); + } + seastar::future<> mount() final; + seastar::future<> umount() final; + + seastar::future<> mkfs(uuid_d new_osd_fsid) final; + seastar::future<store_statfs_t> stat() const final; + seastar::future<struct stat> stat( + CollectionRef c, + const ghobject_t& oid) final; + + read_errorator::future<ceph::bufferlist> read( + CollectionRef c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags = 0) final; + read_errorator::future<ceph::bufferlist> readv( + CollectionRef c, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags = 0) final; + + get_attr_errorator::future<ceph::bufferptr> get_attr( + CollectionRef c, + const ghobject_t& oid, + std::string_view name) const final; + get_attrs_ertr::future<attrs_t> get_attrs( + CollectionRef c, + const ghobject_t& oid); + + read_errorator::future<omap_values_t> omap_get_values( + CollectionRef c, + const ghobject_t& oid, + const omap_keys_t& keys) final; + + /// Retrieves paged set of values > start (if present) + read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values( + CollectionRef c, ///< [in] collection + const ghobject_t &oid, ///< [in] oid + const std::optional<std::string> &start ///< [in] start, empty for begin + ) final; ///< @return <done, values> values.empty() iff done + + seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( + CollectionRef c, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const final; + + read_errorator::future<ceph::bufferlist> omap_get_header( + CollectionRef c, + const ghobject_t& oid) final; + + seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; + seastar::future<CollectionRef> open_collection(const coll_t& cid) final; + seastar::future<std::vector<coll_t>> list_collections() final; + + seastar::future<> do_transaction(CollectionRef ch, + ceph::os::Transaction&& txn) final; + + seastar::future<> write_meta(const std::string& key, + const std::string& value) final; + seastar::future<std::tuple<int, std::string>> + read_meta(const std::string& key) final; + uuid_d get_fsid() const final; + unsigned get_max_attr_name_length() const final; + + seastar::future<OmapIteratorRef> get_omap_iterator( + CollectionRef c, + const ghobject_t& oid); + + seastar::future<std::map<uint64_t, uint64_t>> fiemap(CollectionRef c, + const ghobject_t& oid, + uint64_t off, + uint64_t len); + +private: + int _remove(const coll_t& cid, const ghobject_t& oid); + int _touch(const coll_t& cid, const ghobject_t& oid); + int _write(const coll_t& cid, const ghobject_t& oid, + uint64_t offset, size_t len, const ceph::bufferlist& bl, + uint32_t fadvise_flags); + int _zero(const coll_t& cid, const ghobject_t& oid, + uint64_t offset, size_t len); + int _omap_clear( + const coll_t& cid, + const ghobject_t& oid); + int _omap_set_values( + const coll_t& cid, + const ghobject_t& oid, + std::map<std::string, ceph::bufferlist> &&aset); + int _omap_set_header( + const coll_t& cid, + const ghobject_t& oid, + const ceph::bufferlist &header); + int _omap_rmkeys( + const coll_t& cid, + const ghobject_t& oid, + const omap_keys_t& aset); + int _omap_rmkeyrange( + const coll_t& cid, + const ghobject_t& oid, + const std::string &first, + const std::string &last); + int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size); + int _setattrs(const coll_t& cid, const ghobject_t& oid, + std::map<std::string,bufferptr>& aset); + int _rm_attr(const coll_t& cid, const ghobject_t& oid, + string_view name); + int _create_collection(const coll_t& cid, int bits); + boost::intrusive_ptr<Collection> _get_collection(const coll_t& cid); +}; + +} diff --git a/src/crimson/os/futurized_collection.h b/src/crimson/os/futurized_collection.h new file mode 100644 index 000000000..06f7d2f47 --- /dev/null +++ b/src/crimson/os/futurized_collection.h @@ -0,0 +1,37 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "osd/osd_types.h" + +namespace crimson::os { +class FuturizedStore; + +class FuturizedCollection + : public boost::intrusive_ref_counter<FuturizedCollection, + boost::thread_unsafe_counter> +{ +public: + FuturizedCollection(const coll_t& cid) + : cid{cid} {} + virtual ~FuturizedCollection() {} + virtual seastar::future<> flush() { + return seastar::make_ready_future<>(); + } + virtual seastar::future<bool> flush_commit() { + return seastar::make_ready_future<bool>(true); + } + const coll_t& get_cid() const { + return cid; + } +private: + const coll_t cid; +}; + +using CollectionRef = boost::intrusive_ptr<FuturizedCollection>; +} diff --git a/src/crimson/os/futurized_store.cc b/src/crimson/os/futurized_store.cc new file mode 100644 index 000000000..bb73c3478 --- /dev/null +++ b/src/crimson/os/futurized_store.cc @@ -0,0 +1,22 @@ +#include "futurized_store.h" +#include "cyanstore/cyan_store.h" +#include "alienstore/alien_store.h" + +namespace crimson::os { + +std::unique_ptr<FuturizedStore> +FuturizedStore::create(const std::string& type, + const std::string& data, + const ConfigValues& values) +{ + if (type == "memstore") { + return std::make_unique<crimson::os::CyanStore>(data); + } else if (type == "bluestore") { + return std::make_unique<crimson::os::AlienStore>(data, values); + } else { + ceph_abort_msgf("unsupported objectstore type: %s", type.c_str()); + return {}; + } +} + +} diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h new file mode 100644 index 000000000..bb173056b --- /dev/null +++ b/src/crimson/os/futurized_store.h @@ -0,0 +1,167 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string> +#include <map> +#include <optional> +#include <vector> + +#include <seastar/core/future.hh> + +#include "crimson/osd/exceptions.h" +#include "include/buffer_fwd.h" +#include "include/uuid.h" +#include "osd/osd_types.h" + +namespace ceph::os { +class Transaction; +} + +namespace crimson::os { +class FuturizedCollection; + +class FuturizedStore { + +public: + class OmapIterator { + public: + virtual seastar::future<> seek_to_first() = 0; + virtual seastar::future<> upper_bound(const std::string &after) = 0; + virtual seastar::future<> lower_bound(const std::string &to) = 0; + virtual bool valid() const { + return false; + } + virtual seastar::future<> next() = 0; + virtual std::string key() { + return {}; + } + virtual seastar::future<std::string> tail_key() { + return seastar::make_ready_future<std::string>(); + } + virtual ceph::buffer::list value() { + return {}; + } + virtual int status() const { + return 0; + } + virtual ~OmapIterator() {} + private: + unsigned count = 0; + friend void intrusive_ptr_add_ref(FuturizedStore::OmapIterator* iter); + friend void intrusive_ptr_release(FuturizedStore::OmapIterator* iter); + }; + using OmapIteratorRef = boost::intrusive_ptr<OmapIterator>; + + static std::unique_ptr<FuturizedStore> create(const std::string& type, + const std::string& data, + const ConfigValues& values); + FuturizedStore() = default; + virtual ~FuturizedStore() = default; + + // no copying + explicit FuturizedStore(const FuturizedStore& o) = delete; + const FuturizedStore& operator=(const FuturizedStore& o) = delete; + + virtual seastar::future<> start() { + return seastar::now(); + } + virtual seastar::future<> stop() = 0; + virtual seastar::future<> mount() = 0; + virtual seastar::future<> umount() = 0; + + virtual seastar::future<> mkfs(uuid_d new_osd_fsid) = 0; + virtual seastar::future<store_statfs_t> stat() const = 0; + + using CollectionRef = boost::intrusive_ptr<FuturizedCollection>; + using read_errorator = crimson::errorator<crimson::ct_error::enoent, + crimson::ct_error::input_output_error>; + virtual read_errorator::future<ceph::bufferlist> read( + CollectionRef c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags = 0) = 0; + virtual read_errorator::future<ceph::bufferlist> readv( + CollectionRef c, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags = 0) = 0; + + using get_attr_errorator = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::enodata>; + virtual get_attr_errorator::future<ceph::bufferptr> get_attr( + CollectionRef c, + const ghobject_t& oid, + std::string_view name) const = 0; + + using get_attrs_ertr = crimson::errorator< + crimson::ct_error::enoent>; + using attrs_t = std::map<std::string, ceph::bufferptr, std::less<>>; + virtual get_attrs_ertr::future<attrs_t> get_attrs( + CollectionRef c, + const ghobject_t& oid) = 0; + virtual seastar::future<struct stat> stat( + CollectionRef c, + const ghobject_t& oid) = 0; + + using omap_values_t = std::map<std::string, bufferlist, std::less<>>; + using omap_keys_t = std::set<std::string>; + virtual read_errorator::future<omap_values_t> omap_get_values( + CollectionRef c, + const ghobject_t& oid, + const omap_keys_t& keys) = 0; + virtual seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( + CollectionRef c, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const = 0; + virtual read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values( + CollectionRef c, ///< [in] collection + const ghobject_t &oid, ///< [in] oid + const std::optional<std::string> &start ///< [in] start, empty for begin + ) = 0; ///< @return <done, values> values.empty() iff done + + virtual read_errorator::future<bufferlist> omap_get_header( + CollectionRef c, + const ghobject_t& oid) = 0; + + virtual seastar::future<CollectionRef> create_new_collection(const coll_t& cid) = 0; + virtual seastar::future<CollectionRef> open_collection(const coll_t& cid) = 0; + virtual seastar::future<std::vector<coll_t>> list_collections() = 0; + + virtual seastar::future<> do_transaction(CollectionRef ch, + ceph::os::Transaction&& txn) = 0; + virtual seastar::future<OmapIteratorRef> get_omap_iterator( + CollectionRef ch, + const ghobject_t& oid) = 0; + virtual seastar::future<std::map<uint64_t, uint64_t>> fiemap( + CollectionRef ch, + const ghobject_t& oid, + uint64_t off, + uint64_t len) = 0; + + virtual seastar::future<> write_meta(const std::string& key, + const std::string& value) = 0; + virtual seastar::future<std::tuple<int, std::string>> read_meta( + const std::string& key) = 0; + virtual uuid_d get_fsid() const = 0; + virtual unsigned get_max_attr_name_length() const = 0; +}; + +inline void intrusive_ptr_add_ref(FuturizedStore::OmapIterator* iter) { + assert(iter); + iter->count++; +} + +inline void intrusive_ptr_release(FuturizedStore::OmapIterator* iter) { + assert(iter); + assert(iter->count > 0); + if ((--iter->count) == 0) { + delete iter; + } +} + +} diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt new file mode 100644 index 000000000..77f8465cf --- /dev/null +++ b/src/crimson/os/seastore/CMakeLists.txt @@ -0,0 +1,37 @@ +add_library(crimson-seastore STATIC + cached_extent.cc + seastore_types.cc + segment_manager/ephemeral.cc + segment_manager/block.cc + transaction_manager.cc + journal.cc + cache.cc + lba_manager.cc + segment_cleaner.cc + lba_manager/btree/btree_lba_manager.cc + lba_manager/btree/lba_btree_node_impl.cc + lba_manager/btree/btree_range_pin.cc + onode.cc + onode_manager/simple-fltree/onode_block.cc + onode_manager/simple-fltree/onode_delta.cc + onode_manager/simple-fltree/onode_node.cc + onode_manager/staged-fltree/node.cc + onode_manager/staged-fltree/node_extent_manager.cc + onode_manager/staged-fltree/node_extent_manager/seastore.cc + onode_manager/staged-fltree/node_extent_mutable.cc + onode_manager/staged-fltree/node_impl.cc + onode_manager/staged-fltree/stages/item_iterator_stage.cc + onode_manager/staged-fltree/stages/key_layout.cc + onode_manager/staged-fltree/stages/node_stage_layout.cc + onode_manager/staged-fltree/stages/node_stage.cc + onode_manager/staged-fltree/stages/sub_items_stage.cc + onode_manager/staged-fltree/super.cc + onode_manager/staged-fltree/tree.cc + extentmap_manager.cc + extentmap_manager/btree/extentmap_btree_node_impl.cc + extentmap_manager/btree/btree_extentmap_manager.cc + seastore.cc + ../../../test/crimson/seastore/test_block.cc + ) +target_link_libraries(crimson-seastore + crimson) diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc new file mode 100644 index 000000000..6a406c1b8 --- /dev/null +++ b/src/crimson/os/seastore/cache.cc @@ -0,0 +1,541 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/cache.h" +#include "crimson/common/log.h" + +// included for get_extent_by_type +#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h" +#include "crimson/os/seastore/onode_manager/simple-fltree/onode_block.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h" +#include "test/crimson/seastore/test_block.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore { + +Cache::Cache(SegmentManager &segment_manager) : + segment_manager(segment_manager) {} + +Cache::~Cache() +{ + for (auto &i: extents) { + logger().error("~Cache: extent {} still alive", i); + } + ceph_assert(extents.empty()); +} + +Cache::retire_extent_ret Cache::retire_extent_if_cached( + Transaction &t, paddr_t addr) +{ + if (auto ext = t.write_set.find_offset(addr); ext != t.write_set.end()) { + logger().debug("{}: found {} in t.write_set", __func__, addr); + t.add_to_retired_set(CachedExtentRef(&*ext)); + return retire_extent_ertr::now(); + } else if (auto iter = extents.find_offset(addr); + iter != extents.end()) { + auto ret = CachedExtentRef(&*iter); + return ret->wait_io().then([&t, ret=std::move(ret)]() mutable { + t.add_to_retired_set(ret); + return retire_extent_ertr::now(); + }); + } else { + return retire_extent_ertr::now(); + } +} + +void Cache::add_extent(CachedExtentRef ref) +{ + assert(ref->is_valid()); + extents.insert(*ref); + + if (ref->is_dirty()) { + add_to_dirty(ref); + } else { + ceph_assert(!ref->primary_ref_list_hook.is_linked()); + } + logger().debug("add_extent: {}", *ref); +} + +void Cache::mark_dirty(CachedExtentRef ref) +{ + if (ref->is_dirty()) { + assert(ref->primary_ref_list_hook.is_linked()); + return; + } + + add_to_dirty(ref); + ref->state = CachedExtent::extent_state_t::DIRTY; + + logger().debug("mark_dirty: {}", *ref); +} + +void Cache::add_to_dirty(CachedExtentRef ref) +{ + assert(ref->is_valid()); + assert(!ref->primary_ref_list_hook.is_linked()); + intrusive_ptr_add_ref(&*ref); + dirty.push_back(*ref); +} + +void Cache::remove_extent(CachedExtentRef ref) +{ + logger().debug("remove_extent: {}", *ref); + assert(ref->is_valid()); + extents.erase(*ref); + + if (ref->is_dirty()) { + ceph_assert(ref->primary_ref_list_hook.is_linked()); + dirty.erase(dirty.s_iterator_to(*ref)); + intrusive_ptr_release(&*ref); + } else { + ceph_assert(!ref->primary_ref_list_hook.is_linked()); + } +} + +void Cache::replace_extent(CachedExtentRef next, CachedExtentRef prev) +{ + assert(next->get_paddr() == prev->get_paddr()); + assert(next->version == prev->version + 1); + extents.replace(*next, *prev); + + if (prev->is_dirty()) { + ceph_assert(prev->primary_ref_list_hook.is_linked()); + auto prev_it = dirty.iterator_to(*prev); + dirty.insert(prev_it, *next); + dirty.erase(prev_it); + intrusive_ptr_release(&*prev); + intrusive_ptr_add_ref(&*next); + } else { + add_to_dirty(next); + } +} + +CachedExtentRef Cache::alloc_new_extent_by_type( + Transaction &t, ///< [in, out] current transaction + extent_types_t type, ///< [in] type tag + segment_off_t length ///< [in] length +) +{ + switch (type) { + case extent_types_t::ROOT: + assert(0 == "ROOT is never directly alloc'd"); + return CachedExtentRef(); + case extent_types_t::LADDR_INTERNAL: + return alloc_new_extent<lba_manager::btree::LBAInternalNode>(t, length); + case extent_types_t::LADDR_LEAF: + return alloc_new_extent<lba_manager::btree::LBALeafNode>(t, length); + case extent_types_t::ONODE_BLOCK: + return alloc_new_extent<OnodeBlock>(t, length); + case extent_types_t::EXTMAP_INNER: + return alloc_new_extent<extentmap_manager::ExtMapInnerNode>(t, length); + case extent_types_t::EXTMAP_LEAF: + return alloc_new_extent<extentmap_manager::ExtMapLeafNode>(t, length); + case extent_types_t::TEST_BLOCK: + return alloc_new_extent<TestBlock>(t, length); + case extent_types_t::TEST_BLOCK_PHYSICAL: + return alloc_new_extent<TestBlockPhysical>(t, length); + case extent_types_t::NONE: { + ceph_assert(0 == "NONE is an invalid extent type"); + return CachedExtentRef(); + } + default: + ceph_assert(0 == "impossible"); + return CachedExtentRef(); + } +} + +CachedExtentRef Cache::duplicate_for_write( + Transaction &t, + CachedExtentRef i) { + if (i->is_pending()) + return i; + + auto ret = i->duplicate_for_write(); + if (ret->get_type() == extent_types_t::ROOT) { + // root must be loaded before mutate + assert(t.root == i); + t.root = ret->cast<RootBlock>(); + } else { + ret->last_committed_crc = i->last_committed_crc; + ret->prior_instance = i; + t.add_mutated_extent(ret); + } + + ret->version++; + ret->state = CachedExtent::extent_state_t::MUTATION_PENDING; + logger().debug("Cache::duplicate_for_write: {} -> {}", *i, *ret); + return ret; +} + +std::optional<record_t> Cache::try_construct_record(Transaction &t) +{ + // First, validate read set + for (auto &i: t.read_set) { + if (i->state == CachedExtent::extent_state_t::INVALID) + return std::nullopt; + } + + record_t record; + + t.write_set.clear(); + + // Add new copy of mutated blocks, set_io_wait to block until written + record.deltas.reserve(t.mutated_block_list.size()); + for (auto &i: t.mutated_block_list) { + if (!i->is_valid()) { + logger().debug("try_construct_record: ignoring invalid {}", *i); + continue; + } + logger().debug("try_construct_record: mutating {}", *i); + + assert(i->prior_instance); + replace_extent(i, i->prior_instance); + + i->prepare_write(); + i->set_io_wait(); + + assert(i->get_version() > 0); + auto final_crc = i->get_crc32c(); + record.deltas.push_back( + delta_info_t{ + i->get_type(), + i->get_paddr(), + (i->is_logical() + ? i->cast<LogicalCachedExtent>()->get_laddr() + : L_ADDR_NULL), + i->last_committed_crc, + final_crc, + (segment_off_t)i->get_length(), + i->get_version() - 1, + i->get_delta() + }); + i->last_committed_crc = final_crc; + } + + if (t.root) { + logger().debug( + "{}: writing out root delta for {}", + __func__, + *t.root); + record.deltas.push_back( + delta_info_t{ + extent_types_t::ROOT, + paddr_t{}, + L_ADDR_NULL, + 0, + 0, + 0, + t.root->get_version() - 1, + t.root->get_delta() + }); + } + + // Transaction is now a go, set up in-memory cache state + // invalidate now invalid blocks + for (auto &i: t.retired_set) { + logger().debug("try_construct_record: retiring {}", *i); + ceph_assert(i->is_valid()); + remove_extent(i); + i->state = CachedExtent::extent_state_t::INVALID; + } + + record.extents.reserve(t.fresh_block_list.size()); + for (auto &i: t.fresh_block_list) { + logger().debug("try_construct_record: fresh block {}", *i); + bufferlist bl; + i->prepare_write(); + bl.append(i->get_bptr()); + if (i->get_type() == extent_types_t::ROOT) { + assert(0 == "ROOT never gets written as a fresh block"); + } + + assert(bl.length() == i->get_length()); + record.extents.push_back(extent_t{ + i->get_type(), + i->is_logical() + ? i->cast<LogicalCachedExtent>()->get_laddr() + : L_ADDR_NULL, + std::move(bl) + }); + } + + return std::make_optional<record_t>(std::move(record)); +} + +void Cache::complete_commit( + Transaction &t, + paddr_t final_block_start, + journal_seq_t seq, + SegmentCleaner *cleaner) +{ + if (t.root) { + remove_extent(root); + root = t.root; + root->state = CachedExtent::extent_state_t::DIRTY; + root->on_delta_write(final_block_start); + root->dirty_from = seq; + add_extent(root); + logger().debug("complete_commit: new root {}", *t.root); + } + + for (auto &i: t.fresh_block_list) { + i->set_paddr(final_block_start.add_relative(i->get_paddr())); + i->last_committed_crc = i->get_crc32c(); + i->on_initial_write(); + + if (!i->is_valid()) { + logger().debug("complete_commit: invalid {}", *i); + continue; + } + + i->state = CachedExtent::extent_state_t::CLEAN; + logger().debug("complete_commit: fresh {}", *i); + add_extent(i); + if (cleaner) { + cleaner->mark_space_used( + i->get_paddr(), + i->get_length()); + } + } + + // Add new copy of mutated blocks, set_io_wait to block until written + for (auto &i: t.mutated_block_list) { + logger().debug("complete_commit: mutated {}", *i); + assert(i->prior_instance); + i->on_delta_write(final_block_start); + i->prior_instance = CachedExtentRef(); + if (!i->is_valid()) { + logger().debug("complete_commit: not dirtying invalid {}", *i); + continue; + } + i->state = CachedExtent::extent_state_t::DIRTY; + if (i->version == 1) { + i->dirty_from = seq; + } + } + + if (cleaner) { + for (auto &i: t.retired_set) { + cleaner->mark_space_free( + i->get_paddr(), + i->get_length()); + } + } + + for (auto &i: t.mutated_block_list) { + i->complete_io(); + } +} + +void Cache::init() { + if (root) { + // initial creation will do mkfs followed by mount each of which calls init + remove_extent(root); + root = nullptr; + } + root = new RootBlock(); + root->state = CachedExtent::extent_state_t::DIRTY; + add_extent(root); +} + +Cache::mkfs_ertr::future<> Cache::mkfs(Transaction &t) +{ + return get_root(t).safe_then([this, &t](auto croot) { + duplicate_for_write(t, croot); + return mkfs_ertr::now(); + }); +} + +Cache::close_ertr::future<> Cache::close() +{ + root.reset(); + for (auto i = dirty.begin(); i != dirty.end(); ) { + auto ptr = &*i; + dirty.erase(i++); + intrusive_ptr_release(ptr); + } + return close_ertr::now(); +} + +Cache::replay_delta_ret +Cache::replay_delta( + journal_seq_t journal_seq, + paddr_t record_base, + const delta_info_t &delta) +{ + if (delta.type == extent_types_t::ROOT) { + logger().debug("replay_delta: found root delta"); + root->apply_delta_and_adjust_crc(record_base, delta.bl); + root->dirty_from = journal_seq; + return replay_delta_ertr::now(); + } else { + auto get_extent_if_cached = [this](paddr_t addr) + -> replay_delta_ertr::future<CachedExtentRef> { + auto retiter = extents.find_offset(addr); + if (retiter != extents.end()) { + return replay_delta_ertr::make_ready_future<CachedExtentRef>(&*retiter); + } else { + return replay_delta_ertr::make_ready_future<CachedExtentRef>(); + } + }; + auto extent_fut = delta.pversion == 0 ? + get_extent_by_type( + delta.type, + delta.paddr, + delta.laddr, + delta.length) : + get_extent_if_cached( + delta.paddr); + return extent_fut.safe_then([=, &delta](auto extent) { + if (!extent) { + assert(delta.pversion > 0); + logger().debug( + "replay_delta: replaying {}, extent not present so delta is obsolete", + delta); + return; + } + + logger().debug( + "replay_delta: replaying {} on {}", + *extent, + delta); + + assert(extent->version == delta.pversion); + + assert(extent->last_committed_crc == delta.prev_crc); + extent->apply_delta_and_adjust_crc(record_base, delta.bl); + assert(extent->last_committed_crc == delta.final_crc); + + if (extent->version == 0) { + extent->dirty_from = journal_seq; + } + extent->version++; + mark_dirty(extent); + }); + } +} + +Cache::get_next_dirty_extents_ret Cache::get_next_dirty_extents( + journal_seq_t seq) +{ + std::vector<CachedExtentRef> ret; + for (auto i = dirty.begin(); i != dirty.end(); ++i) { + CachedExtentRef cand; + if (i->dirty_from < seq) { + assert(ret.empty() || ret.back()->dirty_from <= i->dirty_from); + ret.push_back(&*i); + } else { + break; + } + } + return seastar::do_with( + std::move(ret), + [](auto &ret) { + return seastar::do_for_each( + ret, + [](auto &ext) { + logger().debug( + "get_next_dirty_extents: waiting on {}", + *ext); + return ext->wait_io(); + }).then([&ret]() mutable { + return seastar::make_ready_future<std::vector<CachedExtentRef>>( + std::move(ret)); + }); + }); +} + +Cache::get_root_ret Cache::get_root(Transaction &t) +{ + if (t.root) { + return get_root_ret( + get_root_ertr::ready_future_marker{}, + t.root); + } else { + auto ret = root; + return ret->wait_io().then([ret, &t] { + t.root = ret; + return get_root_ret( + get_root_ertr::ready_future_marker{}, + ret); + }); + } +} + +using StagedOnodeBlock = crimson::os::seastore::onode::SeastoreNodeExtent; + +Cache::get_extent_ertr::future<CachedExtentRef> Cache::get_extent_by_type( + extent_types_t type, + paddr_t offset, + laddr_t laddr, + segment_off_t length) +{ + return [=] { + switch (type) { + case extent_types_t::ROOT: + assert(0 == "ROOT is never directly read"); + return get_extent_ertr::make_ready_future<CachedExtentRef>(); + case extent_types_t::LADDR_INTERNAL: + return get_extent<lba_manager::btree::LBAInternalNode>(offset, length + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::LADDR_LEAF: + return get_extent<lba_manager::btree::LBALeafNode>(offset, length + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::EXTMAP_INNER: + return get_extent<extentmap_manager::ExtMapInnerNode>(offset, length + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::EXTMAP_LEAF: + return get_extent<extentmap_manager::ExtMapLeafNode>(offset, length + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::ONODE_BLOCK: + return get_extent<OnodeBlock>(offset, length + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::ONODE_BLOCK_STAGED: + return get_extent<StagedOnodeBlock>(offset, length + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::TEST_BLOCK: + return get_extent<TestBlock>(offset, length + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::TEST_BLOCK_PHYSICAL: + return get_extent<TestBlockPhysical>(offset, length + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::NONE: { + ceph_assert(0 == "NONE is an invalid extent type"); + return get_extent_ertr::make_ready_future<CachedExtentRef>(); + } + default: + ceph_assert(0 == "impossible"); + return get_extent_ertr::make_ready_future<CachedExtentRef>(); + } + }().safe_then([laddr](CachedExtentRef e) { + assert(e->is_logical() == (laddr != L_ADDR_NULL)); + if (e->is_logical()) { + e->cast<LogicalCachedExtent>()->set_laddr(laddr); + } + return get_extent_ertr::make_ready_future<CachedExtentRef>(e); + }); +} + +} diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h new file mode 100644 index 000000000..624272162 --- /dev/null +++ b/src/crimson/os/seastore/cache.h @@ -0,0 +1,516 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include "seastar/core/shared_future.hh" + +#include "include/buffer.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction.h" +#include "crimson/os/seastore/segment_manager.h" +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/root_block.h" +#include "crimson/os/seastore/segment_cleaner.h" + +namespace crimson::os::seastore { + +/** + * Cache + * + * This component is responsible for buffer management, including + * transaction lifecycle. + * + * Seastore transactions are expressed as an atomic combination of + * 1) newly written blocks + * 2) logical mutations to existing physical blocks + * + * See record_t + * + * As such, any transaction has 3 components: + * 1) read_set: references to extents read during the transaction + * See Transaction::read_set + * 2) write_set: references to extents to be written as: + * a) new physical blocks, see Transaction::fresh_block_list + * b) mutations to existing physical blocks, + * see Transaction::mutated_block_list + * 3) retired_set: extent refs to be retired either due to 2b or + * due to releasing the extent generally. + + * In the case of 2b, the CachedExtent will have been copied into + * a fresh CachedExtentRef such that the source extent ref is present + * in the read set and the newly allocated extent is present in the + * write_set. + * + * A transaction has 3 phases: + * 1) construction: user calls Cache::get_transaction() and populates + * the returned transaction by calling Cache methods + * 2) submission: user calls Cache::try_start_transaction(). If + * succcessful, the user may construct a record and submit the + * transaction to the journal. + * 3) completion: once the transaction is durable, the user must call + * Cache::complete_transaction() with the block offset to complete + * the transaction. + * + * Internally, in phase 1, the fields in Transaction are filled in. + * - reads may block if the referenced extent is being written + * - once a read obtains a particular CachedExtentRef for a paddr_t, + * it'll always get the same one until overwritten + * - once a paddr_t is overwritten or written, subsequent reads of + * that addr will get the new ref + * + * In phase 2, if all extents in the read set are valid (not expired), + * we can commit (otherwise, we fail and the user must retry). + * - Expire all extents in the retired_set (they must all be valid) + * - Remove all extents in the retired_set from Cache::extents + * - Mark all extents in the write_set wait_io(), add promises to + * transaction + * - Merge Transaction::write_set into Cache::extents + * + * After phase 2, the user will submit the record to the journal. + * Once complete, we perform phase 3: + * - For each CachedExtent in block_list, call + * CachedExtent::complete_initial_write(paddr_t) with the block's + * final offset (inferred from the extent's position in the block_list + * and extent lengths). + * - For each block in mutation_list, call + * CachedExtent::delta_written(paddr_t) with the address of the start + * of the record + * - Complete all promises with the final record start paddr_t + */ +class Cache { +public: + Cache(SegmentManager &segment_manager); + ~Cache(); + + /** + * drop_from_cache + * + * Drop extent from cache. Intended for use when + * ref refers to a logically dead extent as during + * replay. + */ + void drop_from_cache(CachedExtentRef ref) { + remove_extent(ref); + } + + /// Declare ref retired in t + void retire_extent(Transaction &t, CachedExtentRef ref) { + t.add_to_retired_set(ref); + } + + /// Declare paddr retired in t, noop if not cached + using retire_extent_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using retire_extent_ret = retire_extent_ertr::future<>; + retire_extent_ret retire_extent_if_cached( + Transaction &t, paddr_t addr); + + /** + * get_root + * + * returns ref to current root or t.root if modified in t + */ + using get_root_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using get_root_ret = get_root_ertr::future<RootBlockRef>; + get_root_ret get_root(Transaction &t); + + /** + * get_root_fast + * + * returns t.root and assume it is already present/read in t + */ + RootBlockRef get_root_fast(Transaction &t) { + assert(t.root); + return t.root; + } + + /** + * get_extent + * + * returns ref to extent at offset~length of type T either from + * - extent_set if already in cache + * - disk + */ + using get_extent_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + template <typename T> + get_extent_ertr::future<TCachedExtentRef<T>> get_extent( + paddr_t offset, ///< [in] starting addr + segment_off_t length ///< [in] length + ) { + if (auto iter = extents.find_offset(offset); + iter != extents.end()) { + auto ret = TCachedExtentRef<T>(static_cast<T*>(&*iter)); + return ret->wait_io().then([ret=std::move(ret)]() mutable { + return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>( + std::move(ret)); + }); + } else { + auto ref = CachedExtent::make_cached_extent_ref<T>( + alloc_cache_buf(length)); + ref->set_io_wait(); + ref->set_paddr(offset); + ref->state = CachedExtent::extent_state_t::CLEAN; + + return segment_manager.read( + offset, + length, + ref->get_bptr()).safe_then( + [this, ref=std::move(ref)]() mutable { + /* TODO: crc should be checked against LBA manager */ + ref->last_committed_crc = ref->get_crc32c(); + + ref->on_clean_read(); + ref->complete_io(); + add_extent(ref); + return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>( + std::move(ref)); + }, + get_extent_ertr::pass_further{}, + crimson::ct_error::discard_all{}); + } + } + + /** + * get_extent_if_cached + * + * Returns extent at offset if in cache + */ + Transaction::get_extent_ret get_extent_if_cached( + Transaction &t, + paddr_t offset, + CachedExtentRef *out) { + auto result = t.get_extent(offset, out); + if (result != Transaction::get_extent_ret::ABSENT) { + return result; + } else if (auto iter = extents.find_offset(offset); + iter != extents.end()) { + if (out) + *out = &*iter; + return Transaction::get_extent_ret::PRESENT; + } else { + return Transaction::get_extent_ret::ABSENT; + } + } + + /** + * get_extent + * + * returns ref to extent at offset~length of type T either from + * - t if modified by t + * - extent_set if already in cache + * - disk + * + * t *must not* have retired offset + */ + template <typename T> + get_extent_ertr::future<TCachedExtentRef<T>> get_extent( + Transaction &t, ///< [in,out] current transaction + paddr_t offset, ///< [in] starting addr + segment_off_t length ///< [in] length + ) { + CachedExtentRef ret; + auto result = t.get_extent(offset, &ret); + if (result != Transaction::get_extent_ret::ABSENT) { + assert(result != Transaction::get_extent_ret::RETIRED); + return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>( + ret->cast<T>()); + } else { + return get_extent<T>(offset, length).safe_then( + [&t](auto ref) mutable { + t.add_to_read_set(ref); + return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>( + std::move(ref)); + }); + } + } + + /** + * get_extent_by_type + * + * Based on type, instantiate the correct concrete type + * and read in the extent at location offset~length. + */ + get_extent_ertr::future<CachedExtentRef> get_extent_by_type( + extent_types_t type, ///< [in] type tag + paddr_t offset, ///< [in] starting addr + laddr_t laddr, ///< [in] logical address if logical + segment_off_t length ///< [in] length + ); + + get_extent_ertr::future<CachedExtentRef> get_extent_by_type( + Transaction &t, + extent_types_t type, + paddr_t offset, + laddr_t laddr, + segment_off_t length) { + CachedExtentRef ret; + auto status = get_extent_if_cached(t, offset, &ret); + if (status == Transaction::get_extent_ret::RETIRED) { + return get_extent_ertr::make_ready_future<CachedExtentRef>(); + } else if (status == Transaction::get_extent_ret::PRESENT) { + return get_extent_ertr::make_ready_future<CachedExtentRef>(ret); + } else { + return get_extent_by_type(type, offset, laddr, length + ).safe_then([=, &t](CachedExtentRef ret) { + t.add_to_read_set(ret); + return get_extent_ertr::make_ready_future<CachedExtentRef>( + std::move(ret)); + }); + } + } + + /** + * get_extents + * + * returns refs to extents in extents from: + * - t if modified by t + * - extent_set if already in cache + * - disk + */ + template<typename T> + get_extent_ertr::future<t_pextent_list_t<T>> get_extents( + Transaction &t, ///< [in, out] current transaction + paddr_list_t &&extents ///< [in] extent list for lookup + ) { + auto retref = std::make_unique<t_pextent_list_t<T>>(); + auto &ret = *retref; + auto ext = std::make_unique<paddr_list_t>(std::move(extents)); + return crimson::do_for_each( + ext->begin(), + ext->end(), + [this, &t, &ret](auto &p) { + auto &[offset, len] = p; + return get_extent(t, offset, len).safe_then([&ret](auto cext) { + ret.push_back(std::move(cext)); + }); + }).safe_then([retref=std::move(retref), ext=std::move(ext)]() mutable { + return get_extent_ertr::make_ready_future<t_pextent_list_t<T>>( + std::move(*retref)); + }); + } + + /** + * alloc_new_extent + * + * Allocates a fresh extent. addr will be relative until commit. + */ + template <typename T> + TCachedExtentRef<T> alloc_new_extent( + Transaction &t, ///< [in, out] current transaction + segment_off_t length ///< [in] length + ) { + auto ret = CachedExtent::make_cached_extent_ref<T>( + alloc_cache_buf(length)); + t.add_fresh_extent(ret); + ret->state = CachedExtent::extent_state_t::INITIAL_WRITE_PENDING; + return ret; + } + + /** + * alloc_new_extent + * + * Allocates a fresh extent. addr will be relative until commit. + */ + CachedExtentRef alloc_new_extent_by_type( + Transaction &t, ///< [in, out] current transaction + extent_types_t type, ///< [in] type tag + segment_off_t length ///< [in] length + ); + + /** + * Allocates mutable buffer from extent_set on offset~len + * + * TODO: Note, currently all implementations literally copy the + * buffer. This needn't be true, CachedExtent implementations could + * choose to refer to the same buffer unmodified until commit and just + * buffer the mutations in an ancillary data structure. + * + * @param current transaction + * @param extent to duplicate + * @return mutable extent + */ + CachedExtentRef duplicate_for_write( + Transaction &t, ///< [in, out] current transaction + CachedExtentRef i ///< [in] ref to existing extent + ); + + /** + * try_construct_record + * + * First checks for conflicts. If a racing write has mutated/retired + * an extent mutated by this transaction, nullopt will be returned. + * + * Otherwise, a record will be returned valid for use with Journal. + */ + std::optional<record_t> try_construct_record( + Transaction &t ///< [in, out] current transaction + ); + + /** + * complete_commit + * + * Must be called upon completion of write. Releases blocks on mutating + * extents, fills in addresses, and calls relevant callbacks on fresh + * and mutated exents. + */ + void complete_commit( + Transaction &t, ///< [in, out] current transaction + paddr_t final_block_start, ///< [in] offset of initial block + journal_seq_t seq, ///< [in] journal commit seq + SegmentCleaner *cleaner=nullptr ///< [out] optional segment stat listener + ); + + /** + * init + */ + void init(); + + /** + * mkfs + * + * Alloc initial root node and add to t. The intention is for other + * components to use t to adjust the resulting root ref prior to commit. + */ + using mkfs_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + mkfs_ertr::future<> mkfs(Transaction &t); + + /** + * close + * + * TODO: should flush dirty blocks + */ + using close_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + close_ertr::future<> close(); + + /** + * replay_delta + * + * Intended for use in Journal::delta. For each delta, should decode delta, + * read relevant block from disk or cache (using correct type), and call + * CachedExtent::apply_delta marking the extent dirty. + */ + using replay_delta_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using replay_delta_ret = replay_delta_ertr::future<>; + replay_delta_ret replay_delta( + journal_seq_t seq, + paddr_t record_block_base, + const delta_info_t &delta); + + /** + * init_cached_extents + * + * Calls passed lambda for each dirty cached block. Intended for use + * after replay to allow lba_manager (or w/e) to read in any ancestor + * blocks. + */ + using init_cached_extents_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using init_cached_extents_ret = replay_delta_ertr::future<>; + template <typename F> + init_cached_extents_ret init_cached_extents( + Transaction &t, + F &&f) + { + std::vector<CachedExtentRef> dirty; + for (auto &e : extents) { + dirty.push_back(CachedExtentRef(&e)); + } + return seastar::do_with( + std::forward<F>(f), + std::move(dirty), + [&t](auto &f, auto &refs) mutable { + return crimson::do_for_each( + refs, + [&t, &f](auto &e) { return f(t, e); }); + }); + } + + /** + * update_extent_from_transaction + * + * Updates passed extent based on t. If extent has been retired, + * a null result will be returned. + */ + CachedExtentRef update_extent_from_transaction( + Transaction &t, + CachedExtentRef extent) { + if (extent->get_type() == extent_types_t::ROOT) { + if (t.root) { + return t.root; + } else { + return extent; + } + } else { + auto result = t.get_extent(extent->get_paddr(), &extent); + if (result == Transaction::get_extent_ret::RETIRED) { + return CachedExtentRef(); + } else { + return extent; + } + } + } + + /** + * print + * + * Dump summary of contents (TODO) + */ + std::ostream &print( + std::ostream &out) const { + return out; + } + + /// returns extents with dirty_from < seq + using get_next_dirty_extents_ertr = crimson::errorator<>; + using get_next_dirty_extents_ret = get_next_dirty_extents_ertr::future< + std::vector<CachedExtentRef>>; + get_next_dirty_extents_ret get_next_dirty_extents( + journal_seq_t seq); + +private: + SegmentManager &segment_manager; ///< ref to segment_manager + RootBlockRef root; ///< ref to current root + ExtentIndex extents; ///< set of live extents + + /** + * dirty + * + * holds refs to dirty extents. Ordered by CachedExtent::dirty_from. + */ + CachedExtent::list dirty; + + /// alloc buffer for cached extent + bufferptr alloc_cache_buf(size_t size) { + // TODO: memory pooling etc + auto bp = ceph::bufferptr( + buffer::create_page_aligned(size)); + bp.zero(); + return bp; + } + + /// Add extent to extents handling dirty and refcounting + void add_extent(CachedExtentRef ref); + + /// Mark exising extent ref dirty -- mainly for replay + void mark_dirty(CachedExtentRef ref); + + /// Add dirty extent to dirty list + void add_to_dirty(CachedExtentRef ref); + + /// Remove extent from extents handling dirty and refcounting + void remove_extent(CachedExtentRef ref); + + /// Replace prev with next + void replace_extent(CachedExtentRef next, CachedExtentRef prev); +}; + +} diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc new file mode 100644 index 000000000..7019b9fb8 --- /dev/null +++ b/src/crimson/os/seastore/cached_extent.cc @@ -0,0 +1,96 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/cached_extent.h" + +#include "crimson/common/log.h" + +namespace { + [[maybe_unused]] seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore { + +#ifdef DEBUG_CACHED_EXTENT_REF + +void intrusive_ptr_add_ref(CachedExtent *ptr) +{ + intrusive_ptr_add_ref( + static_cast<boost::intrusive_ref_counter< + CachedExtent, + boost::thread_unsafe_counter>*>(ptr)); + logger().debug("intrusive_ptr_add_ref: {}", *ptr); +} + +void intrusive_ptr_release(CachedExtent *ptr) +{ + logger().debug("intrusive_ptr_release: {}", *ptr); + intrusive_ptr_release( + static_cast<boost::intrusive_ref_counter< + CachedExtent, + boost::thread_unsafe_counter>*>(ptr)); +} + +#endif + +std::ostream &operator<<(std::ostream &out, CachedExtent::extent_state_t state) +{ + switch (state) { + case CachedExtent::extent_state_t::INITIAL_WRITE_PENDING: + return out << "INITIAL_WRITE_PENDING"; + case CachedExtent::extent_state_t::MUTATION_PENDING: + return out << "MUTATION_PENDING"; + case CachedExtent::extent_state_t::CLEAN: + return out << "CLEAN"; + case CachedExtent::extent_state_t::DIRTY: + return out << "DIRTY"; + case CachedExtent::extent_state_t::INVALID: + return out << "INVALID"; + default: + return out << "UNKNOWN"; + } +} + +std::ostream &operator<<(std::ostream &out, const CachedExtent &ext) +{ + return ext.print(out); +} + +CachedExtent::~CachedExtent() +{ + if (parent_index) { + parent_index->erase(*this); + } +} + +std::ostream &LogicalCachedExtent::print_detail(std::ostream &out) const +{ + out << ", laddr=" << laddr; + if (pin) { + out << ", pin=" << *pin; + } else { + out << ", pin=empty"; + } + return print_detail_l(out); +} + +std::ostream &operator<<(std::ostream &out, const LBAPin &rhs) +{ + return out << "LBAPin(" << rhs.get_laddr() << "~" << rhs.get_length() + << "->" << rhs.get_paddr(); +} + +std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs) +{ + bool first = true; + out << '['; + for (auto &i: rhs) { + out << (first ? "" : ",") << *i; + first = false; + } + return out << ']'; +} + +} diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h new file mode 100644 index 000000000..974988489 --- /dev/null +++ b/src/crimson/os/seastore/cached_extent.h @@ -0,0 +1,659 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include <boost/intrusive/list.hpp> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include "seastar/core/shared_future.hh" + +#include "include/buffer.h" +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/seastore_types.h" + +namespace crimson::os::seastore { + +class CachedExtent; +using CachedExtentRef = boost::intrusive_ptr<CachedExtent>; + +// #define DEBUG_CACHED_EXTENT_REF +#ifdef DEBUG_CACHED_EXTENT_REF + +void intrusive_ptr_add_ref(CachedExtent *); +void intrusive_ptr_release(CachedExtent *); + +#endif + +template <typename T> +using TCachedExtentRef = boost::intrusive_ptr<T>; + +/** + * CachedExtent + */ +namespace onode { + class DummyNodeExtent; + class TestReplayExtent; +} +class ExtentIndex; +class CachedExtent : public boost::intrusive_ref_counter< + CachedExtent, boost::thread_unsafe_counter> { + enum class extent_state_t : uint8_t { + INITIAL_WRITE_PENDING, // In Transaction::write_set and fresh_block_list + MUTATION_PENDING, // In Transaction::write_set and mutated_block_list + CLEAN, // In Cache::extent_index, Transaction::read_set + // during write, contents match disk, version == 0 + DIRTY, // Same as CLEAN, but contents do not match disk, + // version > 0 + INVALID // Part of no ExtentIndex set + } state = extent_state_t::INVALID; + friend std::ostream &operator<<(std::ostream &, extent_state_t); + // allow a dummy extent to pretend it is at a specific state + friend class onode::DummyNodeExtent; + friend class onode::TestReplayExtent; + + uint32_t last_committed_crc = 0; + + // Points at current version while in state MUTATION_PENDING + CachedExtentRef prior_instance; + + /** + * dirty_from + * + * When dirty, indiciates the oldest journal entry which mutates + * this extent. + */ + journal_seq_t dirty_from; + +public: + /** + * duplicate_for_write + * + * Implementation should return a fresh CachedExtentRef + * which represents a copy of *this until on_delta_write() + * is complete, at which point the user may assume *this + * will be in state INVALID. As such, the implementation + * may involve a copy of get_bptr(), or an ancillary + * structure which defers updating the actual buffer until + * on_delta_write(). + */ + virtual CachedExtentRef duplicate_for_write() = 0; + + /** + * prepare_write + * + * Called prior to reading buffer. + * Implemenation may use this callback to fully write out + * updates to the buffer. + */ + virtual void prepare_write() {} + + /** + * on_initial_write + * + * Called after commit of extent. State will be CLEAN. + * Implentation may use this call to fixup the buffer + * with the newly available absolute get_paddr(). + */ + virtual void on_initial_write() {} + + /** + * on_clean_read + * + * Called after read of initially written extent. + * State will be CLEAN. Implentation may use this + * call to fixup the buffer with the newly available + * absolute get_paddr(). + */ + virtual void on_clean_read() {} + + /** + * on_delta_write + * + * Called after commit of delta. State will be DIRTY. + * Implentation may use this call to fixup any relative + * references in the the buffer with the passed + * record_block_offset record location. + */ + virtual void on_delta_write(paddr_t record_block_offset) {} + + /** + * get_type + * + * Returns concrete type. + */ + virtual extent_types_t get_type() const = 0; + + virtual bool is_logical() const { + return false; + } + + friend std::ostream &operator<<(std::ostream &, extent_state_t); + virtual std::ostream &print_detail(std::ostream &out) const { return out; } + std::ostream &print(std::ostream &out) const { + out << "CachedExtent(addr=" << this + << ", type=" << get_type() + << ", version=" << version + << ", dirty_from=" << dirty_from + << ", paddr=" << get_paddr() + << ", state=" << state + << ", last_committed_crc=" << last_committed_crc + << ", refcount=" << use_count(); + print_detail(out); + return out << ")"; + } + + /** + * get_delta + * + * Must return a valid delta usable in apply_delta() in submit_transaction + * if state == MUTATION_PENDING. + */ + virtual ceph::bufferlist get_delta() = 0; + + /** + * apply_delta + * + * bl is a delta obtained previously from get_delta. The versions will + * match. Implementation should mutate buffer based on bl. base matches + * the address passed on_delta_write. + * + * Implementation *must* use set_last_committed_crc to update the crc to + * what the crc of the buffer would have been at submission. For physical + * extents that use base to adjust internal record-relative deltas, this + * means that the crc should be of the buffer after applying the delta, + * but before that adjustment. We do it this way because the crc in the + * commit path does not yet know the record base address. + * + * LogicalCachedExtent overrides this method and provides a simpler + * apply_delta override for LogicalCachedExtent implementers. + */ + virtual void apply_delta_and_adjust_crc( + paddr_t base, const ceph::bufferlist &bl) = 0; + + /** + * Called on dirty CachedExtent implementation after replay. + * Implementation should perform any reads/in-memory-setup + * necessary. (for instance, the lba implementation will use this + * to load in lba_manager blocks) + */ + using complete_load_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + virtual complete_load_ertr::future<> complete_load() { + return complete_load_ertr::now(); + } + + /** + * cast + * + * Returns a TCachedExtentRef of the specified type. + * TODO: add dynamic check that the requested type is actually correct. + */ + template <typename T> + TCachedExtentRef<T> cast() { + return TCachedExtentRef<T>(static_cast<T*>(this)); + } + template <typename T> + TCachedExtentRef<const T> cast() const { + return TCachedExtentRef<const T>(static_cast<const T*>(this)); + } + + /// Returns true if extent is part of an open transaction + bool is_pending() const { + return state == extent_state_t::INITIAL_WRITE_PENDING || + state == extent_state_t::MUTATION_PENDING; + } + + /// Returns true if extent has a pending delta + bool is_mutation_pending() const { + return state == extent_state_t::MUTATION_PENDING; + } + + /// Returns true if extent is a fresh extent + bool is_initial_pending() const { + return state == extent_state_t::INITIAL_WRITE_PENDING; + } + + /// Returns true if extent is clean (does not have deltas on disk) + bool is_clean() const { + ceph_assert(is_valid()); + return state == extent_state_t::INITIAL_WRITE_PENDING || + state == extent_state_t::CLEAN; + } + + /// Returns true if extent is dirty (has deltas on disk) + bool is_dirty() const { + ceph_assert(is_valid()); + return !is_clean(); + } + + /// Returns true if extent has not been superceded or retired + bool is_valid() const { + return state != extent_state_t::INVALID; + } + + /** + * get_dirty_from + * + * Return journal location of oldest relevant delta. + */ + auto get_dirty_from() const { return dirty_from; } + + + /** + * get_paddr + * + * Returns current address of extent. If is_initial_pending(), address will + * be relative, otherwise address will be absolute. + */ + paddr_t get_paddr() const { return poffset; } + + /// Returns length of extent + extent_len_t get_length() const { return ptr.length(); } + + /// Returns version, get_version() == 0 iff is_clean() + extent_version_t get_version() const { + return version; + } + + /// Returns crc32c of buffer + uint32_t get_crc32c() { + return ceph_crc32c( + 1, + reinterpret_cast<const unsigned char *>(get_bptr().c_str()), + get_length()); + } + + /// Get ref to raw buffer + bufferptr &get_bptr() { return ptr; } + const bufferptr &get_bptr() const { return ptr; } + + /// Compare by paddr + friend bool operator< (const CachedExtent &a, const CachedExtent &b) { + return a.poffset < b.poffset; + } + friend bool operator> (const CachedExtent &a, const CachedExtent &b) { + return a.poffset > b.poffset; + } + friend bool operator== (const CachedExtent &a, const CachedExtent &b) { + return a.poffset == b.poffset; + } + + virtual ~CachedExtent(); + +private: + friend struct paddr_cmp; + friend struct ref_paddr_cmp; + friend class ExtentIndex; + + /// Pointer to containing index (or null) + ExtentIndex *parent_index = nullptr; + + /// hook for intrusive extent_index + boost::intrusive::set_member_hook<> extent_index_hook; + using index_member_options = boost::intrusive::member_hook< + CachedExtent, + boost::intrusive::set_member_hook<>, + &CachedExtent::extent_index_hook>; + using index = boost::intrusive::set<CachedExtent, index_member_options>; + friend class ExtentIndex; + friend class Transaction; + + /// hook for intrusive ref list (mainly dirty or lru list) + boost::intrusive::list_member_hook<> primary_ref_list_hook; + using primary_ref_list_member_options = boost::intrusive::member_hook< + CachedExtent, + boost::intrusive::list_member_hook<>, + &CachedExtent::primary_ref_list_hook>; + using list = boost::intrusive::list< + CachedExtent, + primary_ref_list_member_options>; + + /// Actual data contents + ceph::bufferptr ptr; + + /// number of deltas since initial write + extent_version_t version = EXTENT_VERSION_NULL; + + /// address of original block -- relative iff is_pending() and is_clean() + paddr_t poffset; + + /// used to wait while in-progress commit completes + std::optional<seastar::shared_promise<>> io_wait_promise; + void set_io_wait() { + ceph_assert(!io_wait_promise); + io_wait_promise = seastar::shared_promise<>(); + } + void complete_io() { + ceph_assert(io_wait_promise); + io_wait_promise->set_value(); + io_wait_promise = std::nullopt; + } + seastar::future<> wait_io() { + if (!io_wait_promise) { + return seastar::now(); + } else { + return io_wait_promise->get_shared_future(); + } + } + +protected: + CachedExtent(CachedExtent &&other) = delete; + CachedExtent(ceph::bufferptr &&ptr) : ptr(std::move(ptr)) {} + CachedExtent(const CachedExtent &other) + : state(other.state), + dirty_from(other.dirty_from), + ptr(other.ptr.c_str(), other.ptr.length()), + version(other.version), + poffset(other.poffset) {} + + struct share_buffer_t {}; + CachedExtent(const CachedExtent &other, share_buffer_t) : + state(other.state), + dirty_from(other.dirty_from), + ptr(other.ptr), + version(other.version), + poffset(other.poffset) {} + + + friend class Cache; + template <typename T> + static TCachedExtentRef<T> make_cached_extent_ref(bufferptr &&ptr) { + return new T(std::move(ptr)); + } + + CachedExtentRef get_prior_instance() { + return prior_instance; + } + + /// Sets last_committed_crc + void set_last_committed_crc(uint32_t crc) { + last_committed_crc = crc; + } + + void set_paddr(paddr_t offset) { poffset = offset; } + + /** + * maybe_generate_relative + * + * There are three kinds of addresses one might want to + * store within an extent: + * - addr for a block within the same transaction relative to the + * physical location of this extent in the + * event that we will read it in the initial read of the extent + * - addr relative to the physical location of the next record to a + * block within that record to contain a delta for this extent in + * the event that we'll read it from a delta and overlay it onto a + * dirty representation of the extent. + * - absolute addr to a block already written outside of the current + * transaction. + * + * This helper checks addr and the current state to create the correct + * reference. + */ + paddr_t maybe_generate_relative(paddr_t addr) { + if (!addr.is_relative()) { + return addr; + } else if (is_mutation_pending()) { + return addr; + } else { + ceph_assert(is_initial_pending()); + ceph_assert(get_paddr().is_record_relative()); + return addr - get_paddr(); + } + } + +}; + +std::ostream &operator<<(std::ostream &, CachedExtent::extent_state_t); +std::ostream &operator<<(std::ostream &, const CachedExtent&); + +/// Compare extents by paddr +struct paddr_cmp { + bool operator()(paddr_t lhs, const CachedExtent &rhs) const { + return lhs < rhs.poffset; + } + bool operator()(const CachedExtent &lhs, paddr_t rhs) const { + return lhs.poffset < rhs; + } +}; + +/// Compare extent refs by paddr +struct ref_paddr_cmp { + using is_transparent = paddr_t; + bool operator()(const CachedExtentRef &lhs, const CachedExtentRef &rhs) const { + return lhs->poffset < rhs->poffset; + } + bool operator()(const paddr_t &lhs, const CachedExtentRef &rhs) const { + return lhs < rhs->poffset; + } + bool operator()(const CachedExtentRef &lhs, const paddr_t &rhs) const { + return lhs->poffset < rhs; + } +}; + +template <typename T, typename C> +class addr_extent_list_base_t + : public std::list<std::pair<T, C>> {}; + +using pextent_list_t = addr_extent_list_base_t<paddr_t, CachedExtentRef>; + +template <typename T, typename C, typename Cmp> +class addr_extent_set_base_t + : public std::set<C, Cmp> {}; + +using pextent_set_t = addr_extent_set_base_t< + paddr_t, + CachedExtentRef, + ref_paddr_cmp + >; + +template <typename T> +using t_pextent_list_t = addr_extent_list_base_t<paddr_t, TCachedExtentRef<T>>; + +/** + * ExtentIndex + * + * Index of CachedExtent & by poffset, does not hold a reference, + * user must ensure each extent is removed prior to deletion + */ +class ExtentIndex { + friend class Cache; + CachedExtent::index extent_index; +public: + auto get_overlap(paddr_t addr, segment_off_t len) { + auto bottom = extent_index.upper_bound(addr, paddr_cmp()); + if (bottom != extent_index.begin()) + --bottom; + if (bottom != extent_index.end() && + bottom->get_paddr().add_offset(bottom->get_length()) <= addr) + ++bottom; + + auto top = extent_index.lower_bound(addr.add_offset(len), paddr_cmp()); + return std::make_pair( + bottom, + top + ); + } + + void clear() { + extent_index.clear(); + } + + void insert(CachedExtent &extent) { + // sanity check + auto [a, b] = get_overlap( + extent.get_paddr(), + extent.get_length()); + ceph_assert(a == b); + + extent_index.insert(extent); + extent.parent_index = this; + } + + void erase(CachedExtent &extent) { + extent_index.erase(extent); + extent.parent_index = nullptr; + } + + void replace(CachedExtent &to, CachedExtent &from) { + extent_index.replace_node(extent_index.s_iterator_to(from), to); + from.parent_index = nullptr; + to.parent_index = this; + } + + bool empty() const { + return extent_index.empty(); + } + + auto find_offset(paddr_t offset) { + return extent_index.find(offset, paddr_cmp()); + } + + auto begin() { + return extent_index.begin(); + } + + auto end() { + return extent_index.end(); + } + + void merge(ExtentIndex &&other) { + for (auto it = other.extent_index.begin(); + it != other.extent_index.end(); + ) { + auto &ext = *it; + ++it; + other.extent_index.erase(ext); + extent_index.insert(ext); + } + } + + template <typename T> + void remove(T &l) { + for (auto &ext : l) { + extent_index.erase(l); + } + } +}; + +class LogicalCachedExtent; +class LBAPin; +using LBAPinRef = std::unique_ptr<LBAPin>; +class LBAPin { +public: + virtual void link_extent(LogicalCachedExtent *ref) = 0; + virtual void take_pin(LBAPin &pin) = 0; + virtual extent_len_t get_length() const = 0; + virtual paddr_t get_paddr() const = 0; + virtual laddr_t get_laddr() const = 0; + virtual LBAPinRef duplicate() const = 0; + + virtual ~LBAPin() {} +}; +std::ostream &operator<<(std::ostream &out, const LBAPin &rhs); + +using lba_pin_list_t = std::list<LBAPinRef>; + +std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs); + + +/** + * LogicalCachedExtent + * + * CachedExtent with associated lba mapping. + * + * Users of TransactionManager should be using extents derived from + * LogicalCachedExtent. + */ +class LogicalCachedExtent : public CachedExtent { +public: + template <typename... T> + LogicalCachedExtent(T&&... t) : CachedExtent(std::forward<T>(t)...) {} + + void set_pin(LBAPinRef &&npin) { + assert(!pin); + pin = std::move(npin); + laddr = pin->get_laddr(); + pin->link_extent(this); + } + + bool has_pin() const { + return !!pin; + } + + LBAPin &get_pin() { + assert(pin); + return *pin; + } + + laddr_t get_laddr() const { + assert(laddr != L_ADDR_NULL); + return laddr; + } + + void set_laddr(laddr_t nladdr) { + laddr = nladdr; + } + + void apply_delta_and_adjust_crc( + paddr_t base, const ceph::bufferlist &bl) final { + apply_delta(bl); + set_last_committed_crc(get_crc32c()); + } + + bool is_logical() const final { + return true; + } + + std::ostream &print_detail(std::ostream &out) const final; +protected: + virtual void apply_delta(const ceph::bufferlist &bl) = 0; + virtual std::ostream &print_detail_l(std::ostream &out) const { + return out; + } + + virtual void logical_on_delta_write() {} + + void on_delta_write(paddr_t record_block_offset) final { + assert(get_prior_instance()); + pin->take_pin(*(get_prior_instance()->cast<LogicalCachedExtent>()->pin)); + logical_on_delta_write(); + } + +private: + laddr_t laddr = L_ADDR_NULL; + LBAPinRef pin; +}; + +using LogicalCachedExtentRef = TCachedExtentRef<LogicalCachedExtent>; +struct ref_laddr_cmp { + using is_transparent = laddr_t; + bool operator()(const LogicalCachedExtentRef &lhs, + const LogicalCachedExtentRef &rhs) const { + return lhs->get_laddr() < rhs->get_laddr(); + } + bool operator()(const laddr_t &lhs, + const LogicalCachedExtentRef &rhs) const { + return lhs < rhs->get_laddr(); + } + bool operator()(const LogicalCachedExtentRef &lhs, + const laddr_t &rhs) const { + return lhs->get_laddr() < rhs; + } +}; + +using lextent_set_t = addr_extent_set_base_t< + laddr_t, + LogicalCachedExtentRef, + ref_laddr_cmp + >; + +template <typename T> +using lextent_list_t = addr_extent_list_base_t< + laddr_t, TCachedExtentRef<T>>; + +} diff --git a/src/crimson/os/seastore/extentmap_manager.cc b/src/crimson/os/seastore/extentmap_manager.cc new file mode 100644 index 000000000..32de3a6ed --- /dev/null +++ b/src/crimson/os/seastore/extentmap_manager.cc @@ -0,0 +1,32 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include <experimental/iterator> +#include <iostream> + +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/extentmap_manager.h" +#include "crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h" +namespace crimson::os::seastore::extentmap_manager { + +ExtentMapManagerRef create_extentmap_manager(TransactionManager &trans_manager) { + return ExtentMapManagerRef(new BtreeExtentMapManager(trans_manager)); +} + +} + +namespace crimson::os::seastore { + +std::ostream &operator<<(std::ostream &out, const extent_mapping_t &rhs) +{ + return out << "extent_mapping_t (" << rhs.logical_offset << "~" << rhs.length + << "->" << rhs.laddr << ")"; +} + +std::ostream &operator<<(std::ostream &out, const extent_map_list_t &rhs) +{ + out << '['; + std::copy(std::begin(rhs), std::end(rhs), std::experimental::make_ostream_joiner(out, ", ")); + return out << ']'; +} + +} diff --git a/src/crimson/os/seastore/extentmap_manager.h b/src/crimson/os/seastore/extentmap_manager.h new file mode 100644 index 000000000..7d5223b94 --- /dev/null +++ b/src/crimson/os/seastore/extentmap_manager.h @@ -0,0 +1,124 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iosfwd> +#include <list> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include <seastar/core/future.hh> + +#include "crimson/osd/exceptions.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction_manager.h" + +#define PAGE_SIZE 4096 +#define EXTMAP_BLOCK_SIZE 4096 + +namespace crimson::os::seastore { + +struct lext_map_val_t { + laddr_t laddr; + extent_len_t length = 0; + + lext_map_val_t( + laddr_t laddr, + extent_len_t length) + : laddr(laddr), length(length) {} + +}; + +class extent_mapping_t +{ +public: + objaddr_t logical_offset = 0; //offset in object + laddr_t laddr; // lextent start address aligned with block size. + extent_len_t length = 0; + explicit extent_mapping_t(objaddr_t lo) : logical_offset(lo) { } + + extent_mapping_t( + objaddr_t lo, + laddr_t laddr, + extent_len_t length) + : logical_offset(lo), laddr(laddr), length(length) {} + + ~extent_mapping_t() {} +}; + +enum class extmap_root_state_t : uint8_t { + INITIAL = 0, + MUTATED = 1, + NONE = 0xFF +}; + +using extent_map_list_t = std::list<extent_mapping_t>; +std::ostream &operator<<(std::ostream &out, const extent_mapping_t &rhs); +std::ostream &operator<<(std::ostream &out, const extent_map_list_t &rhs); + +struct extmap_root_t { + depth_t depth = 0; + extmap_root_state_t state; + laddr_t extmap_root_laddr; + extmap_root_t(depth_t dep, laddr_t laddr) + : depth(dep), + extmap_root_laddr(laddr) { state = extmap_root_state_t::INITIAL; } +}; + +/** + * Abstract interface for managing the object inner offset to logical addr mapping + * each onode has an extentmap tree for a particular onode. + */ +class ExtentMapManager { +public: + using initialize_extmap_ertr = TransactionManager::alloc_extent_ertr; + using initialize_extmap_ret = initialize_extmap_ertr::future<extmap_root_t>; + virtual initialize_extmap_ret initialize_extmap(Transaction &t) = 0; + + /* find_lextents + * + * Return a list of all extent_mapping_t overlapping any portion of lo~len. + * or if not find any overlap extent_mapping_t will return the next extent after the range. + */ + using find_lextent_ertr = TransactionManager::read_extent_ertr; + using find_lextent_ret = find_lextent_ertr::future<extent_map_list_t>; + virtual find_lextent_ret + find_lextent(const extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, extent_len_t len) = 0; + + /* add_lextent + * + * add a new mapping (object offset -> laddr, length) to extent map + * return the added extent_mapping_t + */ + using add_lextent_ertr = TransactionManager::read_extent_ertr; + using add_lextent_ret = add_lextent_ertr::future<extent_mapping_t>; + virtual add_lextent_ret + add_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) = 0; + + /* rm_lextent + * + * remove an existing extent mapping from extent map + * return true if the extent mapping is removed, otherwise return false + */ + using rm_lextent_ertr = TransactionManager::read_extent_ertr; + using rm_lextent_ret = rm_lextent_ertr::future<bool>; + virtual rm_lextent_ret rm_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) = 0; + + virtual ~ExtentMapManager() {} +}; +using ExtentMapManagerRef = std::unique_ptr<ExtentMapManager>; + +namespace extentmap_manager { +/* creat ExtentMapManager for an extentmap + * if it is a new extmap after create_extentmap_manager need call initialize_extmap + * to initialize the extent map before use it + * if it is an exsiting extmap, needn't initialize_extmap + */ +ExtentMapManagerRef create_extentmap_manager( + TransactionManager &trans_manager); + +} + +} diff --git a/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.cc b/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.cc new file mode 100644 index 000000000..f7609d3e8 --- /dev/null +++ b/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.cc @@ -0,0 +1,118 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include <sys/mman.h> +#include <string.h> + +#include "crimson/common/log.h" + +#include "include/buffer.h" +#include "crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h" +#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::extentmap_manager { + +BtreeExtentMapManager::BtreeExtentMapManager( + TransactionManager &tm) + : tm(tm) {} + +BtreeExtentMapManager::initialize_extmap_ret +BtreeExtentMapManager::initialize_extmap(Transaction &t) +{ + + logger().debug("{}", __func__); + return tm.alloc_extent<ExtMapLeafNode>(t, L_ADDR_MIN, EXTMAP_BLOCK_SIZE) + .safe_then([](auto&& root_extent) { + root_extent->set_size(0); + extmap_node_meta_t meta{1}; + root_extent->set_meta(meta); + extmap_root_t extmap_root = extmap_root_t(1, root_extent->get_laddr()); + return initialize_extmap_ertr::make_ready_future<extmap_root_t>(extmap_root); + }); +} + +BtreeExtentMapManager::get_root_ret +BtreeExtentMapManager::get_extmap_root(const extmap_root_t &extmap_root, Transaction &t) +{ + assert(extmap_root.extmap_root_laddr != L_ADDR_NULL); + laddr_t laddr = extmap_root.extmap_root_laddr; + return extmap_load_extent(get_ext_context(t), laddr, extmap_root.depth); +} + +BtreeExtentMapManager::find_lextent_ret +BtreeExtentMapManager::find_lextent(const extmap_root_t &extmap_root, Transaction &t, + objaddr_t lo, extent_len_t len) +{ + logger().debug("{}: {}, {}", __func__, lo, len); + return get_extmap_root(extmap_root, t).safe_then([this, &t, lo, len](auto&& extent) { + return extent->find_lextent(get_ext_context(t), lo, len); + }).safe_then([](auto &&e) { + logger().debug("{}: found_lextent {}", __func__, e); + return find_lextent_ret( + find_lextent_ertr::ready_future_marker{}, + std::move(e)); + }); + +} + +BtreeExtentMapManager::add_lextent_ret +BtreeExtentMapManager::add_lextent(extmap_root_t &extmap_root, Transaction &t, + objaddr_t lo, lext_map_val_t val) +{ + logger().debug("{}: {}, {}, {}", __func__, lo, val.laddr, val.length); + return get_extmap_root(extmap_root, t).safe_then([this, &extmap_root, &t, lo, val](auto &&root) { + return insert_lextent(extmap_root, t, root, lo, val); + }).safe_then([](auto ret) { + logger().debug("{}: {}", __func__, ret); + return add_lextent_ret( + add_lextent_ertr::ready_future_marker{}, + std::move(ret)); + }); + +} + +BtreeExtentMapManager::insert_lextent_ret +BtreeExtentMapManager::insert_lextent(extmap_root_t &extmap_root, Transaction &t, + ExtMapNodeRef root, objaddr_t logical_offset, lext_map_val_t val) +{ + auto split = insert_lextent_ertr::make_ready_future<ExtMapNodeRef>(root); + if (root->at_max_capacity()) { + logger().debug("{}::splitting root {}", __func__, *root); + split = root->extmap_alloc_extent<ExtMapInnerNode>(get_ext_context(t), EXTMAP_BLOCK_SIZE) + .safe_then([this, &extmap_root, root, &t, logical_offset](auto&& nroot) { + extmap_node_meta_t meta{root->get_node_meta().depth + 1}; + nroot->set_meta(meta); + nroot->journal_insert(nroot->begin(), OBJ_ADDR_MIN, + root->get_laddr(), nullptr); + extmap_root.extmap_root_laddr = nroot->get_laddr(); + extmap_root.depth = root->get_node_meta().depth + 1; + extmap_root.state = extmap_root_state_t::MUTATED; + return nroot->split_entry(get_ext_context(t), logical_offset, nroot->begin(), root); + }); + } + return split.safe_then([this, &t, logical_offset, val](ExtMapNodeRef node) { + return node->insert(get_ext_context(t), logical_offset, val); + }); +} + +BtreeExtentMapManager::rm_lextent_ret +BtreeExtentMapManager::rm_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) +{ + logger().debug("{}: {}, {}, {}", __func__, lo, val.laddr, val.length); + return get_extmap_root(extmap_root, t).safe_then([this, &t, lo, val](auto extent) { + return extent->rm_lextent(get_ext_context(t), lo, val); + }).safe_then([](auto removed) { + logger().debug("{}: {}", __func__, removed); + return rm_lextent_ret( + rm_lextent_ertr::ready_future_marker{}, + removed); + }); +} + + +} diff --git a/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h b/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h new file mode 100644 index 000000000..db676f41d --- /dev/null +++ b/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "crimson/osd/exceptions.h" + +#include "crimson/os/seastore/extentmap_manager.h" +#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction_manager.h" + +namespace crimson::os::seastore::extentmap_manager { +/** + * BtreeExtentMapManager + * + * Uses a btree to track : + * objaddr_t -> laddr_t mapping for each onode extentmap + */ + +class BtreeExtentMapManager : public ExtentMapManager { + TransactionManager &tm; + + ext_context_t get_ext_context(Transaction &t) { + return ext_context_t{tm,t}; + } + + /* get_extmap_root + * + * load extent map tree root node + */ + using get_root_ertr = TransactionManager::read_extent_ertr; + using get_root_ret = get_root_ertr::future<ExtMapNodeRef>; + get_root_ret get_extmap_root(const extmap_root_t &extmap_root, Transaction &t); + + using insert_lextent_ertr = TransactionManager::read_extent_ertr; + using insert_lextent_ret = insert_lextent_ertr::future<extent_mapping_t >; + insert_lextent_ret insert_lextent(extmap_root_t &extmap_root, Transaction &t, + ExtMapNodeRef extent, objaddr_t lo, + lext_map_val_t val); + +public: + explicit BtreeExtentMapManager(TransactionManager &tm); + + initialize_extmap_ret initialize_extmap(Transaction &t) final; + + find_lextent_ret find_lextent(const extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, extent_len_t len) final; + + add_lextent_ret add_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) final; + + rm_lextent_ret rm_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) final; + + +}; +using BtreeExtentMapManagerRef = std::unique_ptr<BtreeExtentMapManager>; + +} diff --git a/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h new file mode 100644 index 000000000..3937bd049 --- /dev/null +++ b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h @@ -0,0 +1,145 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + + +#pragma once + +#include <boost/iterator/counting_iterator.hpp> + +#include "crimson/common/log.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/extentmap_manager.h" + +namespace crimson::os::seastore::extentmap_manager{ + +struct ext_context_t { + TransactionManager &tm; + Transaction &t; +}; + +struct extmap_node_meta_t { + depth_t depth = 0; + + std::pair<extmap_node_meta_t, extmap_node_meta_t> split_into(objaddr_t pivot) const { + return std::make_pair( + extmap_node_meta_t{depth}, + extmap_node_meta_t{depth}); + } + + static extmap_node_meta_t merge_from( + const extmap_node_meta_t &lhs, const extmap_node_meta_t &rhs) { + assert(lhs.depth == rhs.depth); + return extmap_node_meta_t{lhs.depth}; + } + + static std::pair<extmap_node_meta_t, extmap_node_meta_t> + rebalance(const extmap_node_meta_t &lhs, const extmap_node_meta_t &rhs, laddr_t pivot) { + assert(lhs.depth == rhs.depth); + return std::make_pair( + extmap_node_meta_t{lhs.depth}, + extmap_node_meta_t{lhs.depth}); + } +}; + +struct ExtMapNode : LogicalCachedExtent { + using ExtMapNodeRef = TCachedExtentRef<ExtMapNode>; + + ExtMapNode(ceph::bufferptr &&ptr) : LogicalCachedExtent(std::move(ptr)) {} + ExtMapNode(const ExtMapNode &other) + : LogicalCachedExtent(other) {} + + using find_lextent_ertr = ExtentMapManager::find_lextent_ertr; + using find_lextent_ret = ExtentMapManager::find_lextent_ret; + virtual find_lextent_ret find_lextent(ext_context_t ec, + objaddr_t lo, extent_len_t len) = 0; + + using insert_ertr = TransactionManager::read_extent_ertr; + using insert_ret = insert_ertr::future<extent_mapping_t>; + virtual insert_ret insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val) = 0; + + using rm_lextent_ertr = TransactionManager::read_extent_ertr; + using rm_lextent_ret = rm_lextent_ertr::future<bool>; + virtual rm_lextent_ret rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val) = 0; + + using split_children_ertr = TransactionManager::alloc_extent_ertr; + using split_children_ret = split_children_ertr::future + <std::tuple<ExtMapNodeRef, ExtMapNodeRef, uint32_t>>; + virtual split_children_ret make_split_children(ext_context_t ec) = 0; + + using full_merge_ertr = TransactionManager::alloc_extent_ertr; + using full_merge_ret = full_merge_ertr::future<ExtMapNodeRef>; + virtual full_merge_ret make_full_merge(ext_context_t ec, ExtMapNodeRef right) = 0; + + using make_balanced_ertr = TransactionManager::alloc_extent_ertr; + using make_balanced_ret = make_balanced_ertr::future + <std::tuple<ExtMapNodeRef, ExtMapNodeRef, uint32_t>>; + virtual make_balanced_ret + make_balanced(ext_context_t ec, ExtMapNodeRef right, bool prefer_left) = 0; + + virtual extmap_node_meta_t get_node_meta() const = 0; + + virtual bool at_max_capacity() const = 0; + virtual bool at_min_capacity() const = 0; + virtual unsigned get_node_size() const = 0; + virtual ~ExtMapNode() = default; + + using alloc_ertr = TransactionManager::alloc_extent_ertr; + template<class T> + alloc_ertr::future<TCachedExtentRef<T>> + extmap_alloc_extent(ext_context_t ec, extent_len_t len) { + return ec.tm.alloc_extent<T>(ec.t, L_ADDR_MIN, len).safe_then( + [](auto&& extent) { + return alloc_ertr::make_ready_future<TCachedExtentRef<T>>(std::move(extent)); + }); + } + + template<class T> + alloc_ertr::future<std::pair<TCachedExtentRef<T>, TCachedExtentRef<T>>> + extmap_alloc_2extents(ext_context_t ec, extent_len_t len) { + return seastar::do_with(std::pair<TCachedExtentRef<T>, TCachedExtentRef<T>>(), + [ec, len] (auto &extents) { + return crimson::do_for_each(boost::make_counting_iterator(0), + boost::make_counting_iterator(2), + [ec, len, &extents] (auto i) { + return ec.tm.alloc_extent<T>(ec.t, L_ADDR_MIN, len).safe_then( + [i, &extents](auto &&node) { + if (i == 0) + extents.first = node; + if (i == 1) + extents.second = node; + }); + }).safe_then([&extents] { + return alloc_ertr::make_ready_future + <std::pair<TCachedExtentRef<T>, TCachedExtentRef<T>>>(std::move(extents)); + }); + }); + } + + using retire_ertr = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::input_output_error>; + using retire_ret = retire_ertr::future<std::list<unsigned>>; + retire_ret + extmap_retire_node(ext_context_t ec, std::list<laddr_t> dec_laddrs) { + return seastar::do_with(std::move(dec_laddrs), std::list<unsigned>(), + [ec] (auto &&dec_laddrs, auto &refcnt) { + return crimson::do_for_each(dec_laddrs.begin(), dec_laddrs.end(), + [ec, &refcnt] (auto &laddr) { + return ec.tm.dec_ref(ec.t, laddr).safe_then([&refcnt] (auto ref) { + refcnt.push_back(ref); + }); + }).safe_then([&refcnt] { + return retire_ertr::make_ready_future<std::list<unsigned>>(std::move(refcnt)); + }); + }); + } + +}; + +using ExtMapNodeRef = ExtMapNode::ExtMapNodeRef; + +TransactionManager::read_extent_ertr::future<ExtMapNodeRef> +extmap_load_extent(ext_context_t ec, laddr_t laddr, depth_t depth); + +} diff --git a/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.cc b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.cc new file mode 100644 index 000000000..7bf8680a5 --- /dev/null +++ b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.cc @@ -0,0 +1,373 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include <memory> +#include <string.h> + +#include "include/buffer.h" +#include "include/byteorder.h" +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h" +#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::extentmap_manager { + +std::ostream &ExtMapInnerNode::print_detail_l(std::ostream &out) const +{ + return out << ", size=" << get_size() + << ", depth=" << get_meta().depth; +} + +ExtMapInnerNode::find_lextent_ret +ExtMapInnerNode::find_lextent(ext_context_t ec, objaddr_t lo, extent_len_t len) +{ + auto [begin, end] = bound(lo, lo + len); + auto result_up = std::make_unique<extent_map_list_t>(); + auto &result = *result_up; + return crimson::do_for_each( + std::move(begin), + std::move(end), + [this, ec, &result, lo, len](const auto &val) mutable { + return extmap_load_extent(ec, val.get_val(), get_meta().depth - 1).safe_then( + [ec, &result, lo, len](auto extent) mutable { + return extent->find_lextent(ec, lo, len).safe_then( + [&result](auto item_list) mutable { + result.splice(result.end(), item_list, + item_list.begin(), item_list.end()); + }); + }); + }).safe_then([result=std::move(result_up)] { + return find_lextent_ret( + find_lextent_ertr::ready_future_marker{}, + std::move(*result)); + }); +} + +ExtMapInnerNode::insert_ret +ExtMapInnerNode::insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val) +{ + auto insertion_pt = get_containing_child(lo); + assert(insertion_pt != end()); + return extmap_load_extent(ec, insertion_pt->get_val(), get_meta().depth - 1).safe_then( + [this, ec, insertion_pt, lo, val=std::move(val)](auto extent) mutable { + return extent->at_max_capacity() ? + split_entry(ec, lo, insertion_pt, extent) : + insert_ertr::make_ready_future<ExtMapNodeRef>(std::move(extent)); + }).safe_then([ec, lo, val=std::move(val)](ExtMapNodeRef extent) mutable { + return extent->insert(ec, lo, val); + }); +} + +ExtMapInnerNode::rm_lextent_ret +ExtMapInnerNode::rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val) +{ + auto rm_pt = get_containing_child(lo); + return extmap_load_extent(ec, rm_pt->get_val(), get_meta().depth - 1).safe_then( + [this, ec, rm_pt, lo, val=std::move(val)](auto extent) mutable { + if (extent->at_min_capacity() && get_node_size() > 1) { + return merge_entry(ec, lo, rm_pt, extent); + } else { + return merge_entry_ertr::make_ready_future<ExtMapNodeRef>(std::move(extent)); + } + }).safe_then([ec, lo, val](ExtMapNodeRef extent) mutable { + return extent->rm_lextent(ec, lo, val); + }); +} + +ExtMapInnerNode::split_children_ret +ExtMapInnerNode::make_split_children(ext_context_t ec) +{ + logger().debug("{}: {}", "ExtMapInnerNode", __func__); + return extmap_alloc_2extents<ExtMapInnerNode>(ec, EXTMAP_BLOCK_SIZE) + .safe_then([this] (auto &&ext_pair) { + auto [left, right] = ext_pair; + return split_children_ret( + split_children_ertr::ready_future_marker{}, + std::make_tuple(left, right, split_into(*left, *right))); + }); +} + +ExtMapInnerNode::full_merge_ret +ExtMapInnerNode::make_full_merge(ext_context_t ec, ExtMapNodeRef right) +{ + logger().debug("{}: {}", "ExtMapInnerNode", __func__); + return extmap_alloc_extent<ExtMapInnerNode>(ec, EXTMAP_BLOCK_SIZE) + .safe_then([this, right] (auto &&replacement) { + replacement->merge_from(*this, *right->cast<ExtMapInnerNode>()); + return full_merge_ret( + full_merge_ertr::ready_future_marker{}, + std::move(replacement)); + }); +} + +ExtMapInnerNode::make_balanced_ret +ExtMapInnerNode::make_balanced(ext_context_t ec, ExtMapNodeRef _right, bool prefer_left) +{ + logger().debug("{}: {}", "ExtMapInnerNode", __func__); + ceph_assert(_right->get_type() == type); + return extmap_alloc_2extents<ExtMapInnerNode>(ec, EXTMAP_BLOCK_SIZE) + .safe_then([this, _right, prefer_left] (auto &&replacement_pair){ + auto [replacement_left, replacement_right] = replacement_pair; + auto &right = *_right->cast<ExtMapInnerNode>(); + return make_balanced_ret( + make_balanced_ertr::ready_future_marker{}, + std::make_tuple(replacement_left, replacement_right, + balance_into_new_nodes(*this, right, prefer_left, + *replacement_left, *replacement_right))); + }); +} + +ExtMapInnerNode::split_entry_ret +ExtMapInnerNode::split_entry(ext_context_t ec, objaddr_t lo, + internal_iterator_t iter, ExtMapNodeRef entry) +{ + logger().debug("{}: {}", "ExtMapInnerNode", __func__); + if (!is_pending()) { + auto mut = ec.tm.get_mutable_extent(ec.t, this)->cast<ExtMapInnerNode>(); + auto mut_iter = mut->iter_idx(iter->get_offset()); + return mut->split_entry(ec, lo, mut_iter, entry); + } + ceph_assert(!at_max_capacity()); + return entry->make_split_children(ec) + .safe_then([this, ec, lo, iter, entry] (auto tuple){ + auto [left, right, pivot] = tuple; + journal_update(iter, left->get_laddr(), maybe_get_delta_buffer()); + journal_insert(iter + 1, pivot, right->get_laddr(), maybe_get_delta_buffer()); + logger().debug( + "ExtMapInnerNode::split_entry *this {} entry {} into left {} right {}", + *this, *entry, *left, *right); + //retire extent + return ec.tm.dec_ref(ec.t, entry->get_laddr()) + .safe_then([lo, left = left, right = right, pivot = pivot] (auto ret) { + return split_entry_ertr::make_ready_future<ExtMapNodeRef>( + pivot > lo ? left : right); + }); + }); +} + +ExtMapInnerNode::merge_entry_ret +ExtMapInnerNode::merge_entry(ext_context_t ec, objaddr_t lo, + internal_iterator_t iter, ExtMapNodeRef entry) +{ + if (!is_pending()) { + auto mut = ec.tm.get_mutable_extent(ec.t, this)->cast<ExtMapInnerNode>(); + auto mut_iter = mut->iter_idx(iter->get_offset()); + return mut->merge_entry(ec, lo, mut_iter, entry); + } + logger().debug("ExtMapInnerNode: merge_entry: {}, {}", *this, *entry); + auto is_left = (iter + 1) == end(); + auto donor_iter = is_left ? iter - 1 : iter + 1; + return extmap_load_extent(ec, donor_iter->get_val(), get_meta().depth - 1) + .safe_then([this, ec, lo, iter, entry, donor_iter, is_left] + (auto &&donor) mutable { + auto [l, r] = is_left ? + std::make_pair(donor, entry) : std::make_pair(entry, donor); + auto [liter, riter] = is_left ? + std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter); + if (donor->at_min_capacity()) { + return l->make_full_merge(ec, r) + .safe_then([this, ec, entry, l = l, r = r, liter = liter, riter = riter] + (auto &&replacement){ + journal_update(liter, replacement->get_laddr(), maybe_get_delta_buffer()); + journal_remove(riter, maybe_get_delta_buffer()); + //retire extent + std::list<laddr_t> dec_laddrs; + dec_laddrs.push_back(l->get_laddr()); + dec_laddrs.push_back(r->get_laddr()); + return extmap_retire_node(ec, dec_laddrs) + .safe_then([replacement] (auto &&ret) { + return merge_entry_ertr::make_ready_future<ExtMapNodeRef>(replacement); + }); + }); + } else { + logger().debug("ExtMapInnerNode::merge_entry balanced l {} r {}", + *l, *r); + return l->make_balanced(ec, r, !is_left) + .safe_then([this, ec, lo, entry, l = l, r = r, liter = liter, riter = riter] + (auto tuple) { + auto [replacement_l, replacement_r, pivot] = tuple; + journal_update(liter, replacement_l->get_laddr(), maybe_get_delta_buffer()); + journal_replace(riter, pivot, replacement_r->get_laddr(), + maybe_get_delta_buffer()); + // retire extent + std::list<laddr_t> dec_laddrs; + dec_laddrs.push_back(l->get_laddr()); + dec_laddrs.push_back(r->get_laddr()); + return extmap_retire_node(ec, dec_laddrs) + .safe_then([lo, pivot = pivot, replacement_l = replacement_l, replacement_r = replacement_r] + (auto &&ret) { + return merge_entry_ertr::make_ready_future<ExtMapNodeRef>( + lo >= pivot ? replacement_r : replacement_l); + }); + }); + } + }); +} + + +ExtMapInnerNode::internal_iterator_t +ExtMapInnerNode::get_containing_child(objaddr_t lo) +{ + // TODO: binary search + for (auto i = begin(); i != end(); ++i) { + if (i.contains(lo)) + return i; + } + ceph_assert(0 == "invalid"); + return end(); +} + +std::ostream &ExtMapLeafNode::print_detail_l(std::ostream &out) const +{ + return out << ", size=" << get_size() + << ", depth=" << get_meta().depth; +} + +ExtMapLeafNode::find_lextent_ret +ExtMapLeafNode::find_lextent(ext_context_t ec, objaddr_t lo, extent_len_t len) +{ + logger().debug( + "ExtMapLeafNode::find_lextent {}~{}", lo, len); + auto ret = extent_map_list_t(); + auto [from, to] = get_leaf_entries(lo, len); + if (from == to && to != end()) + ++to; + for (; from != to; ++from) { + auto val = (*from).get_val(); + ret.emplace_back( + extent_mapping_t( + (*from).get_key(), + val.laddr, + val.length)); + logger().debug("ExtMapLeafNode::find_lextent find {}~{}", lo, val.laddr); + } + return find_lextent_ertr::make_ready_future<extent_map_list_t>( + std::move(ret)); +} + +ExtMapLeafNode::insert_ret +ExtMapLeafNode::insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val) +{ + ceph_assert(!at_max_capacity()); + if (!is_pending()) { + auto mut = ec.tm.get_mutable_extent(ec.t, this)->cast<ExtMapLeafNode>(); + return mut->insert(ec, lo, val); + } + auto insert_pt = lower_bound(lo); + journal_insert(insert_pt, lo, val, maybe_get_delta_buffer()); + + logger().debug( + "ExtMapLeafNode::insert: inserted {}->{} {}", + insert_pt.get_key(), + insert_pt.get_val().laddr, + insert_pt.get_val().length); + return insert_ertr::make_ready_future<extent_mapping_t>( + extent_mapping_t(lo, val.laddr, val.length)); +} + +ExtMapLeafNode::rm_lextent_ret +ExtMapLeafNode::rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val) +{ + if (!is_pending()) { + auto mut = ec.tm.get_mutable_extent(ec.t, this)->cast<ExtMapLeafNode>(); + return mut->rm_lextent(ec, lo, val); + } + + auto [rm_pt, rm_end] = get_leaf_entries(lo, val.length); + if (lo == rm_pt->get_key() && val.laddr == rm_pt->get_val().laddr + && val.length == rm_pt->get_val().length) { + journal_remove(rm_pt, maybe_get_delta_buffer()); + logger().debug( + "ExtMapLeafNode::rm_lextent: removed {}->{} {}", + rm_pt.get_key(), + rm_pt.get_val().laddr, + rm_pt.get_val().length); + return rm_lextent_ertr::make_ready_future<bool>(true); + } else { + return rm_lextent_ertr::make_ready_future<bool>(false); + } +} + +ExtMapLeafNode::split_children_ret +ExtMapLeafNode::make_split_children(ext_context_t ec) +{ + logger().debug("{}: {}", "ExtMapLeafNode", __func__); + return extmap_alloc_2extents<ExtMapLeafNode>(ec, EXTMAP_BLOCK_SIZE) + .safe_then([this] (auto &&ext_pair) { + auto [left, right] = ext_pair; + return split_children_ret( + split_children_ertr::ready_future_marker{}, + std::make_tuple(left, right, split_into(*left, *right))); + }); +} + +ExtMapLeafNode::full_merge_ret +ExtMapLeafNode::make_full_merge(ext_context_t ec, ExtMapNodeRef right) +{ + logger().debug("{}: {}", "ExtMapLeafNode", __func__); + return extmap_alloc_extent<ExtMapLeafNode>(ec, EXTMAP_BLOCK_SIZE) + .safe_then([this, right] (auto &&replacement) { + replacement->merge_from(*this, *right->cast<ExtMapLeafNode>()); + return full_merge_ret( + full_merge_ertr::ready_future_marker{}, + std::move(replacement)); + }); +} +ExtMapLeafNode::make_balanced_ret +ExtMapLeafNode::make_balanced(ext_context_t ec, ExtMapNodeRef _right, bool prefer_left) +{ + logger().debug("{}: {}", "ExtMapLeafNode", __func__); + ceph_assert(_right->get_type() == type); + return extmap_alloc_2extents<ExtMapLeafNode>(ec, EXTMAP_BLOCK_SIZE) + .safe_then([this, _right, prefer_left] (auto &&replacement_pair) { + auto [replacement_left, replacement_right] = replacement_pair; + auto &right = *_right->cast<ExtMapLeafNode>(); + return make_balanced_ret( + make_balanced_ertr::ready_future_marker{}, + std::make_tuple( + replacement_left, replacement_right, + balance_into_new_nodes( + *this, right, prefer_left, + *replacement_left, *replacement_right))); + }); +} + + +std::pair<ExtMapLeafNode::internal_iterator_t, ExtMapLeafNode::internal_iterator_t> +ExtMapLeafNode::get_leaf_entries(objaddr_t addr, extent_len_t len) +{ + return bound(addr, addr + len); +} + + +TransactionManager::read_extent_ertr::future<ExtMapNodeRef> +extmap_load_extent(ext_context_t ec, laddr_t laddr, depth_t depth) +{ + ceph_assert(depth > 0); + if (depth > 1) { + return ec.tm.read_extents<ExtMapInnerNode>(ec.t, laddr, EXTMAP_BLOCK_SIZE).safe_then( + [](auto&& extents) { + assert(extents.size() == 1); + [[maybe_unused]] auto [laddr, e] = extents.front(); + return TransactionManager::read_extent_ertr::make_ready_future<ExtMapNodeRef>(std::move(e)); + }); + } else { + return ec.tm.read_extents<ExtMapLeafNode>(ec.t, laddr, EXTMAP_BLOCK_SIZE).safe_then( + [](auto&& extents) { + assert(extents.size() == 1); + [[maybe_unused]] auto [laddr, e] = extents.front(); + return TransactionManager::read_extent_ertr::make_ready_future<ExtMapNodeRef>(std::move(e)); + }); + } +} + +} diff --git a/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h new file mode 100644 index 000000000..f5da8cdc2 --- /dev/null +++ b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h @@ -0,0 +1,281 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once +#include "include/buffer.h" + +#include "crimson/common/fixed_kv_node_layout.h" +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/extentmap_manager.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h" + +namespace crimson::os::seastore::extentmap_manager { + +struct extmap_node_meta_le_t { + depth_le_t depth = init_les32(0); + + extmap_node_meta_le_t() = default; + extmap_node_meta_le_t(const extmap_node_meta_le_t &) = default; + explicit extmap_node_meta_le_t(const extmap_node_meta_t &val) + : depth(init_les32(val.depth)) {} + + operator extmap_node_meta_t() const { + return extmap_node_meta_t{ depth }; + } +}; + +/** + * ExtMapInnerNode + * + * Abstracts operations on and layout of internal nodes for the + * Extentmap Tree. + * + * Layout (4k): + * num_entries: uint32_t 4b + * meta : depth 4b + * (padding) : 8b + * keys : objaddr_t[340] (340*4)b + * values : laddr_t[340] (340*8)b + * = 4096 + */ +constexpr size_t INNER_NODE_CAPACITY = + (EXTMAP_BLOCK_SIZE - sizeof(uint32_t) - sizeof(extmap_node_meta_t)) + / (sizeof (objaddr_t) + sizeof(laddr_t)); + +struct ExtMapInnerNode + : ExtMapNode, + common::FixedKVNodeLayout< + INNER_NODE_CAPACITY, + extmap_node_meta_t, extmap_node_meta_le_t, + objaddr_t, ceph_le32, + laddr_t, laddr_le_t> { + using internal_iterator_t = const_iterator; + template <typename... T> + ExtMapInnerNode(T&&... t) : + ExtMapNode(std::forward<T>(t)...), + FixedKVNodeLayout(get_bptr().c_str()) {} + + static constexpr extent_types_t type = extent_types_t::EXTMAP_INNER; + + extmap_node_meta_t get_node_meta() const final {return get_meta();} + + CachedExtentRef duplicate_for_write() final { + assert(delta_buffer.empty()); + return CachedExtentRef(new ExtMapInnerNode(*this)); + }; + + delta_buffer_t delta_buffer; + delta_buffer_t *maybe_get_delta_buffer() { + return is_mutation_pending() ? &delta_buffer : nullptr; + } + + find_lextent_ret find_lextent(ext_context_t ec, objaddr_t lo, extent_len_t len) final; + + insert_ret insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val) final; + + rm_lextent_ret rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val) final; + + split_children_ret make_split_children(ext_context_t ec) final; + + full_merge_ret make_full_merge(ext_context_t ec, ExtMapNodeRef right) final; + + make_balanced_ret make_balanced(ext_context_t ec, ExtMapNodeRef _right, bool prefer_left) final; + + std::ostream &print_detail_l(std::ostream &out) const final; + + extent_types_t get_type() const final { + return type; + } + + ceph::bufferlist get_delta() final { + assert(!delta_buffer.empty()); + ceph::buffer::ptr bptr(delta_buffer.get_bytes()); + delta_buffer.copy_out(bptr.c_str(), bptr.length()); + ceph::bufferlist bl; + bl.push_back(bptr); + return bl; + } + + void apply_delta(const ceph::bufferlist &_bl) final { + assert(_bl.length()); + ceph::bufferlist bl = _bl; + bl.rebuild(); + delta_buffer_t buffer; + buffer.copy_in(bl.front().c_str(), bl.front().length()); + buffer.replay(*this); + } + + bool at_max_capacity() const final { + return get_size() == get_capacity(); + } + + bool at_min_capacity() const { + return get_size() == get_capacity() / 2; + } + + unsigned get_node_size() const { + return get_size(); + } + + /* get the iterator containing [l, r] + */ + std::pair<internal_iterator_t, internal_iterator_t> bound( + objaddr_t l, objaddr_t r) { + auto retl = begin(); + for (; retl != end(); ++retl) { + if (retl->get_next_key_or_max() > l) + break; + } + auto retr = retl; + for (; retr != end(); ++retr) { + if (retr->get_key() >= r) + break; + } + return {retl, retr}; + } + + using split_entry_ertr = TransactionManager::read_extent_ertr; + using split_entry_ret = split_entry_ertr::future<ExtMapNodeRef>; + split_entry_ret split_entry(ext_context_t ec, objaddr_t lo, + internal_iterator_t, ExtMapNodeRef entry); + using merge_entry_ertr = TransactionManager::read_extent_ertr; + using merge_entry_ret = merge_entry_ertr::future<ExtMapNodeRef>; + merge_entry_ret merge_entry(ext_context_t ec, objaddr_t lo, + internal_iterator_t iter, ExtMapNodeRef entry); + internal_iterator_t get_containing_child(objaddr_t lo); + +}; + +/** + * ExtMapLeafNode + * + * Abstracts operations on and layout of leaf nodes for the + * ExtentMap Tree. + * + * Layout (4k): + * num_entries: uint32_t 4b + * meta : depth 4b + * (padding) : 8b + * keys : objaddr_t[204] (204*4)b + * values : lext_map_val_t[204] (204*16)b + * = 4096 + */ +constexpr size_t LEAF_NODE_CAPACITY = + (EXTMAP_BLOCK_SIZE - sizeof(uint32_t) - sizeof(extmap_node_meta_t)) + / (sizeof(objaddr_t) + sizeof(lext_map_val_t)); + +struct lext_map_val_le_t { + laddr_le_t laddr; + extent_len_le_t length = init_extent_len_le_t(0); + + lext_map_val_le_t() = default; + lext_map_val_le_t(const lext_map_val_le_t &) = default; + explicit lext_map_val_le_t(const lext_map_val_t &val) + : laddr(laddr_le_t(val.laddr)), + length(init_extent_len_le_t(val.length)) {} + + operator lext_map_val_t() const { + return lext_map_val_t{laddr, length}; + } +}; + +struct ExtMapLeafNode + : ExtMapNode, + common::FixedKVNodeLayout< + LEAF_NODE_CAPACITY, + extmap_node_meta_t, extmap_node_meta_le_t, + objaddr_t, ceph_le32, + lext_map_val_t, lext_map_val_le_t> { + using internal_iterator_t = const_iterator; + template <typename... T> + ExtMapLeafNode(T&&... t) : + ExtMapNode(std::forward<T>(t)...), + FixedKVNodeLayout(get_bptr().c_str()) {} + + static constexpr extent_types_t type = extent_types_t::EXTMAP_LEAF; + + extmap_node_meta_t get_node_meta() const final { return get_meta(); } + + CachedExtentRef duplicate_for_write() final { + assert(delta_buffer.empty()); + return CachedExtentRef(new ExtMapLeafNode(*this)); + }; + + delta_buffer_t delta_buffer; + delta_buffer_t *maybe_get_delta_buffer() { + return is_mutation_pending() ? &delta_buffer : nullptr; + } + + find_lextent_ret find_lextent(ext_context_t ec, objaddr_t lo, extent_len_t len) final; + + insert_ret insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val) final; + + rm_lextent_ret rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val) final; + + split_children_ret make_split_children(ext_context_t ec) final; + + full_merge_ret make_full_merge(ext_context_t ec, ExtMapNodeRef right) final; + + make_balanced_ret make_balanced(ext_context_t ec, ExtMapNodeRef _right, bool prefer_left) final; + + extent_types_t get_type() const final { + return type; + } + + ceph::bufferlist get_delta() final { + assert(!delta_buffer.empty()); + ceph::buffer::ptr bptr(delta_buffer.get_bytes()); + delta_buffer.copy_out(bptr.c_str(), bptr.length()); + ceph::bufferlist bl; + bl.push_back(bptr); + return bl; + } + + void apply_delta(const ceph::bufferlist &_bl) final { + assert(_bl.length()); + ceph::bufferlist bl = _bl; + bl.rebuild(); + delta_buffer_t buffer; + buffer.copy_in(bl.front().c_str(), bl.front().length()); + buffer.replay(*this); + } + + std::ostream &print_detail_l(std::ostream &out) const final; + + bool at_max_capacity() const final { + return get_size() == get_capacity(); + } + + bool at_min_capacity() const final { + return get_size() == get_capacity() / 2; + } + + unsigned get_node_size() const { + return get_size(); + } + + /* get the iterator containing [l, r] + */ + std::pair<internal_iterator_t, internal_iterator_t> bound( + objaddr_t l, objaddr_t r) { + auto retl = begin(); + for (; retl != end(); ++retl) { + if (retl->get_key() >= l || (retl->get_key() + retl->get_val().length) > l) + break; + } + auto retr = retl; + for (; retr != end(); ++retr) { + if (retr->get_key() >= r) + break; + } + return {retl, retr}; + } + + std::pair<internal_iterator_t, internal_iterator_t> + get_leaf_entries(objaddr_t lo, extent_len_t len); + +}; +using ExtentMapLeafNodeRef = TCachedExtentRef<ExtMapLeafNode>; + +} diff --git a/src/crimson/os/seastore/journal.cc b/src/crimson/os/seastore/journal.cc new file mode 100644 index 000000000..39875fb56 --- /dev/null +++ b/src/crimson/os/seastore/journal.cc @@ -0,0 +1,756 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <iostream> + +#include <boost/iterator/counting_iterator.hpp> + +#include "crimson/os/seastore/journal.h" + +#include "include/intarith.h" +#include "crimson/os/seastore/segment_manager.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore { + +std::ostream &operator<<(std::ostream &out, const segment_header_t &header) +{ + return out << "segment_header_t(" + << "segment_seq=" << header.journal_segment_seq + << ", physical_segment_id=" << header.physical_segment_id + << ", journal_tail=" << header.journal_tail + << ", segment_nonce=" << header.segment_nonce + << ")"; +} + +segment_nonce_t generate_nonce( + segment_seq_t seq, + const seastore_meta_t &meta) +{ + return ceph_crc32c( + seq, + reinterpret_cast<const unsigned char *>(meta.seastore_id.bytes()), + sizeof(meta.seastore_id.uuid)); +} + +Journal::Journal(SegmentManager &segment_manager) + : block_size(segment_manager.get_block_size()), + max_record_length( + segment_manager.get_segment_size() - + p2align(ceph::encoded_sizeof_bounded<segment_header_t>(), + size_t(block_size))), + segment_manager(segment_manager) {} + + +Journal::initialize_segment_ertr::future<segment_seq_t> +Journal::initialize_segment(Segment &segment) +{ + auto new_tail = segment_provider->get_journal_tail_target(); + logger().debug( + "initialize_segment {} journal_tail_target {}", + segment.get_segment_id(), + new_tail); + // write out header + ceph_assert(segment.get_write_ptr() == 0); + bufferlist bl; + + segment_seq_t seq = next_journal_segment_seq++; + current_segment_nonce = generate_nonce( + seq, segment_manager.get_meta()); + auto header = segment_header_t{ + seq, + segment.get_segment_id(), + segment_provider->get_journal_tail_target(), + current_segment_nonce}; + encode(header, bl); + + bufferptr bp( + ceph::buffer::create_page_aligned( + segment_manager.get_block_size())); + bp.zero(); + auto iter = bl.cbegin(); + iter.copy(bl.length(), bp.c_str()); + bl.clear(); + bl.append(bp); + + written_to = segment_manager.get_block_size(); + committed_to = 0; + return segment.write(0, bl).safe_then( + [=] { + segment_provider->update_journal_tail_committed(new_tail); + return seq; + }, + initialize_segment_ertr::pass_further{}, + crimson::ct_error::assert_all{ "TODO" }); +} + +ceph::bufferlist Journal::encode_record( + record_size_t rsize, + record_t &&record) +{ + bufferlist data_bl; + for (auto &i: record.extents) { + data_bl.append(i.bl); + } + + bufferlist bl; + record_header_t header{ + rsize.mdlength, + rsize.dlength, + (uint32_t)record.deltas.size(), + (uint32_t)record.extents.size(), + current_segment_nonce, + committed_to, + data_bl.crc32c(-1) + }; + encode(header, bl); + + auto metadata_crc_filler = bl.append_hole(sizeof(uint32_t)); + + for (const auto &i: record.extents) { + encode(extent_info_t(i), bl); + } + for (const auto &i: record.deltas) { + encode(i, bl); + } + if (bl.length() % block_size != 0) { + bl.append_zero( + block_size - (bl.length() % block_size)); + } + ceph_assert(bl.length() == rsize.mdlength); + + + auto bliter = bl.cbegin(); + auto metadata_crc = bliter.crc32c( + ceph::encoded_sizeof_bounded<record_header_t>(), + -1); + bliter += sizeof(checksum_t); /* crc hole again */ + metadata_crc = bliter.crc32c( + bliter.get_remaining(), + metadata_crc); + ceph_le32 metadata_crc_le; + metadata_crc_le = metadata_crc; + metadata_crc_filler.copy_in( + sizeof(checksum_t), + reinterpret_cast<const char *>(&metadata_crc_le)); + + bl.claim_append(data_bl); + ceph_assert(bl.length() == (rsize.dlength + rsize.mdlength)); + + return bl; +} + +bool Journal::validate_metadata(const bufferlist &bl) +{ + auto bliter = bl.cbegin(); + auto test_crc = bliter.crc32c( + ceph::encoded_sizeof_bounded<record_header_t>(), + -1); + ceph_le32 recorded_crc_le; + ::decode(recorded_crc_le, bliter); + uint32_t recorded_crc = recorded_crc_le; + test_crc = bliter.crc32c( + bliter.get_remaining(), + test_crc); + return test_crc == recorded_crc; +} + +Journal::read_validate_data_ret Journal::read_validate_data( + paddr_t record_base, + const record_header_t &header) +{ + return segment_manager.read( + record_base.add_offset(header.mdlength), + header.dlength + ).safe_then([=, &header](auto bptr) { + bufferlist bl; + bl.append(bptr); + return bl.crc32c(-1) == header.data_crc; + }); +} + +Journal::write_record_ret Journal::write_record( + record_size_t rsize, + record_t &&record) +{ + ceph::bufferlist to_write = encode_record( + rsize, std::move(record)); + auto target = written_to; + assert((to_write.length() % block_size) == 0); + written_to += to_write.length(); + logger().debug( + "write_record, mdlength {}, dlength {}, target {}", + rsize.mdlength, + rsize.dlength, + target); + return current_journal_segment->write(target, to_write).handle_error( + write_record_ertr::pass_further{}, + crimson::ct_error::assert_all{ "TODO" }).safe_then([this, target] { + committed_to = target; + return write_record_ret( + write_record_ertr::ready_future_marker{}, + paddr_t{ + current_journal_segment->get_segment_id(), + target}); + }); +} + +Journal::record_size_t Journal::get_encoded_record_length( + const record_t &record) const { + extent_len_t metadata = + (extent_len_t)ceph::encoded_sizeof_bounded<record_header_t>(); + metadata += sizeof(checksum_t) /* crc */; + metadata += record.extents.size() * + ceph::encoded_sizeof_bounded<extent_info_t>(); + extent_len_t data = 0; + for (const auto &i: record.deltas) { + metadata += ceph::encoded_sizeof(i); + } + for (const auto &i: record.extents) { + data += i.bl.length(); + } + metadata = p2roundup(metadata, block_size); + return record_size_t{metadata, data}; +} + +bool Journal::needs_roll(segment_off_t length) const +{ + return length + written_to > + current_journal_segment->get_write_capacity(); +} + +Journal::roll_journal_segment_ertr::future<segment_seq_t> +Journal::roll_journal_segment() +{ + auto old_segment_id = current_journal_segment ? + current_journal_segment->get_segment_id() : + NULL_SEG_ID; + + return (current_journal_segment ? + current_journal_segment->close() : + Segment::close_ertr::now()).safe_then([this] { + return segment_provider->get_segment(); + }).safe_then([this](auto segment) { + return segment_manager.open(segment); + }).safe_then([this](auto sref) { + current_journal_segment = sref; + written_to = 0; + return initialize_segment(*current_journal_segment); + }).safe_then([=](auto seq) { + if (old_segment_id != NULL_SEG_ID) { + segment_provider->close_segment(old_segment_id); + } + segment_provider->set_journal_segment( + current_journal_segment->get_segment_id(), + seq); + return seq; + }).handle_error( + roll_journal_segment_ertr::pass_further{}, + crimson::ct_error::all_same_way([] { ceph_assert(0 == "TODO"); }) + ); +} + +Journal::read_segment_header_ret +Journal::read_segment_header(segment_id_t segment) +{ + return segment_manager.read(paddr_t{segment, 0}, block_size + ).handle_error( + read_segment_header_ertr::pass_further{}, + crimson::ct_error::assert_all{} + ).safe_then([=](bufferptr bptr) -> read_segment_header_ret { + logger().debug("segment {} bptr size {}", segment, bptr.length()); + + segment_header_t header; + bufferlist bl; + bl.push_back(bptr); + + logger().debug( + "Journal::read_segment_header: segment {} block crc {}", + segment, + bl.begin().crc32c(block_size, 0)); + + auto bp = bl.cbegin(); + try { + decode(header, bp); + } catch (ceph::buffer::error &e) { + logger().debug( + "Journal::read_segment_header: segment {} unable to decode " + "header, skipping", + segment); + return crimson::ct_error::enodata::make(); + } + logger().debug( + "Journal::read_segment_header: segment {} header {}", + segment, + header); + return read_segment_header_ret( + read_segment_header_ertr::ready_future_marker{}, + header); + }); +} + +Journal::open_for_write_ret Journal::open_for_write() +{ + return roll_journal_segment().safe_then([this](auto seq) { + return open_for_write_ret( + open_for_write_ertr::ready_future_marker{}, + journal_seq_t{ + seq, + paddr_t{ + current_journal_segment->get_segment_id(), + static_cast<segment_off_t>(block_size)} + }); + }); +} + +Journal::find_replay_segments_fut Journal::find_replay_segments() +{ + return seastar::do_with( + std::vector<std::pair<segment_id_t, segment_header_t>>(), + [this](auto &&segments) mutable { + return crimson::do_for_each( + boost::make_counting_iterator(segment_id_t{0}), + boost::make_counting_iterator(segment_manager.get_num_segments()), + [this, &segments](auto i) { + return read_segment_header(i + ).safe_then([this, &segments, i](auto header) mutable { + if (generate_nonce( + header.journal_segment_seq, + segment_manager.get_meta()) != header.segment_nonce) { + logger().debug( + "find_replay_segments: nonce mismatch segment {} header {}", + i, + header); + assert(0 == "impossible"); + return find_replay_segments_ertr::now(); + } + + segments.emplace_back(i, std::move(header)); + return find_replay_segments_ertr::now(); + }).handle_error( + crimson::ct_error::enoent::handle([i](auto) { + logger().debug( + "find_replay_segments: segment {} not available for read", + i); + return find_replay_segments_ertr::now(); + }), + crimson::ct_error::enodata::handle([i](auto) { + logger().debug( + "find_replay_segments: segment {} header undecodable", + i); + return find_replay_segments_ertr::now(); + }), + find_replay_segments_ertr::pass_further{}, + crimson::ct_error::assert_all{} + ); + }).safe_then([this, &segments]() mutable -> find_replay_segments_fut { + logger().debug( + "find_replay_segments: have {} segments", + segments.size()); + if (segments.empty()) { + return crimson::ct_error::input_output_error::make(); + } + std::sort( + segments.begin(), + segments.end(), + [](const auto <, const auto &rt) { + return lt.second.journal_segment_seq < + rt.second.journal_segment_seq; + }); + + next_journal_segment_seq = + segments.rbegin()->second.journal_segment_seq + 1; + std::for_each( + segments.begin(), + segments.end(), + [this](auto &seg) { + segment_provider->init_mark_segment_closed( + seg.first, + seg.second.journal_segment_seq); + }); + + auto journal_tail = segments.rbegin()->second.journal_tail; + segment_provider->update_journal_tail_committed(journal_tail); + auto replay_from = journal_tail.offset; + logger().debug( + "Journal::find_replay_segments: journal_tail={}", + journal_tail); + auto from = segments.begin(); + if (replay_from != P_ADDR_NULL) { + from = std::find_if( + segments.begin(), + segments.end(), + [&replay_from](const auto &seg) -> bool { + return seg.first == replay_from.segment; + }); + if (from->second.journal_segment_seq != journal_tail.segment_seq) { + logger().error( + "find_replay_segments: journal_tail {} does not match {}", + journal_tail, + from->second); + assert(0 == "invalid"); + } + } else { + replay_from = paddr_t{from->first, (segment_off_t)block_size}; + } + auto ret = replay_segments_t(segments.end() - from); + std::transform( + from, segments.end(), ret.begin(), + [this](const auto &p) { + auto ret = journal_seq_t{ + p.second.journal_segment_seq, + paddr_t{p.first, (segment_off_t)block_size}}; + logger().debug( + "Journal::find_replay_segments: replaying from {}", + ret); + return std::make_pair(ret, p.second); + }); + ret[0].first.offset = replay_from; + return find_replay_segments_fut( + find_replay_segments_ertr::ready_future_marker{}, + std::move(ret)); + }); + }); +} + +Journal::read_validate_record_metadata_ret Journal::read_validate_record_metadata( + paddr_t start, + segment_nonce_t nonce) +{ + if (start.offset + block_size > (int64_t)segment_manager.get_segment_size()) { + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::nullopt); + } + return segment_manager.read(start, block_size + ).safe_then( + [=](bufferptr bptr) mutable + -> read_validate_record_metadata_ret { + logger().debug("read_validate_record_metadata: reading {}", start); + bufferlist bl; + bl.append(bptr); + auto bp = bl.cbegin(); + record_header_t header; + try { + decode(header, bp); + } catch (ceph::buffer::error &e) { + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::nullopt); + } + if (header.segment_nonce != nonce) { + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::nullopt); + } + if (header.mdlength > block_size) { + if (start.offset + header.mdlength > + (int64_t)segment_manager.get_segment_size()) { + return crimson::ct_error::input_output_error::make(); + } + return segment_manager.read( + {start.segment, start.offset + (segment_off_t)block_size}, + header.mdlength - block_size).safe_then( + [header=std::move(header), bl=std::move(bl)]( + auto &&bptail) mutable { + bl.push_back(bptail); + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::make_pair(std::move(header), std::move(bl))); + }); + } else { + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::make_pair(std::move(header), std::move(bl)) + ); + } + }).safe_then([=](auto p) { + if (p && validate_metadata(p->second)) { + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::move(*p) + ); + } else { + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::nullopt); + } + }); +} + +std::optional<std::vector<delta_info_t>> Journal::try_decode_deltas( + record_header_t header, + const bufferlist &bl) +{ + auto bliter = bl.cbegin(); + bliter += ceph::encoded_sizeof_bounded<record_header_t>(); + bliter += sizeof(checksum_t) /* crc */; + bliter += header.extents * ceph::encoded_sizeof_bounded<extent_info_t>(); + logger().debug("{}: decoding {} deltas", __func__, header.deltas); + std::vector<delta_info_t> deltas(header.deltas); + for (auto &&i : deltas) { + try { + decode(i, bliter); + } catch (ceph::buffer::error &e) { + return std::nullopt; + } + } + return deltas; +} + +std::optional<std::vector<extent_info_t>> Journal::try_decode_extent_infos( + record_header_t header, + const bufferlist &bl) +{ + auto bliter = bl.cbegin(); + bliter += ceph::encoded_sizeof_bounded<record_header_t>(); + bliter += sizeof(checksum_t) /* crc */; + logger().debug("{}: decoding {} extents", __func__, header.extents); + std::vector<extent_info_t> extent_infos(header.extents); + for (auto &&i : extent_infos) { + try { + decode(i, bliter); + } catch (ceph::buffer::error &e) { + return std::nullopt; + } + } + return extent_infos; +} + +Journal::replay_ertr::future<> +Journal::replay_segment( + journal_seq_t seq, + segment_header_t header, + delta_handler_t &handler) +{ + logger().debug("replay_segment: starting at {}", seq); + return seastar::do_with( + scan_valid_records_cursor(seq.offset), + found_record_handler_t( + [=, &handler](paddr_t base, + const record_header_t &header, + const bufferlist &mdbuf) { + auto deltas = try_decode_deltas( + header, + mdbuf); + if (!deltas) { + // This should be impossible, we did check the crc on the mdbuf + logger().error( + "Journal::replay_segment unable to decode deltas for record {}", + base); + assert(deltas); + } + + return seastar::do_with( + std::move(*deltas), + [=](auto &deltas) { + return crimson::do_for_each( + deltas, + [=](auto &delta) { + /* The journal may validly contain deltas for extents in + * since released segments. We can detect those cases by + * checking whether the segment in question currently has a + * sequence number > the current journal segment seq. We can + * safetly skip these deltas because the extent must already + * have been rewritten. + * + * Note, this comparison exploits the fact that + * SEGMENT_SEQ_NULL is a large number. + */ + if (delta.paddr != P_ADDR_NULL && + (segment_provider->get_seq(delta.paddr.segment) > + seq.segment_seq)) { + return replay_ertr::now(); + } else { + return handler( + journal_seq_t{seq.segment_seq, base}, + base.add_offset(header.mdlength), + delta); + } + }); + }); + }), + [=](auto &cursor, auto &dhandler) { + return scan_valid_records( + cursor, + header.segment_nonce, + std::numeric_limits<size_t>::max(), + dhandler).safe_then([](auto){}); + }); +} + +Journal::replay_ret Journal::replay(delta_handler_t &&delta_handler) +{ + return seastar::do_with( + std::move(delta_handler), replay_segments_t(), + [this](auto &handler, auto &segments) mutable -> replay_ret { + return find_replay_segments().safe_then( + [this, &handler, &segments](auto replay_segs) mutable { + logger().debug("replay: found {} segments", replay_segs.size()); + segments = std::move(replay_segs); + return crimson::do_for_each(segments, [this, &handler](auto i) mutable { + return replay_segment(i.first, i.second, handler); + }); + }); + }); +} + +Journal::scan_extents_ret Journal::scan_extents( + scan_extents_cursor &cursor, + extent_len_t bytes_to_read) +{ + auto ret = std::make_unique<scan_extents_ret_bare>(); + auto &retref = *ret; + return read_segment_header(cursor.get_offset().segment + ).handle_error( + scan_extents_ertr::pass_further{}, + crimson::ct_error::assert_all{} + ).safe_then([&](auto segment_header) { + auto segment_nonce = segment_header.segment_nonce; + return seastar::do_with( + found_record_handler_t( + [&]( + paddr_t base, + const record_header_t &header, + const bufferlist &mdbuf) mutable { + + auto infos = try_decode_extent_infos( + header, + mdbuf); + if (!infos) { + // This should be impossible, we did check the crc on the mdbuf + logger().error( + "Journal::scan_extents unable to decode extents for record {}", + base); + assert(infos); + } + + paddr_t extent_offset = base.add_offset(header.mdlength); + for (const auto &i : *infos) { + retref.emplace_back(extent_offset, i); + extent_offset.offset += i.len; + } + return scan_extents_ertr::now(); + }), + [=, &cursor](auto &dhandler) { + return scan_valid_records( + cursor, + segment_nonce, + std::numeric_limits<size_t>::max(), + dhandler).safe_then([](auto){}); + }); + }).safe_then([ret=std::move(ret)] { + return std::move(*ret); + }); +} + +Journal::scan_valid_records_ret Journal::scan_valid_records( + scan_valid_records_cursor &cursor, + segment_nonce_t nonce, + size_t budget, + found_record_handler_t &handler) +{ + if (cursor.offset.offset == 0) { + cursor.offset.offset = block_size; + } + auto retref = std::make_unique<size_t>(0); + auto budget_used = *retref; + return crimson::do_until( + [=, &cursor, &budget_used, &handler]() mutable + -> scan_valid_records_ertr::future<bool> { + return [=, &handler, &cursor, &budget_used] { + if (!cursor.last_valid_header_found) { + return read_validate_record_metadata(cursor.offset, nonce + ).safe_then([=, &cursor](auto md) { + logger().debug( + "Journal::scan_valid_records: read complete {}", + cursor.offset); + if (!md) { + logger().debug( + "Journal::scan_valid_records: found invalid header at {}, presumably at end", + cursor.offset); + cursor.last_valid_header_found = true; + return scan_valid_records_ertr::now(); + } else { + logger().debug( + "Journal::scan_valid_records: valid record read at {}", + cursor.offset); + cursor.last_committed = paddr_t{ + cursor.offset.segment, + md->first.committed_to}; + cursor.pending_records.emplace_back( + cursor.offset, + md->first, + md->second); + cursor.offset.offset += + md->first.dlength + md->first.mdlength; + return scan_valid_records_ertr::now(); + } + }).safe_then([=, &cursor, &budget_used, &handler] { + return crimson::do_until( + [=, &budget_used, &cursor, &handler] { + logger().debug( + "Journal::scan_valid_records: valid record read, processing queue"); + if (cursor.pending_records.empty()) { + /* This is only possible if the segment is empty. + * A record's last_commited must be prior to its own + * location since it itself cannot yet have been committed + * at its own time of submission. Thus, the most recently + * read record must always fall after cursor.last_committed */ + return scan_valid_records_ertr::make_ready_future<bool>(true); + } + auto &next = cursor.pending_records.front(); + if (next.offset > cursor.last_committed) { + return scan_valid_records_ertr::make_ready_future<bool>(true); + } + budget_used += + next.header.dlength + next.header.mdlength; + return handler( + next.offset, + next.header, + next.mdbuffer + ).safe_then([&cursor] { + cursor.pending_records.pop_front(); + return scan_valid_records_ertr::make_ready_future<bool>(false); + }); + }); + }); + } else { + assert(!cursor.pending_records.empty()); + auto &next = cursor.pending_records.front(); + return read_validate_data(next.offset, next.header + ).safe_then([=, &budget_used, &next, &cursor, &handler](auto valid) { + if (!valid) { + cursor.pending_records.clear(); + return scan_valid_records_ertr::now(); + } + budget_used += + next.header.dlength + next.header.mdlength; + return handler( + next.offset, + next.header, + next.mdbuffer + ).safe_then([&cursor] { + cursor.pending_records.pop_front(); + return scan_valid_records_ertr::now(); + }); + }); + } + }().safe_then([=, &budget_used, &cursor] { + return scan_valid_records_ertr::make_ready_future<bool>( + cursor.is_complete() || budget_used >= budget); + }); + }).safe_then([retref=std::move(retref)]() mutable -> scan_valid_records_ret { + return scan_valid_records_ret( + scan_valid_records_ertr::ready_future_marker{}, + std::move(*retref)); + }); +} + + +} diff --git a/src/crimson/os/seastore/journal.h b/src/crimson/os/seastore/journal.h new file mode 100644 index 000000000..7424d78b3 --- /dev/null +++ b/src/crimson/os/seastore/journal.h @@ -0,0 +1,405 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/log.h" + +#include <boost/intrusive_ptr.hpp> + +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "include/buffer.h" +#include "include/denc.h" + +#include "crimson/os/seastore/segment_manager.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/osd/exceptions.h" + +namespace crimson::os::seastore { + +using segment_nonce_t = uint32_t; + + +/** + * Segment header + * + * Every segment contains and encode segment_header_t in the first block. + * Our strategy for finding the journal replay point is: + * 1) Find the segment with the highest journal_segment_seq + * 2) Replay starting at record located at that segment's journal_tail + */ +struct segment_header_t { + segment_seq_t journal_segment_seq; + segment_id_t physical_segment_id; // debugging + + journal_seq_t journal_tail; + segment_nonce_t segment_nonce; + + DENC(segment_header_t, v, p) { + DENC_START(1, 1, p); + denc(v.journal_segment_seq, p); + denc(v.physical_segment_id, p); + denc(v.journal_tail, p); + denc(v.segment_nonce, p); + DENC_FINISH(p); + } +}; +std::ostream &operator<<(std::ostream &out, const segment_header_t &header); + +struct record_header_t { + // Fixed portion + extent_len_t mdlength; // block aligned, length of metadata + extent_len_t dlength; // block aligned, length of data + uint32_t deltas; // number of deltas + uint32_t extents; // number of extents + segment_nonce_t segment_nonce;// nonce of containing segment + segment_off_t committed_to; // records in this segment prior to committed_to + // have been fully written + checksum_t data_crc; // crc of data payload + + + DENC(record_header_t, v, p) { + DENC_START(1, 1, p); + denc(v.mdlength, p); + denc(v.dlength, p); + denc(v.deltas, p); + denc(v.extents, p); + denc(v.segment_nonce, p); + denc(v.committed_to, p); + denc(v.data_crc, p); + DENC_FINISH(p); + } +}; + +struct extent_info_t { + extent_types_t type = extent_types_t::NONE; + laddr_t addr = L_ADDR_NULL; + extent_len_t len = 0; + + extent_info_t() = default; + extent_info_t(const extent_t &et) + : type(et.type), addr(et.addr), len(et.bl.length()) {} + + DENC(extent_info_t, v, p) { + DENC_START(1, 1, p); + denc(v.type, p); + denc(v.addr, p); + denc(v.len, p); + DENC_FINISH(p); + } +}; + +/** + * Callback interface for managing available segments + */ +class JournalSegmentProvider { +public: + using get_segment_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using get_segment_ret = get_segment_ertr::future<segment_id_t>; + virtual get_segment_ret get_segment() = 0; + + virtual void close_segment(segment_id_t) {} + + virtual void set_journal_segment( + segment_id_t segment, + segment_seq_t seq) {} + + virtual journal_seq_t get_journal_tail_target() const = 0; + virtual void update_journal_tail_committed(journal_seq_t tail_committed) = 0; + + virtual void init_mark_segment_closed( + segment_id_t segment, segment_seq_t seq) {} + + virtual segment_seq_t get_seq(segment_id_t id) { return 0; } + + virtual ~JournalSegmentProvider() {} +}; + +/** + * Manages stream of atomically written records to a SegmentManager. + */ +class Journal { +public: + Journal(SegmentManager &segment_manager); + + /** + * Sets the JournalSegmentProvider. + * + * Not provided in constructor to allow the provider to not own + * or construct the Journal (TransactionManager). + * + * Note, Journal does not own this ptr, user must ensure that + * *provider outlives Journal. + */ + void set_segment_provider(JournalSegmentProvider *provider) { + segment_provider = provider; + } + + /** + * initializes journal for new writes -- must run prior to calls + * to submit_record. Should be called after replay if not a new + * Journal. + */ + using open_for_write_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + using open_for_write_ret = open_for_write_ertr::future<journal_seq_t>; + open_for_write_ret open_for_write(); + + /** + * close journal + * + * TODO: should probably flush and disallow further writes + */ + using close_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + close_ertr::future<> close() { return close_ertr::now(); } + + /** + * submit_record + * + * @param write record and returns offset of first block and seq + */ + using submit_record_ertr = crimson::errorator< + crimson::ct_error::erange, + crimson::ct_error::input_output_error + >; + using submit_record_ret = submit_record_ertr::future< + std::pair<paddr_t, journal_seq_t> + >; + submit_record_ret submit_record(record_t &&record) { + auto rsize = get_encoded_record_length(record); + auto total = rsize.mdlength + rsize.dlength; + if (total > max_record_length) { + return crimson::ct_error::erange::make(); + } + auto roll = needs_roll(total) + ? roll_journal_segment().safe_then([](auto){}) + : roll_journal_segment_ertr::now(); + return roll.safe_then( + [this, rsize, record=std::move(record)]() mutable { + return write_record(rsize, std::move(record) + ).safe_then([this, rsize](auto addr) { + return std::make_pair( + addr.add_offset(rsize.mdlength), + get_journal_seq(addr)); + }); + }); + } + + /** + * Read deltas and pass to delta_handler + * + * record_block_start (argument to delta_handler) is the start of the + * of the first block in the record + */ + using replay_ertr = SegmentManager::read_ertr; + using replay_ret = replay_ertr::future<>; + using delta_handler_t = std::function< + replay_ret(journal_seq_t seq, + paddr_t record_block_base, + const delta_info_t&)>; + replay_ret replay(delta_handler_t &&delta_handler); + + /** + * scan_extents + * + * Scans records beginning at addr until the first record boundary after + * addr + bytes_to_read. + * + * Returns list<extent, extent_info> + * cursor.is_complete() will be true when no further extents exist in segment. + */ + class scan_valid_records_cursor; + using scan_extents_cursor = scan_valid_records_cursor; + using scan_extents_ertr = SegmentManager::read_ertr; + using scan_extents_ret_bare = std::list<std::pair<paddr_t, extent_info_t>>; + using scan_extents_ret = scan_extents_ertr::future<scan_extents_ret_bare>; + scan_extents_ret scan_extents( + scan_extents_cursor &cursor, + extent_len_t bytes_to_read + ); + + +private: + const extent_len_t block_size; + const extent_len_t max_record_length; + + JournalSegmentProvider *segment_provider = nullptr; + SegmentManager &segment_manager; + + segment_seq_t next_journal_segment_seq = 0; + segment_nonce_t current_segment_nonce = 0; + + SegmentRef current_journal_segment; + segment_off_t written_to = 0; + segment_off_t committed_to = 0; + + journal_seq_t get_journal_seq(paddr_t addr) { + return journal_seq_t{next_journal_segment_seq-1, addr}; + } + + /// prepare segment for writes, writes out segment header + using initialize_segment_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + initialize_segment_ertr::future<segment_seq_t> initialize_segment( + Segment &segment); + + struct record_size_t { + extent_len_t mdlength = 0; + extent_len_t dlength = 0; + + record_size_t( + extent_len_t mdlength, + extent_len_t dlength) + : mdlength(mdlength), dlength(dlength) {} + }; + + /** + * Return <mdlength, dlength> pair denoting length of + * metadata and blocks respectively. + */ + record_size_t get_encoded_record_length( + const record_t &record) const; + + /// create encoded record bl + ceph::bufferlist encode_record( + record_size_t rsize, + record_t &&record); + + /// validate embedded metadata checksum + static bool validate_metadata(const bufferlist &bl); + + /// read and validate data + using read_validate_data_ertr = SegmentManager::read_ertr; + using read_validate_data_ret = read_validate_data_ertr::future<bool>; + read_validate_data_ret read_validate_data( + paddr_t record_base, + const record_header_t &header ///< caller must ensure lifetime through + /// future resolution + ); + + + /// do record write + using write_record_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using write_record_ret = write_record_ertr::future<paddr_t>; + write_record_ret write_record( + record_size_t rsize, + record_t &&record); + + /// close current segment and initialize next one + using roll_journal_segment_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + roll_journal_segment_ertr::future<segment_seq_t> roll_journal_segment(); + + /// returns true iff current segment has insufficient space + bool needs_roll(segment_off_t length) const; + + using read_segment_header_ertr = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::enodata, + crimson::ct_error::input_output_error + >; + using read_segment_header_ret = read_segment_header_ertr::future< + segment_header_t>; + read_segment_header_ret read_segment_header(segment_id_t segment); + + /// return ordered vector of segments to replay + using replay_segments_t = std::vector< + std::pair<journal_seq_t, segment_header_t>>; + using find_replay_segments_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + using find_replay_segments_fut = find_replay_segments_ertr::future< + replay_segments_t>; + find_replay_segments_fut find_replay_segments(); + + /// attempts to decode deltas from bl, return nullopt if unsuccessful + std::optional<std::vector<delta_info_t>> try_decode_deltas( + record_header_t header, + const bufferlist &bl); + + /// attempts to decode extent infos from bl, return nullopt if unsuccessful + std::optional<std::vector<extent_info_t>> try_decode_extent_infos( + record_header_t header, + const bufferlist &bl); + + /// read record metadata for record starting at start + using read_validate_record_metadata_ertr = replay_ertr; + using read_validate_record_metadata_ret = + read_validate_record_metadata_ertr::future< + std::optional<std::pair<record_header_t, bufferlist>> + >; + read_validate_record_metadata_ret read_validate_record_metadata( + paddr_t start, + segment_nonce_t nonce); + +public: + /// scan segment for end incrementally + struct scan_valid_records_cursor { + bool last_valid_header_found = false; + paddr_t offset; + paddr_t last_committed; + + struct found_record_t { + paddr_t offset; + record_header_t header; + bufferlist mdbuffer; + + found_record_t( + paddr_t offset, + const record_header_t &header, + const bufferlist &mdbuffer) + : offset(offset), header(header), mdbuffer(mdbuffer) {} + }; + std::deque<found_record_t> pending_records; + + bool is_complete() const { + return last_valid_header_found && pending_records.empty(); + } + + paddr_t get_offset() const { + return offset; + } + + scan_valid_records_cursor( + paddr_t offset) + : offset(offset) {} + }; +private: + + using scan_valid_records_ertr = SegmentManager::read_ertr; + using scan_valid_records_ret = scan_valid_records_ertr::future< + size_t>; + using found_record_handler_t = std::function< + scan_valid_records_ertr::future<>( + paddr_t record_block_base, + // callee may assume header and bl will remain valid until + // returned future resolves + const record_header_t &header, + const bufferlist &bl)>; + scan_valid_records_ret scan_valid_records( + scan_valid_records_cursor &cursor, ///< [in, out] cursor, updated during call + segment_nonce_t nonce, ///< [in] nonce for segment + size_t budget, ///< [in] max budget to use + found_record_handler_t &handler ///< [in] handler for records + ); ///< @return used budget + + /// replays records starting at start through end of segment + replay_ertr::future<> + replay_segment( + journal_seq_t start, ///< [in] starting addr, seq + segment_header_t header, ///< [in] segment header + delta_handler_t &delta_handler ///< [in] processes deltas in order + ); + +}; + +} +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_header_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_header_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::extent_info_t) diff --git a/src/crimson/os/seastore/lba_manager.cc b/src/crimson/os/seastore/lba_manager.cc new file mode 100644 index 000000000..73411dcf7 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager.cc @@ -0,0 +1,17 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/segment_manager.h" +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/lba_manager.h" +#include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h" + +namespace crimson::os::seastore::lba_manager { + +LBAManagerRef create_lba_manager( + SegmentManager &segment_manager, + Cache &cache) { + return LBAManagerRef(new btree::BtreeLBAManager(segment_manager, cache)); +} + +} diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h new file mode 100644 index 000000000..ad90f4c4f --- /dev/null +++ b/src/crimson/os/seastore/lba_manager.h @@ -0,0 +1,207 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "include/buffer_fwd.h" +#include "include/interval_set.h" +#include "common/interval_map.h" + +#include "crimson/osd/exceptions.h" + +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/segment_manager.h" + +namespace crimson::os::seastore { + +/** + * Abstract interface for managing the logical to physical mapping + */ +class LBAManager { +public: + using mkfs_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using mkfs_ret = mkfs_ertr::future<>; + virtual mkfs_ret mkfs( + Transaction &t + ) = 0; + + /** + * Fetches mappings for laddr_t in range [offset, offset + len) + * + * Future will not resolve until all pins have resolved (set_paddr called) + */ + using get_mapping_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using get_mapping_ret = get_mapping_ertr::future<lba_pin_list_t>; + virtual get_mapping_ret get_mapping( + Transaction &t, + laddr_t offset, extent_len_t length) = 0; + + /** + * Fetches mappings for laddr_t in range [offset, offset + len) + * + * Future will not result until all pins have resolved (set_paddr called) + */ + using get_mappings_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using get_mappings_ret = get_mapping_ertr::future<lba_pin_list_t>; + virtual get_mappings_ret get_mappings( + Transaction &t, + laddr_list_t &&extent_lisk) = 0; + + /** + * Allocates a new mapping referenced by LBARef + * + * Offset will be relative to the block offset of the record + * This mapping will block from transaction submission until set_paddr + * is called on the LBAPin. + */ + using alloc_extent_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using alloc_extent_ret = alloc_extent_ertr::future<LBAPinRef>; + virtual alloc_extent_ret alloc_extent( + Transaction &t, + laddr_t hint, + extent_len_t len, + paddr_t addr) = 0; + + /** + * Creates a new absolute mapping. + * + * off~len must be unreferenced + */ + using set_extent_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg>; + using set_extent_ret = set_extent_ertr::future<LBAPinRef>; + virtual set_extent_ret set_extent( + Transaction &t, + laddr_t off, extent_len_t len, paddr_t addr) = 0; + + + struct ref_update_result_t { + unsigned refcount = 0; + paddr_t addr; + }; + using ref_ertr = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::input_output_error>; + using ref_ret = ref_ertr::future<ref_update_result_t>; + + /** + * Decrements ref count on extent + * + * @return returns resulting refcount + */ + virtual ref_ret decref_extent( + Transaction &t, + laddr_t addr) = 0; + + /** + * Increments ref count on extent + * + * @return returns resulting refcount + */ + virtual ref_ret incref_extent( + Transaction &t, + laddr_t addr) = 0; + + using complete_transaction_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using complete_transaction_ret = complete_transaction_ertr::future<>; + virtual complete_transaction_ret complete_transaction( + Transaction &t) = 0; + + /** + * Should be called after replay on each cached extent. + * Implementation must initialize the LBAPin on any + * LogicalCachedExtent's and may also read in any dependent + * structures, etc. + */ + using init_cached_extent_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using init_cached_extent_ret = init_cached_extent_ertr::future<>; + virtual init_cached_extent_ret init_cached_extent( + Transaction &t, + CachedExtentRef e) = 0; + + /** + * Calls f for each mapping in [begin, end) + */ + using scan_mappings_ertr = SegmentManager::read_ertr; + using scan_mappings_ret = scan_mappings_ertr::future<>; + using scan_mappings_func_t = std::function< + void(laddr_t, paddr_t, extent_len_t)>; + virtual scan_mappings_ret scan_mappings( + Transaction &t, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &&f) = 0; + + /** + * Calls f for each mapped space usage in [begin, end) + */ + using scan_mapped_space_ertr = SegmentManager::read_ertr; + using scan_mapped_space_ret = scan_mapped_space_ertr::future<>; + using scan_mapped_space_func_t = std::function< + void(paddr_t, extent_len_t)>; + virtual scan_mapped_space_ret scan_mapped_space( + Transaction &t, + scan_mapped_space_func_t &&f) = 0; + + /** + * rewrite_extent + * + * rewrite extent into passed transaction + */ + using rewrite_extent_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using rewrite_extent_ret = rewrite_extent_ertr::future<>; + virtual rewrite_extent_ret rewrite_extent( + Transaction &t, + CachedExtentRef extent) = 0; + + /** + * get_physical_extent_if_live + * + * Returns extent at addr/laddr if still live (if laddr + * still points at addr). Extent must be an internal, physical + * extent. + * + * Returns a null CachedExtentRef if extent is not live. + */ + using get_physical_extent_if_live_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using get_physical_extent_if_live_ret = + get_physical_extent_if_live_ertr::future<CachedExtentRef>; + virtual get_physical_extent_if_live_ret get_physical_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + segment_off_t len) = 0; + + virtual void add_pin(LBAPin &pin) = 0; + + virtual ~LBAManager() {} +}; +using LBAManagerRef = std::unique_ptr<LBAManager>; + +class Cache; +namespace lba_manager { +LBAManagerRef create_lba_manager( + SegmentManager &segment_manager, + Cache &cache); +} + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc new file mode 100644 index 000000000..a837ae37e --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc @@ -0,0 +1,580 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include "crimson/common/log.h" + +#include "include/buffer.h" +#include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h" + + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::lba_manager::btree { + +BtreeLBAManager::mkfs_ret BtreeLBAManager::mkfs( + Transaction &t) +{ + logger().debug("BtreeLBAManager::mkfs"); + return cache.get_root(t).safe_then([this, &t](auto croot) { + auto root_leaf = cache.alloc_new_extent<LBALeafNode>( + t, + LBA_BLOCK_SIZE); + root_leaf->set_size(0); + lba_node_meta_t meta{0, L_ADDR_MAX, 1}; + root_leaf->set_meta(meta); + root_leaf->pin.set_range(meta); + croot->get_root() = + root_t{ + 1, + 0, + root_leaf->get_paddr(), + make_record_relative_paddr(0), + L_ADDR_NULL}; + return mkfs_ertr::now(); + }); +} + +BtreeLBAManager::get_root_ret +BtreeLBAManager::get_root(Transaction &t) +{ + return cache.get_root(t).safe_then([this, &t](auto croot) { + logger().debug( + "BtreeLBAManager::get_root: reading root at {} depth {}", + paddr_t{croot->get_root().lba_root_addr}, + unsigned(croot->get_root().lba_depth)); + return get_lba_btree_extent( + get_context(t), + croot->get_root().lba_depth, + croot->get_root().lba_root_addr, + paddr_t()); + }); +} + +BtreeLBAManager::get_mapping_ret +BtreeLBAManager::get_mapping( + Transaction &t, + laddr_t offset, extent_len_t length) +{ + logger().debug("BtreeLBAManager::get_mapping: {}, {}", offset, length); + return get_root( + t).safe_then([this, &t, offset, length](auto extent) { + return extent->lookup_range( + get_context(t), + offset, length + ).safe_then([extent](auto ret) { return ret; }); + }).safe_then([](auto &&e) { + logger().debug("BtreeLBAManager::get_mapping: got mapping {}", e); + return get_mapping_ret( + get_mapping_ertr::ready_future_marker{}, + std::move(e)); + }); +} + + +BtreeLBAManager::get_mappings_ret +BtreeLBAManager::get_mappings( + Transaction &t, + laddr_list_t &&list) +{ + logger().debug("BtreeLBAManager::get_mappings: {}", list); + auto l = std::make_unique<laddr_list_t>(std::move(list)); + auto retptr = std::make_unique<lba_pin_list_t>(); + auto &ret = *retptr; + return crimson::do_for_each( + l->begin(), + l->end(), + [this, &t, &ret](const auto &p) { + return get_mapping(t, p.first, p.second).safe_then( + [&ret](auto res) { + ret.splice(ret.end(), res, res.begin(), res.end()); + }); + }).safe_then([l=std::move(l), retptr=std::move(retptr)]() mutable { + return std::move(*retptr); + }); +} + +BtreeLBAManager::alloc_extent_ret +BtreeLBAManager::alloc_extent( + Transaction &t, + laddr_t hint, + extent_len_t len, + paddr_t addr) +{ + // TODO: we can certainly combine the lookup and the insert. + return get_root( + t).safe_then([this, &t, hint, len](auto extent) { + logger().debug( + "BtreeLBAManager::alloc_extent: beginning search at {}", + *extent); + return extent->find_hole( + get_context(t), + hint, + L_ADDR_MAX, + len).safe_then([extent](auto ret) { + return std::make_pair(ret, extent); + }); + }).safe_then([this, &t, len, addr](auto allocation_pair) { + auto &[laddr, extent] = allocation_pair; + ceph_assert(laddr != L_ADDR_MAX); + return insert_mapping( + t, + extent, + laddr, + { len, addr, 1, 0 } + ).safe_then([laddr=laddr, addr, len](auto pin) { + logger().debug( + "BtreeLBAManager::alloc_extent: alloc {}~{} for {}", + laddr, + len, + addr); + return alloc_extent_ret( + alloc_extent_ertr::ready_future_marker{}, + LBAPinRef(pin.release())); + }); + }); +} + +BtreeLBAManager::set_extent_ret +BtreeLBAManager::set_extent( + Transaction &t, + laddr_t off, extent_len_t len, paddr_t addr) +{ + return get_root( + t).safe_then([this, &t, off, len, addr](auto root) { + return insert_mapping( + t, + root, + off, + { len, addr, 1, 0 }); + }).safe_then([](auto ret) { + return set_extent_ret( + set_extent_ertr::ready_future_marker{}, + LBAPinRef(ret.release())); + }); +} + +static bool is_lba_node(extent_types_t type) +{ + return type == extent_types_t::LADDR_INTERNAL || + type == extent_types_t::LADDR_LEAF; +} + +static bool is_lba_node(const CachedExtent &e) +{ + return is_lba_node(e.get_type()); +} + +btree_range_pin_t &BtreeLBAManager::get_pin(CachedExtent &e) +{ + if (is_lba_node(e)) { + return e.cast<LBANode>()->pin; + } else if (e.is_logical()) { + return static_cast<BtreeLBAPin &>( + e.cast<LogicalCachedExtent>()->get_pin()).pin; + } else { + ceph_abort_msg("impossible"); + } +} + +static depth_t get_depth(const CachedExtent &e) +{ + if (is_lba_node(e)) { + return e.cast<LBANode>()->get_node_meta().depth; + } else if (e.is_logical()) { + return 0; + } else { + ceph_assert(0 == "currently impossible"); + return 0; + } +} + +BtreeLBAManager::complete_transaction_ret +BtreeLBAManager::complete_transaction( + Transaction &t) +{ + std::vector<CachedExtentRef> to_clear; + to_clear.reserve(t.get_retired_set().size()); + for (auto &e: t.get_retired_set()) { + if (e->is_logical() || is_lba_node(*e)) + to_clear.push_back(e); + } + // need to call check_parent from leaf->parent + std::sort( + to_clear.begin(), to_clear.end(), + [](auto &l, auto &r) { return get_depth(*l) < get_depth(*r); }); + + for (auto &e: to_clear) { + auto &pin = get_pin(*e); + logger().debug("{}: retiring {}, {}", __func__, *e, pin); + pin_set.retire(pin); + } + + // ...but add_pin from parent->leaf + std::vector<CachedExtentRef> to_link; + to_link.reserve(t.get_fresh_block_list().size()); + for (auto &e: t.get_fresh_block_list()) { + if (e->is_valid() && (is_lba_node(*e) || e->is_logical())) + to_link.push_back(e); + } + std::sort( + to_link.begin(), to_link.end(), + [](auto &l, auto &r) -> bool { return get_depth(*l) > get_depth(*r); }); + + for (auto &e : to_link) { + logger().debug("{}: linking {}", __func__, *e); + pin_set.add_pin(get_pin(*e)); + } + + for (auto &e: to_clear) { + auto &pin = get_pin(*e); + logger().debug("{}: checking {}, {}", __func__, *e, pin); + pin_set.check_parent(pin); + } + return complete_transaction_ertr::now(); +} + +BtreeLBAManager::init_cached_extent_ret BtreeLBAManager::init_cached_extent( + Transaction &t, + CachedExtentRef e) +{ + logger().debug("{}: {}", __func__, *e); + return get_root(t).safe_then( + [this, &t, e=std::move(e)](LBANodeRef root) mutable { + if (is_lba_node(*e)) { + auto lban = e->cast<LBANode>(); + logger().debug("init_cached_extent: lba node, getting root"); + return root->lookup( + op_context_t{cache, pin_set, t}, + lban->get_node_meta().begin, + lban->get_node_meta().depth + ).safe_then([this, e=std::move(e)](LBANodeRef c) { + if (c->get_paddr() == e->get_paddr()) { + assert(&*c == &*e); + logger().debug("init_cached_extent: {} initialized", *e); + } else { + // e is obsolete + logger().debug("init_cached_extent: {} obsolete", *e); + cache.drop_from_cache(e); + } + return init_cached_extent_ertr::now(); + }); + } else if (e->is_logical()) { + auto logn = e->cast<LogicalCachedExtent>(); + return root->lookup_range( + op_context_t{cache, pin_set, t}, + logn->get_laddr(), + logn->get_length()).safe_then( + [this, logn=std::move(logn)](auto pins) { + if (pins.size() == 1) { + auto pin = std::move(pins.front()); + pins.pop_front(); + if (pin->get_paddr() == logn->get_paddr()) { + logn->set_pin(std::move(pin)); + pin_set.add_pin( + static_cast<BtreeLBAPin&>(logn->get_pin()).pin); + logger().debug("init_cached_extent: {} initialized", *logn); + } else { + // paddr doesn't match, remapped, obsolete + logger().debug("init_cached_extent: {} obsolete", *logn); + cache.drop_from_cache(logn); + } + } else { + // set of extents changed, obsolete + logger().debug("init_cached_extent: {} obsolete", *logn); + cache.drop_from_cache(logn); + } + return init_cached_extent_ertr::now(); + }); + } else { + logger().debug("init_cached_extent: {} skipped", *e); + return init_cached_extent_ertr::now(); + } + }); +} + +BtreeLBAManager::scan_mappings_ret BtreeLBAManager::scan_mappings( + Transaction &t, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &&f) +{ + return seastar::do_with( + std::move(f), + LBANodeRef(), + [=, &t](auto &f, auto &lbarootref) { + return get_root(t).safe_then( + [=, &t, &f](LBANodeRef lbaroot) mutable { + lbarootref = lbaroot; + return lbaroot->scan_mappings( + get_context(t), + begin, + end, + f); + }); + }); +} + +BtreeLBAManager::scan_mapped_space_ret BtreeLBAManager::scan_mapped_space( + Transaction &t, + scan_mapped_space_func_t &&f) +{ + return seastar::do_with( + std::move(f), + LBANodeRef(), + [=, &t](auto &f, auto &lbarootref) { + return get_root(t).safe_then( + [=, &t, &f](LBANodeRef lbaroot) mutable { + lbarootref = lbaroot; + return lbaroot->scan_mapped_space( + get_context(t), + f); + }); + }); +} + +BtreeLBAManager::rewrite_extent_ret BtreeLBAManager::rewrite_extent( + Transaction &t, + CachedExtentRef extent) +{ + if (extent->is_logical()) { + auto lextent = extent->cast<LogicalCachedExtent>(); + cache.retire_extent(t, extent); + auto nlextent = cache.alloc_new_extent_by_type( + t, + lextent->get_type(), + lextent->get_length())->cast<LogicalCachedExtent>(); + lextent->get_bptr().copy_out( + 0, + lextent->get_length(), + nlextent->get_bptr().c_str()); + nlextent->set_laddr(lextent->get_laddr()); + nlextent->set_pin(lextent->get_pin().duplicate()); + + logger().debug( + "{}: rewriting {} into {}", + __func__, + *lextent, + *nlextent); + + return update_mapping( + t, + lextent->get_laddr(), + [prev_addr = lextent->get_paddr(), addr = nlextent->get_paddr()]( + const lba_map_val_t &in) { + lba_map_val_t ret = in; + ceph_assert(in.paddr == prev_addr); + ret.paddr = addr; + return ret; + }).safe_then([nlextent](auto e) {}).handle_error( + rewrite_extent_ertr::pass_further{}, + /* ENOENT in particular should be impossible */ + crimson::ct_error::assert_all{} + ); + } else if (is_lba_node(*extent)) { + auto lba_extent = extent->cast<LBANode>(); + cache.retire_extent(t, extent); + auto nlba_extent = cache.alloc_new_extent_by_type( + t, + lba_extent->get_type(), + lba_extent->get_length())->cast<LBANode>(); + lba_extent->get_bptr().copy_out( + 0, + lba_extent->get_length(), + nlba_extent->get_bptr().c_str()); + nlba_extent->pin.set_range(nlba_extent->get_node_meta()); + + /* This is a bit underhanded. Any relative addrs here must necessarily + * be record relative as we are rewriting a dirty extent. Thus, we + * are using resolve_relative_addrs with a (likely negative) block + * relative offset to correct them to block-relative offsets adjusted + * for our new transaction location. + * + * Upon commit, these now block relative addresses will be interpretted + * against the real final address. + */ + nlba_extent->resolve_relative_addrs( + make_record_relative_paddr(0) - nlba_extent->get_paddr()); + + return update_internal_mapping( + t, + nlba_extent->get_node_meta().depth, + nlba_extent->get_node_meta().begin, + nlba_extent->get_paddr()).safe_then( + [](auto) {}, + rewrite_extent_ertr::pass_further {}, + crimson::ct_error::assert_all{}); + } else { + return rewrite_extent_ertr::now(); + } +} + +BtreeLBAManager::get_physical_extent_if_live_ret +BtreeLBAManager::get_physical_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + segment_off_t len) +{ + ceph_assert(is_lba_node(type)); + return cache.get_extent_by_type( + t, + type, + addr, + laddr, + len + ).safe_then([=, &t](CachedExtentRef extent) { + return get_root(t).safe_then([=, &t](LBANodeRef root) { + auto lba_node = extent->cast<LBANode>(); + return root->lookup( + op_context_t{cache, pin_set, t}, + lba_node->get_node_meta().begin, + lba_node->get_node_meta().depth).safe_then([=](LBANodeRef c) { + if (c->get_paddr() == lba_node->get_paddr()) { + return get_physical_extent_if_live_ret( + get_physical_extent_if_live_ertr::ready_future_marker{}, + lba_node); + } else { + cache.drop_from_cache(lba_node); + return get_physical_extent_if_live_ret( + get_physical_extent_if_live_ertr::ready_future_marker{}, + CachedExtentRef()); + } + }); + }); + }); +} + +BtreeLBAManager::BtreeLBAManager( + SegmentManager &segment_manager, + Cache &cache) + : segment_manager(segment_manager), + cache(cache) {} + +BtreeLBAManager::insert_mapping_ret BtreeLBAManager::insert_mapping( + Transaction &t, + LBANodeRef root, + laddr_t laddr, + lba_map_val_t val) +{ + auto split = insert_mapping_ertr::future<LBANodeRef>( + insert_mapping_ertr::ready_future_marker{}, + root); + if (root->at_max_capacity()) { + split = cache.get_root(t).safe_then( + [this, root, laddr, &t](RootBlockRef croot) { + logger().debug( + "BtreeLBAManager::insert_mapping: splitting root {}", + *croot); + { + auto mut_croot = cache.duplicate_for_write(t, croot); + croot = mut_croot->cast<RootBlock>(); + } + auto nroot = cache.alloc_new_extent<LBAInternalNode>(t, LBA_BLOCK_SIZE); + lba_node_meta_t meta{0, L_ADDR_MAX, root->get_node_meta().depth + 1}; + nroot->set_meta(meta); + nroot->pin.set_range(meta); + nroot->journal_insert( + nroot->begin(), + L_ADDR_MIN, + root->get_paddr(), + nullptr); + croot->get_root().lba_root_addr = nroot->get_paddr(); + croot->get_root().lba_depth = root->get_node_meta().depth + 1; + return nroot->split_entry( + get_context(t), + laddr, nroot->begin(), root); + }); + } + return split.safe_then([this, &t, laddr, val](LBANodeRef node) { + return node->insert( + get_context(t), + laddr, val); + }); +} + +BtreeLBAManager::update_refcount_ret BtreeLBAManager::update_refcount( + Transaction &t, + laddr_t addr, + int delta) +{ + return update_mapping( + t, + addr, + [delta](const lba_map_val_t &in) { + lba_map_val_t out = in; + ceph_assert((int)out.refcount + delta >= 0); + out.refcount += delta; + return out; + }).safe_then([](auto result) { + return ref_update_result_t{result.refcount, result.paddr}; + }); +} + +BtreeLBAManager::update_mapping_ret BtreeLBAManager::update_mapping( + Transaction &t, + laddr_t addr, + update_func_t &&f) +{ + return get_root(t + ).safe_then([this, f=std::move(f), &t, addr](LBANodeRef root) mutable { + return root->mutate_mapping( + get_context(t), + addr, + std::move(f)); + }); +} + +BtreeLBAManager::update_internal_mapping_ret +BtreeLBAManager::update_internal_mapping( + Transaction &t, + depth_t depth, + laddr_t laddr, + paddr_t paddr) +{ + return cache.get_root(t).safe_then([=, &t](RootBlockRef croot) { + if (depth == croot->get_root().lba_depth) { + logger().debug( + "update_internal_mapping: updating lba root to: {}->{}", + laddr, + paddr); + { + auto mut_croot = cache.duplicate_for_write(t, croot); + croot = mut_croot->cast<RootBlock>(); + } + ceph_assert(laddr == 0); + auto old_paddr = croot->get_root().lba_root_addr; + croot->get_root().lba_root_addr = paddr; + return update_internal_mapping_ret( + update_internal_mapping_ertr::ready_future_marker{}, + old_paddr); + } else { + logger().debug( + "update_internal_mapping: updating lba node at depth {} to: {}->{}", + depth, + laddr, + paddr); + return get_lba_btree_extent( + get_context(t), + croot->get_root().lba_depth, + croot->get_root().lba_root_addr, + paddr_t()).safe_then([=, &t](LBANodeRef broot) { + return broot->mutate_internal_address( + get_context(t), + depth, + laddr, + paddr); + }); + } + }); +} + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h new file mode 100644 index 000000000..640d56734 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h @@ -0,0 +1,188 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "include/buffer_fwd.h" +#include "include/interval_set.h" +#include "common/interval_map.h" +#include "crimson/osd/exceptions.h" + +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/lba_manager.h" +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/segment_manager.h" + +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" + +namespace crimson::os::seastore::lba_manager::btree { + +/** + * BtreeLBAManager + * + * Uses a wandering btree to track two things: + * 1) lba state including laddr_t -> paddr_t mapping + * 2) reverse paddr_t -> laddr_t mapping for gc (TODO) + * + * Generally, any transaction will involve + * 1) deltas against lba tree nodes + * 2) new lba tree nodes + * - Note, there must necessarily be a delta linking + * these new nodes into the tree -- might be a + * bootstrap_state_t delta if new root + * + * get_mappings, alloc_extent_*, etc populate a Transaction + * which then gets submitted + */ +class BtreeLBAManager : public LBAManager { +public: + BtreeLBAManager( + SegmentManager &segment_manager, + Cache &cache); + + mkfs_ret mkfs( + Transaction &t) final; + + get_mapping_ret get_mapping( + Transaction &t, + laddr_t offset, extent_len_t length) final; + + get_mappings_ret get_mappings( + Transaction &t, + laddr_list_t &&list) final; + + alloc_extent_ret alloc_extent( + Transaction &t, + laddr_t hint, + extent_len_t len, + paddr_t addr) final; + + set_extent_ret set_extent( + Transaction &t, + laddr_t off, extent_len_t len, paddr_t addr) final; + + ref_ret decref_extent( + Transaction &t, + laddr_t addr) final { + return update_refcount(t, addr, -1); + } + + ref_ret incref_extent( + Transaction &t, + laddr_t addr) final { + return update_refcount(t, addr, 1); + } + + complete_transaction_ret complete_transaction( + Transaction &t) final; + + init_cached_extent_ret init_cached_extent( + Transaction &t, + CachedExtentRef e) final; + + scan_mappings_ret scan_mappings( + Transaction &t, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &&f) final; + + scan_mapped_space_ret scan_mapped_space( + Transaction &t, + scan_mapped_space_func_t &&f) final; + + rewrite_extent_ret rewrite_extent( + Transaction &t, + CachedExtentRef extent); + + get_physical_extent_if_live_ret get_physical_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + segment_off_t len) final; + + void add_pin(LBAPin &pin) final { + auto *bpin = reinterpret_cast<BtreeLBAPin*>(&pin); + pin_set.add_pin(bpin->pin); + bpin->parent = nullptr; + } + +private: + SegmentManager &segment_manager; + Cache &cache; + + btree_pin_set_t pin_set; + + op_context_t get_context(Transaction &t) { + return op_context_t{cache, pin_set, t}; + } + + static btree_range_pin_t &get_pin(CachedExtent &e); + + + /** + * get_root + * + * Get a reference to the root LBANode. + */ + using get_root_ertr = Cache::get_extent_ertr; + using get_root_ret = get_root_ertr::future<LBANodeRef>; + get_root_ret get_root(Transaction &); + + /** + * insert_mapping + * + * Insert a lba mapping into the tree + */ + using insert_mapping_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using insert_mapping_ret = insert_mapping_ertr::future<LBAPinRef>; + insert_mapping_ret insert_mapping( + Transaction &t, ///< [in,out] transaction + LBANodeRef root, ///< [in] root node + laddr_t laddr, ///< [in] logical addr to insert + lba_map_val_t val ///< [in] mapping to insert + ); + + /** + * update_refcount + * + * Updates refcount, returns resulting refcount + */ + using update_refcount_ret = ref_ret; + update_refcount_ret update_refcount( + Transaction &t, + laddr_t addr, + int delta); + + /** + * update_mapping + * + * Updates mapping, removes if f returns nullopt + */ + using update_mapping_ertr = ref_ertr; + using update_mapping_ret = ref_ertr::future<lba_map_val_t>; + using update_func_t = LBANode::mutate_func_t; + update_mapping_ret update_mapping( + Transaction &t, + laddr_t addr, + update_func_t &&f); + + using update_internal_mapping_ertr = LBANode::mutate_internal_address_ertr; + using update_internal_mapping_ret = LBANode::mutate_internal_address_ret; + update_internal_mapping_ret update_internal_mapping( + Transaction &t, + depth_t depth, + laddr_t laddr, + paddr_t paddr); +}; +using BtreeLBAManagerRef = std::unique_ptr<BtreeLBAManager>; + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc new file mode 100644 index 000000000..a86c3cc57 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc @@ -0,0 +1,153 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/common/log.h" + +#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::lba_manager::btree { + +void btree_range_pin_t::take_pin(btree_range_pin_t &other) +{ + assert(other.extent); + assert(other.pins); + other.pins->replace_pin(*this, other); + pins = other.pins; + other.pins = nullptr; + + if (other.has_ref()) { + other.drop_ref(); + acquire_ref(); + } +} + +btree_range_pin_t::~btree_range_pin_t() +{ + assert(!pins == !is_linked()); + assert(!ref); + if (pins) { + logger().debug("{}: removing {}", __func__, *this); + pins->remove_pin(*this, true); + } + extent = nullptr; +} + +void btree_pin_set_t::replace_pin(btree_range_pin_t &to, btree_range_pin_t &from) +{ + pins.replace_node(pins.iterator_to(from), to); +} + +void btree_pin_set_t::remove_pin(btree_range_pin_t &pin, bool do_check_parent) +{ + logger().debug("{}: {}", __func__, pin); + assert(pin.is_linked()); + assert(pin.pins); + assert(!pin.ref); + + pins.erase(pin); + pin.pins = nullptr; + + if (do_check_parent) { + check_parent(pin); + } +} + +btree_range_pin_t *btree_pin_set_t::maybe_get_parent( + const lba_node_meta_t &meta) +{ + auto cmeta = meta; + cmeta.depth++; + auto iter = pins.upper_bound(cmeta, btree_range_pin_t::meta_cmp_t()); + if (iter == pins.begin()) { + return nullptr; + } else { + --iter; + if (iter->range.is_parent_of(meta)) { + return &*iter; + } else { + return nullptr; + } + } +} + +const btree_range_pin_t *btree_pin_set_t::maybe_get_first_child( + const lba_node_meta_t &meta) const +{ + if (meta.depth == 0) { + return nullptr; + } + + auto cmeta = meta; + cmeta.depth--; + + auto iter = pins.lower_bound(cmeta, btree_range_pin_t::meta_cmp_t()); + if (iter == pins.end()) { + return nullptr; + } else if (meta.is_parent_of(iter->range)) { + return &*iter; + } else { + return nullptr; + } +} + +void btree_pin_set_t::release_if_no_children(btree_range_pin_t &pin) +{ + assert(pin.is_linked()); + if (maybe_get_first_child(pin.range) == nullptr) { + pin.drop_ref(); + } +} + +void btree_pin_set_t::add_pin(btree_range_pin_t &pin) +{ + assert(!pin.is_linked()); + assert(!pin.pins); + assert(!pin.ref); + + auto [prev, inserted] = pins.insert(pin); + if (!inserted) { + logger().error("{}: unable to add {}, found {}", __func__, pin, *prev); + assert(0 == "impossible"); + return; + } + pin.pins = this; + if (!pin.is_root()) { + auto *parent = maybe_get_parent(pin.range); + assert(parent); + if (!parent->has_ref()) { + logger().debug("{}: acquiring parent {}", __func__, + static_cast<void*>(parent)); + parent->acquire_ref(); + } else { + logger().debug("{}: parent has ref {}", __func__, + static_cast<void*>(parent)); + } + } + if (maybe_get_first_child(pin.range) != nullptr) { + logger().debug("{}: acquiring self {}", __func__, pin); + pin.acquire_ref(); + } +} + +void btree_pin_set_t::retire(btree_range_pin_t &pin) +{ + pin.drop_ref(); + remove_pin(pin, false); +} + +void btree_pin_set_t::check_parent(btree_range_pin_t &pin) +{ + auto parent = maybe_get_parent(pin.range); + if (parent) { + logger().debug("{}: releasing parent {}", __func__, *parent); + release_if_no_children(*parent); + } +} + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h new file mode 100644 index 000000000..3fa218fc8 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h @@ -0,0 +1,274 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive/set.hpp> + +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/seastore_types.h" + +namespace crimson::os::seastore::lba_manager::btree { + +class LBANode; +using LBANodeRef = TCachedExtentRef<LBANode>; + +struct lba_node_meta_t { + laddr_t begin = 0; + laddr_t end = 0; + depth_t depth = 0; + + bool is_parent_of(const lba_node_meta_t &other) const { + return (depth == other.depth + 1) && + (begin <= other.begin) && + (end >= other.end); + } + + std::pair<lba_node_meta_t, lba_node_meta_t> split_into(laddr_t pivot) const { + return std::make_pair( + lba_node_meta_t{begin, pivot, depth}, + lba_node_meta_t{pivot, end, depth}); + } + + static lba_node_meta_t merge_from( + const lba_node_meta_t &lhs, const lba_node_meta_t &rhs) { + assert(lhs.depth == rhs.depth); + return lba_node_meta_t{lhs.begin, rhs.end, lhs.depth}; + } + + static std::pair<lba_node_meta_t, lba_node_meta_t> + rebalance(const lba_node_meta_t &lhs, const lba_node_meta_t &rhs, laddr_t pivot) { + assert(lhs.depth == rhs.depth); + return std::make_pair( + lba_node_meta_t{lhs.begin, pivot, lhs.depth}, + lba_node_meta_t{pivot, rhs.end, lhs.depth}); + } + + bool is_root() const { + return begin == 0 && end == L_ADDR_MAX; + } +}; + +inline std::ostream &operator<<( + std::ostream &lhs, + const lba_node_meta_t &rhs) +{ + return lhs << "btree_node_meta_t(" + << "begin=" << rhs.begin + << ", end=" << rhs.end + << ", depth=" << rhs.depth + << ")"; +} + +/** + * btree_range_pin_t + * + * Element tracked by btree_pin_set_t below. Encapsulates the intrusive_set + * hook, the lba_node_meta_t representing the lba range covered by a node, + * and extent and ref members intended to hold a reference when the extent + * should be pinned. + */ +class btree_pin_set_t; +class btree_range_pin_t : public boost::intrusive::set_base_hook<> { + friend class btree_pin_set_t; + lba_node_meta_t range; + + btree_pin_set_t *pins = nullptr; + + // We need to be able to remember extent without holding a reference, + // but we can do it more compactly -- TODO + CachedExtent *extent = nullptr; + CachedExtentRef ref; + + using index_t = boost::intrusive::set<btree_range_pin_t>; + + static auto get_tuple(const lba_node_meta_t &meta) { + return std::make_tuple(-meta.depth, meta.begin); + } + + void acquire_ref() { + ref = CachedExtentRef(extent); + } + + void drop_ref() { + ref.reset(); + } + +public: + btree_range_pin_t() = default; + btree_range_pin_t(CachedExtent *extent) + : extent(extent) {} + btree_range_pin_t(const btree_range_pin_t &rhs, CachedExtent *extent) + : range(rhs.range), extent(extent) {} + + bool has_ref() const { + return !!ref; + } + + bool is_root() const { + return range.is_root(); + } + + void set_range(const lba_node_meta_t &nrange) { + range = nrange; + } + void set_extent(CachedExtent *nextent) { + assert(!extent); + extent = nextent; + } + + void take_pin(btree_range_pin_t &other); + + friend bool operator<( + const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { + return get_tuple(lhs.range) < get_tuple(rhs.range); + } + friend bool operator>( + const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { + return get_tuple(lhs.range) > get_tuple(rhs.range); + } + friend bool operator==( + const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { + return get_tuple(lhs.range) == rhs.get_tuple(rhs.range); + } + + struct meta_cmp_t { + bool operator()( + const btree_range_pin_t &lhs, const lba_node_meta_t &rhs) const { + return get_tuple(lhs.range) < get_tuple(rhs); + } + bool operator()( + const lba_node_meta_t &lhs, const btree_range_pin_t &rhs) const { + return get_tuple(lhs) < get_tuple(rhs.range); + } + }; + + friend std::ostream &operator<<( + std::ostream &lhs, + const btree_range_pin_t &rhs) { + return lhs << "btree_range_pin_t(" + << "begin=" << rhs.range.begin + << ", end=" << rhs.range.end + << ", depth=" << rhs.range.depth + << ", extent=" << rhs.extent + << ")"; + } + + friend class BtreeLBAPin; + ~btree_range_pin_t(); +}; + +/** + * btree_pin_set_t + * + * Ensures that for every cached node, all parent LBANodes required + * to map it are present in cache. Relocating these nodes can + * therefore be done without further reads or cache space. + * + * Contains a btree_range_pin_t for every clean or dirty LBANode + * or LogicalCachedExtent instance in cache at any point in time. + * For any LBANode, the contained btree_range_pin_t will hold + * a reference to that node pinning it in cache as long as that + * node has children in the set. This invariant can be violated + * only by calling retire_extent and is repaired by calling + * check_parent synchronously after adding any new extents. + */ +class btree_pin_set_t { + friend class btree_range_pin_t; + using pins_t = btree_range_pin_t::index_t; + pins_t pins; + + pins_t::iterator get_iter(btree_range_pin_t &pin) { + return pins_t::s_iterator_to(pin); + } + + /// Removes pin from set optionally checking whether parent has other children + void remove_pin(btree_range_pin_t &pin, bool check_parent); + + void replace_pin(btree_range_pin_t &to, btree_range_pin_t &from); + + /// Returns parent pin if exists + btree_range_pin_t *maybe_get_parent(const lba_node_meta_t &pin); + + /// Returns earliest child pin if exist + const btree_range_pin_t *maybe_get_first_child(const lba_node_meta_t &pin) const; + + /// Releases pin if it has no children + void release_if_no_children(btree_range_pin_t &pin); + +public: + /// Adds pin to set, assumes set is consistent + void add_pin(btree_range_pin_t &pin); + + /** + * retire/check_parent + * + * See BtreeLBAManager::complete_transaction. + * retire removes the specified pin from the set, but does not + * check parents. After any new extents are added to the set, + * the caller is required to call check_parent to restore the + * invariant. + */ + void retire(btree_range_pin_t &pin); + void check_parent(btree_range_pin_t &pin); + + ~btree_pin_set_t() { + assert(pins.empty()); + } +}; + +class BtreeLBAPin : public LBAPin { + friend class BtreeLBAManager; + + /** + * parent + * + * populated until link_extent is called to ensure cache residence + * until add_pin is called. + */ + CachedExtentRef parent; + + paddr_t paddr; + btree_range_pin_t pin; + +public: + BtreeLBAPin() = default; + + BtreeLBAPin( + CachedExtentRef parent, + paddr_t paddr, + lba_node_meta_t &&meta) + : parent(parent), paddr(paddr) { + pin.set_range(std::move(meta)); + } + + void link_extent(LogicalCachedExtent *ref) final { + pin.set_extent(ref); + } + + extent_len_t get_length() const final { + assert(pin.range.end > pin.range.begin); + return pin.range.end - pin.range.begin; + } + + paddr_t get_paddr() const final { + return paddr; + } + + laddr_t get_laddr() const final { + return pin.range.begin; + } + + LBAPinRef duplicate() const final { + auto ret = std::unique_ptr<BtreeLBAPin>(new BtreeLBAPin); + ret->pin.set_range(pin.range); + ret->paddr = paddr; + return ret; + } + + void take_pin(LBAPin &opin) final { + pin.take_pin(static_cast<BtreeLBAPin&>(opin).pin); + } +}; + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h new file mode 100644 index 000000000..b6f33a1ae --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h @@ -0,0 +1,269 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <sys/mman.h> +#include <memory> +#include <string.h> + +#include "crimson/common/log.h" +#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h" +#include "crimson/os/seastore/lba_manager.h" + +namespace crimson::os::seastore::lba_manager::btree { + +struct op_context_t { + Cache &cache; + btree_pin_set_t &pins; + Transaction &trans; +}; + +/** + * lba_map_val_t + * + * struct representing a single lba mapping + */ +struct lba_map_val_t { + extent_len_t len = 0; ///< length of mapping + paddr_t paddr; ///< physical addr of mapping + uint32_t refcount = 0; ///< refcount + uint32_t checksum = 0; ///< checksum of original block written at paddr (TODO) + + lba_map_val_t( + extent_len_t len, + paddr_t paddr, + uint32_t refcount, + uint32_t checksum) + : len(len), paddr(paddr), refcount(refcount), checksum(checksum) {} +}; + +class BtreeLBAPin; +using BtreeLBAPinRef = std::unique_ptr<BtreeLBAPin>; + +/** + * LBANode + * + * Base class enabling recursive lookup between internal and leaf nodes. + */ +struct LBANode : CachedExtent { + using LBANodeRef = TCachedExtentRef<LBANode>; + using lookup_range_ertr = LBAManager::get_mapping_ertr; + using lookup_range_ret = LBAManager::get_mapping_ret; + + btree_range_pin_t pin; + + LBANode(ceph::bufferptr &&ptr) : CachedExtent(std::move(ptr)), pin(this) {} + LBANode(const LBANode &rhs) + : CachedExtent(rhs), pin(rhs.pin, this) {} + + virtual lba_node_meta_t get_node_meta() const = 0; + + /** + * lookup + * + * Returns the node at the specified depth responsible + * for laddr + */ + using lookup_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using lookup_ret = lookup_ertr::future<LBANodeRef>; + virtual lookup_ret lookup( + op_context_t c, + laddr_t addr, + depth_t depth) = 0; + + /** + * lookup_range + * + * Returns mappings within range [addr, addr+len) + */ + virtual lookup_range_ret lookup_range( + op_context_t c, + laddr_t addr, + extent_len_t len) = 0; + + /** + * insert + * + * Recursively inserts into subtree rooted at *this. Caller + * must already have handled splitting if at_max_capacity(). + * + * Precondition: !at_max_capacity() + */ + using insert_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + using insert_ret = insert_ertr::future<LBAPinRef>; + virtual insert_ret insert( + op_context_t c, + laddr_t laddr, + lba_map_val_t val) = 0; + + /** + * find_hole + * + * Finds minimum hole of size len in [min, max) + * + * @return addr of hole, L_ADDR_NULL if unfound + */ + using find_hole_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using find_hole_ret = find_hole_ertr::future<laddr_t>; + virtual find_hole_ret find_hole( + op_context_t c, + laddr_t min, + laddr_t max, + extent_len_t len) = 0; + + /** + * scan_mappings + * + * Call f for all mappings in [begin, end) + */ + using scan_mappings_ertr = LBAManager::scan_mappings_ertr; + using scan_mappings_ret = LBAManager::scan_mappings_ret; + using scan_mappings_func_t = LBAManager::scan_mappings_func_t; + virtual scan_mappings_ret scan_mappings( + op_context_t c, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &f) = 0; + + using scan_mapped_space_ertr = LBAManager::scan_mapped_space_ertr; + using scan_mapped_space_ret = LBAManager::scan_mapped_space_ret; + using scan_mapped_space_func_t = LBAManager::scan_mapped_space_func_t; + virtual scan_mapped_space_ret scan_mapped_space( + op_context_t c, + scan_mapped_space_func_t &f) = 0; + + /** + * mutate_mapping + * + * Lookups up laddr, calls f on value. If f returns a value, inserts it. + * If it returns nullopt, removes the value. + * Caller must already have merged if at_min_capacity(). + * + * Recursive calls use mutate_mapping_internal. + * + * Precondition: !at_min_capacity() + */ + using mutate_mapping_ertr = crimson::errorator< + crimson::ct_error::enoent, ///< mapping does not exist + crimson::ct_error::input_output_error + >; + using mutate_mapping_ret = mutate_mapping_ertr::future< + lba_map_val_t>; + using mutate_func_t = std::function< + lba_map_val_t(const lba_map_val_t &v) + >; + virtual mutate_mapping_ret mutate_mapping( + op_context_t c, + laddr_t laddr, + mutate_func_t &&f) = 0; + virtual mutate_mapping_ret mutate_mapping_internal( + op_context_t c, + laddr_t laddr, + bool is_root, + mutate_func_t &&f) = 0; + + /** + * mutate_internal_address + * + * Looks up internal node mapping at laddr, depth and + * updates the mapping to paddr. Returns previous paddr + * (for debugging purposes). + */ + using mutate_internal_address_ertr = crimson::errorator< + crimson::ct_error::enoent, ///< mapping does not exist + crimson::ct_error::input_output_error + >; + using mutate_internal_address_ret = mutate_internal_address_ertr::future< + paddr_t>; + virtual mutate_internal_address_ret mutate_internal_address( + op_context_t c, + depth_t depth, + laddr_t laddr, + paddr_t paddr) = 0; + + /** + * make_split_children + * + * Generates appropriately typed left and right nodes formed from the + * contents of *this. + * + * Returns <left, right, pivot> where pivot is the first value of right. + */ + virtual std::tuple< + LBANodeRef, + LBANodeRef, + laddr_t> + make_split_children( + op_context_t c) = 0; + + /** + * make_full_merge + * + * Returns a single node formed from merging *this and right. + * Precondition: at_min_capacity() && right.at_min_capacity() + */ + virtual LBANodeRef make_full_merge( + op_context_t c, + LBANodeRef &right) = 0; + + /** + * make_balanced + * + * Returns nodes formed by balancing the contents of *this and right. + * + * Returns <left, right, pivot> where pivot is the first value of right. + */ + virtual std::tuple< + LBANodeRef, + LBANodeRef, + laddr_t> + make_balanced( + op_context_t c, + LBANodeRef &right, + bool prefer_left) = 0; + + virtual bool at_max_capacity() const = 0; + virtual bool at_min_capacity() const = 0; + + virtual ~LBANode() = default; + + void on_delta_write(paddr_t record_block_offset) final { + // All in-memory relative addrs are necessarily record-relative + assert(get_prior_instance()); + pin.take_pin(get_prior_instance()->cast<LBANode>()->pin); + resolve_relative_addrs(record_block_offset); + } + + void on_initial_write() final { + // All in-memory relative addrs are necessarily block-relative + resolve_relative_addrs(get_paddr()); + } + + void on_clean_read() final { + // From initial write of block, relative addrs are necessarily block-relative + resolve_relative_addrs(get_paddr()); + } + + virtual void resolve_relative_addrs(paddr_t base) = 0; +}; +using LBANodeRef = LBANode::LBANodeRef; + +/** + * get_lba_btree_extent + * + * Fetches node at depth of the appropriate type. + */ +Cache::get_extent_ertr::future<LBANodeRef> get_lba_btree_extent( + op_context_t c, ///< [in] context structure + depth_t depth, ///< [in] depth of node to fetch + paddr_t offset, ///< [in] physical addr of node + paddr_t base ///< [in] depending on user, block addr or record addr + /// in case offset is relative +); + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc new file mode 100644 index 000000000..5e400803b --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc @@ -0,0 +1,701 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include <memory> +#include <string.h> + +#include "include/buffer.h" +#include "include/byteorder.h" + +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::lba_manager::btree { + +std::ostream &LBAInternalNode::print_detail(std::ostream &out) const +{ + return out << ", size=" << get_size() + << ", meta=" << get_meta(); +} + +LBAInternalNode::lookup_ret LBAInternalNode::lookup( + op_context_t c, + laddr_t addr, + depth_t depth) +{ + auto meta = get_meta(); + if (depth == get_meta().depth) { + return lookup_ret( + lookup_ertr::ready_future_marker{}, + this); + } + assert(meta.begin <= addr); + assert(meta.end > addr); + auto iter = lower_bound(addr); + return get_lba_btree_extent( + c, + meta.depth - 1, + iter->get_val(), + get_paddr()).safe_then([c, addr, depth](auto child) { + return child->lookup(c, addr, depth); + }).finally([ref=LBANodeRef(this)] {}); +} + +LBAInternalNode::lookup_range_ret LBAInternalNode::lookup_range( + op_context_t c, + laddr_t addr, + extent_len_t len) +{ + auto [begin, end] = bound(addr, addr + len); + auto result_up = std::make_unique<lba_pin_list_t>(); + auto &result = *result_up; + return crimson::do_for_each( + std::move(begin), + std::move(end), + [this, c, &result, addr, len](const auto &val) mutable { + return get_lba_btree_extent( + c, + get_meta().depth - 1, + val.get_val(), + get_paddr()).safe_then( + [c, &result, addr, len](auto extent) mutable { + return extent->lookup_range( + c, + addr, + len).safe_then( + [&result](auto pin_list) mutable { + result.splice(result.end(), pin_list, + pin_list.begin(), pin_list.end()); + }); + }); + }).safe_then([result=std::move(result_up), ref=LBANodeRef(this)] { + return lookup_range_ertr::make_ready_future<lba_pin_list_t>( + std::move(*result)); + }); +} + +LBAInternalNode::insert_ret LBAInternalNode::insert( + op_context_t c, + laddr_t laddr, + lba_map_val_t val) +{ + auto insertion_pt = get_containing_child(laddr); + return get_lba_btree_extent( + c, + get_meta().depth - 1, + insertion_pt->get_val(), + get_paddr()).safe_then( + [this, insertion_pt, c, laddr, val=std::move(val)]( + auto extent) mutable { + return extent->at_max_capacity() ? + split_entry(c, laddr, insertion_pt, extent) : + insert_ertr::make_ready_future<LBANodeRef>(std::move(extent)); + }).safe_then([c, laddr, val=std::move(val)]( + LBANodeRef extent) mutable { + return extent->insert(c, laddr, val); + }); +} + +LBAInternalNode::mutate_mapping_ret LBAInternalNode::mutate_mapping( + op_context_t c, + laddr_t laddr, + mutate_func_t &&f) +{ + return mutate_mapping_internal(c, laddr, true, std::move(f)); +} + +LBAInternalNode::mutate_mapping_ret LBAInternalNode::mutate_mapping_internal( + op_context_t c, + laddr_t laddr, + bool is_root, + mutate_func_t &&f) +{ + auto mutation_pt = get_containing_child(laddr); + if (mutation_pt == end()) { + assert(0 == "impossible"); + return crimson::ct_error::enoent::make(); + } + return get_lba_btree_extent( + c, + get_meta().depth - 1, + mutation_pt->get_val(), + get_paddr() + ).safe_then([=](LBANodeRef extent) { + if (extent->at_min_capacity() && get_size() > 1) { + return merge_entry( + c, + laddr, + mutation_pt, + extent, + is_root); + } else { + return merge_ertr::make_ready_future<LBANodeRef>( + std::move(extent)); + } + }).safe_then([c, laddr, f=std::move(f)](LBANodeRef extent) mutable { + return extent->mutate_mapping_internal(c, laddr, false, std::move(f)); + }); +} + +LBAInternalNode::mutate_internal_address_ret LBAInternalNode::mutate_internal_address( + op_context_t c, + depth_t depth, + laddr_t laddr, + paddr_t paddr) +{ + if (get_meta().depth == (depth + 1)) { + if (!is_pending()) { + return c.cache.duplicate_for_write(c.trans, this)->cast<LBAInternalNode>( + )->mutate_internal_address( + c, + depth, + laddr, + paddr); + } + auto iter = get_containing_child(laddr); + if (iter->get_key() != laddr) { + return crimson::ct_error::enoent::make(); + } + + auto old_paddr = iter->get_val(); + + journal_update( + iter, + maybe_generate_relative(paddr), + maybe_get_delta_buffer()); + + return mutate_internal_address_ret( + mutate_internal_address_ertr::ready_future_marker{}, + old_paddr + ); + } else { + auto iter = get_containing_child(laddr); + return get_lba_btree_extent( + c, + get_meta().depth - 1, + iter->get_val(), + get_paddr() + ).safe_then([=](auto node) { + return node->mutate_internal_address( + c, + depth, + laddr, + paddr); + }); + } +} + +LBAInternalNode::find_hole_ret LBAInternalNode::find_hole( + op_context_t c, + laddr_t min_addr, + laddr_t max_addr, + extent_len_t len) +{ + logger().debug( + "LBAInternalNode::find_hole min={}, max={}, len={}, *this={}", + min_addr, max_addr, len, *this); + auto [begin, end] = bound(min_addr, max_addr); + return seastar::repeat_until_value( + [i=begin, e=end, c, min_addr, len, this]() mutable { + if (i == e) { + return seastar::make_ready_future<std::optional<laddr_t>>( + std::make_optional<laddr_t>(L_ADDR_NULL)); + } + return get_lba_btree_extent(c, + get_meta().depth - 1, + i->get_val(), + get_paddr()).safe_then( + [c, min_addr, len, i](auto extent) mutable { + auto lb = std::max(min_addr, i->get_key()); + auto ub = i->get_next_key_or_max(); + logger().debug("LBAInternalNode::find_hole extent {} lb {} ub {}", + *extent, lb, ub); + return extent->find_hole(c, lb, ub, len); + }).safe_then([&i](auto addr) mutable -> std::optional<laddr_t> { + if (addr == L_ADDR_NULL) { + ++i; + return {}; + } else { + return addr; + } + }, + // TODO: GCC enters a dead loop if crimson::do_until() is used + // or erroratorized future is returned + crimson::ct_error::assert_all{ "fix me - APIv6" }); + }); +} + +LBAInternalNode::scan_mappings_ret LBAInternalNode::scan_mappings( + op_context_t c, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &f) +{ + auto [biter, eiter] = bound(begin, end); + return crimson::do_for_each( + std::move(biter), + std::move(eiter), + [=, &f](auto &viter) { + return get_lba_btree_extent( + c, + get_meta().depth - 1, + viter->get_val(), + get_paddr()).safe_then([=, &f](auto child) { + return child->scan_mappings(c, begin, end, f); + }); + }).safe_then([ref=LBANodeRef(this)]{}); +} + +LBAInternalNode::scan_mapped_space_ret LBAInternalNode::scan_mapped_space( + op_context_t c, + scan_mapped_space_func_t &f) +{ + f(get_paddr(), get_length()); + return crimson::do_for_each( + begin(), end(), + [=, &f](auto &viter) { + return get_lba_btree_extent( + c, + get_meta().depth - 1, + viter->get_val(), + get_paddr()).safe_then([=, &f](auto child) { + return child->scan_mapped_space(c, f); + }); + }).safe_then([ref=LBANodeRef(this)]{}); +} + + +void LBAInternalNode::resolve_relative_addrs(paddr_t base) +{ + for (auto i: *this) { + if (i->get_val().is_relative()) { + auto updated = base.add_relative(i->get_val()); + logger().debug( + "LBAInternalNode::resolve_relative_addrs {} -> {}", + i->get_val(), + updated); + i->set_val(updated); + } + } +} + + +LBAInternalNode::split_ret +LBAInternalNode::split_entry( + op_context_t c, + laddr_t addr, + internal_iterator_t iter, LBANodeRef entry) +{ + if (!is_pending()) { + auto mut = c.cache.duplicate_for_write( + c.trans, this)->cast<LBAInternalNode>(); + auto mut_iter = mut->iter_idx(iter->get_offset()); + return mut->split_entry(c, addr, mut_iter, entry); + } + + ceph_assert(!at_max_capacity()); + auto [left, right, pivot] = entry->make_split_children(c); + + journal_update( + iter, + maybe_generate_relative(left->get_paddr()), + maybe_get_delta_buffer()); + journal_insert( + iter + 1, + pivot, + maybe_generate_relative(right->get_paddr()), + maybe_get_delta_buffer()); + + c.cache.retire_extent(c.trans, entry); + + logger().debug( + "LBAInternalNode::split_entry *this {} entry {} into left {} right {}", + *this, + *entry, + *left, + *right); + + return split_ertr::make_ready_future<LBANodeRef>( + pivot > addr ? left : right + ); +} + +LBAInternalNode::merge_ret +LBAInternalNode::merge_entry( + op_context_t c, + laddr_t addr, + internal_iterator_t iter, + LBANodeRef entry, + bool is_root) +{ + if (!is_pending()) { + auto mut = c.cache.duplicate_for_write(c.trans, this)->cast<LBAInternalNode>(); + auto mut_iter = mut->iter_idx(iter->get_offset()); + return mut->merge_entry(c, addr, mut_iter, entry, is_root); + } + + logger().debug( + "LBAInternalNode: merge_entry: {}, {}", + *this, + *entry); + auto donor_is_left = (iter + 1) == end(); + auto donor_iter = donor_is_left ? iter - 1 : iter + 1; + return get_lba_btree_extent( + c, + get_meta().depth - 1, + donor_iter->get_val(), + get_paddr() + ).safe_then([=](auto donor) mutable { + auto [l, r] = donor_is_left ? + std::make_pair(donor, entry) : std::make_pair(entry, donor); + auto [liter, riter] = donor_is_left ? + std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter); + if (donor->at_min_capacity()) { + auto replacement = l->make_full_merge( + c, + r); + + journal_update( + liter, + maybe_generate_relative(replacement->get_paddr()), + maybe_get_delta_buffer()); + journal_remove(riter, maybe_get_delta_buffer()); + + c.cache.retire_extent(c.trans, l); + c.cache.retire_extent(c.trans, r); + + if (is_root && get_size() == 1) { + return c.cache.get_root(c.trans).safe_then([=](RootBlockRef croot) { + { + auto mut_croot = c.cache.duplicate_for_write(c.trans, croot); + croot = mut_croot->cast<RootBlock>(); + } + croot->root.lba_root_addr = begin()->get_val(); + logger().debug( + "LBAInternalNode::merge_entry: collapsing root {} to addr {}", + *this, + begin()->get_val()); + croot->root.lba_depth = get_meta().depth - 1; + c.cache.retire_extent(c.trans, this); + return merge_ertr::make_ready_future<LBANodeRef>(replacement); + }); + } else { + return merge_ertr::make_ready_future<LBANodeRef>(replacement); + } + } else { + logger().debug( + "LBAInternalEntry::merge_entry balanced l {} r {}", + *l, + *r); + auto [replacement_l, replacement_r, pivot] = + l->make_balanced( + c, + r, + !donor_is_left); + + journal_update( + liter, + maybe_generate_relative(replacement_l->get_paddr()), + maybe_get_delta_buffer()); + journal_replace( + riter, + pivot, + maybe_generate_relative(replacement_r->get_paddr()), + maybe_get_delta_buffer()); + + c.cache.retire_extent(c.trans, l); + c.cache.retire_extent(c.trans, r); + return merge_ertr::make_ready_future<LBANodeRef>( + addr >= pivot ? replacement_r : replacement_l + ); + } + }); +} + + +LBAInternalNode::internal_iterator_t +LBAInternalNode::get_containing_child(laddr_t laddr) +{ + // TODO: binary search + for (auto i = begin(); i != end(); ++i) { + if (i.contains(laddr)) + return i; + } + ceph_assert(0 == "invalid"); + return end(); +} + +std::ostream &LBALeafNode::print_detail(std::ostream &out) const +{ + return out << ", size=" << get_size() + << ", meta=" << get_meta(); +} + +LBALeafNode::lookup_range_ret LBALeafNode::lookup_range( + op_context_t c, + laddr_t addr, + extent_len_t len) +{ + logger().debug( + "LBALeafNode::lookup_range {}~{}", + addr, + len); + auto ret = lba_pin_list_t(); + auto [i, end] = get_leaf_entries(addr, len); + for (; i != end; ++i) { + auto val = i->get_val(); + auto begin = i->get_key(); + ret.emplace_back( + std::make_unique<BtreeLBAPin>( + this, + val.paddr.maybe_relative_to(get_paddr()), + lba_node_meta_t{ begin, begin + val.len, 0})); + } + return lookup_range_ertr::make_ready_future<lba_pin_list_t>( + std::move(ret)); +} + +LBALeafNode::insert_ret LBALeafNode::insert( + op_context_t c, + laddr_t laddr, + lba_map_val_t val) +{ + ceph_assert(!at_max_capacity()); + + if (!is_pending()) { + return c.cache.duplicate_for_write(c.trans, this + )->cast<LBALeafNode>()->insert(c, laddr, val); + } + + val.paddr = maybe_generate_relative(val.paddr); + logger().debug( + "LBALeafNode::insert: inserting {}~{} -> {}", + laddr, + val.len, + val.paddr); + + auto insert_pt = lower_bound(laddr); + journal_insert(insert_pt, laddr, val, maybe_get_delta_buffer()); + + logger().debug( + "LBALeafNode::insert: inserted {}~{} -> {}", + insert_pt.get_key(), + insert_pt.get_val().len, + insert_pt.get_val().paddr); + auto begin = insert_pt.get_key(); + return insert_ret( + insert_ertr::ready_future_marker{}, + std::make_unique<BtreeLBAPin>( + this, + val.paddr.maybe_relative_to(get_paddr()), + lba_node_meta_t{ begin, begin + val.len, 0})); +} + +LBALeafNode::mutate_mapping_ret LBALeafNode::mutate_mapping( + op_context_t c, + laddr_t laddr, + mutate_func_t &&f) +{ + return mutate_mapping_internal(c, laddr, true, std::move(f)); +} + +LBALeafNode::mutate_mapping_ret LBALeafNode::mutate_mapping_internal( + op_context_t c, + laddr_t laddr, + bool is_root, + mutate_func_t &&f) +{ + auto mutation_pt = find(laddr); + if (mutation_pt == end()) { + return crimson::ct_error::enoent::make(); + } + + if (!is_pending()) { + return c.cache.duplicate_for_write(c.trans, this)->cast<LBALeafNode>( + )->mutate_mapping_internal( + c, + laddr, + is_root, + std::move(f)); + } + + auto cur = mutation_pt.get_val(); + auto mutated = f(cur); + + mutated.paddr = maybe_generate_relative(mutated.paddr); + + logger().debug( + "{}: mutate addr {}: {} -> {}", + __func__, + laddr, + cur.paddr, + mutated.paddr); + + if (mutated.refcount > 0) { + journal_update(mutation_pt, mutated, maybe_get_delta_buffer()); + return mutate_mapping_ret( + mutate_mapping_ertr::ready_future_marker{}, + mutated); + } else { + journal_remove(mutation_pt, maybe_get_delta_buffer()); + return mutate_mapping_ret( + mutate_mapping_ertr::ready_future_marker{}, + mutated); + } +} + +LBALeafNode::mutate_internal_address_ret LBALeafNode::mutate_internal_address( + op_context_t c, + depth_t depth, + laddr_t laddr, + paddr_t paddr) +{ + ceph_assert(0 == "Impossible"); + return mutate_internal_address_ret( + mutate_internal_address_ertr::ready_future_marker{}, + paddr); +} + +LBALeafNode::find_hole_ret LBALeafNode::find_hole( + op_context_t c, + laddr_t min, + laddr_t max, + extent_len_t len) +{ + logger().debug( + "LBALeafNode::find_hole min={} max={}, len={}, *this={}", + min, max, len, *this); + auto [liter, uiter] = bound(min, max); + for (auto i = liter; i != uiter; ++i) { + auto ub = i->get_key(); + if (min + len <= ub) { + return find_hole_ret( + find_hole_ertr::ready_future_marker{}, + min); + } else { + min = i->get_key() + i->get_val().len; + } + } + if (min + len <= max) { + return find_hole_ret( + find_hole_ertr::ready_future_marker{}, + min); + } else { + return find_hole_ret( + find_hole_ertr::ready_future_marker{}, + L_ADDR_MAX); + } +} + +LBALeafNode::scan_mappings_ret LBALeafNode::scan_mappings( + op_context_t c, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &f) +{ + auto [biter, eiter] = bound(begin, end); + for (auto i = biter; i != eiter; ++i) { + auto val = i->get_val(); + f(i->get_key(), val.paddr, val.len); + } + return scan_mappings_ertr::now(); +} + +LBALeafNode::scan_mapped_space_ret LBALeafNode::scan_mapped_space( + op_context_t c, + scan_mapped_space_func_t &f) +{ + f(get_paddr(), get_length()); + for (auto i = begin(); i != end(); ++i) { + auto val = i->get_val(); + f(val.paddr, val.len); + } + return scan_mappings_ertr::now(); +} + + +void LBALeafNode::resolve_relative_addrs(paddr_t base) +{ + for (auto i: *this) { + if (i->get_val().paddr.is_relative()) { + auto val = i->get_val(); + val.paddr = base.add_relative(val.paddr); + logger().debug( + "LBALeafNode::resolve_relative_addrs {} -> {}", + i->get_val().paddr, + val.paddr); + i->set_val(val); + } + } +} + +std::pair<LBALeafNode::internal_iterator_t, LBALeafNode::internal_iterator_t> +LBALeafNode::get_leaf_entries(laddr_t addr, extent_len_t len) +{ + return bound(addr, addr + len); +} + +Cache::get_extent_ertr::future<LBANodeRef> get_lba_btree_extent( + op_context_t c, + depth_t depth, + paddr_t offset, + paddr_t base) +{ + offset = offset.maybe_relative_to(base); + ceph_assert(depth > 0); + if (depth > 1) { + logger().debug( + "get_lba_btree_extent: reading internal at offset {}, depth {}", + offset, + depth); + return c.cache.get_extent<LBAInternalNode>( + c.trans, + offset, + LBA_BLOCK_SIZE).safe_then([c](auto ret) { + auto meta = ret->get_meta(); + if (ret->get_size()) { + ceph_assert(meta.begin <= ret->begin()->get_key()); + ceph_assert(meta.end > (ret->end() - 1)->get_key()); + } + if (!ret->is_pending() && !ret->pin.is_linked()) { + ret->pin.set_range(meta); + c.pins.add_pin(ret->pin); + } + return LBANodeRef(ret.detach(), /* add_ref = */ false); + }); + } else { + logger().debug( + "get_lba_btree_extent: reading leaf at offset {}, depth {}", + offset, + depth); + return c.cache.get_extent<LBALeafNode>( + c.trans, + offset, + LBA_BLOCK_SIZE).safe_then([offset, c](auto ret) { + logger().debug( + "get_lba_btree_extent: read leaf at offset {} {}", + offset, + *ret); + auto meta = ret->get_meta(); + if (ret->get_size()) { + ceph_assert(meta.begin <= ret->begin()->get_key()); + ceph_assert(meta.end > (ret->end() - 1)->get_key()); + } + if (!ret->is_pending() && !ret->pin.is_linked()) { + ret->pin.set_range(meta); + c.pins.add_pin(ret->pin); + } + return LBANodeRef(ret.detach(), /* add_ref = */ false); + }); + } +} + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h new file mode 100644 index 000000000..230eef682 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h @@ -0,0 +1,555 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <sys/mman.h> +#include <string.h> + +#include <memory> +#include <string.h> + +#include "include/buffer.h" + +#include "crimson/common/fixed_kv_node_layout.h" +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/lba_manager.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" +#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h" + +namespace crimson::os::seastore::lba_manager::btree { + +constexpr size_t LBA_BLOCK_SIZE = 4096; + +/** + * lba_node_meta_le_t + * + * On disk layout for lba_node_meta_t + */ +struct lba_node_meta_le_t { + laddr_le_t begin = laddr_le_t(0); + laddr_le_t end = laddr_le_t(0); + depth_le_t depth = init_les32(0); + + lba_node_meta_le_t() = default; + lba_node_meta_le_t(const lba_node_meta_le_t &) = default; + explicit lba_node_meta_le_t(const lba_node_meta_t &val) + : begin(init_le64(val.begin)), + end(init_le64(val.end)), + depth(init_les32(val.depth)) {} + + operator lba_node_meta_t() const { + return lba_node_meta_t{ begin, end, depth }; + } +}; + + +/** + * LBAInternalNode + * + * Abstracts operations on and layout of internal nodes for the + * LBA Tree. + * + * Layout (4k): + * size : uint32_t[1] 4b + * (padding) : 4b + * meta : lba_node_meta_le_t[3] (1*24)b + * keys : laddr_t[255] (254*8)b + * values : paddr_t[255] (254*8)b + * = 4096 + + * TODO: make the above capacity calculation part of FixedKVNodeLayout + * TODO: the above alignment probably isn't portable without further work + */ +constexpr size_t INTERNAL_NODE_CAPACITY = 254; +struct LBAInternalNode + : LBANode, + common::FixedKVNodeLayout< + INTERNAL_NODE_CAPACITY, + lba_node_meta_t, lba_node_meta_le_t, + laddr_t, laddr_le_t, + paddr_t, paddr_le_t> { + using internal_iterator_t = const_iterator; + template <typename... T> + LBAInternalNode(T&&... t) : + LBANode(std::forward<T>(t)...), + FixedKVNodeLayout(get_bptr().c_str()) {} + + static constexpr extent_types_t type = extent_types_t::LADDR_INTERNAL; + + lba_node_meta_t get_node_meta() const final { return get_meta(); } + + CachedExtentRef duplicate_for_write() final { + assert(delta_buffer.empty()); + return CachedExtentRef(new LBAInternalNode(*this)); + }; + + delta_buffer_t delta_buffer; + delta_buffer_t *maybe_get_delta_buffer() { + return is_mutation_pending() ? &delta_buffer : nullptr; + } + + lookup_ret lookup(op_context_t c, laddr_t addr, depth_t depth) final; + + lookup_range_ret lookup_range( + op_context_t c, + laddr_t addr, + extent_len_t len) final; + + insert_ret insert( + op_context_t c, + laddr_t laddr, + lba_map_val_t val) final; + + mutate_mapping_ret mutate_mapping( + op_context_t c, + laddr_t laddr, + mutate_func_t &&f) final; + mutate_mapping_ret mutate_mapping_internal( + op_context_t c, + laddr_t laddr, + bool is_root, + mutate_func_t &&f) final; + + mutate_internal_address_ret mutate_internal_address( + op_context_t c, + depth_t depth, + laddr_t laddr, + paddr_t paddr) final; + + find_hole_ret find_hole( + op_context_t c, + laddr_t min, + laddr_t max, + extent_len_t len) final; + + scan_mappings_ret scan_mappings( + op_context_t c, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &f) final; + + scan_mapped_space_ret scan_mapped_space( + op_context_t c, + scan_mapped_space_func_t &f) final; + + std::tuple<LBANodeRef, LBANodeRef, laddr_t> + make_split_children(op_context_t c) final { + auto left = c.cache.alloc_new_extent<LBAInternalNode>( + c.trans, LBA_BLOCK_SIZE); + auto right = c.cache.alloc_new_extent<LBAInternalNode>( + c.trans, LBA_BLOCK_SIZE); + auto pivot = split_into(*left, *right); + left->pin.set_range(left->get_meta()); + right->pin.set_range(right->get_meta()); + return std::make_tuple( + left, + right, + pivot); + } + + LBANodeRef make_full_merge( + op_context_t c, + LBANodeRef &right) final { + auto replacement = c.cache.alloc_new_extent<LBAInternalNode>( + c.trans, LBA_BLOCK_SIZE); + replacement->merge_from(*this, *right->cast<LBAInternalNode>()); + replacement->pin.set_range(replacement->get_meta()); + return replacement; + } + + std::tuple<LBANodeRef, LBANodeRef, laddr_t> + make_balanced( + op_context_t c, + LBANodeRef &_right, + bool prefer_left) final { + ceph_assert(_right->get_type() == type); + auto &right = *_right->cast<LBAInternalNode>(); + auto replacement_left = c.cache.alloc_new_extent<LBAInternalNode>( + c.trans, LBA_BLOCK_SIZE); + auto replacement_right = c.cache.alloc_new_extent<LBAInternalNode>( + c.trans, LBA_BLOCK_SIZE); + + auto pivot = balance_into_new_nodes( + *this, + right, + prefer_left, + *replacement_left, + *replacement_right); + + replacement_left->pin.set_range(replacement_left->get_meta()); + replacement_right->pin.set_range(replacement_right->get_meta()); + return std::make_tuple( + replacement_left, + replacement_right, + pivot); + } + + /** + * Internal relative addresses on read or in memory prior to commit + * are either record or block relative depending on whether this + * physical node is is_initial_pending() or just is_pending(). + * + * User passes appropriate base depending on lifecycle and + * resolve_relative_addrs fixes up relative internal references + * based on base. + */ + void resolve_relative_addrs(paddr_t base) final; + void node_resolve_vals(iterator from, iterator to) const final { + if (is_initial_pending()) { + for (auto i = from; i != to; ++i) { + if (i->get_val().is_relative()) { + assert(i->get_val().is_block_relative()); + i->set_val(get_paddr().add_relative(i->get_val())); + } + } + } + } + void node_unresolve_vals(iterator from, iterator to) const final { + if (is_initial_pending()) { + for (auto i = from; i != to; ++i) { + if (i->get_val().is_relative()) { + assert(i->get_val().is_record_relative()); + i->set_val(i->get_val() - get_paddr()); + } + } + } + } + + extent_types_t get_type() const final { + return type; + } + + std::ostream &print_detail(std::ostream &out) const final; + + ceph::bufferlist get_delta() final { + assert(!delta_buffer.empty()); + ceph::buffer::ptr bptr(delta_buffer.get_bytes()); + delta_buffer.copy_out(bptr.c_str(), bptr.length()); + ceph::bufferlist bl; + bl.push_back(bptr); + return bl; + } + + void apply_delta_and_adjust_crc( + paddr_t base, const ceph::bufferlist &_bl) final { + assert(_bl.length()); + ceph::bufferlist bl = _bl; + bl.rebuild(); + delta_buffer_t buffer; + buffer.copy_in(bl.front().c_str(), bl.front().length()); + buffer.replay(*this); + set_last_committed_crc(get_crc32c()); + resolve_relative_addrs(base); + } + + bool at_max_capacity() const final { + return get_size() == get_capacity(); + } + + bool at_min_capacity() const { + return get_size() == (get_capacity() / 2); + } + + /// returns iterators containing [l, r) + std::pair<internal_iterator_t, internal_iterator_t> bound( + laddr_t l, laddr_t r) { + // TODO: inefficient + auto retl = begin(); + for (; retl != end(); ++retl) { + if (retl->get_next_key_or_max() > l) + break; + } + auto retr = retl; + for (; retr != end(); ++retr) { + if (retr->get_key() >= r) + break; + } + return std::make_pair(retl, retr); + } + + using split_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + using split_ret = split_ertr::future<LBANodeRef>; + split_ret split_entry( + op_context_t c, + laddr_t addr, + internal_iterator_t, + LBANodeRef entry); + + using merge_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + using merge_ret = merge_ertr::future<LBANodeRef>; + merge_ret merge_entry( + op_context_t c, + laddr_t addr, + internal_iterator_t, + LBANodeRef entry, + bool is_root); + + /// returns iterator for subtree containing laddr + internal_iterator_t get_containing_child(laddr_t laddr); +}; + +/** + * LBALeafNode + * + * Abstracts operations on and layout of leaf nodes for the + * LBA Tree. + * + * Layout (4k): + * size : uint32_t[1] 4b + * (padding) : 4b + * meta : lba_node_meta_le_t[3] (1*24)b + * keys : laddr_t[170] (145*8)b + * values : lba_map_val_t[170] (145*20)b + * = 4092 + * + * TODO: update FixedKVNodeLayout to handle the above calculation + * TODO: the above alignment probably isn't portable without further work + */ +constexpr size_t LEAF_NODE_CAPACITY = 145; + +/** + * lba_map_val_le_t + * + * On disk layout for lba_map_val_t. + */ +struct lba_map_val_le_t { + extent_len_le_t len = init_extent_len_le_t(0); + paddr_le_t paddr; + ceph_le32 refcount = init_le32(0); + ceph_le32 checksum = init_le32(0); + + lba_map_val_le_t() = default; + lba_map_val_le_t(const lba_map_val_le_t &) = default; + explicit lba_map_val_le_t(const lba_map_val_t &val) + : len(init_extent_len_le_t(val.len)), + paddr(paddr_le_t(val.paddr)), + refcount(init_le32(val.refcount)), + checksum(init_le32(val.checksum)) {} + + operator lba_map_val_t() const { + return lba_map_val_t{ len, paddr, refcount, checksum }; + } +}; + +struct LBALeafNode + : LBANode, + common::FixedKVNodeLayout< + LEAF_NODE_CAPACITY, + lba_node_meta_t, lba_node_meta_le_t, + laddr_t, laddr_le_t, + lba_map_val_t, lba_map_val_le_t> { + using internal_iterator_t = const_iterator; + template <typename... T> + LBALeafNode(T&&... t) : + LBANode(std::forward<T>(t)...), + FixedKVNodeLayout(get_bptr().c_str()) {} + + static constexpr extent_types_t type = extent_types_t::LADDR_LEAF; + + lba_node_meta_t get_node_meta() const final { return get_meta(); } + + CachedExtentRef duplicate_for_write() final { + assert(delta_buffer.empty()); + return CachedExtentRef(new LBALeafNode(*this)); + }; + + delta_buffer_t delta_buffer; + delta_buffer_t *maybe_get_delta_buffer() { + return is_mutation_pending() ? &delta_buffer : nullptr; + } + + lookup_ret lookup(op_context_t c, laddr_t addr, depth_t depth) final + { + return lookup_ret( + lookup_ertr::ready_future_marker{}, + this); + } + + lookup_range_ret lookup_range( + op_context_t c, + laddr_t addr, + extent_len_t len) final; + + insert_ret insert( + op_context_t c, + laddr_t laddr, + lba_map_val_t val) final; + + mutate_mapping_ret mutate_mapping( + op_context_t c, + laddr_t laddr, + mutate_func_t &&f) final; + mutate_mapping_ret mutate_mapping_internal( + op_context_t c, + laddr_t laddr, + bool is_root, + mutate_func_t &&f) final; + + mutate_internal_address_ret mutate_internal_address( + op_context_t c, + depth_t depth, + laddr_t laddr, + paddr_t paddr) final; + + find_hole_ret find_hole( + op_context_t c, + laddr_t min, + laddr_t max, + extent_len_t len) final; + + scan_mappings_ret scan_mappings( + op_context_t c, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &f) final; + + scan_mapped_space_ret scan_mapped_space( + op_context_t c, + scan_mapped_space_func_t &f) final; + + std::tuple<LBANodeRef, LBANodeRef, laddr_t> + make_split_children(op_context_t c) final { + auto left = c.cache.alloc_new_extent<LBALeafNode>( + c.trans, LBA_BLOCK_SIZE); + auto right = c.cache.alloc_new_extent<LBALeafNode>( + c.trans, LBA_BLOCK_SIZE); + auto pivot = split_into(*left, *right); + left->pin.set_range(left->get_meta()); + right->pin.set_range(right->get_meta()); + return std::make_tuple( + left, + right, + pivot); + } + + LBANodeRef make_full_merge( + op_context_t c, + LBANodeRef &right) final { + auto replacement = c.cache.alloc_new_extent<LBALeafNode>( + c.trans, LBA_BLOCK_SIZE); + replacement->merge_from(*this, *right->cast<LBALeafNode>()); + replacement->pin.set_range(replacement->get_meta()); + return replacement; + } + + std::tuple<LBANodeRef, LBANodeRef, laddr_t> + make_balanced( + op_context_t c, + LBANodeRef &_right, + bool prefer_left) final { + ceph_assert(_right->get_type() == type); + auto &right = *_right->cast<LBALeafNode>(); + auto replacement_left = c.cache.alloc_new_extent<LBALeafNode>( + c.trans, LBA_BLOCK_SIZE); + auto replacement_right = c.cache.alloc_new_extent<LBALeafNode>( + c.trans, LBA_BLOCK_SIZE); + + auto pivot = balance_into_new_nodes( + *this, + right, + prefer_left, + *replacement_left, + *replacement_right); + + replacement_left->pin.set_range(replacement_left->get_meta()); + replacement_right->pin.set_range(replacement_right->get_meta()); + return std::make_tuple( + replacement_left, + replacement_right, + pivot); + } + + // See LBAInternalNode, same concept + void resolve_relative_addrs(paddr_t base) final; + void node_resolve_vals(iterator from, iterator to) const final { + if (is_initial_pending()) { + for (auto i = from; i != to; ++i) { + auto val = i->get_val(); + if (val.paddr.is_relative()) { + assert(val.paddr.is_block_relative()); + val.paddr = get_paddr().add_relative(val.paddr); + i->set_val(val); + } + } + } + } + void node_unresolve_vals(iterator from, iterator to) const final { + if (is_initial_pending()) { + for (auto i = from; i != to; ++i) { + auto val = i->get_val(); + if (val.paddr.is_relative()) { + auto val = i->get_val(); + assert(val.paddr.is_record_relative()); + val.paddr = val.paddr - get_paddr(); + i->set_val(val); + } + } + } + } + + ceph::bufferlist get_delta() final { + assert(!delta_buffer.empty()); + ceph::buffer::ptr bptr(delta_buffer.get_bytes()); + delta_buffer.copy_out(bptr.c_str(), bptr.length()); + ceph::bufferlist bl; + bl.push_back(bptr); + return bl; + } + + void apply_delta_and_adjust_crc( + paddr_t base, const ceph::bufferlist &_bl) final { + assert(_bl.length()); + ceph::bufferlist bl = _bl; + bl.rebuild(); + delta_buffer_t buffer; + buffer.copy_in(bl.front().c_str(), bl.front().length()); + buffer.replay(*this); + set_last_committed_crc(get_crc32c()); + resolve_relative_addrs(base); + } + + extent_types_t get_type() const final { + return type; + } + + std::ostream &print_detail(std::ostream &out) const final; + + bool at_max_capacity() const final { + return get_size() == get_capacity(); + } + + bool at_min_capacity() const final { + return get_size() == (get_capacity() / 2); + } + + /// returns iterators <lb, ub> containing addresses [l, r) + std::pair<internal_iterator_t, internal_iterator_t> bound( + laddr_t l, laddr_t r) { + // TODO: inefficient + auto retl = begin(); + for (; retl != end(); ++retl) { + if (retl->get_key() >= l || (retl->get_key() + retl->get_val().len) > l) + break; + } + auto retr = retl; + for (; retr != end(); ++retr) { + if (retr->get_key() >= r) + break; + } + return std::make_pair(retl, retr); + } + + std::pair<internal_iterator_t, internal_iterator_t> + get_leaf_entries(laddr_t addr, extent_len_t len); +}; +using LBALeafNodeRef = TCachedExtentRef<LBALeafNode>; + +} diff --git a/src/crimson/os/seastore/onode.cc b/src/crimson/os/seastore/onode.cc new file mode 100644 index 000000000..a8b925b70 --- /dev/null +++ b/src/crimson/os/seastore/onode.cc @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "onode.h" +#include "include/encoding.h" + +namespace crimson::os::seastore { + +size_t Onode::size() const +{ + return ceph::encoded_sizeof(*this); +} + +void Onode::encode(void* buffer, size_t len) +{ + struct [[gnu::packed]] encoded_t { + uint8_t struct_v; + uint8_t struct_compat; + uint32_t struct_len; + uint32_t len; + char data[]; + }; + auto p = reinterpret_cast<encoded_t*>(buffer); + assert(std::numeric_limits<uint16_t>::max() >= size()); + assert(len >= size()); + p->struct_v = 1; + p->struct_compat = 1; + p->struct_len = sizeof(encoded_t) + payload.size(); + p->len = payload.size(); + std::memcpy(p->data, payload.data(), payload.size()); +} + +bool operator==(const Onode& lhs, const Onode& rhs) +{ + return lhs.get() == rhs.get(); +} + +std::ostream& operator<<(std::ostream &out, const Onode &rhs) +{ + return out << rhs.get(); +} + +} + diff --git a/src/crimson/os/seastore/onode.h b/src/crimson/os/seastore/onode.h new file mode 100644 index 000000000..4d7783028 --- /dev/null +++ b/src/crimson/os/seastore/onode.h @@ -0,0 +1,48 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <limits> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include "include/buffer.h" +#include "include/denc.h" + +namespace crimson::os::seastore { + +// in-memory onode, in addition to the stuff that should be persisted to disk, +// it may contain intrusive hooks for LRU, rw locks etc +class Onode : public boost::intrusive_ref_counter< + Onode, + boost::thread_unsafe_counter> +{ +public: + Onode(std::string_view s) + : payload{s} + {} + size_t size() const; + const std::string& get() const { + return payload; + } + void encode(void* buffer, size_t len); + DENC(Onode, v, p) { + DENC_START(1, 1, p); + denc(v.payload, p); + DENC_FINISH(p); + } + +private: + // dummy payload + std::string payload; +}; + +bool operator==(const Onode& lhs, const Onode& rhs); +std::ostream& operator<<(std::ostream &out, const Onode &rhs); +using OnodeRef = boost::intrusive_ptr<Onode>; +} + +WRITE_CLASS_DENC(crimson::os::seastore::Onode) diff --git a/src/crimson/os/seastore/onode_manager.h b/src/crimson/os/seastore/onode_manager.h new file mode 100644 index 000000000..0a03b7fdf --- /dev/null +++ b/src/crimson/os/seastore/onode_manager.h @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "include/buffer_fwd.h" +#include "include/ceph_assert.h" +#include "common/hobject.h" + +#include "crimson/os/seastore/onode.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/osd/exceptions.h" + +namespace crimson::os::seastore { + +class OnodeManager { +public: + using open_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + virtual open_ertr::future<OnodeRef> get_or_create_onode( + Transaction &trans, + const ghobject_t &hoid) { + return open_ertr::make_ready_future<OnodeRef>(); + } + virtual open_ertr::future<std::vector<OnodeRef>> get_or_create_onodes( + Transaction &trans, + const std::vector<ghobject_t> &hoids) { + return open_ertr::make_ready_future<std::vector<OnodeRef>>(); + } + + using write_ertr= crimson::errorator< + crimson::ct_error::input_output_error>; + virtual write_ertr::future<> write_dirty( + Transaction &trans, + const std::vector<OnodeRef> &onodes) { + return write_ertr::now(); + } + virtual ~OnodeManager() {} +}; +using OnodeManagerRef = std::unique_ptr<OnodeManager>; + +namespace onode_manager { + +OnodeManagerRef create_ephemeral() { + return OnodeManagerRef(); +} + +} + +} diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc new file mode 100644 index 000000000..b05ea76a3 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "onode_block.h" + +namespace crimson::os::seastore { + +ceph::bufferlist OnodeBlock::get_delta() +{ + bufferlist bl; + assert(deltas.size() <= std::numeric_limits<uint8_t>::max()); + uint8_t n_deltas = deltas.size(); + ceph::encode(n_deltas, bl); + for (auto& delta : deltas) { + delta->encode(bl); + } + return bl; +} + +void OnodeBlock::logical_on_delta_write() +{ + // journal submitted to disk, now update the memory + apply_pending_changes(true); +} + +void OnodeBlock::apply_delta(const ceph::bufferlist &bl) +{ + assert(deltas.empty()); + + auto p = bl.cbegin(); + uint8_t n_deltas = 0; + ceph::decode(n_deltas, p); + for (uint8_t i = 0; i < n_deltas; i++) { + delta_t delta; + delta.decode(p); + mutate(std::move(delta)); + } + apply_pending_changes(true); +} + +void OnodeBlock::mutate(delta_t&& d) +{ + if (is_initial_pending()) { + char* const p = get_bptr().c_str(); + mutate_func(p, d); + } + deltas.push_back(std::make_unique<delta_t>(std::move(d))); +} + +void OnodeBlock::apply_pending_changes(bool do_cleanup) +{ + if (!is_mutation_pending()) { + return; + } + if (share_buffer) { + // do a deep copy so i can change my own copy + get_bptr() = ceph::bufferptr{get_bptr().c_str(), + get_bptr().length()}; + share_buffer = false; + } + assert(mutate_func); + char* const p = get_bptr().c_str(); + for (auto& delta : deltas) { + mutate_func(p, *delta); + if (do_cleanup) { + delta.reset(); + } + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h new file mode 100644 index 000000000..0025d9847 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <cstdint> +#include <boost/container/small_vector.hpp> + +#include "crimson/os/seastore/transaction_manager.h" +#include "onode_delta.h" + +namespace crimson::os::seastore { + +// TODO s/CachedExtent/LogicalCachedExtent/ +struct OnodeBlock final : LogicalCachedExtent { + using Ref = TCachedExtentRef<OnodeBlock>; + + template <typename... T> + OnodeBlock(T&&... t) : LogicalCachedExtent(std::forward<T>(t)...) {} + OnodeBlock(OnodeBlock&& block) = delete; + OnodeBlock(const OnodeBlock& block, CachedExtent::share_buffer_t tag) noexcept + : LogicalCachedExtent{block, tag}, + share_buffer{true} + {} + + CachedExtentRef duplicate_for_write() final { + return new OnodeBlock{*this, CachedExtent::share_buffer_t{}}; + } + + // could materialize the pending changes to the underlying buffer here, + // but since we write the change to the buffer immediately, let skip + // this for now. + void prepare_write() final {} + + // queries + static constexpr extent_types_t TYPE = extent_types_t::ONODE_BLOCK; + extent_types_t get_type() const final { + return TYPE; + } + + // have to stash all the changes before on_delta_write() is called, + // otherwise we could pollute the extent with pending mutations + // before the transaction carrying these mutations is committed to + // disk + ceph::bufferlist get_delta() final; + void logical_on_delta_write() final; + void apply_delta(const ceph::bufferlist &bl) final; + + void sync() { + apply_pending_changes(false); + } + void mutate(delta_t&& d); + using mutate_func_t = std::function<void (char*, const delta_t&)>; + void set_delta_applier(mutate_func_t&& func) { + mutate_func = std::move(func); + } +private: + // before looking at the extent, we need to make sure the content is up to date + void apply_pending_changes(bool do_cleanup); + // assuming we don't stash too many deltas to a single block + // otherwise a fullwrite op is necessary + boost::container::small_vector<std::unique_ptr<delta_t>, 2> deltas; + mutate_func_t mutate_func; + bool share_buffer = false; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc new file mode 100644 index 000000000..869685d45 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc @@ -0,0 +1,188 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "onode_delta.h" + +delta_t::delta_t(delta_t&& delta) +{ + assert(op == op_t::nop); + op = delta.op; + n = delta.n; + oid = std::move(delta.oid); + onode = std::move(delta.onode); + keys = std::move(delta.keys); + cells = std::move(delta.cells); + delta.op = op_t::nop; +} + +delta_t& delta_t::operator=(delta_t&& delta) +{ + assert(op == op_t::nop); + op = delta.op; + n = delta.n; + oid = std::move(delta.oid); + onode = std::move(delta.onode); + keys = std::move(delta.keys); + cells = std::move(delta.cells); + delta.op = op_t::nop; + return *this; +} + +delta_t delta_t::nop() +{ + return delta_t{op_t::nop}; +} + +delta_t delta_t::insert_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode) +{ + delta_t delta{op_t::insert_onode}; + delta.n = slot; + delta.oid = oid; + delta.onode = onode; + return delta; +} + +delta_t delta_t::update_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode) +{ + delta_t delta{op_t::update_onode}; + delta.n = slot; + delta.oid = oid; + delta.onode = onode; + return delta; +} + +delta_t delta_t::insert_child(unsigned slot, + const ghobject_t& oid, + crimson::os::seastore::laddr_t addr) +{ + delta_t delta{op_t::insert_child}; + delta.n = slot; + delta.oid = oid; + delta.addr = addr; + return delta; +} + +delta_t delta_t::update_key(unsigned slot, const ghobject_t& oid) +{ + delta_t delta{op_t::update_key}; + delta.n = slot; + delta.oid = oid; + return delta; +} + +delta_t delta_t::shift_left(unsigned n) +{ + delta_t delta{op_t::shift_left}; + delta.n = n; + return delta; +} + +delta_t delta_t::trim_right(unsigned n) +{ + delta_t delta{op_t::trim_right}; + delta.n = n; + return delta; +} + +delta_t delta_t::insert_front(ceph::buffer::ptr keys, + ceph::buffer::ptr cells) +{ + delta_t delta{op_t::insert_front}; + delta.keys = std::move(keys); + delta.cells = std::move(cells); + return delta; +} + +delta_t delta_t::insert_back(ceph::buffer::ptr keys, + ceph::buffer::ptr cells) +{ + delta_t delta{op_t::insert_back}; + delta.keys = std::move(keys); + delta.cells = std::move(cells); + return delta; +} + +delta_t delta_t::remove_from(unsigned slot) +{ + delta_t delta{op_t::remove_from}; + delta.n = slot; + return delta; +} + +void delta_t::encode(ceph::bufferlist& bl) +{ + using ceph::encode; + switch (op) { + case op_t::insert_onode: + [[fallthrough]]; + case op_t::update_onode: + // the slot # is not encoded, because we can alway figure it out + // when we have to replay the delta by looking the oid up in the + // node block + encode(oid, bl); + encode(*onode, bl); + break; + case op_t::insert_child: + encode(oid, bl); + encode(addr, bl); + case op_t::update_key: + encode(n, bl); + encode(oid, bl); + break; + case op_t::shift_left: + encode(n, bl); + break; + case op_t::trim_right: + encode(n, bl); + break; + case op_t::insert_front: + [[fallthrough]]; + case op_t::insert_back: + encode(n, bl); + encode(keys, bl); + encode(cells, bl); + break; + case op_t::remove_from: + encode(n, bl); + break; + default: + assert(0 == "unknown onode op"); + } +} + +void delta_t::decode(ceph::bufferlist::const_iterator& p) { + using ceph::decode; + decode(op, p); + switch (op) { + case op_t::insert_onode: + [[fallthrough]]; + case op_t::update_onode: + decode(oid, p); + decode(*onode, p); + break; + case op_t::insert_child: + [[fallthrough]]; + case op_t::update_key: + decode(n, p); + decode(oid, p); + break; + case op_t::shift_left: + decode(n, p); + break; + case op_t::trim_right: + decode(n, p); + break; + case op_t::insert_front: + [[fallthrough]]; + case op_t::insert_back: + decode(n, p); + decode(keys, p); + decode(cells, p); + break; + case op_t::remove_from: + decode(n, p); + break; + default: + assert(0 == "unknown onode op"); + } +} diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h new file mode 100644 index 000000000..3e7e7315e --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h @@ -0,0 +1,70 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cstdint> + +#include "common/hobject.h" +#include "include/buffer_fwd.h" + +#include "crimson/os/seastore/onode.h" +#include "crimson/os/seastore/seastore_types.h" + +using crimson::os::seastore::OnodeRef; + +struct delta_t { + enum class op_t : uint8_t { + nop, + insert_onode, + update_onode, + insert_child, + update_key, + shift_left, + trim_right, + insert_front, + insert_back, + remove_from, + // finer grained op? + // - changing the embedded extent map of given oid + // - mutating the embedded xattrs of given oid + } op = op_t::nop; + + unsigned n = 0; + ghobject_t oid; + crimson::os::seastore::laddr_t addr = 0; + OnodeRef onode; + ceph::bufferptr keys; + ceph::bufferptr cells; + + delta_t() = default; + delta_t(op_t op) + : op{op} + {} + delta_t(delta_t&& delta); + delta_t& operator=(delta_t&& delta); + + static delta_t nop(); + static delta_t insert_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode); + static delta_t update_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode); + static delta_t insert_child(unsigned slot, const ghobject_t& oid, crimson::os::seastore::laddr_t addr); + static delta_t update_key(unsigned slot, const ghobject_t& oid); + static delta_t shift_left(unsigned n); + static delta_t trim_right(unsigned n); + static delta_t insert_front(ceph::buffer::ptr keys, + ceph::buffer::ptr cells); + static delta_t insert_back(ceph::buffer::ptr keys, + ceph::buffer::ptr cells); + static delta_t remove_from(unsigned slot); + + // shortcuts + static delta_t insert_item(unsigned slot, const ghobject_t& oid, OnodeRef onode) { + return insert_onode(slot, oid, onode); + } + static delta_t insert_item(unsigned slot, const ghobject_t& oid, crimson::os::seastore::laddr_t addr) { + return insert_child(slot, oid, addr); + } + + void encode(ceph::bufferlist& bl); + void decode(ceph::bufferlist::const_iterator& p); +}; diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc new file mode 100644 index 000000000..fdcaa2fcb --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc @@ -0,0 +1,567 @@ +#include "onode_node.h" + +template<size_t BlockSize, int N, ntype_t NodeType> +auto node_t<BlockSize, N, NodeType>::key_at(unsigned slot) const + -> std::pair<const key_prefix_t&, const key_suffix_t&> +{ + auto& key = keys[slot]; + if constexpr (item_in_key) { + return {key, key_suffix_t{}}; + } else { + auto p = from_end(key.offset); + return {key, *reinterpret_cast<const key_suffix_t*>(p)}; + } +} + +// update an existing oid with the specified item +template<size_t BlockSize, int N, ntype_t NodeType> +ghobject_t +node_t<BlockSize, N, NodeType>::get_oid_at(unsigned slot, + const ghobject_t& oid) const +{ + auto [prefix, suffix] = key_at(slot); + ghobject_t updated = oid; + prefix.update_oid(updated); + suffix.update_oid(updated); + return updated; +} + +template<size_t BlockSize, int N, ntype_t NodeType> +auto node_t<BlockSize, N, NodeType>::item_at(const key_prefix_t& key) const + -> const_item_t +{ + if constexpr (item_in_key) { + return key.child_addr; + } else { + assert(key.offset < BlockSize); + auto p = from_end(key.offset); + auto partial_key = reinterpret_cast<const key_suffix_t*>(p); + p += size_of(*partial_key); + return *reinterpret_cast<const item_t*>(p); + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::dump(std::ostream& os) const +{ + for (uint16_t i = 0; i < count; i++) { + const auto& [prefix, suffix] = key_at(i); + os << " [" << i << '/' << count - 1 << "]\n" + << " key1 = (" << prefix << ")\n" + << " key2 = (" << suffix << ")\n"; + const auto& item = item_at(prefix); + if (_is_leaf()) { + os << " item = " << item << "\n"; + } else { + os << " child = " << std::hex << item << std::dec << "\n"; + } + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +char* node_t<BlockSize, N, NodeType>::from_end(uint16_t offset) +{ + auto end = reinterpret_cast<char*>(this) + BlockSize; + return end - static_cast<int>(offset); +} + +template<size_t BlockSize, int N, ntype_t NodeType> +const char* node_t<BlockSize, N, NodeType>::from_end(uint16_t offset) const +{ + auto end = reinterpret_cast<const char*>(this) + BlockSize; + return end - static_cast<int>(offset); +} + +template<size_t BlockSize, int N, ntype_t NodeType> +uint16_t node_t<BlockSize, N, NodeType>::used_space() const +{ + if constexpr (item_in_key) { + return count * sizeof(key_prefix_t); + } else { + if (count) { + return keys[count - 1].offset + count * sizeof(key_prefix_t); + } else { + return 0; + } + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +uint16_t node_t<BlockSize, N, NodeType>::capacity() +{ + auto p = reinterpret_cast<node_t*>(0); + return BlockSize - (reinterpret_cast<char*>(p->keys) - + reinterpret_cast<char*>(p)); +} + +// TODO: if it's allowed to update 2 siblings at the same time, we can have +// B* tree +template<size_t BlockSize, int N, ntype_t NodeType> +constexpr uint16_t node_t<BlockSize, N, NodeType>::min_size() +{ + return capacity() / 2; +} + +template<size_t BlockSize, int N, ntype_t NodeType> +constexpr std::pair<int16_t, int16_t> +node_t<BlockSize, N, NodeType>::bytes_to_add(uint16_t size) +{ + assert(size < min_size()); + return {min_size() - size, capacity() - size}; +} + +template<size_t BlockSize, int N, ntype_t NodeType> +constexpr std::pair<int16_t, int16_t> +node_t<BlockSize, N, NodeType>::bytes_to_remove(uint16_t size) +{ + assert(size > capacity()); + return {size - capacity(), size - min_size()}; +} + +template<size_t BlockSize, int N, ntype_t NodeType> +size_state_t node_t<BlockSize, N, NodeType>::size_state(uint16_t size) const +{ + if (size > capacity()) { + return size_state_t::overflow; + } else if (size < capacity() / 2) { + return size_state_t::underflow; + } else { + return size_state_t::okay; + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +bool node_t<BlockSize, N, NodeType>::is_underflow(uint16_t size) const +{ + switch (size_state(size)) { + case size_state_t::underflow: + return true; + case size_state_t::okay: + return false; + default: + assert(0); + return false; + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +int16_t node_t<BlockSize, N, NodeType>::size_with_key(unsigned slot, + const ghobject_t& oid) const +{ + if constexpr (item_in_key) { + return capacity(); + } else { + // the size of fixed key does not change + [[maybe_unused]] const auto& [prefix, suffix] = key_at(slot); + return capacity() + key_suffix_t::size_from(oid) - suffix.size(); + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +ordering_t node_t<BlockSize, N, NodeType>::compare_with_slot(unsigned slot, + const ghobject_t& oid) const +{ + const auto& [prefix, suffix] = key_at(slot); + if (auto result = prefix.compare(oid); result != ordering_t::equivalent) { + return result; + } else { + return suffix.compare(oid); + } +} + +/// return the slot number of the first slot that is greater or equal to +/// key +template<size_t BlockSize, int N, ntype_t NodeType> +std::pair<unsigned, bool> node_t<BlockSize, N, NodeType>::lower_bound(const ghobject_t& oid) const +{ + unsigned s = 0, e = count; + while (s != e) { + unsigned mid = (s + e) / 2; + switch (compare_with_slot(mid, oid)) { + case ordering_t::less: + s = ++mid; + break; + case ordering_t::greater: + e = mid; + break; + case ordering_t::equivalent: + assert(mid == 0 || mid < count); + return {mid, true}; + } + } + return {s, false}; +} + +template<size_t BlockSize, int N, ntype_t NodeType> +uint16_t node_t<BlockSize, N, NodeType>::size_of_item(const ghobject_t& oid, + const item_t& item) +{ + if constexpr (item_in_key) { + return sizeof(key_prefix_t); + } else { + return (sizeof(key_prefix_t) + + key_suffix_t::size_from(oid) + size_of(item)); + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +bool node_t<BlockSize, N, NodeType>::is_overflow(const ghobject_t& oid, + const item_t& item) const +{ + return free_space() < size_of_item(oid, item); +} + +template<size_t BlockSize, int N, ntype_t NodeType> +bool node_t<BlockSize, N, NodeType>::is_overflow(const ghobject_t& oid, + const OnodeRef& item) const +{ + return free_space() < (sizeof(key_prefix_t) + key_suffix_t::size_from(oid) + item->size()); +} + +// inserts an item into the given slot, pushing all subsequent keys forward +// @note if the item is not embedded in key, shift the right half as well +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::insert_at(unsigned slot, + const ghobject_t& oid, + const item_t& item) +{ + assert(!is_overflow(oid, item)); + assert(slot <= count); + if constexpr (item_in_key) { + // shift the keys right + key_prefix_t* key = keys + slot; + key_prefix_t* last_key = keys + count; + std::copy_backward(key, last_key, last_key + 1); + key->set(oid, item); + } else { + const uint16_t size = key_suffix_t::size_from(oid) + size_of(item); + uint16_t offset = size; + if (slot > 0) { + offset += keys[slot - 1].offset; + } + if (slot < count) { + // V + // | |... // ...|//////|| | + // | |... // ...|//////| | | + // shift the partial keys and items left + auto first = keys[slot - 1].offset; + auto last = keys[count - 1].offset; + std::memmove(from_end(last + size), from_end(last), last - first); + // shift the keys right and update the pointers + for (key_prefix_t* dst = keys + count; dst > keys + slot; dst--) { + key_prefix_t* src = dst - 1; + *dst = *src; + dst->offset += size; + } + } + keys[slot].set(oid, offset); + auto p = from_end(offset); + auto partial_key = reinterpret_cast<key_suffix_t*>(p); + partial_key->set(oid); + p += size_of(*partial_key); + auto item_ptr = reinterpret_cast<item_t*>(p); + *item_ptr = item; + } + count++; + assert(used_space() <= capacity()); +} + +// used by InnerNode for updating the keys indexing its children when their lower boundaries +// is updated +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::update_key_at(unsigned slot, const ghobject_t& oid) +{ + if constexpr (is_leaf()) { + assert(0); + } else if constexpr (item_in_key) { + keys[slot].update(oid); + } else { + const auto& [prefix, suffix] = key_at(slot); + int16_t delta = key_suffix_t::size_from(oid) - suffix.size(); + if (delta > 0) { + // shift the cells sitting at its left side + auto first = keys[slot].offset; + auto last = keys[count - 1].offset; + std::memmove(from_end(last + delta), from_end(last), last - first); + // update the pointers + for (key_prefix_t* key = keys + slot; key < keys + count; key++) { + key->offset += delta; + } + } + keys[slot].update(oid); + auto p = from_end(keys[slot].offset); + auto partial_key = reinterpret_cast<key_suffix_t*>(p); + partial_key->set(oid); + // we don't update item here + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +std::pair<unsigned, uint16_t> +node_t<BlockSize, N, NodeType>::calc_grab_front(uint16_t min_grab, + uint16_t max_grab) const +{ + // TODO: split by likeness + uint16_t grabbed = 0; + uint16_t used = used_space(); + int n = 0; + for (; n < count; n++) { + const auto& [prefix, suffix] = key_at(n); + uint16_t to_grab = sizeof(prefix) + size_of(suffix); + if constexpr (!item_in_key) { + const auto& item = item_at(prefix); + to_grab += size_of(item); + } + if (grabbed + to_grab > max_grab) { + break; + } + grabbed += to_grab; + } + if (grabbed >= min_grab) { + if (n == count) { + return {n, grabbed}; + } else if (!is_underflow(used - grabbed)) { + return {n, grabbed}; + } + } + return {0, 0}; +} + +template<size_t BlockSize, int N, ntype_t NodeType> +std::pair<unsigned, uint16_t> +node_t<BlockSize, N, NodeType>::calc_grab_back(uint16_t min_grab, + uint16_t max_grab) const +{ + // TODO: split by likeness + uint16_t grabbed = 0; + uint16_t used = used_space(); + for (int i = count - 1; i >= 0; i--) { + const auto& [prefix, suffix] = key_at(i); + uint16_t to_grab = sizeof(prefix) + size_of(suffix); + if constexpr (!item_in_key) { + const auto& item = item_at(prefix); + to_grab += size_of(item); + } + grabbed += to_grab; + if (is_underflow(used - grabbed)) { + return {0, 0}; + } else if (grabbed > max_grab) { + return {0, 0}; + } else if (grabbed >= min_grab) { + return {i + 1, grabbed}; + } + } + return {0, 0}; +} + +template<size_t BlockSize, int N, ntype_t NodeType> +template<int LeftN, class Mover> +void node_t<BlockSize, N, NodeType>::grab_from_left(node_t<BlockSize, LeftN, NodeType>& left, + unsigned n, uint16_t bytes, + Mover& mover) +{ + // TODO: rebuild keys if moving across different layouts + // group by likeness + shift_right(n, bytes); + mover.move_from(left.count - n, 0, n); +} + +template<size_t BlockSize, int N, ntype_t NodeType> +template<int RightN, class Mover> +delta_t node_t<BlockSize, N, NodeType>::acquire_right(node_t<BlockSize, RightN, NodeType>& right, + unsigned whoami, Mover& mover) +{ + mover.move_from(0, count, right.count); + return mover.to_delta(); +} + +template<size_t BlockSize, int N, ntype_t NodeType> +template<int RightN, class Mover> +void node_t<BlockSize, N, NodeType>::grab_from_right(node_t<BlockSize, RightN, NodeType>& right, + unsigned n, uint16_t bytes, + Mover& mover) +{ + mover.move_from(0, count, n); + right.shift_left(n, 0); +} + +template<size_t BlockSize, int N, ntype_t NodeType> +template<int LeftN, class Mover> +void node_t<BlockSize, N, NodeType>::push_to_left(node_t<BlockSize, LeftN, NodeType>& left, + unsigned n, uint16_t bytes, + Mover& mover) +{ + left.grab_from_right(*this, n, bytes, mover); +} + +template<size_t BlockSize, int N, ntype_t NodeType> +template<int RightN, class Mover> +void node_t<BlockSize, N, NodeType>::push_to_right(node_t<BlockSize, RightN, NodeType>& right, + unsigned n, uint16_t bytes, + Mover& mover) +{ + right.grab_from_left(*this, n, bytes, mover); +} + +// [to, from) are removed, so we need to shift left +// actually there are only two use cases: +// - to = 0: for giving elements in bulk +// - to = from - 1: for removing a single element +// old: |////|.....| |.....|/|........| +// new: |.....| |.....||........| +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::shift_left(unsigned from, unsigned to) +{ + assert(from < count); + assert(to < from); + if constexpr (item_in_key) { + std::copy(keys + from, keys + count, keys + to); + } else { + const uint16_t cell_hi = keys[count - 1].offset; + const uint16_t cell_lo = keys[from - 1].offset; + const uint16_t offset_delta = keys[from].offset - keys[to].offset; + for (auto src_key = keys + from, dst_key = keys + to; + src_key != keys + count; + ++src_key, ++dst_key) { + // shift the keys left + *dst_key = *src_key; + // update the pointers + dst_key->offset -= offset_delta; + } + // and cells + auto dst = from_end(cell_hi); + std::memmove(dst + offset_delta, dst, cell_hi - cell_lo); + } + count -= (from - to); +} + +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::insert_front(const ceph::bufferptr& keys_buf, + const ceph::bufferptr& cells_buf) +{ + unsigned n = keys_buf.length() / sizeof(key_prefix_t); + shift_right(n, cells_buf.length()); + keys_buf.copy_out(0, keys_buf.length(), reinterpret_cast<char*>(keys)); + if constexpr (item_in_key) { + assert(cells_buf.length() == 0); + } else { + cells_buf.copy_out(0, cells_buf.length(), from_end(keys[n - 1].offset)); + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::insert_back(const ceph::bufferptr& keys_buf, + const ceph::bufferptr& cells_buf) +{ + keys_buf.copy_out(0, keys_buf.length(), reinterpret_cast<char*>(keys + count)); + count += keys_buf.length() / sizeof(key_prefix_t); + if constexpr (item_in_key) { + assert(cells_buf.length() == 0); + } else { + cells_buf.copy_out(0, cells_buf.length(), from_end(keys[count - 1].offset)); + } +} + +// one or more elements are inserted, so we need to shift the elements right +// actually there are only two use cases: +// - bytes != 0: for inserting bytes before from +// - bytes = 0: for inserting a single element before from +// old: ||.....| +// new: |/////|.....| +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::shift_right(unsigned n, unsigned bytes) +{ + assert(bytes + used_space() < capacity()); + // shift the keys left + std::copy_backward(keys, keys + count, keys + count + n); + count += n; + if constexpr (!item_in_key) { + uint16_t cells = keys[count - 1].offset; + // copy the partial keys and items + std::memmove(from_end(cells + bytes), from_end(cells), cells); + // update the pointers + for (auto key = keys + n; key < keys + count; ++key) { + key->offset += bytes; + } + } +} + +// shift all keys after slot is removed. +// @note if the item is not embdedded in key, all items sitting at the left +// side of it will be shifted right +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::remove_from(unsigned slot) +{ + assert(slot < count); + if (unsigned next = slot + 1; next < count) { + shift_left(next, slot); + } else { + // slot is the last one + count--; + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::trim_right(unsigned n) +{ + count = n; +} + +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::play_delta(const delta_t& delta) +{ + switch (delta.op) { + case delta_t::op_t::insert_onode: + if constexpr (is_leaf()) { + auto [slot, found] = lower_bound(delta.oid); + assert(!found); + assert(delta.onode->size() <= std::numeric_limits<unsigned>::max()); + ceph::bufferptr buf{static_cast<unsigned>(delta.onode->size())}; + delta.onode->encode(buf.c_str(), buf.length()); + auto onode = reinterpret_cast<const onode_t*>(buf.c_str()); + return insert_at(slot, delta.oid, *onode); + } else { + throw std::invalid_argument("wrong node type"); + } + case delta_t::op_t::update_onode: + // TODO + assert(0 == "not implemented"); + break; + case delta_t::op_t::insert_child: + if constexpr (is_leaf()) { + throw std::invalid_argument("wrong node type"); + } else { + auto [slot, found] = lower_bound(delta.oid); + assert(!found); + insert_at(slot, delta.oid, delta.addr); + } + case delta_t::op_t::update_key: + if constexpr (is_leaf()) { + throw std::invalid_argument("wrong node type"); + } else { + return update_key_at(delta.n, delta.oid); + } + case delta_t::op_t::shift_left: + return shift_left(delta.n, 0); + case delta_t::op_t::trim_right: + return trim_right(delta.n); + case delta_t::op_t::insert_front: + return insert_front(delta.keys, delta.cells); + case delta_t::op_t::insert_back: + return insert_back(delta.keys, delta.cells); + case delta_t::op_t::remove_from: + return remove_from(delta.n); + default: + assert(0 == "unknown onode delta"); + } +} + +// explicit instantiate the node_t classes used by test_node.cc +template class node_t<512, 0, ntype_t::inner>; +template class node_t<512, 0, ntype_t::leaf>; +template class node_t<512, 1, ntype_t::inner>; +template class node_t<512, 1, ntype_t::leaf>; +template class node_t<512, 2, ntype_t::inner>; +template class node_t<512, 2, ntype_t::leaf>; +template class node_t<512, 3, ntype_t::inner>; +template class node_t<512, 3, ntype_t::leaf>; diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h new file mode 100644 index 000000000..d833a6682 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h @@ -0,0 +1,942 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include <algorithm> +#include <cstdint> +#include <type_traits> +#include <variant> + +#include "common/hobject.h" +#include "crimson/common/layout.h" +#include "crimson/os/seastore/onode.h" +#include "crimson/os/seastore/seastore_types.h" +#include "onode_delta.h" + +namespace asci = absl::container_internal; + +namespace boost::beast { + template<class T> + bool operator==(const span<T>& lhs, const span<T>& rhs) { + return std::equal( + lhs.begin(), lhs.end(), + rhs.begin(), rhs.end()); + } +} + +// on-disk onode +// it only keeps the bits necessary to rebuild an in-memory onode +struct [[gnu::packed]] onode_t { + onode_t& operator=(const onode_t& onode) { + len = onode.len; + std::memcpy(data, onode.data, len); + return *this; + } + size_t size() const { + return sizeof(*this) + len; + } + OnodeRef decode() const { + return new crimson::os::seastore::Onode(std::string_view{data, len}); + } + uint8_t struct_v = 1; + uint8_t struct_compat = 1; + // TODO: + // - use uint16_t for length, as the size of an onode should be less + // than a block (16K for now) + // - drop struct_len + uint32_t struct_len = 0; + uint32_t len; + char data[]; +}; + +static inline std::ostream& operator<<(std::ostream& os, const onode_t& onode) { + return os << *onode.decode(); +} + +using crimson::os::seastore::laddr_t; + +struct [[gnu::packed]] child_addr_t { + laddr_t data; + child_addr_t(laddr_t data) + : data{data} + {} + child_addr_t& operator=(laddr_t addr) { + data = addr; + return *this; + } + laddr_t get() const { + return data; + } + operator laddr_t() const { + return data; + } + size_t size() const { + return sizeof(laddr_t); + } +}; + +// poor man's operator<=> +enum class ordering_t { + less, + equivalent, + greater, +}; + +template<class L, class R> +ordering_t compare_element(const L& x, const R& y) +{ + if constexpr (std::is_arithmetic_v<L>) { + static_assert(std::is_arithmetic_v<R>); + if (x < y) { + return ordering_t::less; + } else if (x > y) { + return ordering_t::greater; + } else { + return ordering_t::equivalent; + } + } else { + // string_view::compare(), string::compare(), ... + auto result = x.compare(y); + if (result < 0) { + return ordering_t::less; + } else if (result > 0) { + return ordering_t::greater; + } else { + return ordering_t::equivalent; + } + } +} + +template<typename L, typename R> +constexpr ordering_t tuple_cmp(const L&, const R&, std::index_sequence<>) +{ + return ordering_t::equivalent; +} + +template<typename L, typename R, + size_t Head, size_t... Tail> +constexpr ordering_t tuple_cmp(const L& x, const R& y, + std::index_sequence<Head, Tail...>) +{ + auto ordering = compare_element(std::get<Head>(x), std::get<Head>(y)); + if (ordering != ordering_t::equivalent) { + return ordering; + } else { + return tuple_cmp(x, y, std::index_sequence<Tail...>()); + } +} + +template<typename... Ls, typename... Rs> +constexpr ordering_t cmp(const std::tuple<Ls...>& x, + const std::tuple<Rs...>& y) +{ + static_assert(sizeof...(Ls) == sizeof...(Rs)); + return tuple_cmp(x, y, std::index_sequence_for<Ls...>()); +} + +enum class likes_t { + yes, + no, + maybe, +}; + +struct [[gnu::packed]] variable_key_suffix { + uint64_t snap; + uint64_t gen; + uint8_t nspace_len; + uint8_t name_len; + char data[]; + struct index_t { + enum { + nspace_data = 0, + name_data = 1, + }; + }; + using layout_type = asci::Layout<char, char>; + layout_type cell_layout() const { + return layout_type{nspace_len, name_len}; + } + void set(const ghobject_t& oid) { + snap = oid.hobj.snap; + gen = oid.generation; + nspace_len = oid.hobj.nspace.size(); + name_len = oid.hobj.oid.name.size(); + auto layout = cell_layout(); + std::memcpy(layout.Pointer<index_t::nspace_data>(data), + oid.hobj.nspace.data(), oid.hobj.nspace.size()); + std::memcpy(layout.Pointer<index_t::name_data>(data), + oid.hobj.oid.name.data(), oid.hobj.oid.name.size()); + } + + void update_oid(ghobject_t& oid) const { + oid.hobj.snap = snap; + oid.generation = gen; + oid.hobj.nspace = nspace(); + oid.hobj.oid.name = name(); + } + + variable_key_suffix& operator=(const variable_key_suffix& key) { + snap = key.snap; + gen = key.gen; + auto layout = cell_layout(); + auto nspace = key.nspace(); + std::copy_n(nspace.data(), nspace.size(), + layout.Pointer<index_t::nspace_data>(data)); + auto name = key.name(); + std::copy_n(name.data(), name.size(), + layout.Pointer<index_t::name_data>(data)); + return *this; + } + const std::string_view nspace() const { + auto layout = cell_layout(); + auto nspace = layout.Slice<index_t::nspace_data>(data); + return {nspace.data(), nspace.size()}; + } + const std::string_view name() const { + auto layout = cell_layout(); + auto name = layout.Slice<index_t::name_data>(data); + return {name.data(), name.size()}; + } + size_t size() const { + return sizeof(*this) + nspace_len + name_len; + } + static size_t size_from(const ghobject_t& oid) { + return (sizeof(variable_key_suffix) + + oid.hobj.nspace.size() + + oid.hobj.oid.name.size()); + } + ordering_t compare(const ghobject_t& oid) const { + return cmp(std::tie(nspace(), name(), snap, gen), + std::tie(oid.hobj.nspace, oid.hobj.oid.name, oid.hobj.snap.val, + oid.generation)); + } + bool likes(const variable_key_suffix& key) const { + return nspace() == key.nspace() && name() == key.name(); + } +}; + +static inline std::ostream& operator<<(std::ostream& os, const variable_key_suffix& k) { + if (k.snap != CEPH_NOSNAP) { + os << "s" << k.snap << ","; + } + if (k.gen != ghobject_t::NO_GEN) { + os << "g" << k.gen << ","; + } + return os << k.nspace() << "/" << k.name(); +} + +// should use [[no_unique_address]] in C++20 +struct empty_key_suffix { + static constexpr ordering_t compare(const ghobject_t&) { + return ordering_t::equivalent; + } + static void set(const ghobject_t&) {} + static constexpr size_t size() { + return 0; + } + static size_t size_from(const ghobject_t&) { + return 0; + } + static void update_oid(ghobject_t&) {} +}; + +static inline std::ostream& operator<<(std::ostream& os, const empty_key_suffix&) +{ + return os; +} + +enum class ntype_t : uint8_t { + leaf = 0u, + inner, +}; + +constexpr ntype_t flip_ntype(ntype_t ntype) noexcept +{ + if (ntype == ntype_t::leaf) { + return ntype_t::inner; + } else { + return ntype_t::leaf; + } +} + +template<int N, ntype_t NodeType> +struct FixedKeyPrefix {}; + +template<ntype_t NodeType> +struct FixedKeyPrefix<0, NodeType> +{ + static constexpr bool item_in_key = false; + int8_t shard = -1; + int64_t pool = -1; + uint32_t hash = 0; + uint16_t offset = 0; + + FixedKeyPrefix() = default; + FixedKeyPrefix(const ghobject_t& oid, uint16_t offset) + : shard{oid.shard_id}, + pool{oid.hobj.pool}, + hash{oid.hobj.get_hash()}, + offset{offset} + {} + + void set(const ghobject_t& oid, uint16_t new_offset) { + shard = oid.shard_id; + pool = oid.hobj.pool; + hash = oid.hobj.get_hash(); + offset = new_offset; + } + + void set(const FixedKeyPrefix& k, uint16_t new_offset) { + shard = k.shard; + pool = k.pool; + hash = k.hash; + offset = new_offset; + } + + void update(const ghobject_t& oid) { + shard = oid.shard_id; + pool = oid.hobj.pool; + hash = oid.hobj.get_hash(); + } + + void update_oid(ghobject_t& oid) const { + oid.set_shard(shard_id_t{shard}); + oid.hobj.pool = pool; + oid.hobj.set_hash(hash); + } + + ordering_t compare(const ghobject_t& oid) const { + // so std::tie() can bind them by reference + int8_t rhs_shard = oid.shard_id; + uint32_t rhs_hash = oid.hobj.get_hash(); + return cmp(std::tie(shard, pool, hash), + std::tie(rhs_shard, oid.hobj.pool, rhs_hash)); + } + // @return true if i likes @c k, we will can be pushed down to next level + // in the same node + likes_t likes(const FixedKeyPrefix& k) const { + if (shard == k.shard && pool == k.pool) { + return likes_t::yes; + } else { + return likes_t::no; + } + } +}; + +template<ntype_t NodeType> +std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<0, NodeType>& k) { + if (k.shard != shard_id_t::NO_SHARD) { + os << "s" << k.shard; + } + return os << "p=" << k.pool << "," + << "h=" << std::hex << k.hash << std::dec << "," + << ">" << k.offset; +} + +// all elements in this node share the same <shard, pool> +template<ntype_t NodeType> +struct FixedKeyPrefix<1, NodeType> { + static constexpr bool item_in_key = false; + uint32_t hash = 0; + uint16_t offset = 0; + + FixedKeyPrefix() = default; + FixedKeyPrefix(uint32_t hash, uint16_t offset) + : hash{hash}, + offset{offset} + {} + FixedKeyPrefix(const ghobject_t& oid, uint16_t offset) + : FixedKeyPrefix(oid.hobj.get_hash(), offset) + {} + void set(const ghobject_t& oid, uint16_t new_offset) { + hash = oid.hobj.get_hash(); + offset = new_offset; + } + template<int N> + void set(const FixedKeyPrefix<N, NodeType>& k, uint16_t new_offset) { + static_assert(N < 2, "only N0, N1 have hash"); + hash = k.hash; + offset = new_offset; + } + void update_oid(ghobject_t& oid) const { + oid.hobj.set_hash(hash); + } + void update(const ghobject_t& oid) { + hash = oid.hobj.get_hash(); + } + ordering_t compare(const ghobject_t& oid) const { + return compare_element(hash, oid.hobj.get_hash()); + } + likes_t likes(const FixedKeyPrefix& k) const { + return hash == k.hash ? likes_t::yes : likes_t::no; + } +}; + +template<ntype_t NodeType> +std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<1, NodeType>& k) { + return os << "0x" << std::hex << k.hash << std::dec << "," + << ">" << k.offset; +} + +// all elements in this node must share the same <shard, pool, hash> +template<ntype_t NodeType> +struct FixedKeyPrefix<2, NodeType> { + static constexpr bool item_in_key = false; + uint16_t offset = 0; + + FixedKeyPrefix() = default; + + static constexpr ordering_t compare(const ghobject_t& oid) { + // need to compare the cell + return ordering_t::equivalent; + } + // always defer to my cell for likeness + constexpr likes_t likes(const FixedKeyPrefix&) const { + return likes_t::maybe; + } + void set(const ghobject_t&, uint16_t new_offset) { + offset = new_offset; + } + template<int N> + void set(const FixedKeyPrefix<N, NodeType>&, uint16_t new_offset) { + offset = new_offset; + } + void update(const ghobject_t&) {} + void update_oid(ghobject_t&) const {} +}; + +template<ntype_t NodeType> +std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<2, NodeType>& k) { + return os << ">" << k.offset; +} + +struct fixed_key_3 { + uint64_t snap = 0; + uint64_t gen = 0; + + fixed_key_3() = default; + fixed_key_3(const ghobject_t& oid) + : snap{oid.hobj.snap}, gen{oid.generation} + {} + ordering_t compare(const ghobject_t& oid) const { + return cmp(std::tie(snap, gen), + std::tie(oid.hobj.snap.val, oid.generation)); + } + // no object likes each other at this level + constexpr likes_t likes(const fixed_key_3&) const { + return likes_t::no; + } + void update_with_oid(const ghobject_t& oid) { + snap = oid.hobj.snap; + gen = oid.generation; + } + void update_oid(ghobject_t& oid) const { + oid.hobj.snap = snap; + oid.generation = gen; + } +}; + +static inline std::ostream& operator<<(std::ostream& os, const fixed_key_3& k) { + if (k.snap != CEPH_NOSNAP) { + os << "s" << k.snap << ","; + } + if (k.gen != ghobject_t::NO_GEN) { + os << "g" << k.gen << ","; + } + return os; +} + +// all elements in this node must share the same <shard, pool, hash, namespace, oid> +// but the unlike other FixedKeyPrefix<>, a node with FixedKeyPrefix<3> does not have +// variable_sized_key, so if it is an inner node, we can just embed the child +// addr right in the key. +template<> +struct FixedKeyPrefix<3, ntype_t::inner> : public fixed_key_3 { + // the item is embedded in the key + static constexpr bool item_in_key = true; + laddr_t child_addr = 0; + + FixedKeyPrefix() = default; + void set(const ghobject_t& oid, laddr_t new_child_addr) { + update_with_oid(oid); + child_addr = new_child_addr; + } + // unlikely get called, though.. + void update(const ghobject_t& oid) {} + template<int N> + std::enable_if_t<N < 3> set(const FixedKeyPrefix<N, ntype_t::inner>&, + laddr_t new_child_addr) { + child_addr = new_child_addr; + } + void set(const FixedKeyPrefix& k, laddr_t new_child_addr) { + snap = k.snap; + gen = k.gen; + child_addr = new_child_addr; + } + void set(const variable_key_suffix& k, laddr_t new_child_addr) { + snap = k.snap; + gen = k.gen; + child_addr = new_child_addr; + } +}; + +template<> +struct FixedKeyPrefix<3, ntype_t::leaf> : public fixed_key_3 { + static constexpr bool item_in_key = false; + uint16_t offset = 0; + + FixedKeyPrefix() = default; + void set(const ghobject_t& oid, uint16_t new_offset) { + update_with_oid(oid); + offset = new_offset; + } + void set(const FixedKeyPrefix& k, uint16_t new_offset) { + snap = k.snap; + gen = k.gen; + offset = new_offset; + } + template<int N> + std::enable_if_t<N < 3> set(const FixedKeyPrefix<N, ntype_t::leaf>&, + uint16_t new_offset) { + offset = new_offset; + } +}; + +struct tag_t { + template<int N, ntype_t node_type> + static constexpr tag_t create() { + static_assert(std::clamp(N, 0, 3) == N); + return tag_t{N, static_cast<uint8_t>(node_type)}; + } + bool is_leaf() const { + return type() == ntype_t::leaf; + } + int layout() const { + return layout_type; + } + ntype_t type() const { + return ntype_t{node_type}; + } + int layout_type : 4; + uint8_t node_type : 4; +}; + +static inline std::ostream& operator<<(std::ostream& os, const tag_t& tag) { + return os << "n=" << tag.layout() << ", leaf=" << tag.is_leaf(); +} + +// for calculating size of variable-sized item/key +template<class T> +size_t size_of(const T& t) { + using decayed_t = std::decay_t<T>; + if constexpr (std::is_scalar_v<decayed_t>) { + return sizeof(decayed_t); + } else { + return t.size(); + } +} + +enum class size_state_t { + okay, + underflow, + overflow, +}; + +// layout of a node of B+ tree +// +// it is different from a typical B+ tree in following ways +// - the size of keys is not necessarily fixed, neither is the size of value. +// - the max number of elements in a node is determined by the total size of +// the keys and values in the node +// - in internal nodes, each key maps to the logical address of the child +// node whose minimum key is greater or equal to that key. +template<size_t BlockSize, + int N, + ntype_t NodeType> +struct node_t { + static_assert(std::clamp(N, 0, 3) == N); + constexpr static ntype_t node_type = NodeType; + constexpr static int node_n = N; + + using key_prefix_t = FixedKeyPrefix<N, NodeType>; + using item_t = std::conditional_t<NodeType == ntype_t::leaf, + onode_t, + child_addr_t>; + using const_item_t = std::conditional_t<NodeType == ntype_t::leaf, + const onode_t&, + child_addr_t>; + static constexpr bool item_in_key = key_prefix_t::item_in_key; + using key_suffix_t = std::conditional_t<N < 3, + variable_key_suffix, + empty_key_suffix>; + + std::pair<const key_prefix_t&, const key_suffix_t&> + key_at(unsigned slot) const; + + // update an existing oid with the specified item + ghobject_t get_oid_at(unsigned slot, const ghobject_t& oid) const; + const_item_t item_at(const key_prefix_t& key) const; + void dump(std::ostream& os) const; + + // for debugging only. + static constexpr bool is_leaf() { + return node_type == ntype_t::leaf; + } + + bool _is_leaf() const { + return tag.is_leaf(); + } + + char* from_end(uint16_t offset); + const char* from_end(uint16_t offset) const; + uint16_t used_space() const; + uint16_t free_space() const { + return capacity() - used_space(); + } + static uint16_t capacity(); + // TODO: if it's allowed to update 2 siblings at the same time, we can have + // B* tree + static constexpr uint16_t min_size(); + + + // calculate the allowable bounds on bytes to remove from an overflow node + // with specified size + // @param size the overflowed size + // @return <minimum bytes to grab, maximum bytes to grab> + static constexpr std::pair<int16_t, int16_t> bytes_to_remove(uint16_t size); + + // calculate the allowable bounds on bytes to add to an underflow node + // with specified size + // @param size the underflowed size + // @return <minimum bytes to push, maximum bytes to push> + static constexpr std::pair<int16_t, int16_t> bytes_to_add(uint16_t size); + + size_state_t size_state(uint16_t size) const; + bool is_underflow(uint16_t size) const; + int16_t size_with_key(unsigned slot, const ghobject_t& oid) const; + ordering_t compare_with_slot(unsigned slot, const ghobject_t& oid) const; + /// return the slot number of the first slot that is greater or equal to + /// key + std::pair<unsigned, bool> lower_bound(const ghobject_t& oid) const; + static uint16_t size_of_item(const ghobject_t& oid, const item_t& item); + bool is_overflow(const ghobject_t& oid, const item_t& item) const; + bool is_overflow(const ghobject_t& oid, const OnodeRef& item) const; + + // inserts an item into the given slot, pushing all subsequent keys forward + // @note if the item is not embedded in key, shift the right half as well + void insert_at(unsigned slot, const ghobject_t& oid, const item_t& item); + // used by InnerNode for updating the keys indexing its children when their lower boundaries + // is updated + void update_key_at(unsigned slot, const ghobject_t& oid); + // try to figure out the number of elements and total size when trying to + // rebalance by moving the elements from the front of this node when its + // left sibling node is underflow + // + // @param min_grab lower bound of the number of bytes to move + // @param max_grab upper bound of the number of bytes to move + // @return the number of element to grab + // @note return {0, 0} if current node would be underflow if + // @c min_grab bytes of elements are taken from it + std::pair<unsigned, uint16_t> calc_grab_front(uint16_t min_grab, uint16_t max_grab) const; + // try to figure out the number of elements and their total size when trying to + // rebalance by moving the elements from the end of this node when its right + // sibling node is underflow + // + // @param min_grab lower bound of the number of bytes to move + // @param max_grab upper bound of the number of bytes to move + // @return the number of element to grab + // @note return {0, 0} if current node would be underflow if + // @c min_grab bytes of elements are taken from it + std::pair<unsigned, uint16_t> calc_grab_back(uint16_t min_grab, uint16_t max_grab) const; + template<int LeftN, class Mover> void grab_from_left( + node_t<BlockSize, LeftN, NodeType>& left, + unsigned n, uint16_t bytes, + Mover& mover); + template<int RightN, class Mover> + delta_t acquire_right(node_t<BlockSize, RightN, NodeType>& right, + unsigned whoami, Mover& mover); + // transfer n elements at the front of given node to me + template<int RightN, class Mover> + void grab_from_right(node_t<BlockSize, RightN, NodeType>& right, + unsigned n, uint16_t bytes, + Mover& mover); + template<int LeftN, class Mover> + void push_to_left(node_t<BlockSize, LeftN, NodeType>& left, + unsigned n, uint16_t bytes, + Mover& mover); + template<int RightN, class Mover> + void push_to_right(node_t<BlockSize, RightN, NodeType>& right, + unsigned n, uint16_t bytes, + Mover& mover); + // [to, from) are removed, so we need to shift left + // actually there are only two use cases: + // - to = 0: for giving elements in bulk + // - to = from - 1: for removing a single element + // old: |////|.....| |.....|/|........| + // new: |.....| |.....||........| + void shift_left(unsigned from, unsigned to); + void insert_front(const ceph::bufferptr& keys_buf, const ceph::bufferptr& cells_buf); + void insert_back(const ceph::bufferptr& keys_buf, const ceph::bufferptr& cells_buf); + // one or more elements are inserted, so we need to shift the elements right + // actually there are only two use cases: + // - bytes != 0: for inserting bytes before from + // - bytes = 0: for inserting a single element before from + // old: ||.....| + // new: |/////|.....| + void shift_right(unsigned n, unsigned bytes); + // shift all keys after slot is removed. + // @note if the item is not embdedded in key, all items sitting at the left + // side of it will be shifted right + void remove_from(unsigned slot); + void trim_right(unsigned n); + void play_delta(const delta_t& delta); + // /-------------------------------| + // | V + // |header|k0|k1|k2|... | / / |k2'v2|k1'v1|k0'.v0| v_m | + // |<-- count -->| + tag_t tag = tag_t::create<N, NodeType>(); + // the count of values in the node + uint16_t count = 0; + key_prefix_t keys[]; +}; + +template<class parent_t, + class from_t, + class to_t, + typename=void> +class EntryMover { +public: + // a "trap" mover + EntryMover(const parent_t&, from_t&, to_t& dst, unsigned) { + assert(0); + } + void move_from(unsigned, unsigned, unsigned) { + assert(0); + } + delta_t get_delta() { + return delta_t::nop(); + } +}; + +// lower the layout, for instance, from L0 to L1, no reference oid is used +template<class parent_t, + class from_t, + class to_t> +class EntryMover<parent_t, + from_t, + to_t, + std::enable_if_t<from_t::node_n < to_t::node_n>> +{ +public: + EntryMover(const parent_t&, from_t& src, to_t& dst, unsigned) + : src{src}, dst{dst} + {} + void move_from(unsigned src_first, unsigned dst_first, unsigned n) + { + ceph::bufferptr keys_buf{n * sizeof(to_t::key_prefix_t)}; + ceph::bufferptr cells_buf; + auto dst_keys = reinterpret_cast<typename to_t::key_prefix_t*>(keys_buf.c_str()); + if constexpr (to_t::item_in_key) { + for (unsigned i = 0; i < n; i++) { + const auto& [prefix, suffix] = src.key_at(src_first + i); + dst_keys[i].set(suffix, src.item_at(prefix)); + } + } else { + // copy keys + uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0; + uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0; + for (unsigned i = 0; i < n; i++) { + auto& src_key = src.keys[src_first + i]; + uint16_t offset = src_key.offset - src_offset + dst_offset; + dst_keys[i].set(src_key, offset); + } + // copy cells in bulk, yay! + auto src_end = src.keys[src_first + n - 1].offset; + uint16_t total_cell_size = src_end - src_offset; + cells_buf = ceph::bufferptr{total_cell_size}; + cells_buf.copy_in(0, total_cell_size, src.from_end(src_end)); + } + if (dst_first == dst.count) { + dst_delta = delta_t::insert_back(keys_buf, cells_buf); + } else { + dst_delta = delta_t::insert_front(keys_buf, cells_buf); + } + if (src_first > 0 && src_first + n == src.count) { + src_delta = delta_t::trim_right(src_first); + } else if (src_first == 0 && n < src.count) { + src_delta = delta_t::shift_left(n); + } else if (src_first == 0 && n == src.count) { + // the caller will retire the src extent + } else { + // grab in the middle? + assert(0); + } + } + + delta_t from_delta() { + return std::move(src_delta); + } + delta_t to_delta() { + return std::move(dst_delta); + } +private: + const from_t& src; + const to_t& dst; + delta_t dst_delta; + delta_t src_delta; +}; + +// lift the layout, for instance, from L2 to L0, need a reference oid +template<class parent_t, + class from_t, + class to_t> +class EntryMover<parent_t, from_t, to_t, + std::enable_if_t<(from_t::node_n > to_t::node_n)>> +{ +public: + EntryMover(const parent_t& parent, from_t& src, to_t& dst, unsigned from_slot) + : src{src}, dst{dst}, ref_oid{parent->get_oid_at(from_slot, {})} + {} + void move_from(unsigned src_first, unsigned dst_first, unsigned n) + { + ceph::bufferptr keys_buf{n * sizeof(to_t::key_prefix_t)}; + ceph::bufferptr cells_buf; + auto dst_keys = reinterpret_cast<typename to_t::key_prefix_t*>(keys_buf.c_str()); + uint16_t in_node_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0; + static_assert(!std::is_same_v<typename to_t::key_suffix_t, empty_key_suffix>); + // copy keys + uint16_t buf_offset = 0; + for (unsigned i = 0; i < n; i++) { + auto& src_key = src.keys[src_first + i]; + if constexpr (std::is_same_v<typename from_t::key_suffix_t, empty_key_suffix>) { + // heterogeneous partial key, have to rebuild dst partial key from oid + src_key.update_oid(ref_oid); + const auto& src_item = src.item_at(src_key); + size_t key2_size = to_t::key_suffix_t::size_from(ref_oid); + buf_offset += key2_size + size_of(src_item); + dst_keys[i].set(ref_oid, in_node_offset + buf_offset); + auto p = from_end(cells_buf, buf_offset); + auto partial_key = reinterpret_cast<typename to_t::key_suffix_t*>(p); + partial_key->set(ref_oid); + p += key2_size; + auto dst_item = reinterpret_cast<typename to_t::item_t*>(p); + *dst_item = src_item; + } else { + // homogeneous partial key, just update the pointers + uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0; + uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0; + uint16_t offset = src_key.offset - src_offset + dst_offset; + dst_keys[i].set(ref_oid, in_node_offset + offset); + } + } + if constexpr (std::is_same_v<typename to_t::key_suffix_t, + typename from_t::key_suffix_t>) { + // copy cells in bulk, yay! + uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0; + uint16_t src_end = src.keys[src_first + n - 1].offset; + uint16_t total_cell_size = src_end - src_offset; + cells_buf.copy_in(0, total_cell_size, src.from_end(src_end)); + } + if (dst_first == dst.count) { + dst_delta = delta_t::insert_back(keys_buf, cells_buf); + } else { + dst_delta = delta_t::insert_front(keys_buf, cells_buf); + } + if (src_first + n == src.count && src_first > 0) { + src_delta = delta_t::trim_right(src_first); + } else { + // the caller will retire the src extent + assert(src_first == 0); + } + } + + delta_t from_delta() { + return std::move(src_delta); + } + delta_t to_delta() { + return std::move(dst_delta); + } +private: + char* from_end(ceph::bufferptr& ptr, uint16_t offset) { + return ptr.end_c_str() - static_cast<int>(offset); + } +private: + const from_t& src; + const to_t& dst; + delta_t dst_delta; + delta_t src_delta; + ghobject_t ref_oid; +}; + +// identical layout, yay! +template<class parent_t, + class child_t> +class EntryMover<parent_t, child_t, child_t> +{ +public: + EntryMover(const parent_t&, child_t& src, child_t& dst, unsigned) + : src{src}, dst{dst} + {} + + void move_from(unsigned src_first, unsigned dst_first, unsigned n) + { + ceph::bufferptr keys_buf{static_cast<unsigned>(n * sizeof(typename child_t::key_prefix_t))}; + ceph::bufferptr cells_buf; + auto dst_keys = reinterpret_cast<typename child_t::key_prefix_t*>(keys_buf.c_str()); + + // copy keys + std::copy(src.keys + src_first, src.keys + src_first + n, + dst_keys); + if constexpr (!child_t::item_in_key) { + uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0; + uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0; + const int offset_delta = dst_offset - src_offset; + // update the pointers + for (unsigned i = 0; i < n; i++) { + dst_keys[i].offset += offset_delta; + } + // copy cells in bulk, yay! + auto src_end = src.keys[src_first + n - 1].offset; + uint16_t total_cell_size = src_end - src_offset; + cells_buf = ceph::bufferptr{total_cell_size}; + cells_buf.copy_in(0, total_cell_size, src.from_end(src_end)); + } + if (dst_first == dst.count) { + dst_delta = delta_t::insert_back(std::move(keys_buf), std::move(cells_buf)); + } else { + dst_delta = delta_t::insert_front(std::move(keys_buf), std::move(cells_buf)); + } + if (src_first + n == src.count && src_first > 0) { + src_delta = delta_t::trim_right(n); + } else if (src_first == 0 && n < src.count) { + src_delta = delta_t::shift_left(n); + } else if (src_first == 0 && n == src.count) { + // the caller will retire the src extent + } else { + // grab in the middle? + assert(0); + } + } + + delta_t from_delta() { + return std::move(src_delta); + } + + delta_t to_delta() { + return std::move(dst_delta); + } +private: + char* from_end(ceph::bufferptr& ptr, uint16_t offset) { + return ptr.end_c_str() - static_cast<int>(offset); + } +private: + const child_t& src; + const child_t& dst; + delta_t src_delta; + delta_t dst_delta; +}; + +template<class parent_t, class from_t, class to_t> +EntryMover<parent_t, from_t, to_t> +make_mover(const parent_t& parent, from_t& src, to_t& dst, unsigned from_slot) { + return EntryMover<parent_t, from_t, to_t>(parent, src, dst, from_slot); +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h new file mode 100644 index 000000000..4908c691f --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h @@ -0,0 +1,93 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <algorithm> +#include <cstring> +#include <limits> +#include <memory> +#include <string> + +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction.h" + +namespace crimson::os::seastore::onode { + +using crimson::os::seastore::Transaction; +using crimson::os::seastore::TransactionRef; +using crimson::os::seastore::make_transaction; +using crimson::os::seastore::laddr_t; +using crimson::os::seastore::L_ADDR_MIN; +using crimson::os::seastore::L_ADDR_NULL; +using crimson::os::seastore::extent_len_t; + +class DeltaRecorder; +class NodeExtent; +class NodeExtentManager; +class RootNodeTracker; +using DeltaRecorderURef = std::unique_ptr<DeltaRecorder>; +using NodeExtentRef = crimson::os::seastore::TCachedExtentRef<NodeExtent>; +using NodeExtentManagerURef = std::unique_ptr<NodeExtentManager>; +using RootNodeTrackerURef = std::unique_ptr<RootNodeTracker>; +struct context_t { + NodeExtentManager& nm; + Transaction& t; +}; + +class LeafNodeImpl; +class InternalNodeImpl; +class NodeImpl; +using LeafNodeImplURef = std::unique_ptr<LeafNodeImpl>; +using InternalNodeImplURef = std::unique_ptr<InternalNodeImpl>; +using NodeImplURef = std::unique_ptr<NodeImpl>; + +using level_t = uint8_t; +// a type only to index within a node, 32 bits should be enough +using index_t = uint32_t; +constexpr auto INDEX_END = std::numeric_limits<index_t>::max(); +constexpr auto INDEX_LAST = INDEX_END - 0x4; +constexpr auto INDEX_UPPER_BOUND = INDEX_END - 0x8; +inline bool is_valid_index(index_t index) { return index < INDEX_UPPER_BOUND; } + +// TODO: decide by NODE_BLOCK_SIZE +using node_offset_t = uint16_t; +constexpr node_offset_t DISK_BLOCK_SIZE = 1u << 12; +constexpr node_offset_t NODE_BLOCK_SIZE = DISK_BLOCK_SIZE * 1u; + +enum class MatchKindBS : int8_t { NE = -1, EQ = 0 }; + +enum class MatchKindCMP : int8_t { LT = -1, EQ = 0, GT }; +inline MatchKindCMP toMatchKindCMP(int value) { + if (value > 0) { + return MatchKindCMP::GT; + } else if (value < 0) { + return MatchKindCMP::LT; + } else { + return MatchKindCMP::EQ; + } +} +template <typename Type> +MatchKindCMP toMatchKindCMP(const Type& l, const Type& r) { + int match = l - r; + return toMatchKindCMP(match); +} + +inline MatchKindCMP toMatchKindCMP( + std::string_view l, std::string_view r) { + return toMatchKindCMP(l.compare(r)); +} + +inline MatchKindCMP reverse(MatchKindCMP cmp) { + if (cmp == MatchKindCMP::LT) { + return MatchKindCMP::GT; + } else if (cmp == MatchKindCMP::GT) { + return MatchKindCMP::LT; + } else { + return cmp; + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc new file mode 100644 index 000000000..3df458f08 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc @@ -0,0 +1,809 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node.h" + +#include <cassert> +#include <exception> +#include <sstream> + +#include "common/likely.h" +#include "crimson/common/log.h" +#include "node_extent_manager.h" +#include "node_impl.h" +#include "stages/node_stage_layout.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::onode { + +using node_ertr = Node::node_ertr; +template <class ValueT=void> +using node_future = Node::node_future<ValueT>; + +/* + * tree_cursor_t + */ + +tree_cursor_t::tree_cursor_t(Ref<LeafNode> node, const search_position_t& pos) + : leaf_node{node}, position{pos} { + assert(!is_end()); + leaf_node->do_track_cursor<true>(*this); +} + +tree_cursor_t::tree_cursor_t( + Ref<LeafNode> node, const search_position_t& pos, + const key_view_t& key, const onode_t* _p_value, layout_version_t v) + : leaf_node{node}, position{pos} { + assert(!is_end()); + update_kv(key, _p_value, v); + leaf_node->do_track_cursor<true>(*this); +} + +tree_cursor_t::tree_cursor_t(Ref<LeafNode> node) + : leaf_node{node}, position{search_position_t::end()} { + assert(is_end()); + assert(leaf_node->is_level_tail()); +} + +tree_cursor_t::~tree_cursor_t() { + if (!is_end()) { + leaf_node->do_untrack_cursor(*this); + } +} + +const key_view_t& tree_cursor_t::get_key_view() const { + ensure_kv(); + return *key_view; +} + +const onode_t* tree_cursor_t::get_p_value() const { + ensure_kv(); + return p_value; +} + +template <bool VALIDATE> +void tree_cursor_t::update_track( + Ref<LeafNode> node, const search_position_t& pos) { + // the cursor must be already untracked + // track the new node and new pos + assert(!pos.is_end()); + assert(!is_end()); + leaf_node = node; + position = pos; + key_view.reset(); + p_value = nullptr; + leaf_node->do_track_cursor<VALIDATE>(*this); +} +template void tree_cursor_t::update_track<true>(Ref<LeafNode>, const search_position_t&); +template void tree_cursor_t::update_track<false>(Ref<LeafNode>, const search_position_t&); + +void tree_cursor_t::update_kv( + const key_view_t& key, const onode_t* _p_value, layout_version_t v) const { + assert(!is_end()); + assert(_p_value); + assert(std::make_tuple(key, _p_value, v) == leaf_node->get_kv(position)); + key_view = key; + p_value = _p_value; + node_version = v; +} + +void tree_cursor_t::ensure_kv() const { + assert(!is_end()); + if (!p_value || node_version != leaf_node->get_layout_version()) { + // NOTE: the leaf node is always present when we hold its reference. + std::tie(key_view, p_value, node_version) = leaf_node->get_kv(position); + } + assert(p_value); +} + +/* + * Node + */ + +Node::Node(NodeImplURef&& impl) : impl{std::move(impl)} {} + +Node::~Node() { + // XXX: tolerate failure between allocate() and as_child() + if (is_root()) { + super->do_untrack_root(*this); + } else { + _parent_info->ptr->do_untrack_child(*this); + } +} + +level_t Node::level() const { + return impl->level(); +} + +node_future<Node::search_result_t> Node::lower_bound( + context_t c, const key_hobj_t& key) { + return seastar::do_with( + MatchHistory(), [this, c, &key](auto& history) { + return lower_bound_tracked(c, key, history); + } + ); +} + +node_future<std::pair<Ref<tree_cursor_t>, bool>> Node::insert( + context_t c, const key_hobj_t& key, const onode_t& value) { + return seastar::do_with( + MatchHistory(), [this, c, &key, &value](auto& history) { + return lower_bound_tracked(c, key, history + ).safe_then([c, &key, &value, &history](auto result) { + if (result.match() == MatchKindBS::EQ) { + return node_ertr::make_ready_future<std::pair<Ref<tree_cursor_t>, bool>>( + std::make_pair(result.p_cursor, false)); + } else { + auto leaf_node = result.p_cursor->get_leaf_node(); + return leaf_node->insert_value( + c, key, value, result.p_cursor->get_position(), history, result.mstat + ).safe_then([](auto p_cursor) { + return node_ertr::make_ready_future<std::pair<Ref<tree_cursor_t>, bool>>( + std::make_pair(p_cursor, true)); + }); + } + }); + } + ); +} + +node_future<tree_stats_t> Node::get_tree_stats(context_t c) { + return seastar::do_with( + tree_stats_t(), [this, c](auto& stats) { + return do_get_tree_stats(c, stats).safe_then([&stats] { + return stats; + }); + } + ); +} + +std::ostream& Node::dump(std::ostream& os) const { + return impl->dump(os); +} + +std::ostream& Node::dump_brief(std::ostream& os) const { + return impl->dump_brief(os); +} + +void Node::test_make_destructable( + context_t c, NodeExtentMutable& mut, Super::URef&& _super) { + impl->test_set_tail(mut); + make_root(c, std::move(_super)); +} + +node_future<> Node::mkfs(context_t c, RootNodeTracker& root_tracker) { + return LeafNode::allocate_root(c, root_tracker + ).safe_then([](auto ret) { /* FIXME: discard_result(); */ }); +} + +node_future<Ref<Node>> Node::load_root(context_t c, RootNodeTracker& root_tracker) { + return c.nm.get_super(c.t, root_tracker + ).safe_then([c, &root_tracker](auto&& _super) { + auto root_addr = _super->get_root_laddr(); + assert(root_addr != L_ADDR_NULL); + return Node::load(c, root_addr, true + ).safe_then([c, _super = std::move(_super), + &root_tracker](auto root) mutable { + assert(root->impl->field_type() == field_type_t::N0); + root->as_root(std::move(_super)); + std::ignore = c; // as only used in an assert + std::ignore = root_tracker; + assert(root == root_tracker.get_root(c.t)); + return node_ertr::make_ready_future<Ref<Node>>(root); + }); + }); +} + +void Node::make_root(context_t c, Super::URef&& _super) { + _super->write_root_laddr(c, impl->laddr()); + as_root(std::move(_super)); +} + +void Node::as_root(Super::URef&& _super) { + assert(!super && !_parent_info); + assert(_super->get_root_laddr() == impl->laddr()); + assert(impl->is_level_tail()); + super = std::move(_super); + super->do_track_root(*this); +} + +node_future<> Node::upgrade_root(context_t c) { + assert(is_root()); + assert(impl->is_level_tail()); + assert(impl->field_type() == field_type_t::N0); + super->do_untrack_root(*this); + return InternalNode::allocate_root(c, impl->level(), impl->laddr(), std::move(super) + ).safe_then([this](auto new_root) { + as_child(search_position_t::end(), new_root); + }); +} + +template <bool VALIDATE> +void Node::as_child(const search_position_t& pos, Ref<InternalNode> parent_node) { + assert(!super); + _parent_info = parent_info_t{pos, parent_node}; + parent_info().ptr->do_track_child<VALIDATE>(*this); +} +template void Node::as_child<true>(const search_position_t&, Ref<InternalNode>); +template void Node::as_child<false>(const search_position_t&, Ref<InternalNode>); + +node_future<> Node::insert_parent(context_t c, Ref<Node> right_node) { + assert(!is_root()); + // TODO(cross-node string dedup) + return parent_info().ptr->apply_child_split( + c, parent_info().position, this, right_node); +} + +node_future<Ref<Node>> Node::load( + context_t c, laddr_t addr, bool expect_is_level_tail) { + // NOTE: + // *option1: all types of node have the same length; + // option2: length is defined by node/field types; + // option3: length is totally flexible; + return c.nm.read_extent(c.t, addr, NODE_BLOCK_SIZE + ).safe_then([expect_is_level_tail](auto extent) { + auto [node_type, field_type] = extent->get_types(); + if (node_type == node_type_t::LEAF) { + auto impl = LeafNodeImpl::load(extent, field_type, expect_is_level_tail); + return Ref<Node>(new LeafNode(impl.get(), std::move(impl))); + } else if (node_type == node_type_t::INTERNAL) { + auto impl = InternalNodeImpl::load(extent, field_type, expect_is_level_tail); + return Ref<Node>(new InternalNode(impl.get(), std::move(impl))); + } else { + ceph_abort("impossible path"); + } + }); +} + +/* + * InternalNode + */ + +InternalNode::InternalNode(InternalNodeImpl* impl, NodeImplURef&& impl_ref) + : Node(std::move(impl_ref)), impl{impl} {} + +node_future<> InternalNode::apply_child_split( + context_t c, const search_position_t& pos, + Ref<Node> left_child, Ref<Node> right_child) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(impl->is_level_tail()); + } +#endif + impl->prepare_mutate(c); + + auto left_key = left_child->impl->get_largest_key_view(); + auto left_child_addr = left_child->impl->laddr(); + auto left_child_addr_packed = laddr_packed_t{left_child_addr}; + auto right_key = right_child->impl->get_largest_key_view(); + auto right_child_addr = right_child->impl->laddr(); + logger().debug("OTree::Internal::Insert: " + "pos({}), left_child({}, {:#x}), right_child({}, {:#x}) ...", + pos, left_key, left_child_addr, right_key, right_child_addr); + // update pos => left_child to pos => right_child + impl->replace_child_addr(pos, right_child_addr, left_child_addr); + replace_track(pos, right_child, left_child); + + search_position_t insert_pos = pos; + auto [insert_stage, insert_size] = impl->evaluate_insert( + left_key, left_child_addr, insert_pos); + auto free_size = impl->free_size(); + if (free_size >= insert_size) { + // insert + [[maybe_unused]] auto p_value = impl->insert( + left_key, left_child_addr_packed, insert_pos, insert_stage, insert_size); + assert(impl->free_size() == free_size - insert_size); + assert(insert_pos <= pos); + assert(p_value->value == left_child_addr); + track_insert(insert_pos, insert_stage, left_child, right_child); + validate_tracked_children(); + return node_ertr::now(); + } + // split and insert + Ref<InternalNode> this_ref = this; + return (is_root() ? upgrade_root(c) : node_ertr::now() + ).safe_then([this, c] { + return InternalNode::allocate( + c, impl->field_type(), impl->is_level_tail(), impl->level()); + }).safe_then([this_ref, this, c, left_key, left_child, right_child, + insert_pos, insert_stage=insert_stage, insert_size=insert_size](auto fresh_right) mutable { + auto right_node = fresh_right.node; + auto left_child_addr = left_child->impl->laddr(); + auto left_child_addr_packed = laddr_packed_t{left_child_addr}; + auto [split_pos, is_insert_left, p_value] = impl->split_insert( + fresh_right.mut, *right_node->impl, left_key, left_child_addr_packed, + insert_pos, insert_stage, insert_size); + assert(p_value->value == left_child_addr); + track_split(split_pos, right_node); + if (is_insert_left) { + track_insert(insert_pos, insert_stage, left_child); + } else { + right_node->track_insert(insert_pos, insert_stage, left_child); + } + validate_tracked_children(); + right_node->validate_tracked_children(); + + // propagate index to parent + return insert_parent(c, right_node); + // TODO (optimize) + // try to acquire space from siblings before split... see btrfs + }); +} + +node_future<Ref<InternalNode>> InternalNode::allocate_root( + context_t c, level_t old_root_level, + laddr_t old_root_addr, Super::URef&& super) { + return InternalNode::allocate(c, field_type_t::N0, true, old_root_level + 1 + ).safe_then([c, old_root_addr, + super = std::move(super)](auto fresh_node) mutable { + auto root = fresh_node.node; + auto p_value = root->impl->get_p_value(search_position_t::end()); + fresh_node.mut.copy_in_absolute( + const_cast<laddr_packed_t*>(p_value), old_root_addr); + root->make_root_from(c, std::move(super), old_root_addr); + return root; + }); +} + +node_future<Ref<tree_cursor_t>> +InternalNode::lookup_smallest(context_t c) { + auto position = search_position_t::begin(); + laddr_t child_addr = impl->get_p_value(position)->value; + return get_or_track_child(c, position, child_addr + ).safe_then([c](auto child) { + return child->lookup_smallest(c); + }); +} + +node_future<Ref<tree_cursor_t>> +InternalNode::lookup_largest(context_t c) { + // NOTE: unlike LeafNode::lookup_largest(), this only works for the tail + // internal node to return the tail child address. + auto position = search_position_t::end(); + laddr_t child_addr = impl->get_p_value(position)->value; + return get_or_track_child(c, position, child_addr).safe_then([c](auto child) { + return child->lookup_largest(c); + }); +} + +node_future<Node::search_result_t> +InternalNode::lower_bound_tracked( + context_t c, const key_hobj_t& key, MatchHistory& history) { + auto result = impl->lower_bound(key, history); + return get_or_track_child(c, result.position, result.p_value->value + ).safe_then([c, &key, &history](auto child) { + // XXX(multi-type): pass result.mstat to child + return child->lower_bound_tracked(c, key, history); + }); +} + +node_future<> InternalNode::do_get_tree_stats( + context_t c, tree_stats_t& stats) { + auto nstats = impl->get_stats(); + stats.size_persistent_internal += nstats.size_persistent; + stats.size_filled_internal += nstats.size_filled; + stats.size_logical_internal += nstats.size_logical; + stats.size_overhead_internal += nstats.size_overhead; + stats.size_value_internal += nstats.size_value; + stats.num_kvs_internal += nstats.num_kvs; + stats.num_nodes_internal += 1; + + Ref<const InternalNode> this_ref = this; + return seastar::do_with( + search_position_t(), [this, this_ref, c, &stats](auto& pos) { + pos = search_position_t::begin(); + return crimson::do_until( + [this, this_ref, c, &stats, &pos]() -> node_future<bool> { + auto child_addr = impl->get_p_value(pos)->value; + return get_or_track_child(c, pos, child_addr + ).safe_then([c, &stats](auto child) { + return child->do_get_tree_stats(c, stats); + }).safe_then([this, this_ref, &pos] { + if (pos.is_end()) { + return node_ertr::make_ready_future<bool>(true); + } else { + impl->next_position(pos); + if (pos.is_end()) { + if (impl->is_level_tail()) { + return node_ertr::make_ready_future<bool>(false); + } else { + return node_ertr::make_ready_future<bool>(true); + } + } else { + return node_ertr::make_ready_future<bool>(false); + } + } + }); + }); + } + ); +} + +node_future<> InternalNode::test_clone_root( + context_t c_other, RootNodeTracker& tracker_other) const { + assert(is_root()); + assert(impl->is_level_tail()); + assert(impl->field_type() == field_type_t::N0); + Ref<const InternalNode> this_ref = this; + return InternalNode::allocate(c_other, field_type_t::N0, true, impl->level() + ).safe_then([this, c_other, &tracker_other](auto fresh_other) { + impl->test_copy_to(fresh_other.mut); + auto cloned_root = fresh_other.node; + return c_other.nm.get_super(c_other.t, tracker_other + ).safe_then([c_other, cloned_root](auto&& super_other) { + cloned_root->make_root_new(c_other, std::move(super_other)); + return cloned_root; + }); + }).safe_then([this_ref, this, c_other](auto cloned_root) { + // clone tracked children + // In some unit tests, the children are stubbed out that they + // don't exist in NodeExtentManager, and are only tracked in memory. + return crimson::do_for_each( + tracked_child_nodes.begin(), + tracked_child_nodes.end(), + [this_ref, c_other, cloned_root](auto& kv) { + assert(kv.first == kv.second->parent_info().position); + return kv.second->test_clone_non_root(c_other, cloned_root); + } + ); + }); +} + +node_future<Ref<Node>> InternalNode::get_or_track_child( + context_t c, const search_position_t& position, laddr_t child_addr) { + bool level_tail = position.is_end(); + Ref<Node> child; + auto found = tracked_child_nodes.find(position); + Ref<InternalNode> this_ref = this; + return (found == tracked_child_nodes.end() + ? (logger().trace("OTree::Internal: load child untracked at {:#x}, pos({}), level={}", + child_addr, position, level() - 1), + Node::load(c, child_addr, level_tail + ).safe_then([this, position] (auto child) { + child->as_child(position, this); + return child; + })) + : (logger().trace("OTree::Internal: load child tracked at {:#x}, pos({}), level={}", + child_addr, position, level() - 1), + node_ertr::make_ready_future<Ref<Node>>(found->second)) + ).safe_then([this_ref, this, position, child_addr] (auto child) { + assert(child_addr == child->impl->laddr()); + assert(position == child->parent_info().position); + std::ignore = position; + std::ignore = child_addr; + validate_child(*child); + return child; + }); +} + +void InternalNode::track_insert( + const search_position_t& insert_pos, match_stage_t insert_stage, + Ref<Node> insert_child, Ref<Node> nxt_child) { + // update tracks + auto pos_upper_bound = insert_pos; + pos_upper_bound.index_by_stage(insert_stage) = INDEX_UPPER_BOUND; + auto first = tracked_child_nodes.lower_bound(insert_pos); + auto last = tracked_child_nodes.lower_bound(pos_upper_bound); + std::vector<Node*> nodes; + std::for_each(first, last, [&nodes](auto& kv) { + nodes.push_back(kv.second); + }); + tracked_child_nodes.erase(first, last); + for (auto& node : nodes) { + auto _pos = node->parent_info().position; + assert(!_pos.is_end()); + ++_pos.index_by_stage(insert_stage); + node->as_child(_pos, this); + } + // track insert + insert_child->as_child(insert_pos, this); + +#ifndef NDEBUG + // validate left_child is before right_child + if (nxt_child) { + auto iter = tracked_child_nodes.find(insert_pos); + ++iter; + assert(iter->second == nxt_child); + } +#endif +} + +void InternalNode::replace_track( + const search_position_t& position, Ref<Node> new_child, Ref<Node> old_child) { + assert(tracked_child_nodes[position] == old_child); + tracked_child_nodes.erase(position); + new_child->as_child(position, this); + assert(tracked_child_nodes[position] == new_child); +} + +void InternalNode::track_split( + const search_position_t& split_pos, Ref<InternalNode> right_node) { + auto first = tracked_child_nodes.lower_bound(split_pos); + auto iter = first; + while (iter != tracked_child_nodes.end()) { + search_position_t new_pos = iter->first; + new_pos -= split_pos; + iter->second->as_child<false>(new_pos, right_node); + ++iter; + } + tracked_child_nodes.erase(first, tracked_child_nodes.end()); +} + +void InternalNode::validate_child(const Node& child) const { +#ifndef NDEBUG + assert(impl->level() - 1 == child.impl->level()); + assert(this == child.parent_info().ptr); + auto& child_pos = child.parent_info().position; + assert(impl->get_p_value(child_pos)->value == child.impl->laddr()); + if (child_pos.is_end()) { + assert(impl->is_level_tail()); + assert(child.impl->is_level_tail()); + } else { + assert(!child.impl->is_level_tail()); + assert(impl->get_key_view(child_pos) == child.impl->get_largest_key_view()); + } + // XXX(multi-type) + assert(impl->field_type() <= child.impl->field_type()); +#endif +} + +node_future<InternalNode::fresh_node_t> InternalNode::allocate( + context_t c, field_type_t field_type, bool is_level_tail, level_t level) { + return InternalNodeImpl::allocate(c, field_type, is_level_tail, level + ).safe_then([](auto&& fresh_impl) { + auto node = Ref<InternalNode>(new InternalNode( + fresh_impl.impl.get(), std::move(fresh_impl.impl))); + return fresh_node_t{node, fresh_impl.mut}; + }); +} + +/* + * LeafNode + */ + +LeafNode::LeafNode(LeafNodeImpl* impl, NodeImplURef&& impl_ref) + : Node(std::move(impl_ref)), impl{impl} {} + +bool LeafNode::is_level_tail() const { + return impl->is_level_tail(); +} + +std::tuple<key_view_t, const onode_t*, layout_version_t> LeafNode::get_kv( + const search_position_t& pos) const { + key_view_t key_view; + auto p_value = impl->get_p_value(pos, &key_view); + return {key_view, p_value, layout_version}; +} + +node_future<Ref<tree_cursor_t>> +LeafNode::lookup_smallest(context_t) { + if (unlikely(impl->is_empty())) { + assert(is_root()); + return node_ertr::make_ready_future<Ref<tree_cursor_t>>( + new tree_cursor_t(this)); + } + auto pos = search_position_t::begin(); + key_view_t index_key; + auto p_value = impl->get_p_value(pos, &index_key); + return node_ertr::make_ready_future<Ref<tree_cursor_t>>( + get_or_track_cursor(pos, index_key, p_value)); +} + +node_future<Ref<tree_cursor_t>> +LeafNode::lookup_largest(context_t) { + if (unlikely(impl->is_empty())) { + assert(is_root()); + return node_ertr::make_ready_future<Ref<tree_cursor_t>>( + new tree_cursor_t(this)); + } + search_position_t pos; + const onode_t* p_value = nullptr; + key_view_t index_key; + impl->get_largest_slot(pos, index_key, &p_value); + return node_ertr::make_ready_future<Ref<tree_cursor_t>>( + get_or_track_cursor(pos, index_key, p_value)); +} + +node_future<Node::search_result_t> +LeafNode::lower_bound_tracked( + context_t c, const key_hobj_t& key, MatchHistory& history) { + key_view_t index_key; + auto result = impl->lower_bound(key, history, &index_key); + Ref<tree_cursor_t> cursor; + if (result.position.is_end()) { + assert(!result.p_value); + cursor = new tree_cursor_t(this); + } else { + cursor = get_or_track_cursor(result.position, index_key, result.p_value); + } + return node_ertr::make_ready_future<search_result_t>( + search_result_t{cursor, result.mstat}); +} + +node_future<> LeafNode::do_get_tree_stats(context_t, tree_stats_t& stats) { + auto nstats = impl->get_stats(); + stats.size_persistent_leaf += nstats.size_persistent; + stats.size_filled_leaf += nstats.size_filled; + stats.size_logical_leaf += nstats.size_logical; + stats.size_overhead_leaf += nstats.size_overhead; + stats.size_value_leaf += nstats.size_value; + stats.num_kvs_leaf += nstats.num_kvs; + stats.num_nodes_leaf += 1; + return node_ertr::now(); +} + +node_future<> LeafNode::test_clone_root( + context_t c_other, RootNodeTracker& tracker_other) const { + assert(is_root()); + assert(impl->is_level_tail()); + assert(impl->field_type() == field_type_t::N0); + Ref<const LeafNode> this_ref = this; + return LeafNode::allocate(c_other, field_type_t::N0, true + ).safe_then([this, c_other, &tracker_other](auto fresh_other) { + impl->test_copy_to(fresh_other.mut); + auto cloned_root = fresh_other.node; + return c_other.nm.get_super(c_other.t, tracker_other + ).safe_then([c_other, cloned_root](auto&& super_other) { + cloned_root->make_root_new(c_other, std::move(super_other)); + }); + }).safe_then([this_ref]{}); +} + +node_future<Ref<tree_cursor_t>> LeafNode::insert_value( + context_t c, const key_hobj_t& key, const onode_t& value, + const search_position_t& pos, const MatchHistory& history, + match_stat_t mstat) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(impl->is_level_tail()); + } +#endif + logger().debug("OTree::Leaf::Insert: " + "pos({}), {}, {}, {}, mstat({}) ...", + pos, key, value, history, mstat); + search_position_t insert_pos = pos; + auto [insert_stage, insert_size] = impl->evaluate_insert( + key, value, history, mstat, insert_pos); + auto free_size = impl->free_size(); + if (free_size >= insert_size) { + // insert + on_layout_change(); + impl->prepare_mutate(c); + auto p_value = impl->insert(key, value, insert_pos, insert_stage, insert_size); + assert(impl->free_size() == free_size - insert_size); + assert(insert_pos <= pos); + assert(p_value->size == value.size); + auto ret = track_insert(insert_pos, insert_stage, p_value); + validate_tracked_cursors(); + return node_ertr::make_ready_future<Ref<tree_cursor_t>>(ret); + } + // split and insert + Ref<LeafNode> this_ref = this; + return (is_root() ? upgrade_root(c) : node_ertr::now() + ).safe_then([this, c] { + return LeafNode::allocate(c, impl->field_type(), impl->is_level_tail()); + }).safe_then([this_ref, this, c, &key, &value, + insert_pos, insert_stage=insert_stage, insert_size=insert_size](auto fresh_right) mutable { + auto right_node = fresh_right.node; + // no need to bump version for right node, as it is fresh + on_layout_change(); + impl->prepare_mutate(c); + auto [split_pos, is_insert_left, p_value] = impl->split_insert( + fresh_right.mut, *right_node->impl, key, value, + insert_pos, insert_stage, insert_size); + assert(p_value->size == value.size); + track_split(split_pos, right_node); + Ref<tree_cursor_t> ret; + if (is_insert_left) { + ret = track_insert(insert_pos, insert_stage, p_value); + } else { + ret = right_node->track_insert(insert_pos, insert_stage, p_value); + } + validate_tracked_cursors(); + right_node->validate_tracked_cursors(); + + // propagate insert to parent + return insert_parent(c, right_node).safe_then([ret] { + return ret; + }); + // TODO (optimize) + // try to acquire space from siblings before split... see btrfs + }); +} + +node_future<Ref<LeafNode>> LeafNode::allocate_root( + context_t c, RootNodeTracker& root_tracker) { + return LeafNode::allocate(c, field_type_t::N0, true + ).safe_then([c, &root_tracker](auto fresh_node) { + auto root = fresh_node.node; + return c.nm.get_super(c.t, root_tracker + ).safe_then([c, root](auto&& super) { + root->make_root_new(c, std::move(super)); + return root; + }); + }); +} + +Ref<tree_cursor_t> LeafNode::get_or_track_cursor( + const search_position_t& position, + const key_view_t& key, const onode_t* p_value) { + assert(!position.is_end()); + assert(p_value); + Ref<tree_cursor_t> p_cursor; + auto found = tracked_cursors.find(position); + if (found == tracked_cursors.end()) { + p_cursor = new tree_cursor_t(this, position, key, p_value, layout_version); + } else { + p_cursor = found->second; + assert(p_cursor->get_leaf_node() == this); + assert(p_cursor->get_position() == position); + p_cursor->update_kv(key, p_value, layout_version); + } + return p_cursor; +} + +void LeafNode::validate_cursor(tree_cursor_t& cursor) const { +#ifndef NDEBUG + assert(this == cursor.get_leaf_node().get()); + assert(!cursor.is_end()); + auto [key, val, ver] = get_kv(cursor.get_position()); + assert(key == cursor.get_key_view()); + assert(val == cursor.get_p_value()); +#endif +} + +Ref<tree_cursor_t> LeafNode::track_insert( + const search_position_t& insert_pos, match_stage_t insert_stage, + const onode_t* p_onode) { + // update cursor position + auto pos_upper_bound = insert_pos; + pos_upper_bound.index_by_stage(insert_stage) = INDEX_UPPER_BOUND; + auto first = tracked_cursors.lower_bound(insert_pos); + auto last = tracked_cursors.lower_bound(pos_upper_bound); + std::vector<tree_cursor_t*> p_cursors; + std::for_each(first, last, [&p_cursors](auto& kv) { + p_cursors.push_back(kv.second); + }); + tracked_cursors.erase(first, last); + for (auto& p_cursor : p_cursors) { + search_position_t new_pos = p_cursor->get_position(); + ++new_pos.index_by_stage(insert_stage); + p_cursor->update_track<true>(this, new_pos); + } + + // track insert + // TODO: getting key_view_t from stage::proceed_insert() and + // stage::append_insert() has not supported yet + return new tree_cursor_t(this, insert_pos); +} + +void LeafNode::track_split( + const search_position_t& split_pos, Ref<LeafNode> right_node) { + // update cursor ownership and position + auto first = tracked_cursors.lower_bound(split_pos); + auto iter = first; + while (iter != tracked_cursors.end()) { + search_position_t new_pos = iter->first; + new_pos -= split_pos; + iter->second->update_track<false>(right_node, new_pos); + ++iter; + } + tracked_cursors.erase(first, tracked_cursors.end()); +} + +node_future<LeafNode::fresh_node_t> LeafNode::allocate( + context_t c, field_type_t field_type, bool is_level_tail) { + return LeafNodeImpl::allocate(c, field_type, is_level_tail + ).safe_then([](auto&& fresh_impl) { + auto node = Ref<LeafNode>(new LeafNode( + fresh_impl.impl.get(), std::move(fresh_impl.impl))); + return fresh_node_t{node, fresh_impl.mut}; + }); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h new file mode 100644 index 000000000..d6af489e7 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h @@ -0,0 +1,476 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <memory> +#include <ostream> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include "crimson/common/type_helpers.h" + +#include "node_extent_mutable.h" +#include "stages/key_layout.h" +#include "stages/stage_types.h" +#include "super.h" +#include "tree_types.h" + +/** + * Tree example (2 levels): + * + * Root node keys: [ 3 7 ] + * values: [p1 p2 p3] + * / | \ + * ------- | ------- + * | | | + * V V V + * Leaf node keys: [ 1 2 3] [ 4 5 7] [ 9 11 12] + * values: [v1 v2 v3] [v4 v5 v6] [v7 v8 v9] + * + * Tree structure properties: + * - As illustrated above, the parent key is strictly equal to its left child's + * largest key; + * - If a tree is indexing multiple seastore transactions, each transaction + * will be mapped to a Super which points to a distinct root node. So the + * transactions are isolated at tree level. However, tree nodes from + * different transactions can reference the same seastore CachedExtent before + * modification; + * - The resources of the transactional tree are tracked by tree_cursor_ts held + * by users. As long as any cursor is alive, the according tree hierarchy is + * alive and keeps tracked. See the reversed resource management sections + * below; + */ + +namespace crimson::os::seastore::onode { + +class LeafNode; +class InternalNode; + +/** + * tree_cursor_t + * + * A cursor points to a position (LeafNode and search_position_t) of the tree + * where it can find the according key and value pair. The position is updated + * by LeafNode insert/split/delete/merge internally and is kept valid. It also + * caches the key-value information for a specific node layout version. + * + * Exposes public interfaces for Btree::Cursor. + */ +using layout_version_t = uint32_t; +class tree_cursor_t final + : public boost::intrusive_ref_counter< + tree_cursor_t, boost::thread_unsafe_counter> { + public: + // public to Btree + ~tree_cursor_t(); + tree_cursor_t(const tree_cursor_t&) = delete; + tree_cursor_t(tree_cursor_t&&) = delete; + tree_cursor_t& operator=(const tree_cursor_t&) = delete; + tree_cursor_t& operator=(tree_cursor_t&&) = delete; + + /** + * is_end + * + * Represents one-past-the-last of all the sorted key-value + * pairs in the tree. An end cursor won't contain valid key-value + * information. + */ + bool is_end() const { return position.is_end(); } + + /// Returns the key view in tree if it is not an end cursor. + const key_view_t& get_key_view() const; + + /// Returns the value pointer in tree if it is not an end cursor. + const onode_t* get_p_value() const; + + private: + tree_cursor_t(Ref<LeafNode>, const search_position_t&); + tree_cursor_t(Ref<LeafNode>, const search_position_t&, + const key_view_t& key, const onode_t*, layout_version_t); + // lookup reaches the end, contain leaf node for further insert + tree_cursor_t(Ref<LeafNode>); + const search_position_t& get_position() const { return position; } + Ref<LeafNode> get_leaf_node() { return leaf_node; } + template <bool VALIDATE> + void update_track(Ref<LeafNode>, const search_position_t&); + void update_kv(const key_view_t&, const onode_t*, layout_version_t) const; + void ensure_kv() const; + + private: + /** + * Reversed resource management (tree_cursor_t) + * + * tree_cursor_t holds a reference to the LeafNode, so the LeafNode will be + * alive as long as any of it's cursors is still referenced by user. + */ + Ref<LeafNode> leaf_node; + search_position_t position; + + // cached information + mutable std::optional<key_view_t> key_view; + mutable const onode_t* p_value; + mutable layout_version_t node_version; + + friend class LeafNode; + friend class Node; // get_position(), get_leaf_node() +}; + +/** + * Node + * + * An abstracted class for both InternalNode and LeafNode. + * + * Exposes public interfaces for Btree. + */ +class Node + : public boost::intrusive_ref_counter< + Node, boost::thread_unsafe_counter> { + public: + // public to Btree + using node_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + template <class ValueT=void> + using node_future = node_ertr::future<ValueT>; + + struct search_result_t { + bool is_end() const { return p_cursor->is_end(); } + Ref<tree_cursor_t> p_cursor; + match_stat_t mstat; + + MatchKindBS match() const { + assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX); + return (mstat == MSTAT_EQ ? MatchKindBS::EQ : MatchKindBS::NE); + } + }; + + virtual ~Node(); + Node(const Node&) = delete; + Node(Node&&) = delete; + Node& operator=(const Node&) = delete; + Node& operator=(Node&&) = delete; + + /** + * level + * + * A positive value denotes the level (or height) of this node in tree. + * 0 means LeafNode, positive means InternalNode. + */ + level_t level() const; + + /** + * lookup_smallest + * + * Returns a cursor pointing to the smallest key in the sub-tree formed by + * this node. + * + * Returns an end cursor if it is an empty root node. + */ + virtual node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) = 0; + + /** + * lookup_largest + * + * Returns a cursor pointing to the largest key in the sub-tree formed by + * this node. + * + * Returns an end cursor if it is an empty root node. + */ + virtual node_future<Ref<tree_cursor_t>> lookup_largest(context_t) = 0; + + /** + * lower_bound + * + * Returns a cursor pointing to the first element in the range [first, last) + * of the sub-tree which does not compare less than the input key. The + * result also denotes whether the pointed key is equal to the input key. + * + * Returns an end cursor with MatchKindBS::NE if: + * - It is an empty root node; + * - Or the input key is larger than all the keys in the sub-tree; + */ + node_future<search_result_t> lower_bound(context_t c, const key_hobj_t& key); + + /** + * insert + * + * Try to insert a key-value pair into the sub-tree formed by this node. + * + * Returns a boolean denoting whether the insertion is successful: + * - If true, the returned cursor points to the inserted element in tree; + * - If false, the returned cursor points to the conflicting element in tree; + */ + node_future<std::pair<Ref<tree_cursor_t>, bool>> insert( + context_t, const key_hobj_t&, const onode_t&); + + /// Recursively collects the statistics of the sub-tree formed by this node + node_future<tree_stats_t> get_tree_stats(context_t); + + /// Returns an ostream containing a dump of all the elements in the node. + std::ostream& dump(std::ostream&) const; + + /// Returns an ostream containing an one-line summary of this node. + std::ostream& dump_brief(std::ostream&) const; + + /// Initializes the tree by allocating an empty root node. + static node_future<> mkfs(context_t, RootNodeTracker&); + + /// Loads the tree root. The tree must be initialized. + static node_future<Ref<Node>> load_root(context_t, RootNodeTracker&); + + // Only for unit test purposes. + void test_make_destructable(context_t, NodeExtentMutable&, Super::URef&&); + virtual node_future<> test_clone_root(context_t, RootNodeTracker&) const = 0; + + protected: + virtual node_future<> test_clone_non_root(context_t, Ref<InternalNode>) const { + ceph_abort("impossible path"); + } + virtual node_future<search_result_t> lower_bound_tracked( + context_t, const key_hobj_t&, MatchHistory&) = 0; + virtual node_future<> do_get_tree_stats(context_t, tree_stats_t&) = 0; + + protected: + Node(NodeImplURef&&); + bool is_root() const { + assert((super && !_parent_info.has_value()) || + (!super && _parent_info.has_value())); + return !_parent_info.has_value(); + } + + // as root + void make_root(context_t c, Super::URef&& _super); + void make_root_new(context_t c, Super::URef&& _super) { + assert(_super->get_root_laddr() == L_ADDR_NULL); + make_root(c, std::move(_super)); + } + void make_root_from(context_t c, Super::URef&& _super, laddr_t from_addr) { + assert(_super->get_root_laddr() == from_addr); + make_root(c, std::move(_super)); + } + void as_root(Super::URef&& _super); + node_future<> upgrade_root(context_t); + + // as child/non-root + template <bool VALIDATE = true> + void as_child(const search_position_t&, Ref<InternalNode>); + struct parent_info_t { + search_position_t position; + Ref<InternalNode> ptr; + }; + const parent_info_t& parent_info() const { return *_parent_info; } + node_future<> insert_parent(context_t, Ref<Node> right_node); + + private: + /** + * Reversed resource management (Node) + * + * Root Node holds a reference to its parent Super class, so its parent + * will be alive as long as this root node is alive. + * + * None-root Node holds a reference to its parent Node, so its parent will + * be alive as long as any of it's children is alive. + */ + // as root + Super::URef super; + // as child/non-root + std::optional<parent_info_t> _parent_info; + + private: + static node_future<Ref<Node>> load(context_t, laddr_t, bool expect_is_level_tail); + + NodeImplURef impl; + friend class InternalNode; +}; +inline std::ostream& operator<<(std::ostream& os, const Node& node) { + return node.dump_brief(os); +} + +/** + * InternalNode + * + * A concrete implementation of Node class that represents an internal tree + * node. Its level is always positive and its values are logical block + * addresses to its child nodes. An internal node cannot be empty. + */ +class InternalNode final : public Node { + public: + // public to Node + InternalNode(InternalNodeImpl*, NodeImplURef&&); + ~InternalNode() override { assert(tracked_child_nodes.empty()); } + InternalNode(const InternalNode&) = delete; + InternalNode(InternalNode&&) = delete; + InternalNode& operator=(const InternalNode&) = delete; + InternalNode& operator=(InternalNode&&) = delete; + + node_future<> apply_child_split( + context_t, const search_position_t&, Ref<Node> left, Ref<Node> right); + template <bool VALIDATE> + void do_track_child(Node& child) { + if constexpr (VALIDATE) { + validate_child(child); + } + auto& child_pos = child.parent_info().position; + assert(tracked_child_nodes.find(child_pos) == tracked_child_nodes.end()); + tracked_child_nodes[child_pos] = &child; + } + void do_untrack_child(const Node& child) { + auto& child_pos = child.parent_info().position; + assert(tracked_child_nodes.find(child_pos)->second == &child); + [[maybe_unused]] auto removed = tracked_child_nodes.erase(child_pos); + assert(removed); + } + + static node_future<Ref<InternalNode>> allocate_root( + context_t, level_t, laddr_t, Super::URef&&); + + protected: + node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) override; + node_future<Ref<tree_cursor_t>> lookup_largest(context_t) override; + node_future<search_result_t> lower_bound_tracked( + context_t, const key_hobj_t&, MatchHistory&) override; + node_future<> do_get_tree_stats(context_t, tree_stats_t&) override; + + node_future<> test_clone_root(context_t, RootNodeTracker&) const override; + + private: + // XXX: extract a common tracker for InternalNode to track Node, + // and LeafNode to track tree_cursor_t. + node_future<Ref<Node>> get_or_track_child(context_t, const search_position_t&, laddr_t); + void track_insert( + const search_position_t&, match_stage_t, Ref<Node>, Ref<Node> nxt_child = nullptr); + void replace_track(const search_position_t&, Ref<Node> new_child, Ref<Node> old_child); + void track_split(const search_position_t&, Ref<InternalNode>); + void validate_tracked_children() const { +#ifndef NDEBUG + for (auto& kv : tracked_child_nodes) { + assert(kv.first == kv.second->parent_info().position); + validate_child(*kv.second); + } +#endif + } + void validate_child(const Node& child) const; + + struct fresh_node_t { + Ref<InternalNode> node; + NodeExtentMutable mut; + std::pair<Ref<Node>, NodeExtentMutable> make_pair() { + return std::make_pair(Ref<Node>(node), mut); + } + }; + static node_future<fresh_node_t> allocate(context_t, field_type_t, bool, level_t); + + private: + /** + * Reversed resource management (InternalNode) + * + * InteralNode keeps track of its child nodes which are still alive in + * memory, and their positions will be updated throughout + * insert/split/delete/merge operations of this node. + */ + // XXX: leverage intrusive data structure to control memory overhead + std::map<search_position_t, Node*> tracked_child_nodes; + InternalNodeImpl* impl; +}; + +/** + * LeafNode + * + * A concrete implementation of Node class that represents a leaf tree node. + * Its level is always 0. A leaf node can only be empty if it is root. + */ +class LeafNode final : public Node { + public: + // public to tree_cursor_t + ~LeafNode() override { assert(tracked_cursors.empty()); } + LeafNode(const LeafNode&) = delete; + LeafNode(LeafNode&&) = delete; + LeafNode& operator=(const LeafNode&) = delete; + LeafNode& operator=(LeafNode&&) = delete; + + bool is_level_tail() const; + layout_version_t get_layout_version() const { return layout_version; } + std::tuple<key_view_t, const onode_t*, layout_version_t> get_kv( + const search_position_t&) const; + template <bool VALIDATE> + void do_track_cursor(tree_cursor_t& cursor) { + if constexpr (VALIDATE) { + validate_cursor(cursor); + } + auto& cursor_pos = cursor.get_position(); + assert(tracked_cursors.find(cursor_pos) == tracked_cursors.end()); + tracked_cursors[cursor_pos] = &cursor; + } + void do_untrack_cursor(tree_cursor_t& cursor) { + validate_cursor(cursor); + auto& cursor_pos = cursor.get_position(); + assert(tracked_cursors.find(cursor_pos)->second == &cursor); + [[maybe_unused]] auto removed = tracked_cursors.erase(cursor_pos); + assert(removed); + } + + protected: + node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) override; + node_future<Ref<tree_cursor_t>> lookup_largest(context_t) override; + node_future<search_result_t> lower_bound_tracked( + context_t, const key_hobj_t&, MatchHistory&) override; + node_future<> do_get_tree_stats(context_t, tree_stats_t&) override; + + node_future<> test_clone_root(context_t, RootNodeTracker&) const override; + + private: + LeafNode(LeafNodeImpl*, NodeImplURef&&); + node_future<Ref<tree_cursor_t>> insert_value( + context_t, const key_hobj_t&, const onode_t&, + const search_position_t&, const MatchHistory&, + match_stat_t mstat); + static node_future<Ref<LeafNode>> allocate_root(context_t, RootNodeTracker&); + friend class Node; + + private: + // XXX: extract a common tracker for InternalNode to track Node, + // and LeafNode to track tree_cursor_t. + Ref<tree_cursor_t> get_or_track_cursor( + const search_position_t&, const key_view_t&, const onode_t*); + Ref<tree_cursor_t> track_insert( + const search_position_t&, match_stage_t, const onode_t*); + void track_split(const search_position_t&, Ref<LeafNode>); + void validate_tracked_cursors() const { +#ifndef NDEBUG + for (auto& kv : tracked_cursors) { + assert(kv.first == kv.second->get_position()); + validate_cursor(*kv.second); + } +#endif + } + void validate_cursor(tree_cursor_t& cursor) const; + // invalidate p_value pointers in tree_cursor_t + void on_layout_change() { ++layout_version; } + + struct fresh_node_t { + Ref<LeafNode> node; + NodeExtentMutable mut; + std::pair<Ref<Node>, NodeExtentMutable> make_pair() { + return std::make_pair(Ref<Node>(node), mut); + } + }; + static node_future<fresh_node_t> allocate(context_t, field_type_t, bool); + + private: + /** + * Reversed resource management (LeafNode) + * + * LeafNode keeps track of the referencing cursors which are still alive in + * memory, and their positions will be updated throughout + * insert/split/delete/merge operations of this node. + */ + // XXX: leverage intrusive data structure to control memory overhead + std::map<search_position_t, tree_cursor_t*> tracked_cursors; + LeafNodeImpl* impl; + layout_version_t layout_version = 0; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h new file mode 100644 index 000000000..d08a99015 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "include/buffer.h" +#include "node_types.h" + +namespace crimson::os::seastore::onode { + +/** + * DeltaRecorder + * + * An abstracted class to encapsulate different implementations to apply delta + * to a specific node layout. + */ +class DeltaRecorder { + public: + virtual ~DeltaRecorder() { + assert(is_empty()); + } + + bool is_empty() const { + return encoded.length() == 0; + } + + ceph::bufferlist get_delta() { + assert(!is_empty()); + return std::move(encoded); + } + + virtual node_type_t node_type() const = 0; + virtual field_type_t field_type() const = 0; + virtual void apply_delta(ceph::bufferlist::const_iterator&, + NodeExtentMutable&) = 0; + + protected: + DeltaRecorder() = default; + ceph::bufferlist encoded; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h new file mode 100644 index 000000000..94782f50d --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h @@ -0,0 +1,413 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/log.h" +#include "node_extent_manager.h" +#include "node_delta_recorder.h" +#include "node_layout_replayable.h" + +#ifndef NDEBUG +#include "node_extent_manager/test_replay.h" +#endif + +namespace crimson::os::seastore::onode { + +/** + * DeltaRecorderT + * + * Responsible to encode and decode delta, and apply delta for a specific node + * layout. + */ +template <typename FieldType, node_type_t NODE_TYPE> +class DeltaRecorderT final: public DeltaRecorder { + enum class op_t : uint8_t { + INSERT, + SPLIT, + SPLIT_INSERT, + UPDATE_CHILD_ADDR, + }; + + public: + using layout_t = NodeLayoutReplayableT<FieldType, NODE_TYPE>; + using node_stage_t = typename layout_t::node_stage_t; + using position_t = typename layout_t::position_t; + using StagedIterator = typename layout_t::StagedIterator; + using value_t = typename layout_t::value_t; + static constexpr auto FIELD_TYPE = layout_t::FIELD_TYPE; + + ~DeltaRecorderT() override = default; + + template <KeyT KT> + void encode_insert( + const full_key_t<KT>& key, + const value_t& value, + const position_t& insert_pos, + const match_stage_t& insert_stage, + const node_offset_t& insert_size) { + ceph::encode(op_t::INSERT, encoded); + encode_key<KT>(key, encoded); + encode_value(value, encoded); + insert_pos.encode(encoded); + ceph::encode(insert_stage, encoded); + ceph::encode(insert_size, encoded); + } + + void encode_split( + const StagedIterator& split_at, + const char* p_node_start) { + ceph::encode(op_t::SPLIT, encoded); + split_at.encode(p_node_start, encoded); + } + + template <KeyT KT> + void encode_split_insert( + const StagedIterator& split_at, + const full_key_t<KT>& key, + const value_t& value, + const position_t& insert_pos, + const match_stage_t& insert_stage, + const node_offset_t& insert_size, + const char* p_node_start) { + ceph::encode(op_t::SPLIT_INSERT, encoded); + split_at.encode(p_node_start, encoded); + encode_key<KT>(key, encoded); + encode_value(value, encoded); + insert_pos.encode(encoded); + ceph::encode(insert_stage, encoded); + ceph::encode(insert_size, encoded); + } + + void encode_update_child_addr( + const laddr_t new_addr, + const laddr_packed_t* p_addr, + const char* p_node_start) { + ceph::encode(op_t::UPDATE_CHILD_ADDR, encoded); + ceph::encode(new_addr, encoded); + int node_offset = reinterpret_cast<const char*>(p_addr) - p_node_start; + assert(node_offset > 0 && node_offset <= NODE_BLOCK_SIZE); + ceph::encode(static_cast<node_offset_t>(node_offset), encoded); + } + + static DeltaRecorderURef create() { + return std::unique_ptr<DeltaRecorder>(new DeltaRecorderT()); + } + + protected: + DeltaRecorderT() = default; + node_type_t node_type() const override { return NODE_TYPE; } + field_type_t field_type() const override { return FIELD_TYPE; } + void apply_delta(ceph::bufferlist::const_iterator& delta, + NodeExtentMutable& node) override { + assert(is_empty()); + node_stage_t stage(reinterpret_cast<const FieldType*>(node.get_read())); + op_t op; + try { + ceph::decode(op, delta); + switch (op) { + case op_t::INSERT: { + logger().debug("OTree::Extent::Replay: decoding INSERT ..."); + auto key = key_hobj_t::decode(delta); + + std::unique_ptr<char[]> value_storage_heap; + value_t value_storage_stack; + auto p_value = decode_value(delta, value_storage_heap, value_storage_stack); + + auto insert_pos = position_t::decode(delta); + match_stage_t insert_stage; + ceph::decode(insert_stage, delta); + node_offset_t insert_size; + ceph::decode(insert_size, delta); + logger().debug("OTree::Extent::Replay: apply {}, {}, " + "insert_pos({}), insert_stage={}, insert_size={}B ...", + key, *p_value, insert_pos, insert_stage, insert_size); + layout_t::template insert<KeyT::HOBJ>( + node, stage, key, *p_value, insert_pos, insert_stage, insert_size); + break; + } + case op_t::SPLIT: { + logger().debug("OTree::Extent::Replay: decoding SPLIT ..."); + auto split_at = StagedIterator::decode(stage.p_start(), delta); + logger().debug("OTree::Extent::Replay: apply split_at={} ...", split_at); + layout_t::split(node, stage, split_at); + break; + } + case op_t::SPLIT_INSERT: { + logger().debug("OTree::Extent::Replay: decoding SPLIT_INSERT ..."); + auto split_at = StagedIterator::decode(stage.p_start(), delta); + auto key = key_hobj_t::decode(delta); + + std::unique_ptr<char[]> value_storage_heap; + value_t value_storage_stack; + auto p_value = decode_value(delta, value_storage_heap, value_storage_stack); + + auto insert_pos = position_t::decode(delta); + match_stage_t insert_stage; + ceph::decode(insert_stage, delta); + node_offset_t insert_size; + ceph::decode(insert_size, delta); + logger().debug("OTree::Extent::Replay: apply split_at={}, {}, {}, " + "insert_pos({}), insert_stage={}, insert_size={}B ...", + split_at, key, *p_value, insert_pos, insert_stage, insert_size); + layout_t::template split_insert<KeyT::HOBJ>( + node, stage, split_at, key, *p_value, insert_pos, insert_stage, insert_size); + break; + } + case op_t::UPDATE_CHILD_ADDR: { + logger().debug("OTree::Extent::Replay: decoding UPDATE_CHILD_ADDR ..."); + laddr_t new_addr; + ceph::decode(new_addr, delta); + node_offset_t update_offset; + ceph::decode(update_offset, delta); + auto p_addr = reinterpret_cast<laddr_packed_t*>( + node.get_write() + update_offset); + logger().debug("OTree::Extent::Replay: apply {:#x} to offset {:#x} ...", + new_addr, update_offset); + layout_t::update_child_addr(node, new_addr, p_addr); + break; + } + default: + logger().error("OTree::Extent::Replay: got unknown op {} when replay {:#x}", + op, node.get_laddr()); + ceph_abort(); + } + } catch (buffer::error& e) { + logger().error("OTree::Extent::Replay: got decode error {} when replay {:#x}", + e, node.get_laddr()); + ceph_abort(); + } + } + + private: + static void encode_value(const value_t& value, ceph::bufferlist& encoded) { + if constexpr (std::is_same_v<value_t, laddr_packed_t>) { + // NODE_TYPE == node_type_t::INTERNAL + ceph::encode(value.value, encoded); + } else if constexpr (std::is_same_v<value_t, onode_t>) { + // NODE_TYPE == node_type_t::LEAF + value.encode(encoded); + } else { + ceph_abort("impossible path"); + } + } + + static value_t* decode_value(ceph::bufferlist::const_iterator& delta, + std::unique_ptr<char[]>& value_storage_heap, + value_t& value_storage_stack) { + if constexpr (std::is_same_v<value_t, laddr_packed_t>) { + // NODE_TYPE == node_type_t::INTERNAL + laddr_t value; + ceph::decode(value, delta); + value_storage_stack.value = value; + return &value_storage_stack; + } else if constexpr (std::is_same_v<value_t, onode_t>) { + // NODE_TYPE == node_type_t::LEAF + auto value_config = onode_t::decode(delta); + value_storage_heap = onode_t::allocate(value_config); + return reinterpret_cast<onode_t*>(value_storage_heap.get()); + } else { + ceph_abort("impossible path"); + } + } + + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +}; + +/** + * NodeExtentAccessorT + * + * This component is responsible to reference and mutate the underlying + * NodeExtent, record mutation parameters when needed, and apply the recorded + * modifications for a specific node layout. + */ +template <typename FieldType, node_type_t NODE_TYPE> +class NodeExtentAccessorT { + public: + using layout_t = NodeLayoutReplayableT<FieldType, NODE_TYPE>; + using node_stage_t = typename layout_t::node_stage_t; + using position_t = typename layout_t::position_t; + using recorder_t = DeltaRecorderT<FieldType, NODE_TYPE>; + using StagedIterator = typename layout_t::StagedIterator; + using value_t = typename layout_t::value_t; + static constexpr auto FIELD_TYPE = layout_t::FIELD_TYPE; + + NodeExtentAccessorT(NodeExtentRef extent) + : extent{extent}, + node_stage{reinterpret_cast<const FieldType*>(extent->get_read())} { + if (no_recording()) { + mut.emplace(extent->get_mutable()); + assert(extent->get_recorder() == nullptr); + recorder = nullptr; + } else if (needs_recording()) { + mut.emplace(extent->get_mutable()); + auto p_recorder = extent->get_recorder(); + assert(p_recorder != nullptr); + assert(p_recorder->node_type() == NODE_TYPE); + assert(p_recorder->field_type() == FIELD_TYPE); + recorder = static_cast<recorder_t*>(p_recorder); + } else if (needs_mutate()) { + // mut is empty + assert(extent->get_recorder() == nullptr || + extent->get_recorder()->is_empty()); + recorder = nullptr; + } else { + ceph_abort("impossible path"); + } +#ifndef NDEBUG + auto ref_recorder = recorder_t::create(); + test_recorder = static_cast<recorder_t*>(ref_recorder.get()); + test_extent = TestReplayExtent::create( + extent->get_length(), std::move(ref_recorder)); +#endif + } + ~NodeExtentAccessorT() = default; + NodeExtentAccessorT(const NodeExtentAccessorT&) = delete; + NodeExtentAccessorT(NodeExtentAccessorT&&) = delete; + NodeExtentAccessorT& operator=(const NodeExtentAccessorT&) = delete; + NodeExtentAccessorT& operator=(NodeExtentAccessorT&&) = delete; + + const node_stage_t& read() const { return node_stage; } + laddr_t get_laddr() const { return extent->get_laddr(); } + + // must be called before any mutate attempes. + // for the safety of mixed read and mutate, call before read. + void prepare_mutate(context_t c) { + if (needs_mutate()) { + auto ref_recorder = recorder_t::create(); + recorder = static_cast<recorder_t*>(ref_recorder.get()); + extent = extent->mutate(c, std::move(ref_recorder)); + assert(needs_recording()); + node_stage = node_stage_t( + reinterpret_cast<const FieldType*>(extent->get_read())); + assert(recorder == static_cast<recorder_t*>(extent->get_recorder())); + mut.emplace(extent->get_mutable()); + } + } + + template <KeyT KT> + const value_t* insert_replayable( + const full_key_t<KT>& key, + const value_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + assert(!needs_mutate()); + if (needs_recording()) { + recorder->template encode_insert<KT>( + key, value, insert_pos, insert_stage, insert_size); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->template encode_insert<KT>( + key, value, insert_pos, insert_stage, insert_size); +#endif + auto ret = layout_t::template insert<KT>( + *mut, read(), key, value, + insert_pos, insert_stage, insert_size); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + return ret; + } + + void split_replayable(StagedIterator& split_at) { + assert(!needs_mutate()); + if (needs_recording()) { + recorder->encode_split(split_at, read().p_start()); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->template encode_split(split_at, read().p_start()); +#endif + layout_t::split(*mut, read(), split_at); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + } + + template <KeyT KT> + const value_t* split_insert_replayable( + StagedIterator& split_at, + const full_key_t<KT>& key, + const value_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + assert(!needs_mutate()); + if (needs_recording()) { + recorder->template encode_split_insert<KT>( + split_at, key, value, insert_pos, insert_stage, insert_size, + read().p_start()); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->template encode_split_insert<KT>( + split_at, key, value, insert_pos, insert_stage, insert_size, + read().p_start()); +#endif + auto ret = layout_t::template split_insert<KT>( + *mut, read(), split_at, key, value, + insert_pos, insert_stage, insert_size); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + return ret; + } + + void update_child_addr_replayable( + const laddr_t new_addr, laddr_packed_t* p_addr) { + assert(!needs_mutate()); + if (needs_recording()) { + recorder->encode_update_child_addr(new_addr, p_addr, read().p_start()); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->encode_update_child_addr(new_addr, p_addr, read().p_start()); +#endif + layout_t::update_child_addr(*mut, new_addr, p_addr); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + } + + void test_copy_to(NodeExtentMutable& to) const { + assert(extent->get_length() == to.get_length()); + std::memcpy(to.get_write(), extent->get_read(), extent->get_length()); + } + + private: + /** + * Possible states with CachedExtent::extent_state_t: + * INITIAL_WRITE_PENDING -- can mutate, no recording + * MUTATION_PENDING -- can mutate, needs recording + * CLEAN/DIRTY -- pending mutate + * INVALID -- impossible + */ + bool no_recording() const { + return extent->is_initial_pending(); + } + bool needs_recording() const { + return extent->is_mutation_pending(); + } + bool needs_mutate() const { + assert(extent->is_valid()); + return !extent->is_pending(); + } + + NodeExtentRef extent; + node_stage_t node_stage; + std::optional<NodeExtentMutable> mut; + // owned by extent + recorder_t* recorder; + +#ifndef NDEBUG + // verify record replay using a different memory block + TestReplayExtent::Ref test_extent; + recorder_t* test_recorder; +#endif +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc new file mode 100644 index 000000000..bd22d4b67 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_extent_manager.h" + +#include "node_extent_manager/dummy.h" +#include "node_extent_manager/seastore.h" +#include "stages/node_stage_layout.h" + +namespace crimson::os::seastore::onode { + +std::pair<node_type_t, field_type_t> NodeExtent::get_types() const { + const auto header = reinterpret_cast<const node_header_t*>(get_read()); + auto node_type = header->get_node_type(); + auto field_type = header->get_field_type(); + if (!field_type.has_value()) { + throw std::runtime_error("load failed: bad field type"); + } + return {node_type, *field_type}; +} + +NodeExtentManagerURef NodeExtentManager::create_dummy(bool is_sync) { + if (is_sync) { + return NodeExtentManagerURef(new DummyNodeExtentManager<true>()); + } else { + return NodeExtentManagerURef(new DummyNodeExtentManager<false>()); + } +} + +NodeExtentManagerURef NodeExtentManager::create_seastore( + TransactionManager& tm, laddr_t min_laddr) { + return NodeExtentManagerURef(new SeastoreNodeExtentManager(tm, min_laddr)); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h new file mode 100644 index 000000000..77b230e03 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/type_helpers.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/transaction_manager.h" + +#include "fwd.h" +#include "super.h" +#include "node_extent_mutable.h" +#include "node_types.h" + +/** + * node_extent_manager.h + * + * Contains general interfaces for different backends (Dummy and Seastore). + */ + +namespace crimson::os::seastore::onode { + +using crimson::os::seastore::LogicalCachedExtent; +class NodeExtent : public LogicalCachedExtent { + public: + virtual ~NodeExtent() = default; + std::pair<node_type_t, field_type_t> get_types() const; + const char* get_read() const { + return get_bptr().c_str(); + } + NodeExtentMutable get_mutable() { + assert(is_pending()); + return do_get_mutable(); + } + + virtual DeltaRecorder* get_recorder() const = 0; + virtual NodeExtentRef mutate(context_t, DeltaRecorderURef&&) = 0; + + protected: + template <typename... T> + NodeExtent(T&&... t) : LogicalCachedExtent(std::forward<T>(t)...) {} + + NodeExtentMutable do_get_mutable() { + return NodeExtentMutable(*this); + } + + /** + * Abstracted interfaces to implement: + * - CacheExtent::duplicate_for_write() -> CachedExtentRef + * - CacheExtent::get_type() -> extent_types_t + * - CacheExtent::get_delta() -> ceph::bufferlist + * - LogicalCachedExtent::apply_delta(const ceph::bufferlist) -> void + */ + + private: + friend class NodeExtentMutable; +}; + +using crimson::os::seastore::TransactionManager; +class NodeExtentManager { + public: + virtual ~NodeExtentManager() = default; + using tm_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + template <class ValueT=void> + using tm_future = tm_ertr::future<ValueT>; + + virtual bool is_read_isolated() const = 0; + virtual tm_future<NodeExtentRef> read_extent( + Transaction&, laddr_t, extent_len_t) = 0; + virtual tm_future<NodeExtentRef> alloc_extent(Transaction&, extent_len_t) = 0; + virtual tm_future<Super::URef> get_super(Transaction&, RootNodeTracker&) = 0; + virtual std::ostream& print(std::ostream& os) const = 0; + + static NodeExtentManagerURef create_dummy(bool is_sync); + static NodeExtentManagerURef create_seastore( + TransactionManager& tm, laddr_t min_laddr = L_ADDR_MIN); +}; +inline std::ostream& operator<<(std::ostream& os, const NodeExtentManager& nm) { + return nm.print(os); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h new file mode 100644 index 000000000..830ea4a7d --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h @@ -0,0 +1,156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <chrono> +#include <seastar/core/sleep.hh> + +#include "include/buffer_raw.h" + +#include "crimson/common/log.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h" + +/** + * dummy.h + * + * Dummy backend implementations for test purposes. + */ + +namespace crimson::os::seastore::onode { + +class DummySuper final: public Super { + public: + DummySuper(Transaction& t, RootNodeTracker& tracker, laddr_t* p_root_laddr) + : Super(t, tracker), p_root_laddr{p_root_laddr} {} + ~DummySuper() override = default; + protected: + laddr_t get_root_laddr() const override { return *p_root_laddr; } + void write_root_laddr(context_t, laddr_t addr) override { + logger().info("OTree::Dummy: update root {:#x} ...", addr); + *p_root_laddr = addr; + } + private: + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + laddr_t* p_root_laddr; +}; + +class DummyNodeExtent final: public NodeExtent { + public: + DummyNodeExtent(ceph::bufferptr &&ptr) : NodeExtent(std::move(ptr)) { + state = extent_state_t::INITIAL_WRITE_PENDING; + } + ~DummyNodeExtent() override = default; + protected: + NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override { + ceph_abort("impossible path"); } + DeltaRecorder* get_recorder() const override { + return nullptr; } + CachedExtentRef duplicate_for_write() override { + ceph_abort("impossible path"); } + extent_types_t get_type() const override { + return extent_types_t::TEST_BLOCK; } + ceph::bufferlist get_delta() override { + ceph_abort("impossible path"); } + void apply_delta(const ceph::bufferlist&) override { + ceph_abort("impossible path"); } +}; + +template <bool SYNC> +class DummyNodeExtentManager final: public NodeExtentManager { + static constexpr size_t ALIGNMENT = 4096; + public: + ~DummyNodeExtentManager() override = default; + protected: + bool is_read_isolated() const override { return false; } + + tm_future<NodeExtentRef> read_extent( + Transaction& t, laddr_t addr, extent_len_t len) override { + logger().trace("OTree::Dummy: reading {}B at {:#x} ...", len, addr); + if constexpr (SYNC) { + return read_extent_sync(t, addr, len); + } else { + using namespace std::chrono_literals; + return seastar::sleep(1us).then([this, &t, addr, len] { + return read_extent_sync(t, addr, len); + }); + } + } + + tm_future<NodeExtentRef> alloc_extent( + Transaction& t, extent_len_t len) override { + logger().trace("OTree::Dummy: allocating {}B ...", len); + if constexpr (SYNC) { + return alloc_extent_sync(t, len); + } else { + using namespace std::chrono_literals; + return seastar::sleep(1us).then([this, &t, len] { + return alloc_extent_sync(t, len); + }); + } + } + + tm_future<Super::URef> get_super( + Transaction& t, RootNodeTracker& tracker) override { + logger().trace("OTree::Dummy: get root ..."); + if constexpr (SYNC) { + return get_super_sync(t, tracker); + } else { + using namespace std::chrono_literals; + return seastar::sleep(1us).then([this, &t, &tracker] { + return get_super_sync(t, tracker); + }); + } + } + + std::ostream& print(std::ostream& os) const override { + return os << "DummyNodeExtentManager(sync=" << SYNC << ")"; + } + + private: + tm_future<NodeExtentRef> read_extent_sync( + Transaction& t, laddr_t addr, extent_len_t len) { + auto iter = allocate_map.find(addr); + assert(iter != allocate_map.end()); + auto extent = iter->second; + logger().trace("OTree::Dummy: read {}B at {:#x}", + extent->get_length(), extent->get_laddr()); + assert(extent->get_laddr() == addr); + assert(extent->get_length() == len); + return tm_ertr::make_ready_future<NodeExtentRef>(extent); + } + + tm_future<NodeExtentRef> alloc_extent_sync( + Transaction& t, extent_len_t len) { + assert(len % ALIGNMENT == 0); + auto r = ceph::buffer::create_aligned(len, ALIGNMENT); + auto addr = reinterpret_cast<laddr_t>(r->get_data()); + auto bp = ceph::bufferptr(std::move(r)); + auto extent = Ref<DummyNodeExtent>(new DummyNodeExtent(std::move(bp))); + extent->set_laddr(addr); + assert(allocate_map.find(extent->get_laddr()) == allocate_map.end()); + allocate_map.insert({extent->get_laddr(), extent}); + logger().debug("OTree::Dummy: allocated {}B at {:#x}", + extent->get_length(), extent->get_laddr()); + assert(extent->get_length() == len); + return tm_ertr::make_ready_future<NodeExtentRef>(extent); + } + + tm_future<Super::URef> get_super_sync( + Transaction& t, RootNodeTracker& tracker) { + logger().debug("OTree::Dummy: got root {:#x}", root_laddr); + return tm_ertr::make_ready_future<Super::URef>( + Super::URef(new DummySuper(t, tracker, &root_laddr))); + } + + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + + std::map<laddr_t, Ref<DummyNodeExtent>> allocate_map; + laddr_t root_laddr = L_ADDR_NULL; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc new file mode 100644 index 000000000..8d88485bf --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc @@ -0,0 +1,88 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "seastore.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h" + +namespace { + +seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); +} + +} + +namespace crimson::os::seastore::onode { + +static DeltaRecorderURef create_recorder( + node_type_t node_type, field_type_t field_type) { + if (node_type == node_type_t::LEAF) { + if (field_type == field_type_t::N0) { + return DeltaRecorderT<node_fields_0_t, node_type_t::LEAF>::create(); + } else if (field_type == field_type_t::N1) { + return DeltaRecorderT<node_fields_1_t, node_type_t::LEAF>::create(); + } else if (field_type == field_type_t::N2) { + return DeltaRecorderT<node_fields_2_t, node_type_t::LEAF>::create(); + } else if (field_type == field_type_t::N3) { + return DeltaRecorderT<leaf_fields_3_t, node_type_t::LEAF>::create(); + } else { + ceph_abort("impossible path"); + } + } else if (node_type == node_type_t::INTERNAL) { + if (field_type == field_type_t::N0) { + return DeltaRecorderT<node_fields_0_t, node_type_t::INTERNAL>::create(); + } else if (field_type == field_type_t::N1) { + return DeltaRecorderT<node_fields_1_t, node_type_t::INTERNAL>::create(); + } else if (field_type == field_type_t::N2) { + return DeltaRecorderT<node_fields_2_t, node_type_t::INTERNAL>::create(); + } else if (field_type == field_type_t::N3) { + return DeltaRecorderT<internal_fields_3_t, node_type_t::INTERNAL>::create(); + } else { + ceph_abort("impossible path"); + } + } else { + ceph_abort("impossible path"); + } +} + +void SeastoreSuper::write_root_laddr(context_t c, laddr_t addr) { + logger().info("OTree::Seastore: update root {:#x} ...", addr); + root_addr = addr; + auto nm = static_cast<SeastoreNodeExtentManager*>(&c.nm); + nm->get_tm().write_onode_root(c.t, addr); +} + +NodeExtentRef SeastoreNodeExtent::mutate( + context_t c, DeltaRecorderURef&& _recorder) { + logger().debug("OTree::Seastore: mutate {:#x} ...", get_laddr()); + auto nm = static_cast<SeastoreNodeExtentManager*>(&c.nm); + auto extent = nm->get_tm().get_mutable_extent(c.t, this); + auto ret = extent->cast<SeastoreNodeExtent>(); + assert(!ret->recorder || ret->recorder->is_empty()); + ret->recorder = std::move(_recorder); + return ret; +} + +void SeastoreNodeExtent::apply_delta(const ceph::bufferlist& bl) { + logger().debug("OTree::Seastore: replay {:#x} ...", get_laddr()); + if (!recorder) { + auto [node_type, field_type] = get_types(); + recorder = create_recorder(node_type, field_type); + } else { +#ifndef NDEBUG + auto [node_type, field_type] = get_types(); + assert(recorder->node_type() == node_type); + assert(recorder->field_type() == field_type); +#endif + } + assert(is_clean()); + auto node = do_get_mutable(); + auto p = bl.cbegin(); + while (p != bl.end()) { + recorder->apply_delta(p, node); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h new file mode 100644 index 000000000..f80b99fab --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h @@ -0,0 +1,126 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/log.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h" + +/** + * seastore.h + * + * Seastore backend implementations. + */ + +namespace crimson::os::seastore::onode { + +class SeastoreSuper final: public Super { + public: + SeastoreSuper(Transaction& t, RootNodeTracker& tracker, + laddr_t root_addr, TransactionManager& tm) + : Super(t, tracker), root_addr{root_addr}, tm{tm} {} + ~SeastoreSuper() override = default; + protected: + laddr_t get_root_laddr() const override { + return root_addr; + } + void write_root_laddr(context_t c, laddr_t addr) override; + private: + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + laddr_t root_addr; + TransactionManager& tm; +}; + +class SeastoreNodeExtent final: public NodeExtent { + public: + SeastoreNodeExtent(ceph::bufferptr &&ptr) + : NodeExtent(std::move(ptr)) {} + SeastoreNodeExtent(const SeastoreNodeExtent& other) + : NodeExtent(other) {} + ~SeastoreNodeExtent() override = default; + protected: + NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override; + + DeltaRecorder* get_recorder() const override { + return recorder.get(); + } + + CachedExtentRef duplicate_for_write() override { + return CachedExtentRef(new SeastoreNodeExtent(*this)); + } + extent_types_t get_type() const override { + return extent_types_t::ONODE_BLOCK_STAGED; + } + ceph::bufferlist get_delta() override { + assert(recorder); + return recorder->get_delta(); + } + void apply_delta(const ceph::bufferlist&) override; + private: + DeltaRecorderURef recorder; +}; + +class SeastoreNodeExtentManager final: public NodeExtentManager { + public: + SeastoreNodeExtentManager(TransactionManager& tm, laddr_t min) + : tm{tm}, addr_min{min} {}; + ~SeastoreNodeExtentManager() override = default; + TransactionManager& get_tm() { return tm; } + protected: + bool is_read_isolated() const override { return true; } + + tm_future<NodeExtentRef> read_extent( + Transaction& t, laddr_t addr, extent_len_t len) override { + logger().debug("OTree::Seastore: reading {}B at {:#x} ...", len, addr); + return tm.read_extents<SeastoreNodeExtent>(t, addr, len + ).safe_then([addr, len](auto&& extents) { + assert(extents.size() == 1); + [[maybe_unused]] auto [laddr, e] = extents.front(); + logger().trace("OTree::Seastore: read {}B at {:#x}", + e->get_length(), e->get_laddr()); + assert(e->get_laddr() == addr); + assert(e->get_length() == len); + std::ignore = addr; + std::ignore = len; + return NodeExtentRef(e); + }); + } + + tm_future<NodeExtentRef> alloc_extent( + Transaction& t, extent_len_t len) override { + logger().debug("OTree::Seastore: allocating {}B ...", len); + return tm.alloc_extent<SeastoreNodeExtent>(t, addr_min, len + ).safe_then([len](auto extent) { + logger().debug("OTree::Seastore: allocated {}B at {:#x}", + extent->get_length(), extent->get_laddr()); + assert(extent->get_length() == len); + std::ignore = len; + return NodeExtentRef(extent); + }); + } + + tm_future<Super::URef> get_super( + Transaction& t, RootNodeTracker& tracker) override { + logger().trace("OTree::Seastore: get root ..."); + return tm.read_onode_root(t).safe_then([this, &t, &tracker](auto root_addr) { + logger().debug("OTree::Seastore: got root {:#x}", root_addr); + return Super::URef(new SeastoreSuper(t, tracker, root_addr, tm)); + }); + } + + std::ostream& print(std::ostream& os) const override { + return os << "SeastoreNodeExtentManager"; + } + + private: + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + TransactionManager& tm; + const laddr_t addr_min; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h new file mode 100644 index 000000000..240c88932 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h" + +/** test_replay.h + * + * A special version of NodeExtent to help verify delta encode, decode and + * replay in recorder_t under debug build. + */ + +namespace crimson::os::seastore::onode { + +class TestReplayExtent final: public NodeExtent { + public: + using Ref = crimson::os::seastore::TCachedExtentRef<TestReplayExtent>; + + void prepare_replay(NodeExtentRef from_extent) { + assert(get_length() == from_extent->get_length()); + auto mut = do_get_mutable(); + std::memcpy(mut.get_write(), from_extent->get_read(), get_length()); + } + + void replay_and_verify(NodeExtentRef replayed_extent) { + assert(get_length() == replayed_extent->get_length()); + auto mut = do_get_mutable(); + auto bl = recorder->get_delta(); + assert(bl.length()); + auto p = bl.cbegin(); + recorder->apply_delta(p, mut); + assert(p == bl.end()); + auto cmp = std::memcmp(get_read(), replayed_extent->get_read(), get_length()); + ceph_assert(cmp == 0 && "replay mismatch!"); + } + + static Ref create(extent_len_t length, DeltaRecorderURef&& recorder) { + auto r = ceph::buffer::create_aligned(length, 4096); + auto bp = ceph::bufferptr(std::move(r)); + return new TestReplayExtent(std::move(bp), std::move(recorder)); + } + + protected: + NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override { + ceph_abort("impossible path"); } + DeltaRecorder* get_recorder() const override { + ceph_abort("impossible path"); } + CachedExtentRef duplicate_for_write() override { + ceph_abort("impossible path"); } + extent_types_t get_type() const override { + return extent_types_t::TEST_BLOCK; } + ceph::bufferlist get_delta() override { + ceph_abort("impossible path"); } + void apply_delta(const ceph::bufferlist&) override { + ceph_abort("impossible path"); } + + private: + TestReplayExtent(ceph::bufferptr&& ptr, DeltaRecorderURef&& recorder) + : NodeExtent(std::move(ptr)), recorder(std::move(recorder)) { + state = extent_state_t::MUTATION_PENDING; + } + DeltaRecorderURef recorder; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc new file mode 100644 index 000000000..048c4000d --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_extent_mutable.h" +#include "node_extent_manager.h" + +namespace crimson::os::seastore::onode { + +NodeExtentMutable::NodeExtentMutable(NodeExtent& extent) + : extent{extent} { + assert(extent.is_pending() || // during mutation + extent.is_clean()); // during replay +} + +const char* NodeExtentMutable::get_read() const { + assert(extent.is_pending() || // during mutation + extent.is_clean()); // during replay + return extent.get_bptr().c_str(); +} + +char* NodeExtentMutable::get_write() { + assert(extent.is_pending() || // during mutation + extent.is_clean()); // during replay + return extent.get_bptr().c_str(); +} + +extent_len_t NodeExtentMutable::get_length() const { + return extent.get_length(); +} + +laddr_t NodeExtentMutable::get_laddr() const { + return extent.get_laddr(); +} + +const char* NodeExtentMutable::buf_upper_bound() const { + return get_read() + get_length(); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h new file mode 100644 index 000000000..52f10a013 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include <cstring> + +#include "fwd.h" + +#pragma once + +namespace crimson::os::seastore::onode { + +class NodeExtent; + +/** + * NodeExtentMutable + * + * A thin wrapper of NodeExtent to make sure that only the newly allocated + * or the duplicated NodeExtent is mutable, and the memory modifications are + * safe within the extent range. + */ +class NodeExtentMutable { + public: + void copy_in_absolute(void* dst, const void* src, extent_len_t len) { + assert((char*)dst >= get_write()); + assert((char*)dst + len <= buf_upper_bound()); + std::memcpy(dst, src, len); + } + template <typename T> + void copy_in_absolute(void* dst, const T& src) { + copy_in_absolute(dst, &src, sizeof(T)); + } + + const void* copy_in_relative( + extent_len_t dst_offset, const void* src, extent_len_t len) { + auto dst = get_write() + dst_offset; + copy_in_absolute(dst, src, len); + return dst; + } + template <typename T> + const T* copy_in_relative( + extent_len_t dst_offset, const T& src) { + auto dst = copy_in_relative(dst_offset, &src, sizeof(T)); + return static_cast<const T*>(dst); + } + + void shift_absolute(const void* src, extent_len_t len, int offset) { + assert((const char*)src >= get_write()); + assert((const char*)src + len <= buf_upper_bound()); + char* to = (char*)src + offset; + assert(to >= get_write()); + assert(to + len <= buf_upper_bound()); + if (len != 0) { + std::memmove(to, src, len); + } + } + void shift_relative(extent_len_t src_offset, extent_len_t len, int offset) { + shift_absolute(get_write() + src_offset, len, offset); + } + + template <typename T> + void validate_inplace_update(const T& updated) { + assert((const char*)&updated >= get_write()); + assert((const char*)&updated + sizeof(T) <= buf_upper_bound()); + } + + const char* get_read() const; + char* get_write(); + extent_len_t get_length() const; + laddr_t get_laddr() const; + + private: + explicit NodeExtentMutable(NodeExtent&); + const char* buf_upper_bound() const; + + NodeExtent& extent; + + friend class NodeExtent; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc new file mode 100644 index 000000000..59d792b1a --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_impl.h" +#include "node_layout.h" + +namespace crimson::os::seastore::onode { + +#ifdef UNIT_TESTS_BUILT +last_split_info_t last_split = {}; +#endif + +// XXX: branchless allocation +InternalNodeImpl::alloc_ertr::future<InternalNodeImpl::fresh_impl_t> +InternalNodeImpl::allocate( + context_t c, field_type_t type, bool is_level_tail, level_t level) { + if (type == field_type_t::N0) { + return InternalNode0::allocate(c, is_level_tail, level); + } else if (type == field_type_t::N1) { + return InternalNode1::allocate(c, is_level_tail, level); + } else if (type == field_type_t::N2) { + return InternalNode2::allocate(c, is_level_tail, level); + } else if (type == field_type_t::N3) { + return InternalNode3::allocate(c, is_level_tail, level); + } else { + ceph_abort("impossible path"); + } +} + +LeafNodeImpl::alloc_ertr::future<LeafNodeImpl::fresh_impl_t> +LeafNodeImpl::allocate( + context_t c, field_type_t type, bool is_level_tail) { + if (type == field_type_t::N0) { + return LeafNode0::allocate(c, is_level_tail, 0); + } else if (type == field_type_t::N1) { + return LeafNode1::allocate(c, is_level_tail, 0); + } else if (type == field_type_t::N2) { + return LeafNode2::allocate(c, is_level_tail, 0); + } else if (type == field_type_t::N3) { + return LeafNode3::allocate(c, is_level_tail, 0); + } else { + ceph_abort("impossible path"); + } +} + +InternalNodeImplURef InternalNodeImpl::load( + NodeExtentRef extent, field_type_t type, bool expect_is_level_tail) { + if (type == field_type_t::N0) { + return InternalNode0::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N1) { + return InternalNode1::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N2) { + return InternalNode2::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N3) { + return InternalNode3::load(extent, expect_is_level_tail); + } else { + ceph_abort("impossible path"); + } +} + +LeafNodeImplURef LeafNodeImpl::load( + NodeExtentRef extent, field_type_t type, bool expect_is_level_tail) { + if (type == field_type_t::N0) { + return LeafNode0::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N1) { + return LeafNode1::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N2) { + return LeafNode2::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N3) { + return LeafNode3::load(extent, expect_is_level_tail); + } else { + ceph_abort("impossible path"); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h new file mode 100644 index 000000000..3267cda2b --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h @@ -0,0 +1,197 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <ostream> + +#include "node_extent_mutable.h" +#include "node_types.h" +#include "stages/stage_types.h" + +namespace crimson::os::seastore::onode { + +#ifdef UNIT_TESTS_BUILT +enum class InsertType { BEGIN, LAST, MID }; +struct split_expectation_t { + match_stage_t split_stage; + match_stage_t insert_stage; + bool is_insert_left; + InsertType insert_type; +}; +struct last_split_info_t { + search_position_t split_pos; + match_stage_t insert_stage; + bool is_insert_left; + InsertType insert_type; + bool match(const split_expectation_t& e) const { + match_stage_t split_stage; + if (split_pos.nxt.nxt.index == 0) { + if (split_pos.nxt.index == 0) { + split_stage = 2; + } else { + split_stage = 1; + } + } else { + split_stage = 0; + } + return split_stage == e.split_stage && + insert_stage == e.insert_stage && + is_insert_left == e.is_insert_left && + insert_type == e.insert_type; + } + bool match_split_pos(const search_position_t& pos) const { + return split_pos == pos; + } +}; +extern last_split_info_t last_split; +#endif + +struct key_hobj_t; +struct key_view_t; +class NodeExtentMutable; + +/** + * NodeImpl + * + * Hides type specific node layout implementations for Node. + */ +class NodeImpl { + public: + using alloc_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + virtual ~NodeImpl() = default; + + virtual field_type_t field_type() const = 0; + virtual laddr_t laddr() const = 0; + virtual void prepare_mutate(context_t) = 0; + virtual bool is_level_tail() const = 0; + virtual bool is_empty() const = 0; + virtual level_t level() const = 0; + virtual node_offset_t free_size() const = 0; + virtual key_view_t get_key_view(const search_position_t&) const = 0; + virtual key_view_t get_largest_key_view() const = 0; + virtual void next_position(search_position_t&) const = 0; + + virtual node_stats_t get_stats() const = 0; + virtual std::ostream& dump(std::ostream&) const = 0; + virtual std::ostream& dump_brief(std::ostream&) const = 0; + virtual void validate_layout() const = 0; + + virtual void test_copy_to(NodeExtentMutable&) const = 0; + virtual void test_set_tail(NodeExtentMutable&) = 0; + + protected: + NodeImpl() = default; +}; + +/** + * InternalNodeImpl + * + * Hides type specific node layout implementations for InternalNode. + */ +class InternalNodeImpl : public NodeImpl { + public: + struct internal_marker_t {}; + virtual ~InternalNodeImpl() = default; + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual const laddr_packed_t* get_p_value( + const search_position_t&, + key_view_t* = nullptr, internal_marker_t = {}) const { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual lookup_result_t<node_type_t::INTERNAL> lower_bound( + const key_hobj_t&, MatchHistory&, + key_view_t* = nullptr, internal_marker_t = {}) const { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual const laddr_packed_t* insert( + const key_view_t&, const laddr_packed_t&, search_position_t&, match_stage_t&, node_offset_t&) { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual std::tuple<search_position_t, bool, const laddr_packed_t*> split_insert( + NodeExtentMutable&, NodeImpl&, const key_view_t&, const laddr_packed_t&, + search_position_t&, match_stage_t&, node_offset_t&) { + ceph_abort("impossible path"); + } + + virtual void replace_child_addr(const search_position_t&, laddr_t dst, laddr_t src) = 0; + virtual std::tuple<match_stage_t, node_offset_t> evaluate_insert( + const key_view_t&, const laddr_t&, search_position_t&) const = 0; + + struct fresh_impl_t { + InternalNodeImplURef impl; + NodeExtentMutable mut; + std::pair<NodeImplURef, NodeExtentMutable> make_pair() { + return {std::move(impl), mut}; + } + }; + static alloc_ertr::future<fresh_impl_t> allocate(context_t, field_type_t, bool, level_t); + static InternalNodeImplURef load(NodeExtentRef, field_type_t, bool); + + protected: + InternalNodeImpl() = default; +}; + +/** + * LeafNodeImpl + * + * Hides type specific node layout implementations for LeafNode. + */ +class LeafNodeImpl : public NodeImpl { + public: + struct leaf_marker_t {}; + virtual ~LeafNodeImpl() = default; + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual const onode_t* get_p_value( + const search_position_t&, + key_view_t* = nullptr, leaf_marker_t={}) const { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual lookup_result_t<node_type_t::LEAF> lower_bound( + const key_hobj_t&, MatchHistory&, + key_view_t* = nullptr, leaf_marker_t = {}) const { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual const onode_t* insert( + const key_hobj_t&, const onode_t&, search_position_t&, match_stage_t&, node_offset_t&) { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual std::tuple<search_position_t, bool, const onode_t*> split_insert( + NodeExtentMutable&, NodeImpl&, const key_hobj_t&, const onode_t&, + search_position_t&, match_stage_t&, node_offset_t&) { + ceph_abort("impossible path"); + } + + virtual void get_largest_slot( + search_position_t&, key_view_t&, const onode_t**) const = 0; + virtual std::tuple<match_stage_t, node_offset_t> evaluate_insert( + const key_hobj_t&, const onode_t&, + const MatchHistory&, match_stat_t, search_position_t&) const = 0; + + struct fresh_impl_t { + LeafNodeImplURef impl; + NodeExtentMutable mut; + std::pair<NodeImplURef, NodeExtentMutable> make_pair() { + return {std::move(impl), mut}; + } + }; + static alloc_ertr::future<fresh_impl_t> allocate(context_t, field_type_t, bool); + static LeafNodeImplURef load(NodeExtentRef, field_type_t, bool); + + protected: + LeafNodeImpl() = default; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h new file mode 100644 index 000000000..916d17424 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h @@ -0,0 +1,613 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <ostream> +#include <sstream> + +#include "common/likely.h" +#include "crimson/common/log.h" +#include "node_extent_accessor.h" +#include "node_impl.h" +#include "stages/node_stage_layout.h" + +namespace crimson::os::seastore::onode { + +template <node_type_t NODE_TYPE> struct insert_key_type; +template <> struct insert_key_type<node_type_t::INTERNAL> { + static constexpr auto type = KeyT::VIEW; }; +template <> struct insert_key_type<node_type_t::LEAF> { + static constexpr auto type = KeyT::HOBJ; }; + +template <node_type_t NODE_TYPE> struct node_impl_type; +template <> struct node_impl_type<node_type_t::INTERNAL> { + using type = InternalNodeImpl; }; +template <> struct node_impl_type<node_type_t::LEAF> { + using type = LeafNodeImpl; }; + +template <node_type_t NODE_TYPE> struct node_marker_type; +template <> struct node_marker_type<node_type_t::INTERNAL> { + using type = InternalNodeImpl::internal_marker_t; }; +template <> struct node_marker_type<node_type_t::LEAF> { + using type = LeafNodeImpl::leaf_marker_t; }; + +/** + * NodeLayoutT + * + * Contains templated and concrete implementations for both InternalNodeImpl + * and LeafNodeImpl under a specific node layout. + */ +template <typename FieldType, node_type_t NODE_TYPE> +class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl { + public: + using URef = std::unique_ptr<NodeLayoutT>; + using extent_t = NodeExtentAccessorT<FieldType, NODE_TYPE>; + using parent_t = typename node_impl_type<NODE_TYPE>::type; + using marker_t = typename node_marker_type<NODE_TYPE>::type; + using node_stage_t = typename extent_t::node_stage_t; + using position_t = typename extent_t::position_t; + using value_t = typename extent_t::value_t; + static constexpr auto FIELD_TYPE = extent_t::FIELD_TYPE; + static constexpr auto KEY_TYPE = insert_key_type<NODE_TYPE>::type; + static constexpr auto STAGE = STAGE_T::STAGE; + + NodeLayoutT(const NodeLayoutT&) = delete; + NodeLayoutT(NodeLayoutT&&) = delete; + NodeLayoutT& operator=(const NodeLayoutT&) = delete; + NodeLayoutT& operator=(NodeLayoutT&&) = delete; + ~NodeLayoutT() override = default; + + static URef load(NodeExtentRef extent, bool expect_is_level_tail) { + std::unique_ptr<NodeLayoutT> ret(new NodeLayoutT(extent)); + assert(ret->is_level_tail() == expect_is_level_tail); + return ret; + } + + using alloc_ertr = NodeExtentManager::tm_ertr; + static alloc_ertr::future<typename parent_t::fresh_impl_t> allocate( + context_t c, bool is_level_tail, level_t level) { + // NOTE: Currently, all the node types have the same size for simplicity. + // But depending on the requirement, we may need to make node size + // configurable by field_type_t and node_type_t, or totally flexible. + return c.nm.alloc_extent(c.t, node_stage_t::EXTENT_SIZE + ).safe_then([is_level_tail, level](auto extent) { + assert(extent->is_initial_pending()); + auto mut = extent->get_mutable(); + node_stage_t::bootstrap_extent( + mut, FIELD_TYPE, NODE_TYPE, is_level_tail, level); + return typename parent_t::fresh_impl_t{ + std::unique_ptr<parent_t>(new NodeLayoutT(extent)), mut}; + }); + } + + protected: + /* + * NodeImpl + */ + field_type_t field_type() const override { return FIELD_TYPE; } + laddr_t laddr() const override { return extent.get_laddr(); } + void prepare_mutate(context_t c) override { return extent.prepare_mutate(c); } + bool is_level_tail() const override { return extent.read().is_level_tail(); } + bool is_empty() const override { return extent.read().keys() == 0; } + level_t level() const override { return extent.read().level(); } + node_offset_t free_size() const override { return extent.read().free_size(); } + + key_view_t get_key_view(const search_position_t& position) const override { + key_view_t ret; + STAGE_T::get_key_view(extent.read(), cast_down<STAGE>(position), ret); + return ret; + } + + key_view_t get_largest_key_view() const override { + key_view_t index_key; + STAGE_T::template lookup_largest_slot<false, true, false>( + extent.read(), nullptr, &index_key, nullptr); + return index_key; + } + + void next_position(search_position_t& pos) const override { + assert(!pos.is_end()); + bool find_next = STAGE_T::next_position(extent.read(), cast_down<STAGE>(pos)); + if (find_next) { + pos = search_position_t::end(); + } + } + + node_stats_t get_stats() const override { + node_stats_t stats; + auto& node_stage = extent.read(); + key_view_t index_key; + if (node_stage.keys()) { + STAGE_T::get_stats(node_stage, stats, index_key); + } + stats.size_persistent = node_stage_t::EXTENT_SIZE; + stats.size_filled = filled_size(); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (is_level_tail()) { + stats.size_logical += sizeof(value_t); + stats.size_value += sizeof(value_t); + stats.num_kvs += 1; + } + } + return stats; + } + + std::ostream& dump(std::ostream& os) const override { + auto& node_stage = extent.read(); + auto p_start = node_stage.p_start(); + dump_brief(os); + auto stats = get_stats(); + os << " num_kvs=" << stats.num_kvs + << ", logical=" << stats.size_logical + << "B, overhead=" << stats.size_overhead + << "B, value=" << stats.size_value << "B"; + os << ":\n header: " << node_stage_t::header_size() << "B"; + size_t size = 0u; + if (node_stage.keys()) { + STAGE_T::dump(node_stage, os, " ", size, p_start); + } else { + size += node_stage_t::header_size(); + if (NODE_TYPE == node_type_t::LEAF || !node_stage.is_level_tail()) { + os << " empty!"; + } + } + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (node_stage.is_level_tail()) { + size += sizeof(laddr_t); + auto value_ptr = node_stage.get_end_p_laddr(); + int offset = reinterpret_cast<const char*>(value_ptr) - p_start; + os << "\n tail value: 0x" + << std::hex << value_ptr->value << std::dec + << " " << size << "B" + << " @" << offset << "B"; + } + } + assert(size == filled_size()); + return os; + } + + std::ostream& dump_brief(std::ostream& os) const override { + auto& node_stage = extent.read(); + os << "Node" << NODE_TYPE << FIELD_TYPE + << "@0x" << std::hex << extent.get_laddr() + << "+" << node_stage_t::EXTENT_SIZE << std::dec + << (node_stage.is_level_tail() ? "$" : "") + << "(level=" << (unsigned)node_stage.level() + << ", filled=" << filled_size() << "B" + << ", free=" << node_stage.free_size() << "B" + << ")"; + return os; + } + + void validate_layout() const override { +#ifndef NDEBUG + STAGE_T::validate(extent.read()); +#endif + } + + void test_copy_to(NodeExtentMutable& to) const override { + extent.test_copy_to(to); + } + + void test_set_tail(NodeExtentMutable& mut) override { + node_stage_t::update_is_level_tail(mut, extent.read(), true); + } + + /* + * Common + */ + const value_t* get_p_value(const search_position_t& position, + key_view_t* index_key=nullptr, marker_t={}) const override { + auto& node_stage = extent.read(); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + assert(!index_key); + if (position.is_end()) { + assert(is_level_tail()); + return node_stage.get_end_p_laddr(); + } + } else { + assert(!position.is_end()); + } + if (index_key) { + return STAGE_T::template get_p_value<true>( + node_stage, cast_down<STAGE>(position), index_key); + } else { + return STAGE_T::get_p_value(node_stage, cast_down<STAGE>(position)); + } + } + + lookup_result_t<NODE_TYPE> lower_bound( + const key_hobj_t& key, MatchHistory& history, + key_view_t* index_key=nullptr, marker_t={}) const override { + auto& node_stage = extent.read(); + if constexpr (NODE_TYPE == node_type_t::LEAF) { + if (unlikely(node_stage.keys() == 0)) { + history.set<STAGE_LEFT>(MatchKindCMP::LT); + return lookup_result_t<NODE_TYPE>::end(); + } + } + + typename STAGE_T::result_t result_raw; + if (index_key) { + result_raw = STAGE_T::template lower_bound<true>( + node_stage, key, history, index_key); +#ifndef NDEBUG + if (!result_raw.is_end()) { + full_key_t<KeyT::VIEW> index; + STAGE_T::get_key_view(node_stage, result_raw.position, index); + assert(index == *index_key); + } +#endif + } else { + result_raw = STAGE_T::lower_bound(node_stage, key, history); + } +#ifndef NDEBUG + if (result_raw.is_end()) { + assert(result_raw.mstat == MSTAT_END); + } else { + full_key_t<KeyT::VIEW> index; + STAGE_T::get_key_view(node_stage, result_raw.position, index); + assert_mstat(key, index, result_raw.mstat); + } +#endif + + // calculate MSTAT_LT3 + if constexpr (FIELD_TYPE == field_type_t::N0) { + // currently only internal node checks mstat + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (result_raw.mstat == MSTAT_LT2) { + auto cmp = compare_to<KeyT::HOBJ>( + key, node_stage[result_raw.position.index].shard_pool); + assert(cmp != MatchKindCMP::GT); + if (cmp != MatchKindCMP::EQ) { + result_raw.mstat = MSTAT_LT3; + } + } + } + } + + auto result = normalize(std::move(result_raw)); + if (result.is_end()) { + assert(node_stage.is_level_tail()); + assert(result.p_value == nullptr); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + result.p_value = node_stage.get_end_p_laddr(); + } + } else { + assert(result.p_value != nullptr); + } + return result; + } + + const value_t* insert( + const full_key_t<KEY_TYPE>& key, const value_t& value, + search_position_t& insert_pos, match_stage_t& insert_stage, + node_offset_t& insert_size) override { + logger().debug("OTree::Layout::Insert: begin at " + "insert_pos({}), insert_stage={}, insert_size={}B ...", + insert_pos, insert_stage, insert_size); + if (unlikely(logger().is_enabled(seastar::log_level::trace))) { + std::ostringstream sos; + dump(sos); + logger().trace("OTree::Layout::Insert: -- dump\n{}", sos.str()); + } + auto ret = extent.template insert_replayable<KEY_TYPE>( + key, value, cast_down<STAGE>(insert_pos), insert_stage, insert_size); + logger().debug("OTree::Layout::Insert: done at " + "insert_pos({}), insert_stage={}, insert_size={}B", + insert_pos, insert_stage, insert_size); + if (unlikely(logger().is_enabled(seastar::log_level::trace))) { + std::ostringstream sos; + dump(sos); + logger().trace("OTree::Layout::Insert: -- dump\n{}", sos.str()); + } + validate_layout(); + assert(get_key_view(insert_pos) == key); + return ret; + } + + std::tuple<search_position_t, bool, const value_t*> split_insert( + NodeExtentMutable& right_mut, NodeImpl& right_impl, + const full_key_t<KEY_TYPE>& key, const value_t& value, + search_position_t& _insert_pos, match_stage_t& insert_stage, + node_offset_t& insert_size) override { + logger().info("OTree::Layout::Split: begin at " + "insert_pos({}), insert_stage={}, insert_size={}B, " + "{:#x}=>{:#x} ...", + _insert_pos, insert_stage, insert_size, + laddr(), right_impl.laddr()); + if (unlikely(logger().is_enabled(seastar::log_level::debug))) { + std::ostringstream sos; + dump(sos); + logger().debug("OTree::Layout::Split: -- dump\n{}", sos.str()); + } +#ifdef UNIT_TESTS_BUILT + auto insert_stage_pre = insert_stage; +#endif + + auto& insert_pos = cast_down<STAGE>(_insert_pos); + auto& node_stage = extent.read(); + typename STAGE_T::StagedIterator split_at; + bool is_insert_left; + size_t split_size; + size_t target_split_size; + { + size_t empty_size = node_stage.size_before(0); + size_t filled_kv_size = filled_size() - empty_size; + /** NODE_BLOCK_SIZE considerations + * + * Generally, + * target_split_size = (filled_size + insert_size) / 2 + * We can have two locate_split() strategies: + * A. the simpler one is to locate the largest split position where + * the estimated left_node_size <= target_split_size; + * B. the fair one takes a further step to calculate the next slot of + * P KiB, and if left_node_size + P/2 < target_split_size, compensate + * the split position to include the next slot; (TODO) + * + * Say that the node_block_size = N KiB, the largest allowed + * insert_size = 1/I * N KiB (I > 1). We want to identify the minimal 'I' + * that won't lead to "double split" effect, meaning after a split, + * the right node size is still larger than N KiB and need to split + * again. I think "double split" makes split much more complicated and + * we can no longer identify whether the node is safe under concurrent + * operations. + * + * We need to evaluate the worst case in order to identify 'I'. This means: + * - filled_size ~= N KiB + * - insert_size == N/I KiB + * - target_split_size ~= (I+1)/2I * N KiB + * To simplify the below calculations, node_block_size is normalized to 1. + * + * With strategy A, the worst case is when left_node_size cannot include + * the next slot that will just overflow the target_split_size: + * - left_node_size + 1/I ~= (I+1)/2I + * - left_node_size ~= (I-1)/2I + * - right_node_size ~= 1 + 1/I - left_node_size ~= (I+3)/2I + * The right_node_size cannot larger than the node_block_size in the + * worst case, which means (I+3)/2I < 1, so I > 3, meaning the largest + * possible insert_size must be smaller than 1/3 of the node_block_size. + * + * With strategy B, the worst case is when left_node_size cannot include + * the next slot that will just overflow the threshold + * target_split_size - 1/2I, thus: + * - left_node_size ~= (I+1)/2I - 1/2I ~= 1/2 + * - right_node_size ~= 1 + 1/I - 1/2 ~= (I+2)/2I < node_block_size(1) + * - I > 2 + * This means the largest possible insert_size must be smaller than 1/2 of + * the node_block_size, which is better than strategy A. + + * In order to avoid "double split", there is another side-effect we need + * to take into consideration: if split happens with snap-gen indexes, the + * according ns-oid string needs to be copied to the right node. That is + * to say: right_node_size + string_size < node_block_size. + * + * Say that the largest allowed string size is 1/S of the largest allowed + * insert_size N/I KiB. If we go with stragety B, the equation should be + * changed to: + * - right_node_size ~= (I+2)/2I + 1/(I*S) < 1 + * - I > 2 + 2/S (S > 1) + * + * Now back to NODE_BLOCK_SIZE calculation, if we have limits of at most + * X KiB ns-oid string and Y KiB of onode_t to store in this BTree, then: + * - largest_insert_size ~= X+Y KiB + * - 1/S == X/(X+Y) + * - I > (4X+2Y)/(X+Y) + * - node_block_size(N) == I * insert_size > 4X+2Y KiB + * + * In conclusion, + * (TODO) the current node block size (4 KiB) is too small to + * store entire 2 KiB ns-oid string. We need to consider a larger + * node_block_size. + * + * We are setting X = Y = 640 B in order not to break the current + * implementations with 4KiB node. + * + * (TODO) Implement smarter logics to check when "double split" happens. + */ + target_split_size = empty_size + (filled_kv_size + insert_size) / 2; + assert(insert_size < (node_stage.total_size() - empty_size) / 2); + + std::optional<bool> _is_insert_left; + split_at.set(node_stage); + split_size = 0; + bool locate_nxt = STAGE_T::recursively_locate_split_inserted( + split_size, 0, target_split_size, insert_pos, + insert_stage, insert_size, _is_insert_left, split_at); + is_insert_left = *_is_insert_left; + logger().debug("OTree::Layout::Split: -- located " + "split_at({}), insert_pos({}), is_insert_left={}, " + "split_size={}B(target={}B, current={}B)", + split_at, insert_pos, is_insert_left, + split_size, target_split_size, filled_size()); + // split_size can be larger than target_split_size in strategy B + // assert(split_size <= target_split_size); + if (locate_nxt) { + assert(insert_stage == STAGE); + assert(split_at.get().is_last()); + split_at.set_end(); + assert(insert_pos.index == split_at.index()); + } + } + + auto append_at = split_at; + // TODO(cross-node string dedup) + typename STAGE_T::template StagedAppender<KEY_TYPE> right_appender; + right_appender.init(&right_mut, right_mut.get_write()); + const value_t* p_value = nullptr; + if (!is_insert_left) { + // right node: append [start(append_at), insert_pos) + STAGE_T::template append_until<KEY_TYPE>( + append_at, right_appender, insert_pos, insert_stage); + logger().debug("OTree::Layout::Split: -- right appended until " + "insert_pos({}), insert_stage={}, insert/append the rest ...", + insert_pos, insert_stage); + // right node: append [insert_pos(key, value)] + bool is_front_insert = (insert_pos == position_t::begin()); + [[maybe_unused]] bool is_end = STAGE_T::template append_insert<KEY_TYPE>( + key, value, append_at, right_appender, + is_front_insert, insert_stage, p_value); + assert(append_at.is_end() == is_end); + } else { + logger().debug("OTree::Layout::Split: -- right appending ..."); + } + + // right node: append (insert_pos, end) + auto pos_end = position_t::end(); + STAGE_T::template append_until<KEY_TYPE>( + append_at, right_appender, pos_end, STAGE); + assert(append_at.is_end()); + right_appender.wrap(); + if (unlikely(logger().is_enabled(seastar::log_level::debug))) { + std::ostringstream sos; + right_impl.dump(sos); + logger().debug("OTree::Layout::Split: -- right node dump\n{}", sos.str()); + } + right_impl.validate_layout(); + + // mutate left node + if (is_insert_left) { + logger().debug("OTree::Layout::Split: -- left trim/insert at " + "insert_pos({}), insert_stage={} ...", + insert_pos, insert_stage); + p_value = extent.template split_insert_replayable<KEY_TYPE>( + split_at, key, value, insert_pos, insert_stage, insert_size); + assert(get_key_view(_insert_pos) == key); + } else { + logger().debug("OTree::Layout::Split: -- left trim ..."); + assert(right_impl.get_key_view(_insert_pos) == key); + extent.split_replayable(split_at); + } + if (unlikely(logger().is_enabled(seastar::log_level::debug))) { + std::ostringstream sos; + dump(sos); + logger().debug("OTree::Layout::Split: -- left node dump\n{}", sos.str()); + } + validate_layout(); + assert(p_value); + + auto split_pos = normalize(split_at.get_pos()); + logger().info("OTree::Layout::Split: done at " + "insert_pos({}), insert_stage={}, insert_size={}B, split_at({}), " + "is_insert_left={}, split_size={}B(target={}B)", + _insert_pos, insert_stage, insert_size, split_pos, + is_insert_left, split_size, target_split_size); + assert(split_size == filled_size()); + +#ifdef UNIT_TESTS_BUILT + InsertType insert_type; + search_position_t last_pos; + if (is_insert_left) { + STAGE_T::template lookup_largest_slot<true, false, false>( + extent.read(), &cast_down_fill_0<STAGE>(last_pos), nullptr, nullptr); + } else { + node_stage_t right_stage{reinterpret_cast<FieldType*>(right_mut.get_write())}; + STAGE_T::template lookup_largest_slot<true, false, false>( + right_stage, &cast_down_fill_0<STAGE>(last_pos), nullptr, nullptr); + } + if (_insert_pos == search_position_t::begin()) { + insert_type = InsertType::BEGIN; + } else if (_insert_pos == last_pos) { + insert_type = InsertType::LAST; + } else { + insert_type = InsertType::MID; + } + last_split = {split_pos, insert_stage_pre, is_insert_left, insert_type}; +#endif + return {split_pos, is_insert_left, p_value}; + } + + /* + * InternalNodeImpl + */ + void replace_child_addr( + const search_position_t& pos, laddr_t dst, laddr_t src) override { + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + const laddr_packed_t* p_value = get_p_value(pos); + assert(p_value->value == src); + extent.update_child_addr_replayable(dst, const_cast<laddr_packed_t*>(p_value)); + } else { + ceph_abort("impossible path"); + } + } + + std::tuple<match_stage_t, node_offset_t> evaluate_insert( + const key_view_t& key, const laddr_t& value, + search_position_t& insert_pos) const override { + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + auto packed_value = laddr_packed_t{value}; + auto& node_stage = extent.read(); + match_stage_t insert_stage; + node_offset_t insert_size; + if (unlikely(!node_stage.keys())) { + assert(insert_pos.is_end()); + insert_stage = STAGE; + insert_size = STAGE_T::template insert_size<KeyT::VIEW>(key, packed_value); + } else { + std::tie(insert_stage, insert_size) = STAGE_T::evaluate_insert( + node_stage, key, packed_value, cast_down<STAGE>(insert_pos), false); + } + return {insert_stage, insert_size}; + } else { + ceph_abort("impossible path"); + } + } + + /* + * LeafNodeImpl + */ + void get_largest_slot(search_position_t& pos, + key_view_t& index_key, const onode_t** pp_value) const override { + if constexpr (NODE_TYPE == node_type_t::LEAF) { + STAGE_T::template lookup_largest_slot<true, true, true>( + extent.read(), &cast_down_fill_0<STAGE>(pos), &index_key, pp_value); + } else { + ceph_abort("impossible path"); + } + } + + std::tuple<match_stage_t, node_offset_t> evaluate_insert( + const key_hobj_t& key, const onode_t& value, + const MatchHistory& history, match_stat_t mstat, + search_position_t& insert_pos) const override { + if constexpr (NODE_TYPE == node_type_t::LEAF) { + if (unlikely(is_empty())) { + assert(insert_pos.is_end()); + return {STAGE, STAGE_T::template insert_size<KeyT::HOBJ>(key, value)}; + } else { + return STAGE_T::evaluate_insert( + key, value, history, mstat, cast_down<STAGE>(insert_pos)); + } + } else { + ceph_abort("impossible path"); + } + } + + private: + NodeLayoutT(NodeExtentRef extent) : extent{extent} {} + + node_offset_t filled_size() const { + auto& node_stage = extent.read(); + auto ret = node_stage.size_before(node_stage.keys()); + assert(ret == node_stage.total_size() - node_stage.free_size()); + return ret; + } + + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + + extent_t extent; +}; + +using InternalNode0 = NodeLayoutT<node_fields_0_t, node_type_t::INTERNAL>; +using InternalNode1 = NodeLayoutT<node_fields_1_t, node_type_t::INTERNAL>; +using InternalNode2 = NodeLayoutT<node_fields_2_t, node_type_t::INTERNAL>; +using InternalNode3 = NodeLayoutT<internal_fields_3_t, node_type_t::INTERNAL>; +using LeafNode0 = NodeLayoutT<node_fields_0_t, node_type_t::LEAF>; +using LeafNode1 = NodeLayoutT<node_fields_1_t, node_type_t::LEAF>; +using LeafNode2 = NodeLayoutT<node_fields_2_t, node_type_t::LEAF>; +using LeafNode3 = NodeLayoutT<leaf_fields_3_t, node_type_t::LEAF>; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h new file mode 100644 index 000000000..c1499d609 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "node_extent_mutable.h" +#include "stages/node_stage.h" +#include "stages/stage.h" + +#define STAGE_T node_to_stage_t<node_stage_t> + +namespace crimson::os::seastore::onode { + +/** + * NodeLayoutReplayableT + * + * Contains templated logics to modify the layout of a NodeExtend which are + * also replayable. Used by NodeExtentAccessorT at runtime and by + * DeltaRecorderT during replay. + */ +template <typename FieldType, node_type_t NODE_TYPE> +struct NodeLayoutReplayableT { + using node_stage_t = node_extent_t<FieldType, NODE_TYPE>; + using position_t = typename STAGE_T::position_t; + using StagedIterator = typename STAGE_T::StagedIterator; + using value_t = value_type_t<NODE_TYPE>; + static constexpr auto FIELD_TYPE = FieldType::FIELD_TYPE; + + template <KeyT KT> + static const value_t* insert( + NodeExtentMutable& mut, + const node_stage_t& node_stage, + const full_key_t<KT>& key, + const value_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + auto p_value = STAGE_T::template proceed_insert<KT, false>( + mut, node_stage, key, value, insert_pos, insert_stage, insert_size); + return p_value; + } + + static void split( + NodeExtentMutable& mut, + const node_stage_t& node_stage, + StagedIterator& split_at) { + node_stage_t::update_is_level_tail(mut, node_stage, false); + STAGE_T::trim(mut, split_at); + } + + template <KeyT KT> + static const value_t* split_insert( + NodeExtentMutable& mut, + const node_stage_t& node_stage, + StagedIterator& split_at, + const full_key_t<KT>& key, + const value_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + node_stage_t::update_is_level_tail(mut, node_stage, false); + STAGE_T::trim(mut, split_at); + auto p_value = STAGE_T::template proceed_insert<KT, true>( + mut, node_stage, key, value, insert_pos, insert_stage, insert_size); + return p_value; + } + + static void update_child_addr( + NodeExtentMutable& mut, const laddr_t new_addr, laddr_packed_t* p_addr) { + assert(NODE_TYPE == node_type_t::INTERNAL); + mut.copy_in_absolute(p_addr, new_addr); + } +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h new file mode 100644 index 000000000..6774544c7 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <ostream> + +#include "fwd.h" + +namespace crimson::os::seastore::onode { + +constexpr uint8_t FIELD_TYPE_MAGIC = 0x25; +enum class field_type_t : uint8_t { + N0 = FIELD_TYPE_MAGIC, + N1, + N2, + N3, + _MAX +}; +inline uint8_t to_unsigned(field_type_t type) { + auto value = static_cast<uint8_t>(type); + assert(value >= FIELD_TYPE_MAGIC); + assert(value < static_cast<uint8_t>(field_type_t::_MAX)); + return value - FIELD_TYPE_MAGIC; +} +inline std::ostream& operator<<(std::ostream &os, field_type_t type) { + const char* const names[] = {"0", "1", "2", "3"}; + auto index = to_unsigned(type); + os << names[index]; + return os; +} + +enum class node_type_t : uint8_t { + LEAF = 0, + INTERNAL +}; +inline std::ostream& operator<<(std::ostream &os, const node_type_t& type) { + const char* const names[] = {"L", "I"}; + auto index = static_cast<uint8_t>(type); + assert(index <= 1u); + os << names[index]; + return os; +} + +struct laddr_packed_t { + laddr_t value; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const laddr_packed_t& laddr) { + return os << "laddr_packed(0x" << std::hex << laddr.value << std::dec << ")"; +} + +using match_stat_t = int8_t; +constexpr match_stat_t MSTAT_END = -2; // index is search_position_t::end() +constexpr match_stat_t MSTAT_EQ = -1; // key == index +constexpr match_stat_t MSTAT_LT0 = 0; // key == index [pool/shard crush ns/oid]; key < index [snap/gen] +constexpr match_stat_t MSTAT_LT1 = 1; // key == index [pool/shard crush]; key < index [ns/oid] +constexpr match_stat_t MSTAT_LT2 = 2; // key < index [pool/shard crush ns/oid] || + // key == index [pool/shard]; key < index [crush] +constexpr match_stat_t MSTAT_LT3 = 3; // key < index [pool/shard] +constexpr match_stat_t MSTAT_MIN = MSTAT_END; +constexpr match_stat_t MSTAT_MAX = MSTAT_LT3; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc new file mode 100644 index 000000000..443c6cabd --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc @@ -0,0 +1,165 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "item_iterator_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +#define ITER_T item_iterator_t<NODE_TYPE> +#define ITER_INST(NT) item_iterator_t<NT> + +template <node_type_t NODE_TYPE> +template <KeyT KT> +memory_range_t ITER_T::insert_prefix( + NodeExtentMutable& mut, const ITER_T& iter, const full_key_t<KT>& key, + bool is_end, node_offset_t size, const char* p_left_bound) { + // 1. insert range + char* p_insert; + if (is_end) { + assert(!iter.has_next()); + p_insert = const_cast<char*>(iter.p_start()); + } else { + p_insert = const_cast<char*>(iter.p_end()); + } + char* p_insert_front = p_insert - size; + + // 2. shift memory + const char* p_shift_start = p_left_bound; + const char* p_shift_end = p_insert; + mut.shift_absolute(p_shift_start, + p_shift_end - p_shift_start, + -(int)size); + + // 3. append header + p_insert -= sizeof(node_offset_t); + node_offset_t back_offset = (p_insert - p_insert_front); + mut.copy_in_absolute(p_insert, back_offset); + ns_oid_view_t::append<KT>(mut, key, p_insert); + + return {p_insert_front, p_insert}; +} +#define IP_TEMPLATE(NT, KT) \ + template memory_range_t ITER_INST(NT)::insert_prefix<KT>( \ + NodeExtentMutable&, const ITER_INST(NT)&, const full_key_t<KT>&, \ + bool, node_offset_t, const char*) +IP_TEMPLATE(node_type_t::LEAF, KeyT::VIEW); +IP_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW); +IP_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ); +IP_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ); + +template <node_type_t NODE_TYPE> +void ITER_T::update_size( + NodeExtentMutable& mut, const ITER_T& iter, int change) { + node_offset_t offset = iter.get_back_offset(); + int new_size = change + offset; + assert(new_size > 0 && new_size < NODE_BLOCK_SIZE); + mut.copy_in_absolute( + (void*)iter.get_item_range().p_end, node_offset_t(new_size)); +} + +template <node_type_t NODE_TYPE> +node_offset_t ITER_T::trim_until(NodeExtentMutable&, const ITER_T& iter) { + assert(iter.index() != 0); + size_t ret = iter.p_end() - iter.p_items_start; + assert(ret < NODE_BLOCK_SIZE); + return ret; +} + +template <node_type_t NODE_TYPE> +node_offset_t ITER_T::trim_at( + NodeExtentMutable& mut, const ITER_T& iter, node_offset_t trimmed) { + size_t trim_size = iter.p_start() - iter.p_items_start + trimmed; + assert(trim_size < NODE_BLOCK_SIZE); + assert(iter.get_back_offset() > trimmed); + node_offset_t new_offset = iter.get_back_offset() - trimmed; + mut.copy_in_absolute((void*)iter.item_range.p_end, new_offset); + return trim_size; +} + +#define ITER_TEMPLATE(NT) template class ITER_INST(NT) +ITER_TEMPLATE(node_type_t::LEAF); +ITER_TEMPLATE(node_type_t::INTERNAL); + +#define APPEND_T ITER_T::Appender<KT> + +template <node_type_t NODE_TYPE> +template <KeyT KT> +bool APPEND_T::append(const ITER_T& src, index_t& items) { + auto p_end = src.p_end(); + bool append_till_end = false; + if (is_valid_index(items)) { + for (auto i = 1u; i <= items; ++i) { + if (!src.has_next()) { + assert(i == items); + append_till_end = true; + break; + } + ++src; + } + } else { + if (items == INDEX_END) { + append_till_end = true; + } else { + assert(items == INDEX_LAST); + } + items = 0; + while (src.has_next()) { + ++src; + ++items; + } + if (append_till_end) { + ++items; + } + } + + const char* p_start; + if (append_till_end) { + p_start = src.p_start(); + } else { + p_start = src.p_end(); + } + assert(p_end >= p_start); + size_t append_size = p_end - p_start; + p_append -= append_size; + p_mut->copy_in_absolute(p_append, p_start, append_size); + return append_till_end; +} + +template <node_type_t NODE_TYPE> +template <KeyT KT> +std::tuple<NodeExtentMutable*, char*> +APPEND_T::open_nxt(const key_get_type& partial_key) { + p_append -= sizeof(node_offset_t); + p_offset_while_open = p_append; + ns_oid_view_t::append(*p_mut, partial_key, p_append); + return {p_mut, p_append}; +} + +template <node_type_t NODE_TYPE> +template <KeyT KT> +std::tuple<NodeExtentMutable*, char*> +APPEND_T::open_nxt(const full_key_t<KT>& key) { + p_append -= sizeof(node_offset_t); + p_offset_while_open = p_append; + ns_oid_view_t::append<KT>(*p_mut, key, p_append); + return {p_mut, p_append}; +} + +template <node_type_t NODE_TYPE> +template <KeyT KT> +void APPEND_T::wrap_nxt(char* _p_append) { + assert(_p_append < p_append); + p_mut->copy_in_absolute( + p_offset_while_open, node_offset_t(p_offset_while_open - _p_append)); + p_append = _p_append; +} + +#define APPEND_TEMPLATE(NT, KT) template class ITER_INST(NT)::Appender<KT> +APPEND_TEMPLATE(node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ); + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h new file mode 100644 index 000000000..bb68eec8f --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h @@ -0,0 +1,180 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +/** + * item_iterator_t + * + * The STAGE_STRING implementation for node N0/N1, implements staged contract + * as an iterative container to resolve crush hash conflicts. + * + * The layout of the contaner to index ns, oid strings storing n items: + * + * # <--------- container range ---------> # + * #<~># items [i+1, n) # + * # # items [0, i) #<~># + * # # <------ item i -------------> # # + * # # <--- item_range ---> | # # + * # # | # # + * # # next-stage | ns-oid | back_ # # + * # # contaner | strings | offset # # + * #...# range | | #...# + * ^ ^ | ^ + * | | | | + * | +---------------------------+ | + * + p_items_start p_items_end + + */ +template <node_type_t NODE_TYPE> +class item_iterator_t { + using value_t = value_type_t<NODE_TYPE>; + public: + item_iterator_t(const memory_range_t& range) + : p_items_start(range.p_start), p_items_end(range.p_end) { + assert(p_items_start < p_items_end); + next_item_range(p_items_end); + } + + const char* p_start() const { return item_range.p_start; } + const char* p_end() const { return item_range.p_end + sizeof(node_offset_t); } + const memory_range_t& get_item_range() const { return item_range; } + node_offset_t get_back_offset() const { return back_offset; } + + // container type system + using key_get_type = const ns_oid_view_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::ITERATIVE; + index_t index() const { return _index; } + key_get_type get_key() const { + if (!key.has_value()) { + key = ns_oid_view_t(item_range.p_end); + assert(item_range.p_start < (*key).p_start()); + } + return *key; + } + node_offset_t size() const { + size_t ret = item_range.p_end - item_range.p_start + sizeof(node_offset_t); + assert(ret < NODE_BLOCK_SIZE); + return ret; + }; + node_offset_t size_to_nxt() const { + size_t ret = get_key().size() + sizeof(node_offset_t); + assert(ret < NODE_BLOCK_SIZE); + return ret; + } + node_offset_t size_overhead() const { + return sizeof(node_offset_t) + get_key().size_overhead(); + } + memory_range_t get_nxt_container() const { + return {item_range.p_start, get_key().p_start()}; + } + bool has_next() const { + assert(p_items_start <= item_range.p_start); + return p_items_start < item_range.p_start; + } + const item_iterator_t<NODE_TYPE>& operator++() const { + assert(has_next()); + next_item_range(item_range.p_start); + key.reset(); + ++_index; + return *this; + } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + int start_offset = p_items_start - p_node_start; + int end_offset = p_items_end - p_node_start; + assert(start_offset > 0 && start_offset < NODE_BLOCK_SIZE); + assert(end_offset > 0 && end_offset <= NODE_BLOCK_SIZE); + ceph::encode(static_cast<node_offset_t>(start_offset), encoded); + ceph::encode(static_cast<node_offset_t>(end_offset), encoded); + ceph::encode(_index, encoded); + } + + static item_iterator_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t end_offset; + ceph::decode(end_offset, delta); + assert(start_offset < end_offset); + assert(end_offset <= NODE_BLOCK_SIZE); + index_t index; + ceph::decode(index, delta); + + item_iterator_t ret({p_node_start + start_offset, + p_node_start + end_offset}); + while (index > 0) { + ++ret; + --index; + } + return ret; + } + + static node_offset_t header_size() { return 0u; } + + template <KeyT KT> + static node_offset_t estimate_insert( + const full_key_t<KT>& key, const value_t&) { + return ns_oid_view_t::estimate_size<KT>(key) + sizeof(node_offset_t); + } + + template <KeyT KT> + static memory_range_t insert_prefix( + NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter, + const full_key_t<KT>& key, bool is_end, + node_offset_t size, const char* p_left_bound); + + static void update_size( + NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter, int change); + + static node_offset_t trim_until(NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&); + static node_offset_t trim_at( + NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&, node_offset_t trimmed); + + template <KeyT KT> + class Appender; + + private: + void next_item_range(const char* p_end) const { + auto p_item_end = p_end - sizeof(node_offset_t); + assert(p_items_start < p_item_end); + back_offset = reinterpret_cast<const node_offset_packed_t*>(p_item_end)->value; + assert(back_offset); + const char* p_item_start = p_item_end - back_offset; + assert(p_items_start <= p_item_start); + item_range = {p_item_start, p_item_end}; + } + + const char* p_items_start; + const char* p_items_end; + mutable memory_range_t item_range; + mutable node_offset_t back_offset; + mutable std::optional<ns_oid_view_t> key; + mutable index_t _index = 0u; +}; + +template <node_type_t NODE_TYPE> +template <KeyT KT> +class item_iterator_t<NODE_TYPE>::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} {} + bool append(const item_iterator_t<NODE_TYPE>& src, index_t& items); + char* wrap() { return p_append; } + std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&); + std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&); + void wrap_nxt(char* _p_append); + + private: + NodeExtentMutable* p_mut; + char* p_append; + char* p_offset_while_open; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc new file mode 100644 index 000000000..d60bb8d09 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc @@ -0,0 +1,32 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "key_layout.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +void string_key_view_t::append_str( + NodeExtentMutable& mut, std::string_view str, char*& p_append) { + assert(is_valid_size(str.length())); + p_append -= sizeof(string_size_t); + string_size_t len = str.length(); + mut.copy_in_absolute(p_append, len); + p_append -= len; + mut.copy_in_absolute(p_append, str.data(), len); +} + +void string_key_view_t::append_dedup( + NodeExtentMutable& mut, const Type& dedup_type, char*& p_append) { + p_append -= sizeof(string_size_t); + if (dedup_type == Type::MIN) { + mut.copy_in_absolute(p_append, MIN); + } else if (dedup_type == Type::MAX) { + mut.copy_in_absolute(p_append, MAX); + } else { + ceph_abort("impossible path"); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h new file mode 100644 index 000000000..cc1f546c1 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h @@ -0,0 +1,846 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <limits> +#include <optional> +#include <ostream> + +#include "common/hobject.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h" + +namespace crimson::os::seastore::onode { + +using shard_t = int8_t; +using pool_t = int64_t; +using crush_hash_t = uint32_t; +using snap_t = uint64_t; +using gen_t = uint64_t; +static_assert(sizeof(shard_t) == sizeof(ghobject_t().shard_id.id)); +static_assert(sizeof(pool_t) == sizeof(ghobject_t().hobj.pool)); +static_assert(sizeof(crush_hash_t) == sizeof(ghobject_t().hobj.get_hash())); +static_assert(sizeof(snap_t) == sizeof(ghobject_t().hobj.snap.val)); +static_assert(sizeof(gen_t) == sizeof(ghobject_t().generation)); + +class NodeExtentMutable; +class key_view_t; +class key_hobj_t; +enum class KeyT { VIEW, HOBJ }; +template <KeyT> struct _full_key_type; +template<> struct _full_key_type<KeyT::VIEW> { using type = key_view_t; }; +template<> struct _full_key_type<KeyT::HOBJ> { using type = key_hobj_t; }; +template <KeyT type> +using full_key_t = typename _full_key_type<type>::type; + +struct node_offset_packed_t { + node_offset_t value; +} __attribute__((packed)); + +// TODO: consider alignments +struct shard_pool_t { + bool operator==(const shard_pool_t& x) const { + return (shard == x.shard && pool == x.pool); + } + bool operator!=(const shard_pool_t& x) const { return !(*this == x); } + + template <KeyT KT> + static shard_pool_t from_key(const full_key_t<KT>& key); + + shard_t shard; + pool_t pool; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const shard_pool_t& sp) { + return os << (unsigned)sp.shard << "," << sp.pool; +} +inline MatchKindCMP compare_to(const shard_pool_t& l, const shard_pool_t& r) { + auto ret = toMatchKindCMP(l.shard, r.shard); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(l.pool, r.pool); +} + +struct crush_t { + bool operator==(const crush_t& x) const { return crush == x.crush; } + bool operator!=(const crush_t& x) const { return !(*this == x); } + + template <KeyT KT> + static crush_t from_key(const full_key_t<KT>& key); + + crush_hash_t crush; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const crush_t& c) { + return os << c.crush; +} +inline MatchKindCMP compare_to(const crush_t& l, const crush_t& r) { + return toMatchKindCMP(l.crush, r.crush); +} + +struct shard_pool_crush_t { + bool operator==(const shard_pool_crush_t& x) const { + return (shard_pool == x.shard_pool && crush == x.crush); + } + bool operator!=(const shard_pool_crush_t& x) const { return !(*this == x); } + + template <KeyT KT> + static shard_pool_crush_t from_key(const full_key_t<KT>& key); + + shard_pool_t shard_pool; + crush_t crush; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const shard_pool_crush_t& spc) { + return os << spc.shard_pool << "," << spc.crush; +} +inline MatchKindCMP compare_to( + const shard_pool_crush_t& l, const shard_pool_crush_t& r) { + auto ret = compare_to(l.shard_pool, r.shard_pool); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to(l.crush, r.crush); +} + +struct snap_gen_t { + bool operator==(const snap_gen_t& x) const { + return (snap == x.snap && gen == x.gen); + } + bool operator!=(const snap_gen_t& x) const { return !(*this == x); } + + template <KeyT KT> + static snap_gen_t from_key(const full_key_t<KT>& key); + + snap_t snap; + gen_t gen; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const snap_gen_t& sg) { + return os << sg.snap << "," << sg.gen; +} +inline MatchKindCMP compare_to(const snap_gen_t& l, const snap_gen_t& r) { + auto ret = toMatchKindCMP(l.snap, r.snap); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(l.gen, r.gen); +} + +/** + * string_key_view_t + * + * The layout to store char array as an oid or an ns string which may be + * compressed. + * + * If compressed, the physical block only stores an unsigned int of + * string_size_t, with value 0 denoting Type::MIN, and value max() denoting + * Type::MAX. + * + * If not compressed (Type::STR), the physical block stores the char array and + * a valid string_size_t value. + */ +struct string_key_view_t { + enum class Type {MIN, STR, MAX}; + // presumably the maximum string length is 2KiB + using string_size_t = uint16_t; + static constexpr auto MAX = std::numeric_limits<string_size_t>::max(); + static constexpr auto MIN = string_size_t(0u); + static auto is_valid_size(size_t size) { + return (size > MIN && size < MAX); + } + + string_key_view_t(const char* p_end) { + p_length = p_end - sizeof(string_size_t); + std::memcpy(&length, p_length, sizeof(string_size_t)); + if (is_valid_size(length)) { + auto _p_key = p_length - length; + p_key = static_cast<const char*>(_p_key); + } else { + assert(length == MAX || length == MIN); + p_key = nullptr; + } + } + Type type() const { + if (length == MIN) { + return Type::MIN; + } else if (length == MAX) { + return Type::MAX; + } else { + assert(is_valid_size(length)); + return Type::STR; + } + } + const char* p_start() const { + if (p_key) { + return p_key; + } else { + return p_length; + } + } + const char* p_next_end() const { + if (p_key) { + return p_start(); + } else { + return p_length + sizeof(string_size_t); + } + } + node_offset_t size() const { + size_t ret = length + sizeof(string_size_t); + assert(ret < NODE_BLOCK_SIZE); + return ret; + } + node_offset_t size_logical() const { + assert(type() == Type::STR); + assert(is_valid_size(length)); + return length; + } + node_offset_t size_overhead() const { + assert(type() == Type::STR); + return sizeof(string_size_t); + } + + std::string_view to_string_view() const { + assert(type() == Type::STR); + assert(is_valid_size(length)); + return {p_key, length}; + } + bool operator==(const string_key_view_t& x) const { + if (type() == x.type() && type() != Type::STR) + return true; + if (type() != x.type()) + return false; + if (length != x.length) + return false; + return (memcmp(p_key, x.p_key, length) == 0); + } + bool operator!=(const string_key_view_t& x) const { return !(*this == x); } + + static void append_str( + NodeExtentMutable&, std::string_view, char*& p_append); + + static void test_append_str(std::string_view str, char*& p_append) { + assert(is_valid_size(str.length())); + p_append -= sizeof(string_size_t); + string_size_t len = str.length(); + std::memcpy(p_append, &len, sizeof(string_size_t)); + p_append -= len; + std::memcpy(p_append, str.data(), len); + } + + static void append_dedup( + NodeExtentMutable&, const Type& dedup_type, char*& p_append); + + static void test_append_dedup(const Type& dedup_type, char*& p_append) { + p_append -= sizeof(string_size_t); + string_size_t len; + if (dedup_type == Type::MIN) { + len = MIN; + } else if (dedup_type == Type::MAX) { + len = MAX; + } else { + ceph_abort("impossible path"); + } + std::memcpy(p_append, &len, sizeof(string_size_t)); + } + + const char* p_key; + const char* p_length; + // TODO: remove if p_length is aligned + string_size_t length; +}; + +/** + * string_view_masked_t + * + * A common class to hide the underlying string implementation regardless of a + * string_key_view_t (maybe compressed), a string/string_view, or a compressed + * string. And leverage this consistant class to do compare, print, convert and + * append operations. + */ +class string_view_masked_t { + public: + using string_size_t = string_key_view_t::string_size_t; + using Type = string_key_view_t::Type; + explicit string_view_masked_t(const string_key_view_t& index) + : type{index.type()} { + if (type == Type::STR) { + view = index.to_string_view(); + } + } + explicit string_view_masked_t(std::string_view str) + : type{Type::STR}, view{str} { + assert(string_key_view_t::is_valid_size(view.size())); + } + + Type get_type() const { return type; } + std::string_view to_string_view() const { + assert(get_type() == Type::STR); + return view; + } + string_size_t size() const { + assert(get_type() == Type::STR); + assert(string_key_view_t::is_valid_size(view.size())); + return view.size(); + } + bool operator==(const string_view_masked_t& x) const { + if (get_type() == x.get_type() && get_type() != Type::STR) + return true; + if (get_type() != x.get_type()) + return false; + if (size() != x.size()) + return false; + return (memcmp(view.data(), x.view.data(), size()) == 0); + } + bool operator!=(const string_view_masked_t& x) const { return !(*this == x); } + void encode(ceph::bufferlist& bl) const { + if (get_type() == Type::MIN) { + ceph::encode(string_key_view_t::MIN, bl); + } else if (get_type() == Type::MAX) { + ceph::encode(string_key_view_t::MAX, bl); + } else { + ceph::encode(size(), bl); + ceph::encode_nohead(view, bl); + } + } + static auto min() { return string_view_masked_t{Type::MIN}; } + static auto max() { return string_view_masked_t{Type::MAX}; } + static string_view_masked_t decode( + std::string& str_storage, ceph::bufferlist::const_iterator& delta) { + string_size_t size; + ceph::decode(size, delta); + if (size == string_key_view_t::MIN) { + return min(); + } else if (size == string_key_view_t::MAX) { + return max(); + } else { + ceph::decode_nohead(size, str_storage, delta); + return string_view_masked_t(str_storage); + } + } + + private: + explicit string_view_masked_t(Type type) + : type{type} {} + + Type type; + std::string_view view; +}; +inline MatchKindCMP compare_to(const string_view_masked_t& l, const string_view_masked_t& r) { + using Type = string_view_masked_t::Type; + auto l_type = l.get_type(); + auto r_type = r.get_type(); + if (l_type == Type::STR && r_type == Type::STR) { + assert(l.size() && r.size()); + return toMatchKindCMP(l.to_string_view(), r.to_string_view()); + } else if (l_type == r_type) { + return MatchKindCMP::EQ; + } else if (l_type == Type::MIN || r_type == Type::MAX) { + return MatchKindCMP::LT; + } else { // l_type == Type::MAX || r_type == Type::MIN + return MatchKindCMP::GT; + } +} +inline MatchKindCMP compare_to(std::string_view l, const string_view_masked_t& r) { + using Type = string_view_masked_t::Type; + assert(l.length()); + auto r_type = r.get_type(); + if (r_type == Type::MIN) { + return MatchKindCMP::GT; + } else if (r_type == Type::MAX) { + return MatchKindCMP::LT; + } else { // r_type == Type::STR + assert(r.size()); + return toMatchKindCMP(l, r.to_string_view()); + } +} +inline MatchKindCMP compare_to(const string_view_masked_t& l, std::string_view r) { + return reverse(compare_to(r, l)); +} +inline std::ostream& operator<<(std::ostream& os, const string_view_masked_t& masked) { + using Type = string_view_masked_t::Type; + auto type = masked.get_type(); + if (type == Type::MIN) { + return os << "MIN"; + } else if (type == Type::MAX) { + return os << "MAX"; + } else { // type == Type::STR + auto view = masked.to_string_view(); + if (view.length() <= 12) { + os << "\"" << view << "\""; + } else { + os << "\"" << std::string_view(view.data(), 4) << ".." + << std::string_view(view.data() + view.length() - 2, 2) + << "/" << view.length() << "B\""; + } + return os; + } +} + +struct ns_oid_view_t { + using string_size_t = string_key_view_t::string_size_t; + using Type = string_key_view_t::Type; + + ns_oid_view_t(const char* p_end) : nspace(p_end), oid(nspace.p_next_end()) {} + Type type() const { return oid.type(); } + const char* p_start() const { return oid.p_start(); } + node_offset_t size() const { + if (type() == Type::STR) { + size_t ret = nspace.size() + oid.size(); + assert(ret < NODE_BLOCK_SIZE); + return ret; + } else { + return sizeof(string_size_t); + } + } + node_offset_t size_logical() const { + assert(type() == Type::STR); + return nspace.size_logical() + oid.size_logical(); + } + node_offset_t size_overhead() const { + assert(type() == Type::STR); + return nspace.size_overhead() + oid.size_overhead(); + } + bool operator==(const ns_oid_view_t& x) const { + return (string_view_masked_t{nspace} == string_view_masked_t{x.nspace} && + string_view_masked_t{oid} == string_view_masked_t{x.oid}); + } + bool operator!=(const ns_oid_view_t& x) const { return !(*this == x); } + + template <KeyT KT> + static node_offset_t estimate_size(const full_key_t<KT>& key); + + template <KeyT KT> + static void append(NodeExtentMutable&, + const full_key_t<KT>& key, + char*& p_append); + + static void append(NodeExtentMutable& mut, + const ns_oid_view_t& view, + char*& p_append) { + if (view.type() == Type::STR) { + string_key_view_t::append_str(mut, view.nspace.to_string_view(), p_append); + string_key_view_t::append_str(mut, view.oid.to_string_view(), p_append); + } else { + string_key_view_t::append_dedup(mut, view.type(), p_append); + } + } + + template <KeyT KT> + static void test_append(const full_key_t<KT>& key, char*& p_append); + + string_key_view_t nspace; + string_key_view_t oid; +}; +inline std::ostream& operator<<(std::ostream& os, const ns_oid_view_t& ns_oid) { + return os << string_view_masked_t{ns_oid.nspace} << "," + << string_view_masked_t{ns_oid.oid}; +} +inline MatchKindCMP compare_to(const ns_oid_view_t& l, const ns_oid_view_t& r) { + auto ret = compare_to(string_view_masked_t{l.nspace}, + string_view_masked_t{r.nspace}); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to(string_view_masked_t{l.oid}, + string_view_masked_t{r.oid}); +} + +/** + * key_hobj_t + * + * A specialized implementation of a full_key_t storing a ghobject_t passed + * from user. + */ +class key_hobj_t { + public: + explicit key_hobj_t(const ghobject_t& ghobj) : ghobj{ghobj} {} + /* + * common interfaces as a full_key_t + */ + shard_t shard() const { + return ghobj.shard_id; + } + pool_t pool() const { + return ghobj.hobj.pool; + } + crush_hash_t crush() const { + return ghobj.hobj.get_hash(); + } + std::string_view nspace() const { + // TODO(cross-node string dedup) + return ghobj.hobj.nspace; + } + string_view_masked_t nspace_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{nspace()}; + } + std::string_view oid() const { + // TODO(cross-node string dedup) + return ghobj.hobj.oid.name; + } + string_view_masked_t oid_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{oid()}; + } + ns_oid_view_t::Type dedup_type() const { + return _dedup_type; + } + snap_t snap() const { + return ghobj.hobj.snap; + } + gen_t gen() const { + return ghobj.generation; + } + + bool operator==(const full_key_t<KeyT::VIEW>& o) const; + bool operator==(const full_key_t<KeyT::HOBJ>& o) const; + bool operator!=(const full_key_t<KeyT::VIEW>& o) const { + return !operator==(o); + } + bool operator!=(const full_key_t<KeyT::HOBJ>& o) const { + return !operator==(o); + } + + std::ostream& dump(std::ostream& os) const { + os << "key_hobj(" << (unsigned)shard() << "," + << pool() << "," << crush() << "; " + << string_view_masked_t{nspace()} << "," + << string_view_masked_t{oid()} << "; " + << snap() << "," << gen() << ")"; + return os; + } + + static key_hobj_t decode(ceph::bufferlist::const_iterator& delta) { + shard_t shard; + ceph::decode(shard, delta); + pool_t pool; + ceph::decode(pool, delta); + crush_hash_t crush; + ceph::decode(crush, delta); + std::string nspace; + auto nspace_masked = string_view_masked_t::decode(nspace, delta); + // TODO(cross-node string dedup) + assert(nspace_masked.get_type() == string_view_masked_t::Type::STR); + std::string oid; + auto oid_masked = string_view_masked_t::decode(oid, delta); + // TODO(cross-node string dedup) + assert(oid_masked.get_type() == string_view_masked_t::Type::STR); + snap_t snap; + ceph::decode(snap, delta); + gen_t gen; + ceph::decode(gen, delta); + return key_hobj_t(ghobject_t( + shard_id_t(shard), pool, crush, nspace, oid, snap, gen)); + } + + private: + ns_oid_view_t::Type _dedup_type = ns_oid_view_t::Type::STR; + ghobject_t ghobj; +}; +inline std::ostream& operator<<(std::ostream& os, const key_hobj_t& key) { + return key.dump(os); +} + +/** + * key_view_t + * + * A specialized implementation of a full_key_t pointing to the locations + * storing the full key in a tree node. + */ +class key_view_t { + public: + /** + * common interfaces as a full_key_t + */ + shard_t shard() const { + return shard_pool_packed().shard; + } + pool_t pool() const { + return shard_pool_packed().pool; + } + crush_hash_t crush() const { + return crush_packed().crush; + } + std::string_view nspace() const { + // TODO(cross-node string dedup) + return ns_oid_view().nspace.to_string_view(); + } + string_view_masked_t nspace_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{ns_oid_view().nspace}; + } + std::string_view oid() const { + // TODO(cross-node string dedup) + return ns_oid_view().oid.to_string_view(); + } + string_view_masked_t oid_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{ns_oid_view().oid}; + } + ns_oid_view_t::Type dedup_type() const { + return ns_oid_view().type(); + } + snap_t snap() const { + return snap_gen_packed().snap; + } + gen_t gen() const { + return snap_gen_packed().gen; + } + + bool operator==(const full_key_t<KeyT::VIEW>& o) const; + bool operator==(const full_key_t<KeyT::HOBJ>& o) const; + bool operator!=(const full_key_t<KeyT::VIEW>& o) const { + return !operator==(o); + } + bool operator!=(const full_key_t<KeyT::HOBJ>& o) const { + return !operator==(o); + } + + /** + * key_view_t specific interfaces + */ + bool has_shard_pool() const { + return p_shard_pool != nullptr; + } + bool has_crush() const { + return p_crush != nullptr; + } + bool has_ns_oid() const { + return p_ns_oid.has_value(); + } + bool has_snap_gen() const { + return p_snap_gen != nullptr; + } + + const shard_pool_t& shard_pool_packed() const { + assert(has_shard_pool()); + return *p_shard_pool; + } + const crush_t& crush_packed() const { + assert(has_crush()); + return *p_crush; + } + const ns_oid_view_t& ns_oid_view() const { + assert(has_ns_oid()); + return *p_ns_oid; + } + const snap_gen_t& snap_gen_packed() const { + assert(has_snap_gen()); + return *p_snap_gen; + } + + size_t size_logical() const { + return sizeof(shard_t) + sizeof(pool_t) + sizeof(crush_hash_t) + + sizeof(snap_t) + sizeof(gen_t) + ns_oid_view().size_logical(); + } + + ghobject_t to_ghobj() const { + return ghobject_t( + shard_id_t(shard()), pool(), crush(), + std::string(nspace()), std::string(oid()), snap(), gen()); + } + + void replace(const crush_t& key) { p_crush = &key; } + void set(const crush_t& key) { + assert(!has_crush()); + replace(key); + } + void replace(const shard_pool_crush_t& key) { p_shard_pool = &key.shard_pool; } + void set(const shard_pool_crush_t& key) { + set(key.crush); + assert(!has_shard_pool()); + replace(key); + } + void replace(const ns_oid_view_t& key) { p_ns_oid = key; } + void set(const ns_oid_view_t& key) { + assert(!has_ns_oid()); + replace(key); + } + void replace(const snap_gen_t& key) { p_snap_gen = &key; } + void set(const snap_gen_t& key) { + assert(!has_snap_gen()); + replace(key); + } + + std::ostream& dump(std::ostream& os) const { + os << "key_view("; + if (has_shard_pool()) { + os << (unsigned)shard() << "," << pool() << ","; + } else { + os << "X,X,"; + } + if (has_crush()) { + os << crush() << "; "; + } else { + os << "X; "; + } + if (has_ns_oid()) { + os << ns_oid_view() << "; "; + } else { + os << "X,X; "; + } + if (has_snap_gen()) { + os << snap() << "," << gen() << ")"; + } else { + os << "X,X)"; + } + return os; + } + + private: + const shard_pool_t* p_shard_pool = nullptr; + const crush_t* p_crush = nullptr; + std::optional<ns_oid_view_t> p_ns_oid; + const snap_gen_t* p_snap_gen = nullptr; +}; + +template <KeyT KT> +void encode_key(const full_key_t<KT>& key, ceph::bufferlist& bl) { + ceph::encode(key.shard(), bl); + ceph::encode(key.pool(), bl); + ceph::encode(key.crush(), bl); + key.nspace_masked().encode(bl); + key.oid_masked().encode(bl); + ceph::encode(key.snap(), bl); + ceph::encode(key.gen(), bl); +} + +inline MatchKindCMP compare_to(std::string_view l, std::string_view r) { + return toMatchKindCMP(l, r); +} +template <KeyT TypeL, KeyT TypeR> +bool compare_full_key(const full_key_t<TypeL>& l, const full_key_t<TypeR>& r) { + if (l.shard() != r.shard()) + return false; + if (l.pool() != r.pool()) + return false; + if (l.crush() != r.crush()) + return false; + if (compare_to(l.nspace(), r.nspace()) != MatchKindCMP::EQ) + return false; + if (compare_to(l.oid(), r.oid()) != MatchKindCMP::EQ) + return false; + if (l.snap() != r.snap()) + return false; + if (l.gen() != r.gen()) + return false; + return true; +} + +inline bool key_hobj_t::operator==(const full_key_t<KeyT::VIEW>& o) const { + return compare_full_key<KeyT::HOBJ, KeyT::VIEW>(*this, o); +} +inline bool key_hobj_t::operator==(const full_key_t<KeyT::HOBJ>& o) const { + return compare_full_key<KeyT::HOBJ, KeyT::HOBJ>(*this, o); +} +inline bool key_view_t::operator==(const full_key_t<KeyT::VIEW>& o) const { + return compare_full_key<KeyT::VIEW, KeyT::VIEW>(*this, o); +} +inline bool key_view_t::operator==(const full_key_t<KeyT::HOBJ>& o) const { + return compare_full_key<KeyT::VIEW, KeyT::HOBJ>(*this, o); +} + +inline std::ostream& operator<<(std::ostream& os, const key_view_t& key) { + return key.dump(os); +} + +template <KeyT Type> +MatchKindCMP compare_to(const full_key_t<Type>& key, const shard_pool_t& target) { + auto ret = toMatchKindCMP(key.shard(), target.shard); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(key.pool(), target.pool); +} + +template <KeyT Type> +MatchKindCMP compare_to(const full_key_t<Type>& key, const crush_t& target) { + return toMatchKindCMP(key.crush(), target.crush); +} + +template <KeyT Type> +MatchKindCMP compare_to(const full_key_t<Type>& key, const shard_pool_crush_t& target) { + auto ret = compare_to<Type>(key, target.shard_pool); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to<Type>(key, target.crush); +} + +template <KeyT Type> +MatchKindCMP compare_to(const full_key_t<Type>& key, const ns_oid_view_t& target) { + auto ret = compare_to(key.nspace(), string_view_masked_t{target.nspace}); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to(key.oid(), string_view_masked_t{target.oid}); +} + +template <KeyT Type> +MatchKindCMP compare_to(const full_key_t<Type>& key, const snap_gen_t& target) { + auto ret = toMatchKindCMP(key.snap(), target.snap); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(key.gen(), target.gen); +} + +template <KeyT KT> +shard_pool_t shard_pool_t::from_key(const full_key_t<KT>& key) { + if constexpr (KT == KeyT::VIEW) { + return key.shard_pool_packed(); + } else { + return {key.shard(), key.pool()}; + } +} + +template <KeyT KT> +crush_t crush_t::from_key(const full_key_t<KT>& key) { + if constexpr (KT == KeyT::VIEW) { + return key.crush_packed(); + } else { + return {key.crush()}; + } +} + +template <KeyT KT> +shard_pool_crush_t shard_pool_crush_t::from_key(const full_key_t<KT>& key) { + return {shard_pool_t::from_key<KT>(key), crush_t::from_key<KT>(key)}; +} + +template <KeyT KT> +snap_gen_t snap_gen_t::from_key(const full_key_t<KT>& key) { + if constexpr (KT == KeyT::VIEW) { + return key.snap_gen_packed(); + } else { + return {key.snap(), key.gen()}; + } +} + +template <KeyT KT> +node_offset_t ns_oid_view_t::estimate_size(const full_key_t<KT>& key) { + if constexpr (KT == KeyT::VIEW) { + return key.ns_oid_view().size(); + } else { + if (key.dedup_type() != Type::STR) { + // size after deduplication + return sizeof(string_size_t); + } else { + return 2 * sizeof(string_size_t) + key.nspace().size() + key.oid().size(); + } + } +} + +template <KeyT KT> +void ns_oid_view_t::append( + NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) { + if (key.dedup_type() == Type::STR) { + string_key_view_t::append_str(mut, key.nspace(), p_append); + string_key_view_t::append_str(mut, key.oid(), p_append); + } else { + string_key_view_t::append_dedup(mut, key.dedup_type(), p_append); + } +} + +template <KeyT KT> +void ns_oid_view_t::test_append(const full_key_t<KT>& key, char*& p_append) { + if (key.dedup_type() == Type::STR) { + string_key_view_t::test_append_str(key.nspace(), p_append); + string_key_view_t::test_append_str(key.oid(), p_append); + } else { + string_key_view_t::test_append_dedup(key.dedup_type(), p_append); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc new file mode 100644 index 000000000..4a5988185 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc @@ -0,0 +1,318 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" +#include "node_stage_layout.h" + +namespace crimson::os::seastore::onode { + +#define NODE_T node_extent_t<FieldType, NODE_TYPE> +#define NODE_INST(FT, NT) node_extent_t<FT, NT> + +template <typename FieldType, node_type_t NODE_TYPE> +const char* NODE_T::p_left_bound() const { + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + // N3 internal node doesn't have the right part + return nullptr; + } else { + auto ret = p_start() + fields().get_item_end_offset(keys()); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (is_level_tail()) { + ret -= sizeof(laddr_t); + } + } + return ret; + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +node_offset_t NODE_T::size_to_nxt_at(index_t index) const { + assert(index < keys()); + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + return FieldType::estimate_insert_one(); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + auto p_end = p_start() + p_fields->get_item_end_offset(index); + return FieldType::estimate_insert_one() + ns_oid_view_t(p_end).size(); + } else { + ceph_abort("N3 node is not nested"); + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +memory_range_t NODE_T::get_nxt_container(index_t index) const { + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + ceph_abort("N3 internal node doesn't have the right part"); + } else { + node_offset_t item_start_offset = p_fields->get_item_start_offset(index); + node_offset_t item_end_offset = p_fields->get_item_end_offset(index); + assert(item_start_offset < item_end_offset); + auto item_p_start = p_start() + item_start_offset; + auto item_p_end = p_start() + item_end_offset; + if constexpr (FIELD_TYPE == field_type_t::N2) { + // range for sub_items_t<NODE_TYPE> + item_p_end = ns_oid_view_t(item_p_end).p_start(); + assert(item_p_start < item_p_end); + } else { + // range for item_iterator_t<NODE_TYPE> + } + return {item_p_start, item_p_end}; + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +void NODE_T::bootstrap_extent( + NodeExtentMutable& mut, + field_type_t field_type, node_type_t node_type, + bool is_level_tail, level_t level) { + node_header_t::bootstrap_extent( + mut, field_type, node_type, is_level_tail, level); + mut.copy_in_relative( + sizeof(node_header_t), typename FieldType::num_keys_t(0u)); +} + +template <typename FieldType, node_type_t NODE_TYPE> +void NODE_T::update_is_level_tail( + NodeExtentMutable& mut, const node_extent_t& extent, bool value) { + node_header_t::update_is_level_tail(mut, extent.p_fields->header, value); +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +memory_range_t NODE_T::insert_prefix_at( + NodeExtentMutable& mut, const node_extent_t& node, const full_key_t<KT>& key, + index_t index, node_offset_t size, const char* p_left_bound) { + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + assert(index <= node.keys()); + assert(p_left_bound == node.p_left_bound()); + assert(size > FieldType::estimate_insert_one()); + auto size_right = size - FieldType::estimate_insert_one(); + const char* p_insert = node.p_start() + node.fields().get_item_end_offset(index); + const char* p_insert_front = p_insert - size_right; + FieldType::template insert_at<KT>(mut, key, node.fields(), index, size_right); + mut.shift_absolute(p_left_bound, + p_insert - p_left_bound, + -(int)size_right); + return {p_insert_front, p_insert}; + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + ceph_abort("not implemented"); + } else { + ceph_abort("impossible"); + } +} +#define IPA_TEMPLATE(FT, NT, KT) \ + template memory_range_t NODE_INST(FT, NT)::insert_prefix_at<KT>( \ + NodeExtentMutable&, const node_extent_t&, const full_key_t<KT>&, \ + index_t, node_offset_t, const char*) +IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW); +IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW); +IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW); +IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW); +IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW); +IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW); +IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ); + +template <typename FieldType, node_type_t NODE_TYPE> +void NODE_T::update_size_at( + NodeExtentMutable& mut, const node_extent_t& node, index_t index, int change) { + assert(index < node.keys()); + FieldType::update_size_at(mut, node.fields(), index, change); +} + +template <typename FieldType, node_type_t NODE_TYPE> +node_offset_t NODE_T::trim_until( + NodeExtentMutable& mut, const node_extent_t& node, index_t index) { + assert(!node.is_level_tail()); + auto keys = node.keys(); + assert(index <= keys); + if (index == keys) { + return 0; + } + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + ceph_abort("not implemented"); + } else { + mut.copy_in_absolute( + (void*)&node.p_fields->num_keys, num_keys_t(index)); + } + // no need to calculate trim size for node + return 0; +} + +template <typename FieldType, node_type_t NODE_TYPE> +node_offset_t NODE_T::trim_at( + NodeExtentMutable& mut, const node_extent_t& node, + index_t index, node_offset_t trimmed) { + assert(!node.is_level_tail()); + assert(index < node.keys()); + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + ceph_abort("not implemented"); + } else { + node_offset_t offset = node.p_fields->get_item_start_offset(index); + size_t new_offset = offset + trimmed; + assert(new_offset < node.p_fields->get_item_end_offset(index)); + mut.copy_in_absolute(const_cast<void*>(node.p_fields->p_offset(index)), + node_offset_t(new_offset)); + mut.copy_in_absolute( + (void*)&node.p_fields->num_keys, num_keys_t(index + 1)); + } + // no need to calculate trim size for node + return 0; +} + +#define NODE_TEMPLATE(FT, NT) template class NODE_INST(FT, NT) +NODE_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL); +NODE_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_0_t, node_type_t::LEAF); +NODE_TEMPLATE(node_fields_1_t, node_type_t::LEAF); +NODE_TEMPLATE(node_fields_2_t, node_type_t::LEAF); +NODE_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF); + +#define APPEND_T node_extent_t<FieldType, NODE_TYPE>::Appender<KT> + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +void APPEND_T::append(const node_extent_t& src, index_t from, index_t items) { + assert(from <= src.keys()); + if (p_src == nullptr) { + p_src = &src; + } else { + assert(p_src == &src); + } + if (items == 0) { + return; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + num_keys += items; + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + ceph_abort("impossible path"); + } else { + // append left part forwards + node_offset_t offset_left_start = src.fields().get_key_start_offset(from); + node_offset_t offset_left_end = src.fields().get_key_start_offset(from + items); + node_offset_t left_size = offset_left_end - offset_left_start; + if (num_keys == 0) { + // no need to adjust offset + assert(from == 0); + assert(p_start + offset_left_start == p_append_left); + p_mut->copy_in_absolute(p_append_left, + src.p_start() + offset_left_start, left_size); + } else { + node_offset_t step_size = FieldType::estimate_insert_one(); + node_offset_t offset_base = src.fields().get_item_end_offset(from); + int offset_change = p_append_right - p_start - offset_base; + auto p_offset_dst = p_append_left; + if constexpr (FIELD_TYPE != field_type_t::N2) { + // copy keys + p_mut->copy_in_absolute(p_append_left, + src.p_start() + offset_left_start, left_size); + // point to offset for update + p_offset_dst += sizeof(typename FieldType::key_t); + } + for (auto i = from; i < from + items; ++i) { + p_mut->copy_in_absolute(p_offset_dst, + node_offset_t(src.fields().get_item_start_offset(i) + offset_change)); + p_offset_dst += step_size; + } + assert(p_append_left + left_size + sizeof(typename FieldType::key_t) == + p_offset_dst); + } + p_append_left += left_size; + + // append right part backwards + node_offset_t offset_right_start = src.fields().get_item_end_offset(from + items); + node_offset_t offset_right_end = src.fields().get_item_end_offset(from); + node_offset_t right_size = offset_right_end - offset_right_start; + p_append_right -= right_size; + p_mut->copy_in_absolute(p_append_right, + src.p_start() + offset_right_start, right_size); + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +void APPEND_T::append( + const full_key_t<KT>& key, const value_t& value, const value_t*& p_value) { + if constexpr (FIELD_TYPE == field_type_t::N3) { + ceph_abort("not implemented"); + } else { + ceph_abort("should not happen"); + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +std::tuple<NodeExtentMutable*, char*> +APPEND_T::open_nxt(const key_get_type& partial_key) { + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + FieldType::append_key(*p_mut, partial_key, p_append_left); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + FieldType::append_key(*p_mut, partial_key, p_append_right); + } else { + ceph_abort("impossible path"); + } + return {p_mut, p_append_right}; +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +std::tuple<NodeExtentMutable*, char*> +APPEND_T::open_nxt(const full_key_t<KT>& key) { + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + FieldType::template append_key<KT>(*p_mut, key, p_append_left); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + FieldType::template append_key<KT>(*p_mut, key, p_append_right); + } else { + ceph_abort("impossible path"); + } + return {p_mut, p_append_right}; +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +char* APPEND_T::wrap() { + assert(p_append_left <= p_append_right); + assert(p_src); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (p_src->is_level_tail()) { + laddr_t tail_value = p_src->get_end_p_laddr()->value; + p_append_right -= sizeof(laddr_t); + assert(p_append_left <= p_append_right); + p_mut->copy_in_absolute(p_append_right, tail_value); + } + } + p_mut->copy_in_absolute(p_start + offsetof(FieldType, num_keys), num_keys); + return p_append_left; +} + +#define APPEND_TEMPLATE(FT, NT, KT) template class node_extent_t<FT, NT>::Appender<KT> +APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::HOBJ); + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h new file mode 100644 index 000000000..cf0ca463c --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h @@ -0,0 +1,226 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +/** + * node_extent_t + * + * The top indexing stage implementation for node N0/N1/N2/N3, implements + * staged contract as an indexable container, and provides access to node + * header. + * + * The specific field layout are defined by FieldType which are + * node_fields_0_t, node_fields_1_t, node_fields_2_t, internal_fields_3_t and + * leaf_fields_3_t. Diagrams see node_stage_layout.h. + */ +template <typename FieldType, node_type_t _NODE_TYPE> +class node_extent_t { + public: + using value_t = value_type_t<_NODE_TYPE>; + using num_keys_t = typename FieldType::num_keys_t; + static constexpr node_type_t NODE_TYPE = _NODE_TYPE; + static constexpr field_type_t FIELD_TYPE = FieldType::FIELD_TYPE; + static constexpr node_offset_t EXTENT_SIZE = + (FieldType::SIZE + DISK_BLOCK_SIZE - 1u) / DISK_BLOCK_SIZE * DISK_BLOCK_SIZE; + + // TODO: remove + node_extent_t() = default; + + node_extent_t(const FieldType* p_fields) : p_fields{p_fields} { + validate(*p_fields); + } + + const char* p_start() const { return fields_start(*p_fields); } + + const char* off_to_ptr(node_offset_t off) const { + assert(off <= FieldType::SIZE); + return p_start() + off; + } + + node_offset_t ptr_to_off(const void* ptr) const { + auto _ptr = static_cast<const char*>(ptr); + assert(_ptr >= p_start()); + auto off = _ptr - p_start(); + assert(off <= FieldType::SIZE); + return off; + } + + bool is_level_tail() const { return p_fields->is_level_tail(); } + level_t level() const { return p_fields->header.level; } + node_offset_t free_size() const { + return p_fields->template free_size_before<NODE_TYPE>(keys()); + } + node_offset_t total_size() const { return p_fields->total_size(); } + const char* p_left_bound() const; + template <node_type_t T = NODE_TYPE> + std::enable_if_t<T == node_type_t::INTERNAL, const laddr_packed_t*> + get_end_p_laddr() const { + assert(is_level_tail()); + if constexpr (FIELD_TYPE == field_type_t::N3) { + return &p_fields->child_addrs[keys()]; + } else { + auto offset_start = p_fields->get_item_end_offset(keys()); + assert(offset_start <= FieldType::SIZE); + offset_start -= sizeof(laddr_packed_t); + auto p_addr = p_start() + offset_start; + return reinterpret_cast<const laddr_packed_t*>(p_addr); + } + } + + // container type system + using key_get_type = typename FieldType::key_get_type; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + index_t keys() const { return p_fields->num_keys; } + key_get_type operator[] (index_t index) const { return p_fields->get_key(index); } + node_offset_t size_before(index_t index) const { + auto free_size = p_fields->template free_size_before<NODE_TYPE>(index); + assert(total_size() >= free_size); + return total_size() - free_size; + } + node_offset_t size_to_nxt_at(index_t index) const; + node_offset_t size_overhead_at(index_t index) const { + return FieldType::ITEM_OVERHEAD; } + memory_range_t get_nxt_container(index_t index) const; + + template <typename T = FieldType> + std::enable_if_t<T::FIELD_TYPE == field_type_t::N3, const value_t*> + get_p_value(index_t index) const { + assert(index < keys()); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + return &p_fields->child_addrs[index]; + } else { + auto range = get_nxt_container(index); + auto ret = reinterpret_cast<const onode_t*>(range.p_start); + assert(range.p_start + ret->size == range.p_end); + return ret; + } + } + + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + assert(p_node_start == p_start()); + // nothing to encode as the container range is the entire extent + } + + static node_extent_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + // nothing to decode + return node_extent_t(reinterpret_cast<const FieldType*>(p_node_start)); + } + + static void validate(const FieldType& fields) { +#ifndef NDEBUG + assert(fields.header.get_node_type() == NODE_TYPE); + assert(fields.header.get_field_type() == FieldType::FIELD_TYPE); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + assert(fields.header.level > 0u); + } else { + assert(fields.header.level == 0u); + } +#endif + } + + static void bootstrap_extent( + NodeExtentMutable&, field_type_t, node_type_t, bool, level_t); + + static void update_is_level_tail(NodeExtentMutable&, const node_extent_t&, bool); + + static node_offset_t header_size() { return FieldType::HEADER_SIZE; } + + template <KeyT KT> + static node_offset_t estimate_insert( + const full_key_t<KT>& key, const value_t& value) { + auto size = FieldType::estimate_insert_one(); + if constexpr (FIELD_TYPE == field_type_t::N2) { + size += ns_oid_view_t::estimate_size<KT>(key); + } else if constexpr (FIELD_TYPE == field_type_t::N3 && + NODE_TYPE == node_type_t::LEAF) { + size += value.size; + } + return size; + } + + template <KeyT KT> + static const value_t* insert_at( + NodeExtentMutable& mut, const node_extent_t&, + const full_key_t<KT>& key, const value_t& value, + index_t index, node_offset_t size, const char* p_left_bound) { + if constexpr (FIELD_TYPE == field_type_t::N3) { + ceph_abort("not implemented"); + } else { + ceph_abort("impossible"); + } + } + + template <KeyT KT> + static memory_range_t insert_prefix_at( + NodeExtentMutable&, const node_extent_t&, + const full_key_t<KT>& key, + index_t index, node_offset_t size, const char* p_left_bound); + + static void update_size_at( + NodeExtentMutable&, const node_extent_t&, index_t index, int change); + + static node_offset_t trim_until( + NodeExtentMutable&, const node_extent_t&, index_t index); + static node_offset_t trim_at(NodeExtentMutable&, const node_extent_t&, + index_t index, node_offset_t trimmed); + + template <KeyT KT> + class Appender; + + private: + const FieldType& fields() const { return *p_fields; } + const FieldType* p_fields; +}; + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +class node_extent_t<FieldType, NODE_TYPE>::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_start{p_append} { +#ifndef NDEBUG + auto p_fields = reinterpret_cast<const FieldType*>(p_append); + assert(*(p_fields->header.get_field_type()) == FIELD_TYPE); + assert(p_fields->header.get_node_type() == NODE_TYPE); + assert(p_fields->num_keys == 0); +#endif + p_append_left = p_start + FieldType::HEADER_SIZE; + p_append_right = p_start + FieldType::SIZE; + } + void append(const node_extent_t& src, index_t from, index_t items); + void append(const full_key_t<KT>&, const value_t&, const value_t*&); + char* wrap(); + std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&); + std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&); + void wrap_nxt(char* p_append) { + if constexpr (FIELD_TYPE != field_type_t::N3) { + assert(p_append < p_append_right); + assert(p_append_left < p_append); + p_append_right = p_append; + FieldType::append_offset(*p_mut, p_append - p_start, p_append_left); + ++num_keys; + } else { + ceph_abort("not implemented"); + } + } + + private: + const node_extent_t* p_src = nullptr; + NodeExtentMutable* p_mut; + char* p_start; + char* p_append_left; + char* p_append_right; + num_keys_t num_keys = 0; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc new file mode 100644 index 000000000..81bfac72a --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc @@ -0,0 +1,96 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_stage_layout.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +void node_header_t::bootstrap_extent( + NodeExtentMutable& mut, + field_type_t field_type, node_type_t node_type, + bool is_level_tail, level_t level) { + node_header_t header; + header.set_field_type(field_type); + header.set_node_type(node_type); + header.set_is_level_tail(is_level_tail); + header.level = level; + mut.copy_in_relative(0, header); +} + +void node_header_t::update_is_level_tail( + NodeExtentMutable& mut, const node_header_t& header, bool value) { + auto& _header = const_cast<node_header_t&>(header); + _header.set_is_level_tail(value); + mut.validate_inplace_update(_header); +} + +#define F013_T _node_fields_013_t<SlotType> +#define F013_INST(ST) _node_fields_013_t<ST> + +template <typename SlotType> +void F013_T::update_size_at( + NodeExtentMutable& mut, const me_t& node, index_t index, int change) { + assert(index <= node.num_keys); + for (const auto* p_slot = &node.slots[index]; + p_slot < &node.slots[node.num_keys]; + ++p_slot) { + node_offset_t offset = p_slot->right_offset; + mut.copy_in_absolute( + (void*)&(p_slot->right_offset), + node_offset_t(offset - change)); + } +} + +template <typename SlotType> +void F013_T::append_key( + NodeExtentMutable& mut, const key_t& key, char*& p_append) { + mut.copy_in_absolute(p_append, key); + p_append += sizeof(key_t); +} + +template <typename SlotType> +void F013_T::append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) { + mut.copy_in_absolute(p_append, offset_to_right); + p_append += sizeof(node_offset_t); +} + +template <typename SlotType> +template <KeyT KT> +void F013_T::insert_at( + NodeExtentMutable& mut, const full_key_t<KT>& key, + const me_t& node, index_t index, node_offset_t size_right) { + assert(index <= node.num_keys); + update_size_at(mut, node, index, size_right); + auto p_insert = const_cast<char*>(fields_start(node)) + + node.get_key_start_offset(index); + auto p_shift_end = fields_start(node) + node.get_key_start_offset(node.num_keys); + mut.shift_absolute(p_insert, p_shift_end - p_insert, estimate_insert_one()); + mut.copy_in_absolute((void*)&node.num_keys, num_keys_t(node.num_keys + 1)); + append_key(mut, key_t::template from_key<KT>(key), p_insert); + append_offset(mut, node.get_item_end_offset(index) - size_right, p_insert); +} +#define IA_TEMPLATE(ST, KT) template void F013_INST(ST):: \ + insert_at<KT>(NodeExtentMutable&, const full_key_t<KT>&, \ + const F013_INST(ST)&, index_t, node_offset_t) +IA_TEMPLATE(slot_0_t, KeyT::VIEW); +IA_TEMPLATE(slot_1_t, KeyT::VIEW); +IA_TEMPLATE(slot_3_t, KeyT::VIEW); +IA_TEMPLATE(slot_0_t, KeyT::HOBJ); +IA_TEMPLATE(slot_1_t, KeyT::HOBJ); +IA_TEMPLATE(slot_3_t, KeyT::HOBJ); + +#define F013_TEMPLATE(ST) template struct F013_INST(ST) +F013_TEMPLATE(slot_0_t); +F013_TEMPLATE(slot_1_t); +F013_TEMPLATE(slot_3_t); + +void node_fields_2_t::append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) { + mut.copy_in_absolute(p_append, offset_to_right); + p_append += sizeof(node_offset_t); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h new file mode 100644 index 000000000..14ba95bf4 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h @@ -0,0 +1,366 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "key_layout.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +struct node_header_t { + static constexpr unsigned FIELD_TYPE_BITS = 6u; + static_assert(static_cast<uint8_t>(field_type_t::_MAX) <= 1u << FIELD_TYPE_BITS); + static constexpr unsigned NODE_TYPE_BITS = 1u; + static constexpr unsigned B_LEVEL_TAIL_BITS = 1u; + using bits_t = uint8_t; + + node_header_t() {} + std::optional<field_type_t> get_field_type() const { + if (field_type >= FIELD_TYPE_MAGIC && + field_type < static_cast<uint8_t>(field_type_t::_MAX)) { + return static_cast<field_type_t>(field_type); + } else { + return std::nullopt; + } + } + node_type_t get_node_type() const { + return static_cast<node_type_t>(node_type); + } + bool get_is_level_tail() const { + return is_level_tail; + } + + static void bootstrap_extent( + NodeExtentMutable&, field_type_t, node_type_t, bool, level_t); + + static void update_is_level_tail(NodeExtentMutable&, const node_header_t&, bool); + + bits_t field_type : FIELD_TYPE_BITS; + bits_t node_type : NODE_TYPE_BITS; + bits_t is_level_tail : B_LEVEL_TAIL_BITS; + static_assert(sizeof(bits_t) * 8 == + FIELD_TYPE_BITS + NODE_TYPE_BITS + B_LEVEL_TAIL_BITS); + level_t level; + + private: + void set_field_type(field_type_t type) { + field_type = static_cast<uint8_t>(type); + } + void set_node_type(node_type_t type) { + node_type = static_cast<uint8_t>(type); + } + void set_is_level_tail(bool value) { + is_level_tail = static_cast<uint8_t>(value); + } +} __attribute__((packed)); + +template <typename FixedKeyType, field_type_t _FIELD_TYPE> +struct _slot_t { + using key_t = FixedKeyType; + static constexpr field_type_t FIELD_TYPE = _FIELD_TYPE; + static constexpr node_offset_t OVERHEAD = sizeof(_slot_t) - sizeof(key_t); + + key_t key; + node_offset_t right_offset; +} __attribute__((packed)); +using slot_0_t = _slot_t<shard_pool_crush_t, field_type_t::N0>; +using slot_1_t = _slot_t<crush_t, field_type_t::N1>; +using slot_3_t = _slot_t<snap_gen_t, field_type_t::N3>; + +struct node_range_t { + node_offset_t start; + node_offset_t end; +}; + +template <typename FieldType> +const char* fields_start(const FieldType& node) { + return reinterpret_cast<const char*>(&node); +} + +template <node_type_t NODE_TYPE, typename FieldType> +node_range_t fields_free_range_before( + const FieldType& node, index_t index) { + assert(index <= node.num_keys); + node_offset_t offset_start = node.get_key_start_offset(index); + node_offset_t offset_end = + (index == 0 ? FieldType::SIZE + : node.get_item_start_offset(index - 1)); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (node.is_level_tail() && index == node.num_keys) { + offset_end -= sizeof(laddr_t); + } + } + assert(offset_start <= offset_end); + assert(offset_end - offset_start < FieldType::SIZE); + return {offset_start, offset_end}; +} + +/** + * _node_fields_013_t (node_fields_0_t, node_fields_1_t, leaf_fields_3_t + * + * The STAGE_LEFT layout implementation for node N0/N1, or the STAGE_RIGHT + * layout implementation for leaf node N3. + * + * The node layout storing n slots: + * + * # <----------------------------- node range --------------------------------------> # + * # #<~># free space # + * # <----- left part -----------------------------> # <~# <----- right slots -------> # + * # # <---- left slots -------------> #~> # # + * # # slots [2, n) |<~># #<~>| right slots [2, n) # + * # # <- slot 0 -> | <- slot 1 -> | # # | <-- s1 --> | <-- s0 --> # + * # # | | # # | | # + * # | num_ # | right | | right | # # | next-stage | next-stage # + * # header | keys # key | offset | key | offset | # # | container | container # + * # | # 0 | 0 | 1 | 1 |...#...#...| or onode 1 | or onode 0 # + * | | ^ ^ + * | | | | + * | +----------------+ | + * +--------------------------------------------+ + */ +template <typename SlotType> +struct _node_fields_013_t { + // TODO: decide by NODE_BLOCK_SIZE, sizeof(SlotType), sizeof(laddr_t) + // and the minimal size of variable_key. + using num_keys_t = uint8_t; + using key_t = typename SlotType::key_t; + using key_get_type = const key_t&; + using me_t = _node_fields_013_t<SlotType>; + static constexpr field_type_t FIELD_TYPE = SlotType::FIELD_TYPE; + static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE; + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + static constexpr node_offset_t ITEM_OVERHEAD = SlotType::OVERHEAD; + + bool is_level_tail() const { return header.get_is_level_tail(); } + node_offset_t total_size() const { return SIZE; } + key_get_type get_key(index_t index) const { + assert(index < num_keys); + return slots[index].key; + } + node_offset_t get_key_start_offset(index_t index) const { + assert(index <= num_keys); + auto offset = HEADER_SIZE + sizeof(SlotType) * index; + assert(offset < SIZE); + return offset; + } + node_offset_t get_item_start_offset(index_t index) const { + assert(index < num_keys); + auto offset = slots[index].right_offset; + assert(offset <= SIZE); + return offset; + } + const void* p_offset(index_t index) const { + assert(index < num_keys); + return &slots[index].right_offset; + } + node_offset_t get_item_end_offset(index_t index) const { + return index == 0 ? SIZE : get_item_start_offset(index - 1); + } + template <node_type_t NODE_TYPE> + node_offset_t free_size_before(index_t index) const { + auto range = fields_free_range_before<NODE_TYPE>(*this, index); + return range.end - range.start; + } + + static node_offset_t estimate_insert_one() { return sizeof(SlotType); } + template <KeyT KT> + static void insert_at( + NodeExtentMutable&, const full_key_t<KT>& key, + const me_t& node, index_t index, node_offset_t size_right); + static void update_size_at( + NodeExtentMutable&, const me_t& node, index_t index, int change); + static void append_key( + NodeExtentMutable&, const key_t& key, char*& p_append); + template <KeyT KT> + static void append_key( + NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) { + append_key(mut, key_t::template from_key<KT>(key), p_append); + } + static void append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append); + + node_header_t header; + num_keys_t num_keys = 0u; + SlotType slots[]; +} __attribute__((packed)); +using node_fields_0_t = _node_fields_013_t<slot_0_t>; +using node_fields_1_t = _node_fields_013_t<slot_1_t>; + +/** + * node_fields_2_t + * + * The STAGE_STRING layout implementation for node N2. + * + * The node layout storing n slots: + * + * # <--------------------------------- node range ----------------------------------------> # + * # #<~># free space # + * # <------- left part ---------------> # <~# <--------- right slots ---------------------> # + * # # <---- offsets ----> #~> #<~>| slots [2, n) # + * # # offsets [2, n) |<~># # | <----- slot 1 ----> | <----- slot 0 ----> # + * # # | # # | | # + * # | num_ # offset | offset | # # | next-stage | ns-oid | next-stage | ns-oid # + * # header | keys # 0 | 1 |...#...#...| container1 | 1 | container0 | 0 # + * | | ^ ^ + * | | | | + * | +----------------+ | + * +-----------------------------------------------+ + */ +struct node_fields_2_t { + // TODO: decide by NODE_BLOCK_SIZE, sizeof(node_off_t), sizeof(laddr_t) + // and the minimal size of variable_key. + using num_keys_t = uint8_t; + using key_t = ns_oid_view_t; + using key_get_type = key_t; + static constexpr field_type_t FIELD_TYPE = field_type_t::N2; + static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE; + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + static constexpr node_offset_t ITEM_OVERHEAD = sizeof(node_offset_t); + + bool is_level_tail() const { return header.get_is_level_tail(); } + node_offset_t total_size() const { return SIZE; } + key_get_type get_key(index_t index) const { + assert(index < num_keys); + node_offset_t item_end_offset = + (index == 0 ? SIZE : offsets[index - 1]); + assert(item_end_offset <= SIZE); + const char* p_start = fields_start(*this); + return key_t(p_start + item_end_offset); + } + node_offset_t get_key_start_offset(index_t index) const { + assert(index <= num_keys); + auto offset = HEADER_SIZE + sizeof(node_offset_t) * num_keys; + assert(offset <= SIZE); + return offset; + } + node_offset_t get_item_start_offset(index_t index) const { + assert(index < num_keys); + auto offset = offsets[index]; + assert(offset <= SIZE); + return offset; + } + const void* p_offset(index_t index) const { + assert(index < num_keys); + return &offsets[index]; + } + node_offset_t get_item_end_offset(index_t index) const { + return index == 0 ? SIZE : get_item_start_offset(index - 1); + } + template <node_type_t NODE_TYPE> + node_offset_t free_size_before(index_t index) const { + auto range = fields_free_range_before<NODE_TYPE>(*this, index); + return range.end - range.start; + } + + static node_offset_t estimate_insert_one() { return sizeof(node_offset_t); } + template <KeyT KT> + static void insert_at( + NodeExtentMutable& mut, const full_key_t<KT>& key, + const node_fields_2_t& node, index_t index, node_offset_t size_right) { + ceph_abort("not implemented"); + } + static void update_size_at( + NodeExtentMutable& mut, const node_fields_2_t& node, index_t index, int change) { + ceph_abort("not implemented"); + } + static void append_key( + NodeExtentMutable& mut, const key_t& key, char*& p_append) { + ns_oid_view_t::append(mut, key, p_append); + } + template <KeyT KT> + static void append_key( + NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) { + ns_oid_view_t::append<KT>(mut, key, p_append); + } + static void append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append); + + node_header_t header; + num_keys_t num_keys = 0u; + node_offset_t offsets[]; +} __attribute__((packed)); + +/** + * internal_fields_3_t + * + * The STAGE_RIGHT layout implementation for N2. + * + * The node layout storing 3 children: + * + * # <---------------- node range ---------------------------> # + * # # <-- keys ---> # <---- laddrs -----------> # + * # free space: # |<~># |<~># + * # # | # | # + * # | num_ # key | key | # laddr | laddr | laddr | # + * # header | keys # 0 | 1 |...# 0 | 1 | 2 |...# + */ +// TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t) +static constexpr unsigned MAX_NUM_KEYS_I3 = 170u; +template <unsigned MAX_NUM_KEYS> +struct _internal_fields_3_t { + using key_get_type = const snap_gen_t&; + using me_t = _internal_fields_3_t<MAX_NUM_KEYS>; + // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t) + using num_keys_t = uint8_t; + static constexpr field_type_t FIELD_TYPE = field_type_t::N3; + static constexpr node_offset_t SIZE = sizeof(me_t); + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + static constexpr node_offset_t ITEM_OVERHEAD = 0u; + + bool is_level_tail() const { return header.get_is_level_tail(); } + node_offset_t total_size() const { + if (is_level_tail()) { + return SIZE - sizeof(snap_gen_t); + } else { + return SIZE; + } + } + key_get_type get_key(index_t index) const { + assert(index < num_keys); + return keys[index]; + } + template <node_type_t NODE_TYPE> + std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, node_offset_t> + free_size_before(index_t index) const { + assert(index <= num_keys); + assert(num_keys <= (is_level_tail() ? MAX_NUM_KEYS - 1 : MAX_NUM_KEYS)); + auto free = (MAX_NUM_KEYS - index) * (sizeof(snap_gen_t) + sizeof(laddr_t)); + if (is_level_tail() && index == num_keys) { + free -= (sizeof(snap_gen_t) + sizeof(laddr_t)); + } + assert(free < SIZE); + return free; + } + + static node_offset_t estimate_insert_one() { + return sizeof(snap_gen_t) + sizeof(laddr_t); + } + template <KeyT KT> + static void insert_at( + NodeExtentMutable& mut, const full_key_t<KT>& key, + const me_t& node, index_t index, node_offset_t size_right) { + ceph_abort("not implemented"); + } + static void update_size_at( + NodeExtentMutable& mut, const me_t& node, index_t index, int change) { + ceph_abort("not implemented"); + } + + node_header_t header; + num_keys_t num_keys = 0u; + snap_gen_t keys[MAX_NUM_KEYS]; + laddr_packed_t child_addrs[MAX_NUM_KEYS]; +} __attribute__((packed)); +static_assert(_internal_fields_3_t<MAX_NUM_KEYS_I3>::SIZE <= NODE_BLOCK_SIZE && + _internal_fields_3_t<MAX_NUM_KEYS_I3 + 1>::SIZE > NODE_BLOCK_SIZE); +using internal_fields_3_t = _internal_fields_3_t<MAX_NUM_KEYS_I3>; + +using leaf_fields_3_t = _node_fields_013_t<slot_3_t>; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h new file mode 100644 index 000000000..cac167a98 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h @@ -0,0 +1,2186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <optional> +#include <ostream> +#include <sstream> +#include <type_traits> + +#include "common/likely.h" + +#include "sub_items_stage.h" +#include "item_iterator_stage.h" + +namespace crimson::os::seastore::onode { + +struct search_result_bs_t { + index_t index; + MatchKindBS match; +}; +template <typename FGetKey> +search_result_bs_t binary_search( + const full_key_t<KeyT::HOBJ>& key, + index_t begin, index_t end, FGetKey&& f_get_key) { + assert(begin <= end); + while (begin < end) { + auto total = begin + end; + auto mid = total >> 1; + // do not copy if return value is reference + decltype(f_get_key(mid)) target = f_get_key(mid); + auto match = compare_to<KeyT::HOBJ>(key, target); + if (match == MatchKindCMP::LT) { + end = mid; + } else if (match == MatchKindCMP::GT) { + begin = mid + 1; + } else { + return {mid, MatchKindBS::EQ}; + } + } + return {begin , MatchKindBS::NE}; +} + +template <typename PivotType, typename FGet> +search_result_bs_t binary_search_r( + index_t rend, index_t rbegin, FGet&& f_get, const PivotType& key) { + assert(rend <= rbegin); + while (rend < rbegin) { + auto total = rend + rbegin + 1; + auto mid = total >> 1; + // do not copy if return value is reference + decltype(f_get(mid)) target = f_get(mid); + int match = target - key; + if (match < 0) { + rend = mid; + } else if (match > 0) { + rbegin = mid - 1; + } else { + return {mid, MatchKindBS::EQ}; + } + } + return {rbegin, MatchKindBS::NE}; +} + +inline bool matchable(field_type_t type, match_stat_t mstat) { + assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX); + /* + * compressed prefix by field type: + * N0: NONE + * N1: pool/shard + * N2: pool/shard crush + * N3: pool/shard crush ns/oid + * + * if key matches the node's compressed prefix, return true + * else, return false + */ +#ifndef NDEBUG + if (mstat == MSTAT_END) { + assert(type == field_type_t::N0); + } +#endif + return mstat + to_unsigned(type) < 4; +} + +inline void assert_mstat( + const full_key_t<KeyT::HOBJ>& key, + const full_key_t<KeyT::VIEW>& index, + match_stat_t mstat) { + assert(mstat >= MSTAT_MIN && mstat <= MSTAT_LT2); + // key < index ... + switch (mstat) { + case MSTAT_EQ: + break; + case MSTAT_LT0: + assert(compare_to<KeyT::HOBJ>(key, index.snap_gen_packed()) == MatchKindCMP::LT); + break; + case MSTAT_LT1: + assert(compare_to<KeyT::HOBJ>(key, index.ns_oid_view()) == MatchKindCMP::LT); + break; + case MSTAT_LT2: + if (index.has_shard_pool()) { + assert(compare_to<KeyT::HOBJ>(key, shard_pool_crush_t{ + index.shard_pool_packed(), index.crush_packed()}) == MatchKindCMP::LT); + } else { + assert(compare_to<KeyT::HOBJ>(key, index.crush_packed()) == MatchKindCMP::LT); + } + break; + default: + ceph_abort("impossible path"); + } + // key == index ... + switch (mstat) { + case MSTAT_EQ: + assert(compare_to<KeyT::HOBJ>(key, index.snap_gen_packed()) == MatchKindCMP::EQ); + case MSTAT_LT0: + if (!index.has_ns_oid()) + break; + assert(index.ns_oid_view().type() == ns_oid_view_t::Type::MAX || + compare_to<KeyT::HOBJ>(key, index.ns_oid_view()) == MatchKindCMP::EQ); + case MSTAT_LT1: + if (!index.has_crush()) + break; + assert(compare_to<KeyT::HOBJ>(key, index.crush_packed()) == MatchKindCMP::EQ); + if (!index.has_shard_pool()) + break; + assert(compare_to<KeyT::HOBJ>(key, index.shard_pool_packed()) == MatchKindCMP::EQ); + default: + break; + } +} + +#define NXT_STAGE_T staged<next_param_t> + +enum class TrimType { BEFORE, AFTER, AT }; + +/** + * staged + * + * Implements recursive logic that modifies or reads the node layout + * (N0/N1/N2/N3 * LEAF/INTERNAL) with the multi-stage design. The specific + * stage implementation is flexible. So the implementations for different + * stages can be assembled independently, as long as they follow the + * definitions of container interfaces. + * + * Multi-stage is designed to index different portions of onode keys + * stage-by-stage. There are at most 3 stages for a node: + * - STAGE_LEFT: index shard-pool-crush for N0, or index crush for N1 node; + * - STAGE_STRING: index ns-oid for N0/N1/N2 nodes; + * - STAGE_RIGHT: index snap-gen for N0/N1/N2/N3 nodes; + * + * The intention is to consolidate the high-level indexing implementations at + * the level of stage, so we don't need to write them repeatedly for every + * stage and for every node type. + */ +template <typename Params> +struct staged { + static_assert(Params::STAGE >= STAGE_BOTTOM); + static_assert(Params::STAGE <= STAGE_TOP); + using container_t = typename Params::container_t; + using key_get_type = typename container_t::key_get_type; + using next_param_t = typename Params::next_param_t; + using position_t = staged_position_t<Params::STAGE>; + using result_t = staged_result_t<Params::NODE_TYPE, Params::STAGE>; + using value_t = value_type_t<Params::NODE_TYPE>; + static constexpr auto CONTAINER_TYPE = container_t::CONTAINER_TYPE; + static constexpr bool IS_BOTTOM = (Params::STAGE == STAGE_BOTTOM); + static constexpr auto NODE_TYPE = Params::NODE_TYPE; + static constexpr auto STAGE = Params::STAGE; + + template <bool is_exclusive> + static void _left_or_right(index_t& split_index, index_t insert_index, + std::optional<bool>& is_insert_left) { + assert(!is_insert_left.has_value()); + assert(is_valid_index(split_index)); + if constexpr (is_exclusive) { + if (split_index <= insert_index) { + // ...[s_index-1] |!| (i_index) [s_index]... + // offset i_position to right + is_insert_left = false; + } else { + // ...[s_index-1] (i_index)) |?[s_index]| ... + // ...(i_index)...[s_index-1] |?[s_index]| ... + is_insert_left = true; + --split_index; + } + } else { + if (split_index < insert_index) { + // ...[s_index-1] |?[s_index]| ...[(i_index)[s_index_k]... + is_insert_left = false; + } else if (split_index > insert_index) { + // ...[(i_index)s_index-1] |?[s_index]| ... + // ...[(i_index)s_index_k]...[s_index-1] |?[s_index]| ... + is_insert_left = true; + } else { + // ...[s_index-1] |?[(i_index)s_index]| ... + // i_to_left = std::nullopt; + } + } + } + + template <ContainerType CTYPE, typename Enable = void> class _iterator_t; + template <ContainerType CTYPE> + class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::INDEXABLE>> { + /* + * indexable container type system: + * CONTAINER_TYPE = ContainerType::INDEXABLE + * keys() const -> index_t + * operator[](index_t) const -> key_get_type + * size_before(index_t) const -> node_offset_t + * size_overhead_at(index_t) const -> node_offset_t + * (IS_BOTTOM) get_p_value(index_t) const -> const value_t* + * (!IS_BOTTOM) size_to_nxt_at(index_t) const -> node_offset_t + * (!IS_BOTTOM) get_nxt_container(index_t) const + * encode(p_node_start, encoded) + * decode(p_node_start, delta) -> container_t + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + * (IS_BOTTOM) insert_at(mut, src, key, value, + * index, size, p_left_bound) -> const value_t* + * (!IS_BOTTOM) insert_prefix_at(mut, src, key, + * index, size, p_left_bound) -> memory_range_t + * (!IS_BOTTOM) update_size_at(mut, src, index, size) + * trim_until(mut, container, index) -> trim_size + * (!IS_BOTTOM) trim_at(mut, container, index, trimmed) -> trim_size + * + * Appender::append(const container_t& src, from, items) + */ + public: + using me_t = _iterator_t<CTYPE>; + + _iterator_t(const container_t& container) : container{container} { + assert(container.keys()); + } + + index_t index() const { + return _index; + } + key_get_type get_key() const { + assert(!is_end()); + return container[_index]; + } + node_offset_t size_to_nxt() const { + assert(!is_end()); + return container.size_to_nxt_at(_index); + } + template <typename T = typename NXT_STAGE_T::container_t> + std::enable_if_t<!IS_BOTTOM, T> get_nxt_container() const { + assert(!is_end()); + return container.get_nxt_container(_index); + } + template <typename T = value_t> + std::enable_if_t<IS_BOTTOM, const T*> get_p_value() const { + assert(!is_end()); + return container.get_p_value(_index); + } + bool is_last() const { + return _index + 1 == container.keys(); + } + bool is_end() const { return _index == container.keys(); } + node_offset_t size() const { + assert(!is_end()); + assert(header_size() == container.size_before(0)); + assert(container.size_before(_index + 1) > container.size_before(_index)); + return container.size_before(_index + 1) - + container.size_before(_index); + } + node_offset_t size_overhead() const { + assert(!is_end()); + return container.size_overhead_at(_index); + } + + me_t& operator++() { + assert(!is_end()); + assert(!is_last()); + ++_index; + return *this; + } + void seek_at(index_t index) { + assert(index < container.keys()); + seek_till_end(index); + } + void seek_till_end(index_t index) { + assert(!is_end()); + assert(this->index() == 0); + assert(index <= container.keys()); + _index = index; + } + void seek_last() { + assert(!is_end()); + assert(index() == 0); + _index = container.keys() - 1; + } + void set_end() { + assert(!is_end()); + assert(is_last()); + ++_index; + } + // Note: possible to return an end iterator + MatchKindBS seek(const full_key_t<KeyT::HOBJ>& key, bool exclude_last) { + assert(!is_end()); + assert(index() == 0); + index_t end_index = container.keys(); + if (exclude_last) { + assert(end_index); + --end_index; + assert(compare_to<KeyT::HOBJ>(key, container[end_index]) == MatchKindCMP::LT); + } + auto ret = binary_search(key, _index, end_index, + [this] (index_t index) { return container[index]; }); + _index = ret.index; + return ret.match; + } + + template <KeyT KT, typename T = value_t> + std::enable_if_t<IS_BOTTOM, const T*> insert( + NodeExtentMutable& mut, const full_key_t<KT>& key, + const value_t& value, node_offset_t insert_size, const char* p_left_bound) { + return container_t::template insert_at<KT>( + mut, container, key, value, _index, insert_size, p_left_bound); + } + + template <KeyT KT, typename T = memory_range_t> + std::enable_if_t<!IS_BOTTOM, T> insert_prefix( + NodeExtentMutable& mut, const full_key_t<KT>& key, + node_offset_t size, const char* p_left_bound) { + return container_t::template insert_prefix_at<KT>( + mut, container, key, _index, size, p_left_bound); + } + + template <typename T = void> + std::enable_if_t<!IS_BOTTOM, T> + update_size(NodeExtentMutable& mut, node_offset_t insert_size) { + assert(!is_end()); + container_t::update_size_at(mut, container, _index, insert_size); + } + + // Note: possible to return an end iterator when is_exclusive is true + template <bool is_exclusive> + size_t seek_split_inserted( + size_t start_size, size_t extra_size, size_t target_size, + index_t& insert_index, size_t insert_size, + std::optional<bool>& is_insert_left) { + assert(!is_end()); + assert(index() == 0); + // replace insert_index placeholder + if constexpr (!is_exclusive) { + if (insert_index == INDEX_LAST) { + insert_index = container.keys() - 1; + } + } else { + if (insert_index == INDEX_END) { + insert_index = container.keys(); + } + } + assert(insert_index <= container.keys()); + + auto start_size_1 = start_size + extra_size; + auto f_get_used_size = [this, start_size, start_size_1, + insert_index, insert_size] (index_t index) { + size_t current_size; + if (unlikely(index == 0)) { + current_size = start_size; + } else { + current_size = start_size_1; + if (index > insert_index) { + current_size += insert_size; + if constexpr (is_exclusive) { + --index; + } + } + // already includes header size + current_size += container.size_before(index); + } + return current_size; + }; + index_t s_end; + if constexpr (is_exclusive) { + s_end = container.keys(); + } else { + s_end = container.keys() - 1; + } + _index = binary_search_r(0, s_end, f_get_used_size, target_size).index; + size_t current_size = f_get_used_size(_index); + assert(current_size <= target_size); + + _left_or_right<is_exclusive>(_index, insert_index, is_insert_left); + return current_size; + } + + size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) { + assert(!is_end()); + assert(index() == 0); + auto start_size_1 = start_size + extra_size; + auto f_get_used_size = [this, start_size, start_size_1] (index_t index) { + size_t current_size; + if (unlikely(index == 0)) { + current_size = start_size; + } else { + // already includes header size + current_size = start_size_1 + container.size_before(index); + } + return current_size; + }; + _index = binary_search_r( + 0, container.keys() - 1, f_get_used_size, target_size).index; + size_t current_size = f_get_used_size(_index); + assert(current_size <= target_size); + return current_size; + } + + // Note: possible to return an end iterater if to_index == INDEX_END + template <KeyT KT> + void copy_out_until( + typename container_t::template Appender<KT>& appender, index_t& to_index) { + auto num_keys = container.keys(); + index_t items; + if (to_index == INDEX_END) { + items = num_keys - _index; + appender.append(container, _index, items); + _index = num_keys; + to_index = _index; + } else if (to_index == INDEX_LAST) { + assert(!is_end()); + items = num_keys - 1 - _index; + appender.append(container, _index, items); + _index = num_keys - 1; + to_index = _index; + } else { + assert(_index <= to_index); + assert(to_index <= num_keys); + items = to_index - _index; + appender.append(container, _index, items); + _index = to_index; + } + } + + node_offset_t trim_until(NodeExtentMutable& mut) { + return container_t::trim_until(mut, container, _index); + } + + template <typename T = node_offset_t> + std::enable_if_t<!IS_BOTTOM, T> + trim_at(NodeExtentMutable& mut, node_offset_t trimmed) { + return container_t::trim_at(mut, container, _index, trimmed); + } + + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + container.encode(p_node_start, encoded); + ceph::encode(_index, encoded); + } + + static me_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + auto container = container_t::decode(p_node_start, delta); + auto ret = me_t(container); + index_t index; + ceph::decode(index, delta); + ret.seek_till_end(index); + return ret; + } + + static node_offset_t header_size() { + return container_t::header_size(); + } + + template <KeyT KT> + static node_offset_t estimate_insert( + const full_key_t<KT>& key, const value_t& value) { + return container_t::template estimate_insert<KT>(key, value); + } + + private: + container_t container; + index_t _index = 0; + }; + + template <ContainerType CTYPE> + class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::ITERATIVE>> { + /* + * iterative container type system (!IS_BOTTOM): + * CONTAINER_TYPE = ContainerType::ITERATIVE + * index() const -> index_t + * get_key() const -> key_get_type + * size() const -> node_offset_t + * size_to_nxt() const -> node_offset_t + * size_overhead() const -> node_offset_t + * get_nxt_container() const + * has_next() const -> bool + * encode(p_node_start, encoded) + * decode(p_node_start, delta) -> container_t + * operator++() + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + * insert_prefix(mut, src, key, is_end, size, p_left_bound) -> memory_range_t + * update_size(mut, src, size) + * trim_until(mut, container) -> trim_size + * trim_at(mut, container, trimmed) -> trim_size + */ + // currently the iterative iterator is only implemented with STAGE_STRING + // for in-node space efficiency + static_assert(STAGE == STAGE_STRING); + public: + using me_t = _iterator_t<CTYPE>; + + _iterator_t(const container_t& container) : container{container} {} + + index_t index() const { + if (is_end()) { + return container.index() + 1; + } else { + return container.index(); + } + } + key_get_type get_key() const { + assert(!is_end()); + return container.get_key(); + } + node_offset_t size_to_nxt() const { + assert(!is_end()); + return container.size_to_nxt(); + } + const typename NXT_STAGE_T::container_t get_nxt_container() const { + assert(!is_end()); + return container.get_nxt_container(); + } + bool is_last() const { + assert(!is_end()); + return !container.has_next(); + } + bool is_end() const { +#ifndef NDEBUG + if (_is_end) { + assert(!container.has_next()); + } +#endif + return _is_end; + } + node_offset_t size() const { + assert(!is_end()); + return container.size(); + } + node_offset_t size_overhead() const { + assert(!is_end()); + return container.size_overhead(); + } + + me_t& operator++() { + assert(!is_end()); + assert(!is_last()); + ++container; + return *this; + } + void seek_at(index_t index) { + assert(!is_end()); + assert(this->index() == 0); + while (index > 0) { + assert(container.has_next()); + ++container; + --index; + } + } + void seek_till_end(index_t index) { + assert(!is_end()); + assert(this->index() == 0); + while (index > 0) { + if (!container.has_next()) { + assert(index == 1); + set_end(); + break; + } + ++container; + --index; + } + } + void seek_last() { + assert(!is_end()); + assert(index() == 0); + while (container.has_next()) { + ++container; + } + } + void set_end() { + assert(!is_end()); + assert(is_last()); + _is_end = true; + } + // Note: possible to return an end iterator + MatchKindBS seek(const full_key_t<KeyT::HOBJ>& key, bool exclude_last) { + assert(!is_end()); + assert(index() == 0); + do { + if (exclude_last && is_last()) { + assert(compare_to<KeyT::HOBJ>(key, get_key()) == MatchKindCMP::LT); + return MatchKindBS::NE; + } + auto match = compare_to<KeyT::HOBJ>(key, get_key()); + if (match == MatchKindCMP::LT) { + return MatchKindBS::NE; + } else if (match == MatchKindCMP::EQ) { + return MatchKindBS::EQ; + } else { + if (container.has_next()) { + ++container; + } else { + // end + break; + } + } + } while (true); + assert(!exclude_last); + set_end(); + return MatchKindBS::NE; + } + + template <KeyT KT> + memory_range_t insert_prefix( + NodeExtentMutable& mut, const full_key_t<KT>& key, + node_offset_t size, const char* p_left_bound) { + return container_t::template insert_prefix<KT>( + mut, container, key, is_end(), size, p_left_bound); + } + + void update_size(NodeExtentMutable& mut, node_offset_t insert_size) { + assert(!is_end()); + container_t::update_size(mut, container, insert_size); + } + + // Note: possible to return an end iterator when is_exclusive is true + // insert_index can still be INDEX_LAST or INDEX_END + template <bool is_exclusive> + size_t seek_split_inserted( + size_t start_size, size_t extra_size, size_t target_size, + index_t& insert_index, size_t insert_size, + std::optional<bool>& is_insert_left) { + assert(!is_end()); + assert(index() == 0); + size_t current_size = start_size; + index_t split_index = 0; + extra_size += header_size(); + do { + if constexpr (!is_exclusive) { + if (is_last()) { + assert(split_index == index()); + if (insert_index == INDEX_LAST) { + insert_index = index(); + } + assert(insert_index <= index()); + break; + } + } + + size_t nxt_size = current_size; + if (split_index == 0) { + nxt_size += extra_size; + } + if (split_index == insert_index) { + nxt_size += insert_size; + if constexpr (is_exclusive) { + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + ++split_index; + } + } + nxt_size += size(); + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + + if constexpr (is_exclusive) { + if (is_last()) { + assert(split_index == index()); + set_end(); + split_index = index(); + if (insert_index == INDEX_END) { + insert_index = index(); + } + assert(insert_index == index()); + break; + } else { + ++(*this); + ++split_index; + } + } else { + ++(*this); + ++split_index; + } + } while (true); + assert(current_size <= target_size); + + _left_or_right<is_exclusive>(split_index, insert_index, is_insert_left); + assert(split_index == index()); + return current_size; + } + + size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) { + assert(!is_end()); + assert(index() == 0); + size_t current_size = start_size; + do { + if (is_last()) { + break; + } + + size_t nxt_size = current_size; + if (index() == 0) { + nxt_size += extra_size; + } + nxt_size += size(); + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + ++(*this); + } while (true); + assert(current_size <= target_size); + return current_size; + } + + // Note: possible to return an end iterater if to_index == INDEX_END + template <KeyT KT> + void copy_out_until( + typename container_t::template Appender<KT>& appender, index_t& to_index) { + if (is_end()) { + assert(!container.has_next()); + if (to_index == INDEX_END) { + to_index = index(); + } + assert(to_index == index()); + return; + } + index_t items; + if (to_index == INDEX_END || to_index == INDEX_LAST) { + items = to_index; + } else { + assert(is_valid_index(to_index)); + assert(index() <= to_index); + items = to_index - index(); + } + if (appender.append(container, items)) { + set_end(); + } + to_index = index(); + } + + node_offset_t trim_until(NodeExtentMutable& mut) { + if (is_end()) { + return 0; + } + return container_t::trim_until(mut, container); + } + + node_offset_t trim_at(NodeExtentMutable& mut, node_offset_t trimmed) { + assert(!is_end()); + return container_t::trim_at(mut, container, trimmed); + } + + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + container.encode(p_node_start, encoded); + uint8_t is_end = _is_end; + ceph::encode(is_end, encoded); + } + + static me_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + auto container = container_t::decode(p_node_start, delta); + auto ret = me_t(container); + uint8_t is_end; + ceph::decode(is_end, delta); + if (is_end) { + ret.set_end(); + } + return ret; + } + + static node_offset_t header_size() { + return container_t::header_size(); + } + + template <KeyT KT> + static node_offset_t estimate_insert(const full_key_t<KT>& key, const value_t& value) { + return container_t::template estimate_insert<KT>(key, value); + } + + private: + container_t container; + bool _is_end = false; + }; + + /* + * iterator_t encapsulates both indexable and iterative implementations + * from a *non-empty* container. + * cstr(const container_t&) + * access: + * index() -> index_t + * get_key() -> key_get_type (const reference or value type) + * is_last() -> bool + * is_end() -> bool + * size() -> node_offset_t + * size_overhead() -> node_offset_t + * (IS_BOTTOM) get_p_value() -> const value_t* + * (!IS_BOTTOM) get_nxt_container() -> nxt_stage::container_t + * (!IS_BOTTOM) size_to_nxt() -> node_offset_t + * seek: + * operator++() -> iterator_t& + * seek_at(index) + * seek_till_end(index) + * seek_last() + * set_end() + * seek(key, exclude_last) -> MatchKindBS + * insert: + * (IS_BOTTOM) insert(mut, key, value, size, p_left_bound) -> p_value + * (!IS_BOTTOM) insert_prefix(mut, key, size, p_left_bound) -> memory_range_t + * (!IS_BOTTOM) update_size(mut, size) + * split: + * seek_split_inserted<bool is_exclusive>( + * start_size, extra_size, target_size, insert_index, insert_size, + * std::optional<bool>& is_insert_left) + * -> insert to left/right/unknown (!exclusive) + * -> insert to left/right (exclusive, can be end) + * -> split_size + * seek_split(start_size, extra_size, target_size) -> split_size + * copy_out_until(appender, to_index) (can be end) + * trim_until(mut) -> trim_size + * (!IS_BOTTOM) trim_at(mut, trimmed) -> trim_size + * denc: + * encode(p_node_start, encoded) + * decode(p_node_start, delta) -> iterator_t + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + */ + using iterator_t = _iterator_t<CONTAINER_TYPE>; + /* TODO: detailed comments + * - trim_until(mut) -> trim_size + * * keep 0 to i - 1, and remove the rest, return the size trimmed. + * * if this is the end iterator, do nothing and return 0. + * * if this is the start iterator, normally needs to go to the higher + * stage to trim the entire container. + * - trim_at(mut, trimmed) -> trim_size + * * trim happens inside the current iterator, causing the size reduced by + * <trimmed>, return the total size trimmed. + */ + + /* + * Lookup internals (hide?) + */ + + template <bool GET_KEY> + static result_t smallest_result( + const iterator_t& iter, full_key_t<KeyT::VIEW>* index_key) { + static_assert(!IS_BOTTOM); + assert(!iter.is_end()); + auto pos_smallest = NXT_STAGE_T::position_t::begin(); + auto nxt_container = iter.get_nxt_container(); + auto value_ptr = NXT_STAGE_T::template get_p_value<GET_KEY>( + nxt_container, pos_smallest, index_key); + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + return result_t{{iter.index(), pos_smallest}, value_ptr, STAGE}; + } + + template <bool GET_KEY> + static result_t nxt_lower_bound( + const full_key_t<KeyT::HOBJ>& key, iterator_t& iter, + MatchHistory& history, full_key_t<KeyT::VIEW>* index_key) { + static_assert(!IS_BOTTOM); + assert(!iter.is_end()); + auto nxt_container = iter.get_nxt_container(); + auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>( + nxt_container, key, history, index_key); + if (nxt_result.is_end()) { + if (iter.is_last()) { + return result_t::end(); + } else { + return smallest_result<GET_KEY>(++iter, index_key); + } + } else { + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + return result_t::from_nxt(iter.index(), nxt_result); + } + } + + template <bool GET_POS, bool GET_KEY, bool GET_VAL> + static void lookup_largest_slot( + const container_t& container, position_t* p_position, + full_key_t<KeyT::VIEW>* p_index_key, const value_t** pp_value) { + auto iter = iterator_t(container); + iter.seek_last(); + if constexpr (GET_KEY) { + assert(p_index_key); + p_index_key->set(iter.get_key()); + } + if constexpr (GET_POS) { + assert(p_position); + p_position->index = iter.index(); + } + if constexpr (IS_BOTTOM) { + if constexpr (GET_VAL) { + assert(pp_value); + *pp_value = iter.get_p_value(); + } + } else { + auto nxt_container = iter.get_nxt_container(); + if constexpr (GET_POS) { + NXT_STAGE_T::template lookup_largest_slot<true, GET_KEY, GET_VAL>( + nxt_container, &p_position->nxt, p_index_key, pp_value); + } else { + NXT_STAGE_T::template lookup_largest_slot<false, GET_KEY, GET_VAL>( + nxt_container, nullptr, p_index_key, pp_value); + } + } + } + + template <bool GET_KEY = false> + static const value_t* get_p_value( + const container_t& container, const position_t& position, + full_key_t<KeyT::VIEW>* index_key = nullptr) { + auto iter = iterator_t(container); + iter.seek_at(position.index); + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::template get_p_value<GET_KEY>( + nxt_container, position.nxt, index_key); + } else { + return iter.get_p_value(); + } + } + + static void get_key_view( + const container_t& container, + const position_t& position, + full_key_t<KeyT::VIEW>& index_key) { + auto iter = iterator_t(container); + iter.seek_at(position.index); + index_key.set(iter.get_key()); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::get_key_view(nxt_container, position.nxt, index_key); + } + } + + template <bool GET_KEY = false> + static result_t lower_bound( + const container_t& container, + const full_key_t<KeyT::HOBJ>& key, + MatchHistory& history, + full_key_t<KeyT::VIEW>* index_key = nullptr) { + bool exclude_last = false; + if (history.get<STAGE>().has_value()) { + if (*history.get<STAGE>() == MatchKindCMP::EQ) { + // lookup is short-circuited + if constexpr (!IS_BOTTOM) { + assert(history.get<STAGE - 1>().has_value()); + if (history.is_GT<STAGE - 1>()) { + auto iter = iterator_t(container); + bool test_key_equal; + if constexpr (STAGE == STAGE_STRING) { + // TODO(cross-node string dedup) + // test_key_equal = (iter.get_key().type() == ns_oid_view_t::Type::MIN); + auto cmp = compare_to<KeyT::HOBJ>(key, iter.get_key()); + assert(cmp != MatchKindCMP::GT); + test_key_equal = (cmp == MatchKindCMP::EQ); + } else { + auto cmp = compare_to<KeyT::HOBJ>(key, iter.get_key()); + // From history, key[stage] == parent[stage][index - 1] + // which should be the smallest possible value for all + // index[stage][*] + assert(cmp != MatchKindCMP::GT); + test_key_equal = (cmp == MatchKindCMP::EQ); + } + if (test_key_equal) { + return nxt_lower_bound<GET_KEY>(key, iter, history, index_key); + } else { + // key[stage] < index[stage][left-most] + return smallest_result<GET_KEY>(iter, index_key); + } + } + } + // IS_BOTTOM || !history.is_GT<STAGE - 1>() + auto iter = iterator_t(container); + iter.seek_last(); + if constexpr (STAGE == STAGE_STRING) { + // TODO(cross-node string dedup) + // assert(iter.get_key().type() == ns_oid_view_t::Type::MAX); + assert(compare_to<KeyT::HOBJ>(key, iter.get_key()) == MatchKindCMP::EQ); + } else { + assert(compare_to<KeyT::HOBJ>(key, iter.get_key()) == MatchKindCMP::EQ); + } + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + if constexpr (IS_BOTTOM) { + auto value_ptr = iter.get_p_value(); + return result_t{{iter.index()}, value_ptr, MSTAT_EQ}; + } else { + auto nxt_container = iter.get_nxt_container(); + auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>( + nxt_container, key, history, index_key); + // !history.is_GT<STAGE - 1>() means + // key[stage+1 ...] <= index[stage+1 ...][*] + assert(!nxt_result.is_end()); + return result_t::from_nxt(iter.index(), nxt_result); + } + } else if (*history.get<STAGE>() == MatchKindCMP::LT) { + exclude_last = true; + } + } + auto iter = iterator_t(container); + auto bs_match = iter.seek(key, exclude_last); + if (iter.is_end()) { + assert(!exclude_last); + assert(bs_match == MatchKindBS::NE); + history.set<STAGE>(MatchKindCMP::GT); + return result_t::end(); + } + history.set<STAGE>(bs_match == MatchKindBS::EQ ? + MatchKindCMP::EQ : MatchKindCMP::LT); + if constexpr (IS_BOTTOM) { + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + auto value_ptr = iter.get_p_value(); + return result_t{{iter.index()}, value_ptr, + (bs_match == MatchKindBS::EQ ? MSTAT_EQ : MSTAT_LT0)}; + } else { + if (bs_match == MatchKindBS::EQ) { + return nxt_lower_bound<GET_KEY>(key, iter, history, index_key); + } else { + return smallest_result<GET_KEY>(iter, index_key); + } + } + } + + template <KeyT KT> + static node_offset_t insert_size(const full_key_t<KT>& key, const value_t& value) { + if constexpr (IS_BOTTOM) { + return iterator_t::template estimate_insert<KT>(key, value); + } else { + return iterator_t::template estimate_insert<KT>(key, value) + + NXT_STAGE_T::iterator_t::header_size() + + NXT_STAGE_T::template insert_size<KT>(key, value); + } + } + + template <KeyT KT> + static node_offset_t insert_size_at( + match_stage_t stage, const full_key_t<KeyT::HOBJ>& key, const value_t& value) { + if (stage == STAGE) { + return insert_size<KT>(key, value); + } else { + assert(stage < STAGE); + return NXT_STAGE_T::template insert_size_at<KT>(stage, key, value); + } + } + + template <typename T = std::tuple<match_stage_t, node_offset_t>> + static std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, T> evaluate_insert( + const container_t& container, const full_key_t<KeyT::VIEW>& key, + const value_t& value, position_t& position, bool evaluate_last) { + auto iter = iterator_t(container); + auto& index = position.index; + if (evaluate_last || index == INDEX_END) { + iter.seek_last(); + index = iter.index(); + // evaluate the previous index + } else { + assert(is_valid_index(index)); + // evaluate the current index + iter.seek_at(index); + auto match = compare_to<KeyT::VIEW>(key, iter.get_key()); + if (match == MatchKindCMP::EQ) { + if constexpr (IS_BOTTOM) { + ceph_abort("insert conflict at current index!"); + } else { + // insert into the current index + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::evaluate_insert( + nxt_container, key, value, position.nxt, false); + } + } else { + assert(match == MatchKindCMP::LT); + if (index == 0) { + // already the first index, so insert at the current index + return {STAGE, insert_size<KeyT::VIEW>(key, value)}; + } + --index; + iter = iterator_t(container); + iter.seek_at(index); + // proceed to evaluate the previous index + } + } + + // XXX(multi-type): when key is from a different type of node + auto match = compare_to<KeyT::VIEW>(key, iter.get_key()); + if (match == MatchKindCMP::GT) { + // key doesn't match both indexes, so insert at the current index + ++index; + return {STAGE, insert_size<KeyT::VIEW>(key, value)}; + } else { + assert(match == MatchKindCMP::EQ); + if constexpr (IS_BOTTOM) { + // ceph_abort? + ceph_abort("insert conflict at the previous index!"); + } else { + // insert into the previous index + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::evaluate_insert( + nxt_container, key, value, position.nxt, true); + } + } + } + + template <typename T = bool> + static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T> + compensate_insert_position_at(match_stage_t stage, position_t& position) { + auto& index = position.index; + if (stage == STAGE) { + assert(index == 0); + // insert at the end of the current stage + index = INDEX_END; + return true; + } else { + if constexpr (IS_BOTTOM) { + ceph_abort("impossible path"); + } else { + assert(stage < STAGE); + bool compensate = NXT_STAGE_T:: + compensate_insert_position_at(stage, position.nxt); + if (compensate) { + assert(is_valid_index(index)); + if (index == 0) { + // insert into the *last* index of the current stage + index = INDEX_LAST; + return true; + } else { + --index; + return false; + } + } else { + return false; + } + } + } + } + + static void patch_insert_end(position_t& insert_pos, match_stage_t insert_stage) { + assert(insert_stage <= STAGE); + if (insert_stage == STAGE) { + insert_pos.index = INDEX_END; + } else if constexpr (!IS_BOTTOM) { + insert_pos.index = INDEX_LAST; + NXT_STAGE_T::patch_insert_end(insert_pos.nxt, insert_stage); + } + } + + template <typename T = std::tuple<match_stage_t, node_offset_t>> + static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T> evaluate_insert( + const full_key_t<KeyT::HOBJ>& key, const onode_t& value, + const MatchHistory& history, match_stat_t mstat, position_t& position) { + match_stage_t insert_stage = STAGE_TOP; + while (*history.get_by_stage(insert_stage) == MatchKindCMP::EQ) { + assert(insert_stage != STAGE_BOTTOM && "insert conflict!"); + --insert_stage; + } + + if (history.is_GT()) { + if (position.is_end()) { + // no need to compensate insert position + assert(insert_stage <= STAGE && "impossible insert stage"); + } else if (position == position_t::begin()) { + // I must be short-circuited by staged::smallest_result() + // in staged::lower_bound(), so we need to rely on mstat instead + assert(mstat >= MSTAT_LT0 && mstat <= MSTAT_LT3); + if (mstat == MSTAT_LT0) { + insert_stage = STAGE_RIGHT; + } else if (mstat == MSTAT_LT1) { + insert_stage = STAGE_STRING; + } else { + insert_stage = STAGE_LEFT; + } + // XXX(multi-type): need to upgrade node type before inserting an + // incompatible index at front. + assert(insert_stage <= STAGE && "incompatible insert"); + } else { + assert(insert_stage <= STAGE && "impossible insert stage"); + [[maybe_unused]] bool ret = compensate_insert_position_at(insert_stage, position); + assert(!ret); + } + } + + if (position.is_end()) { + patch_insert_end(position, insert_stage); + } + + node_offset_t insert_size = insert_size_at<KeyT::HOBJ>(insert_stage, key, value); + + return {insert_stage, insert_size}; + } + + template <KeyT KT> + static const value_t* insert_new( + NodeExtentMutable& mut, const memory_range_t& range, + const full_key_t<KT>& key, const value_t& value) { + char* p_insert = const_cast<char*>(range.p_end); + const value_t* p_value = nullptr; + StagedAppender<KT> appender; + appender.init(&mut, p_insert); + appender.append(key, value, p_value); + [[maybe_unused]] const char* p_insert_front = appender.wrap(); + assert(p_insert_front == range.p_start); + return p_value; + } + + template <KeyT KT, bool SPLIT> + static const value_t* proceed_insert_recursively( + NodeExtentMutable& mut, const container_t& container, + const full_key_t<KT>& key, const value_t& value, + position_t& position, match_stage_t& stage, + node_offset_t& _insert_size, const char* p_left_bound) { + // proceed insert from right to left + assert(stage <= STAGE); + auto iter = iterator_t(container); + auto& index = position.index; + + bool do_insert = false; + if (stage == STAGE) { + if (index == INDEX_END) { + iter.seek_last(); + iter.set_end(); + index = iter.index(); + } else { + assert(is_valid_index(index)); + iter.seek_till_end(index); + } + do_insert = true; + } else { // stage < STAGE + if (index == INDEX_LAST) { + iter.seek_last(); + index = iter.index(); + } else { + assert(is_valid_index(index)); + iter.seek_till_end(index); + } + if constexpr (SPLIT) { + if (iter.is_end()) { + // insert at the higher stage due to split + do_insert = true; + _insert_size = insert_size<KT>(key, value); + stage = STAGE; + } + } else { + assert(!iter.is_end()); + } + } + + if (do_insert) { + if constexpr (!IS_BOTTOM) { + position.nxt = position_t::nxt_t::begin(); + } + assert(_insert_size == insert_size<KT>(key, value)); + if constexpr (IS_BOTTOM) { + return iter.template insert<KT>( + mut, key, value, _insert_size, p_left_bound); + } else { + auto range = iter.template insert_prefix<KT>( + mut, key, _insert_size, p_left_bound); + return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value); + } + } else { + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + auto p_value = NXT_STAGE_T::template proceed_insert_recursively<KT, SPLIT>( + mut, nxt_container, key, value, + position.nxt, stage, _insert_size, p_left_bound); + iter.update_size(mut, _insert_size); + return p_value; + } else { + ceph_abort("impossible path"); + } + } + } + + template <KeyT KT, bool SPLIT> + static const value_t* proceed_insert( + NodeExtentMutable& mut, const container_t& container, + const full_key_t<KT>& key, const value_t& value, + position_t& position, match_stage_t& stage, node_offset_t& _insert_size) { + auto p_left_bound = container.p_left_bound(); + if (unlikely(!container.keys())) { + if (position.is_end()) { + position = position_t::begin(); + assert(stage == STAGE); + assert(_insert_size == insert_size<KT>(key, value)); + } else if (position == position_t::begin()) { + // when insert into a trimmed and empty left node + stage = STAGE; + _insert_size = insert_size<KT>(key, value); + } else { + ceph_abort("impossible path"); + } + if constexpr (IS_BOTTOM) { + return container_t::template insert_at<KT>( + mut, container, key, value, 0, _insert_size, p_left_bound); + } else { + auto range = container_t::template insert_prefix_at<KT>( + mut, container, key, 0, _insert_size, p_left_bound); + return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value); + } + } else { + return proceed_insert_recursively<KT, SPLIT>( + mut, container, key, value, + position, stage, _insert_size, p_left_bound); + } + } + + static std::ostream& dump(const container_t& container, + std::ostream& os, + const std::string& prefix, + size_t& size, + const char* p_start) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + std::string prefix_blank(prefix.size(), ' '); + const std::string* p_prefix = &prefix; + size += iterator_t::header_size(); + do { + std::ostringstream sos; + sos << *p_prefix << iter.get_key() << ": "; + std::string i_prefix = sos.str(); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + size += iter.size_to_nxt(); + NXT_STAGE_T::dump(nxt_container, os, i_prefix, size, p_start); + } else { + auto value_ptr = iter.get_p_value(); + int offset = reinterpret_cast<const char*>(value_ptr) - p_start; + size += iter.size(); + os << "\n" << i_prefix; + if constexpr (NODE_TYPE == node_type_t::LEAF) { + os << *value_ptr; + } else { + os << "0x" << std::hex << value_ptr->value << std::dec; + } + os << " " << size << "B" + << " @" << offset << "B"; + } + if (iter.is_last()) { + break; + } else { + ++iter; + p_prefix = &prefix_blank; + } + } while (true); + return os; + } + + static void validate(const container_t& container) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + auto key = iter.get_key(); + do { + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + NXT_STAGE_T::validate(nxt_container); + } + if (iter.is_last()) { + break; + } else { + ++iter; + assert(compare_to(key, iter.get_key()) == MatchKindCMP::LT); + key = iter.get_key(); + } + } while (true); + } + + static void get_stats(const container_t& container, node_stats_t& stats, + full_key_t<KeyT::VIEW>& index_key) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + stats.size_overhead += iterator_t::header_size(); + do { + index_key.replace(iter.get_key()); + stats.size_overhead += iter.size_overhead(); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + NXT_STAGE_T::get_stats(nxt_container, stats, index_key); + } else { + ++stats.num_kvs; + size_t kv_logical_size = index_key.size_logical(); + size_t value_size; + if constexpr (NODE_TYPE == node_type_t::LEAF) { + value_size = iter.get_p_value()->size; + } else { + value_size = sizeof(value_t); + } + stats.size_value += value_size; + kv_logical_size += value_size; + stats.size_logical += kv_logical_size; + } + if (iter.is_last()) { + break; + } else { + ++iter; + } + } while (true); + } + + static bool next_position(const container_t& container, position_t& pos) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + iter.seek_at(pos.index); + bool find_next; + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + find_next = NXT_STAGE_T::next_position(nxt_container, pos.nxt); + } else { + find_next = true; + } + if (find_next) { + if (iter.is_last()) { + return true; + } else { + pos.index = iter.index() + 1; + if constexpr (!IS_BOTTOM) { + pos.nxt = NXT_STAGE_T::position_t::begin(); + } + return false; + } + } else { + return false; + } + } + + struct _BaseEmpty {}; + class _BaseWithNxtIterator { + protected: + typename NXT_STAGE_T::StagedIterator _nxt; + }; + class StagedIterator + : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtIterator> { + public: + StagedIterator() = default; + bool valid() const { return iter.has_value(); } + index_t index() const { + return iter->index(); + } + bool is_end() const { return iter->is_end(); } + bool in_progress() const { + assert(valid()); + if constexpr (!IS_BOTTOM) { + if (this->_nxt.valid()) { + if (this->_nxt.index() == 0) { + return this->_nxt.in_progress(); + } else { + return true; + } + } else { + return false; + } + } else { + return false; + } + } + key_get_type get_key() const { return iter->get_key(); } + + iterator_t& get() { return *iter; } + void set(const container_t& container) { + assert(!valid()); + iter = iterator_t(container); + } + void set_end() { iter->set_end(); } + typename NXT_STAGE_T::StagedIterator& nxt() { + if constexpr (!IS_BOTTOM) { + if (!this->_nxt.valid()) { + auto nxt_container = iter->get_nxt_container(); + this->_nxt.set(nxt_container); + } + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + typename NXT_STAGE_T::StagedIterator& get_nxt() { + if constexpr (!IS_BOTTOM) { + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + StagedIterator& operator++() { + if (iter->is_last()) { + iter->set_end(); + } else { + ++(*iter); + } + if constexpr (!IS_BOTTOM) { + this->_nxt.reset(); + } + return *this; + } + void reset() { + if (valid()) { + iter.reset(); + if constexpr (!IS_BOTTOM) { + this->_nxt.reset(); + } + } + } + std::ostream& print(std::ostream& os, bool is_top) const { + if (valid()) { + if (iter->is_end()) { + return os << "END"; + } else { + os << index(); + } + } else { + if (is_top) { + return os << "invalid StagedIterator!"; + } else { + os << "0!"; + } + } + if constexpr (!IS_BOTTOM) { + os << ", "; + return this->_nxt.print(os, false); + } else { + return os; + } + } + position_t get_pos() const { + if (valid()) { + if constexpr (IS_BOTTOM) { + return position_t{index()}; + } else { + return position_t{index(), this->_nxt.get_pos()}; + } + } else { + return position_t::begin(); + } + } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + uint8_t present = static_cast<bool>(iter); + ceph::encode(present, encoded); + if (iter.has_value()) { + iter->encode(p_node_start, encoded); + if constexpr (!IS_BOTTOM) { + this->_nxt.encode(p_node_start, encoded); + } + } + } + static StagedIterator decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + StagedIterator ret; + uint8_t present; + ceph::decode(present, delta); + if (present) { + ret.iter = iterator_t::decode(p_node_start, delta); + if constexpr (!IS_BOTTOM) { + ret._nxt = NXT_STAGE_T::StagedIterator::decode(p_node_start, delta); + } + } + return ret; + } + friend std::ostream& operator<<(std::ostream& os, const StagedIterator& iter) { + return iter.print(os, true); + } + private: + std::optional<iterator_t> iter; + }; + + static bool recursively_locate_split( + size_t& current_size, size_t extra_size, + size_t target_size, StagedIterator& split_at) { + assert(current_size <= target_size); + iterator_t& split_iter = split_at.get(); + current_size = split_iter.seek_split(current_size, extra_size, target_size); + assert(current_size <= target_size); + assert(!split_iter.is_end()); + if (split_iter.index() == 0) { + extra_size += iterator_t::header_size(); + } else { + extra_size = 0; + } + bool locate_nxt; + if constexpr (!IS_BOTTOM) { + locate_nxt = NXT_STAGE_T::recursively_locate_split( + current_size, extra_size + split_iter.size_to_nxt(), + target_size, split_at.nxt()); + } else { // IS_BOTTOM + // located upper_bound, fair split strategy + size_t nxt_size = split_iter.size() + extra_size; + assert(current_size + nxt_size > target_size); + if (current_size + nxt_size/2 < target_size) { + // include next + current_size += nxt_size; + locate_nxt = true; + } else { + // exclude next + locate_nxt = false; + } + } + if (locate_nxt) { + if (split_iter.is_last()) { + return true; + } else { + ++split_at; + return false; + } + } else { + return false; + } + } + + static bool recursively_locate_split_inserted( + size_t& current_size, size_t extra_size, size_t target_size, + position_t& insert_pos, match_stage_t insert_stage, size_t insert_size, + std::optional<bool>& is_insert_left, StagedIterator& split_at) { + assert(current_size <= target_size); + assert(!is_insert_left.has_value()); + iterator_t& split_iter = split_at.get(); + auto& insert_index = insert_pos.index; + if (insert_stage == STAGE) { + current_size = split_iter.template seek_split_inserted<true>( + current_size, extra_size, target_size, + insert_index, insert_size, is_insert_left); + assert(is_insert_left.has_value()); + assert(current_size <= target_size); + if (split_iter.index() == 0) { + if (insert_index == 0) { + if (*is_insert_left == false) { + extra_size += iterator_t::header_size(); + } else { + extra_size = 0; + } + } else { + extra_size += iterator_t::header_size(); + } + } else { + extra_size = 0; + } + if (*is_insert_left == false && split_iter.index() == insert_index) { + // split_iter can be end + // found the lower-bound of target_size + // ...[s_index-1] |!| (i_index) [s_index]... + + // located upper-bound, fair split strategy + // look at the next slot (the insert item) + size_t nxt_size = insert_size + extra_size; + assert(current_size + nxt_size > target_size); + if (current_size + nxt_size/2 < target_size) { + // include next + *is_insert_left = true; + current_size += nxt_size; + if (split_iter.is_end()) { + // ...[s_index-1] (i_index) |!| + return true; + } else { + return false; + } + } else { + // exclude next + return false; + } + } else { + // Already considered insert effect in the current stage. + // Look into the next stage to identify the target_size lower-bound w/o + // insert effect. + assert(!split_iter.is_end()); + bool locate_nxt; + if constexpr (!IS_BOTTOM) { + locate_nxt = NXT_STAGE_T::recursively_locate_split( + current_size, extra_size + split_iter.size_to_nxt(), + target_size, split_at.nxt()); + } else { // IS_BOTTOM + // located upper-bound, fair split strategy + // look at the next slot + size_t nxt_size = split_iter.size() + extra_size; + assert(current_size + nxt_size > target_size); + if (current_size + nxt_size/2 < target_size) { + // include next + current_size += nxt_size; + locate_nxt = true; + } else { + // exclude next + locate_nxt = false; + } + } + if (locate_nxt) { + if (split_iter.is_last()) { + auto end_index = split_iter.index() + 1; + if (insert_index == INDEX_END) { + insert_index = end_index; + } + assert(insert_index <= end_index); + if (insert_index == end_index) { + assert(*is_insert_left == false); + split_iter.set_end(); + // ...[s_index-1] |!| (i_index) + return false; + } else { + assert(*is_insert_left == true); + return true; + } + } else { + ++split_at; + return false; + } + } else { + return false; + } + } + } else { + if constexpr (!IS_BOTTOM) { + assert(insert_stage < STAGE); + current_size = split_iter.template seek_split_inserted<false>( + current_size, extra_size, target_size, + insert_index, insert_size, is_insert_left); + assert(!split_iter.is_end()); + assert(current_size <= target_size); + if (split_iter.index() == 0) { + extra_size += iterator_t::header_size(); + } else { + extra_size = 0; + } + bool locate_nxt; + if (!is_insert_left.has_value()) { + // Considered insert effect in the current stage, and insert happens + // in the lower stage. + // Look into the next stage to identify the target_size lower-bound w/ + // insert effect. + assert(split_iter.index() == insert_index); + locate_nxt = NXT_STAGE_T::recursively_locate_split_inserted( + current_size, extra_size + split_iter.size_to_nxt(), target_size, + insert_pos.nxt, insert_stage, insert_size, + is_insert_left, split_at.nxt()); + assert(is_insert_left.has_value()); +#ifndef NDEBUG + if (locate_nxt) { + assert(*is_insert_left == true); + } +#endif + } else { + // is_insert_left.has_value() == true + // Insert will *not* happen in the lower stage. + // Need to look into the next stage to identify the target_size + // lower-bound w/ insert effect + assert(split_iter.index() != insert_index); + locate_nxt = NXT_STAGE_T::recursively_locate_split( + current_size, extra_size + split_iter.size_to_nxt(), + target_size, split_at.nxt()); +#ifndef NDEBUG + if (split_iter.index() < insert_index) { + assert(*is_insert_left == false); + } else { + assert(*is_insert_left == true); + } +#endif + } + if (locate_nxt) { + if (split_iter.is_last()) { + return true; + } else { + ++split_at; + return false; + } + } else { + return false; + } + } else { + ceph_abort("impossible path"); + return false;; + } + } + } + + /* + * container appender type system + * container_t::Appender(NodeExtentMutable& mut, char* p_append) + * append(const container_t& src, index_t from, index_t items) + * wrap() -> char* + * IF !IS_BOTTOM: + * open_nxt(const key_get_type&) + * open_nxt(const full_key_t&) + * -> std::tuple<NodeExtentMutable&, char*> + * wrap_nxt(char* p_append) + * ELSE + * append(const full_key_t& key, const value_t& value) + */ + template <KeyT KT> + struct _BaseWithNxtAppender { + typename NXT_STAGE_T::template StagedAppender<KT> _nxt; + }; + template <KeyT KT> + class StagedAppender + : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtAppender<KT>> { + public: + StagedAppender() = default; + ~StagedAppender() { + assert(!require_wrap_nxt); + assert(!valid()); + } + bool valid() const { return appender.has_value(); } + index_t index() const { + assert(valid()); + return _index; + } + bool in_progress() const { return require_wrap_nxt; } + // TODO: pass by reference + void init(NodeExtentMutable* p_mut, char* p_start) { + assert(!valid()); + appender = typename container_t::template Appender<KT>(p_mut, p_start); + _index = 0; + } + // possible to make src_iter end if to_index == INDEX_END + void append_until(StagedIterator& src_iter, index_t& to_index) { + assert(!require_wrap_nxt); + auto s_index = src_iter.index(); + src_iter.get().template copy_out_until<KT>(*appender, to_index); + assert(src_iter.index() == to_index); + assert(to_index >= s_index); + auto increment = (to_index - s_index); + if (increment) { + _index += increment; + if constexpr (!IS_BOTTOM) { + src_iter.get_nxt().reset(); + } + } + } + void append(const full_key_t<KT>& key, + const value_t& value, const value_t*& p_value) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + auto& nxt = open_nxt(key); + nxt.append(key, value, p_value); + wrap_nxt(); + } else { + appender->append(key, value, p_value); + ++_index; + } + } + char* wrap() { + assert(valid()); + assert(_index > 0); + if constexpr (!IS_BOTTOM) { + if (require_wrap_nxt) { + wrap_nxt(); + } + } + auto ret = appender->wrap(); + appender.reset(); + return ret; + } + typename NXT_STAGE_T::template StagedAppender<KT>& + open_nxt(key_get_type paritial_key) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + require_wrap_nxt = true; + auto [p_mut, p_append] = appender->open_nxt(paritial_key); + this->_nxt.init(p_mut, p_append); + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + typename NXT_STAGE_T::template StagedAppender<KT>& + open_nxt(const full_key_t<KT>& key) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + require_wrap_nxt = true; + auto [p_mut, p_append] = appender->open_nxt(key); + this->_nxt.init(p_mut, p_append); + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + typename NXT_STAGE_T::template StagedAppender<KT>& get_nxt() { + if constexpr (!IS_BOTTOM) { + assert(require_wrap_nxt); + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + void wrap_nxt() { + if constexpr (!IS_BOTTOM) { + assert(require_wrap_nxt); + require_wrap_nxt = false; + auto p_append = this->_nxt.wrap(); + appender->wrap_nxt(p_append); + ++_index; + } else { + ceph_abort("impossible path"); + } + } + private: + std::optional<typename container_t::template Appender<KT>> appender; + index_t _index; + bool require_wrap_nxt = false; + }; + + template <KeyT KT> + static void _append_range( + StagedIterator& src_iter, StagedAppender<KT>& appender, index_t& to_index) { + if (src_iter.is_end()) { + // append done + assert(to_index == INDEX_END); + to_index = src_iter.index(); + } else if constexpr (!IS_BOTTOM) { + if (appender.in_progress()) { + // appender has appended something at the current item, + // cannot append the current item as-a-whole + index_t to_index_nxt = INDEX_END; + NXT_STAGE_T::template _append_range<KT>( + src_iter.nxt(), appender.get_nxt(), to_index_nxt); + ++src_iter; + appender.wrap_nxt(); + } else if (src_iter.in_progress()) { + // src_iter is not at the beginning of the current item, + // cannot append the current item as-a-whole + index_t to_index_nxt = INDEX_END; + NXT_STAGE_T::template _append_range<KT>( + src_iter.nxt(), appender.open_nxt(src_iter.get_key()), to_index_nxt); + ++src_iter; + appender.wrap_nxt(); + } else { + // we can safely append the current item as-a-whole + } + } + appender.append_until(src_iter, to_index); + } + + template <KeyT KT> + static void _append_into(StagedIterator& src_iter, StagedAppender<KT>& appender, + position_t& position, match_stage_t stage) { + assert(position.index == src_iter.index()); + // reaches the last item + if (stage == STAGE) { + // done, end recursion + if constexpr (!IS_BOTTOM) { + position.nxt = position_t::nxt_t::begin(); + } + } else { + assert(stage < STAGE); + // proceed append in the next stage + NXT_STAGE_T::template append_until<KT>( + src_iter.nxt(), appender.open_nxt(src_iter.get_key()), + position.nxt, stage); + } + } + + template <KeyT KT> + static void append_until(StagedIterator& src_iter, StagedAppender<KT>& appender, + position_t& position, match_stage_t stage) { + index_t from_index = src_iter.index(); + index_t& to_index = position.index; + assert(from_index <= to_index); + if constexpr (IS_BOTTOM) { + assert(stage == STAGE); + appender.append_until(src_iter, to_index); + } else { + assert(stage <= STAGE); + if (src_iter.index() == to_index) { + _append_into<KT>(src_iter, appender, position, stage); + } else { + if (to_index == INDEX_END) { + assert(stage == STAGE); + } else if (to_index == INDEX_LAST) { + assert(stage < STAGE); + } + _append_range<KT>(src_iter, appender, to_index); + _append_into<KT>(src_iter, appender, position, stage); + } + } + to_index -= from_index; + } + + template <KeyT KT> + static bool append_insert( + const full_key_t<KT>& key, const value_t& value, + StagedIterator& src_iter, StagedAppender<KT>& appender, + bool is_front_insert, match_stage_t& stage, const value_t*& p_value) { + assert(src_iter.valid()); + if (stage == STAGE) { + appender.append(key, value, p_value); + if (src_iter.is_end()) { + return true; + } else { + return false; + } + } else { + assert(stage < STAGE); + if constexpr (!IS_BOTTOM) { + auto nxt_is_end = NXT_STAGE_T::template append_insert<KT>( + key, value, src_iter.get_nxt(), appender.get_nxt(), + is_front_insert, stage, p_value); + if (nxt_is_end) { + appender.wrap_nxt(); + ++src_iter; + if (is_front_insert) { + stage = STAGE; + } + if (src_iter.is_end()) { + return true; + } + } + return false; + } else { + ceph_abort("impossible path"); + } + } + } + + /* TrimType: + * BEFORE: remove the entire container, normally means the according higher + * stage iterator needs to be trimmed as-a-whole. + * AFTER: retain the entire container, normally means the trim should be + * start from the next iterator at the higher stage. + * AT: trim happens in the current container, and the according higher + * stage iterator needs to be adjusted by the trimmed size. + */ + static std::tuple<TrimType, node_offset_t> + recursively_trim(NodeExtentMutable& mut, StagedIterator& trim_at) { + if (!trim_at.valid()) { + return {TrimType::BEFORE, 0u}; + } + if (trim_at.is_end()) { + return {TrimType::AFTER, 0u}; + } + + auto& iter = trim_at.get(); + if constexpr (!IS_BOTTOM) { + auto [type, trimmed] = NXT_STAGE_T::recursively_trim( + mut, trim_at.get_nxt()); + node_offset_t trim_size; + if (type == TrimType::AFTER) { + if (iter.is_last()) { + return {TrimType::AFTER, 0u}; + } + ++trim_at; + trim_size = iter.trim_until(mut); + } else if (type == TrimType::BEFORE) { + if (iter.index() == 0) { + return {TrimType::BEFORE, 0u}; + } + trim_size = iter.trim_until(mut); + } else { + trim_size = iter.trim_at(mut, trimmed); + } + return {TrimType::AT, trim_size}; + } else { + if (iter.index() == 0) { + return {TrimType::BEFORE, 0u}; + } else { + auto trimmed = iter.trim_until(mut); + return {TrimType::AT, trimmed}; + } + } + } + + static void trim(NodeExtentMutable& mut, StagedIterator& trim_at) { + auto [type, trimmed] = recursively_trim(mut, trim_at); + if (type == TrimType::BEFORE) { + assert(trim_at.valid()); + auto& iter = trim_at.get(); + iter.trim_until(mut); + } + } +}; + +/** + * Configurations for struct staged + * + * staged_params_* assembles different container_t implementations (defined by + * stated::_iterator_t) by STAGE, and constructs the final multi-stage + * implementations for different node layouts defined by + * node_extent_t<FieldType, NODE_TYPE>. + * + * The specialized implementations for different layouts are accessible through + * the helper type node_to_stage_t<node_extent_t<FieldType, NODE_TYPE>>. + * + * Specifically, the settings of 8 layouts are: + * + * The layout (N0, LEAF/INTERNAL) has 3 stages: + * - STAGE_LEFT: node_extent_t<node_fields_0_t, LEAF/INTERNAL> + * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL> + * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL> + * + * The layout (N1, LEAF/INTERNAL) has 3 stages: + * - STAGE_LEFT: node_extent_t<node_fields_1_t, LEAF/INTERNAL> + * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL> + * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL> + * + * The layout (N2, LEAF/INTERNAL) has 2 stages: + * - STAGE_STRING: node_extent_t<node_fields_2_t, LEAF/INTERNAL> + * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL> + * + * The layout (N3, LEAF) has 1 stage: + * - STAGE_RIGHT: node_extent_t<leaf_fields_3_t, LEAF> + * + * The layout (N3, INTERNAL) has 1 stage: + * - STAGE_RIGHT: node_extent_t<internal_fields_3_t, INTERNAL> + */ + +template <node_type_t _NODE_TYPE> +struct staged_params_subitems { + using container_t = sub_items_t<_NODE_TYPE>; + static constexpr auto NODE_TYPE = _NODE_TYPE; + static constexpr auto STAGE = STAGE_RIGHT; + + // dummy type in order to make our type system work + // any better solution to get rid of this? + using next_param_t = staged_params_subitems<NODE_TYPE>; +}; + +template <node_type_t _NODE_TYPE> +struct staged_params_item_iterator { + using container_t = item_iterator_t<_NODE_TYPE>; + static constexpr auto NODE_TYPE = _NODE_TYPE; + static constexpr auto STAGE = STAGE_STRING; + + using next_param_t = staged_params_subitems<NODE_TYPE>; +}; + +template <typename NodeType> +struct staged_params_node_01 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_LEFT; + + using next_param_t = staged_params_item_iterator<NODE_TYPE>; +}; + +template <typename NodeType> +struct staged_params_node_2 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_STRING; + + using next_param_t = staged_params_subitems<NODE_TYPE>; +}; + +template <typename NodeType> +struct staged_params_node_3 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_RIGHT; + + // dummy type in order to make our type system work + // any better solution to get rid of this? + using next_param_t = staged_params_node_3<NodeType>; +}; + +template <typename NodeType, typename Enable = void> struct _node_to_stage_t; +template <typename NodeType> +struct _node_to_stage_t<NodeType, + std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N0 || + NodeType::FIELD_TYPE == field_type_t::N1>> { + using type = staged<staged_params_node_01<NodeType>>; +}; +template <typename NodeType> +struct _node_to_stage_t<NodeType, + std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N2>> { + using type = staged<staged_params_node_2<NodeType>>; +}; +template <typename NodeType> +struct _node_to_stage_t<NodeType, + std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N3>> { + using type = staged<staged_params_node_3<NodeType>>; +}; +template <typename NodeType> +using node_to_stage_t = typename _node_to_stage_t<NodeType>::type; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h new file mode 100644 index 000000000..a9d5cef3b --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h @@ -0,0 +1,411 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <optional> +#include <ostream> + +#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/tree_types.h" + +namespace crimson::os::seastore::onode { + +using match_stage_t = int8_t; +constexpr match_stage_t STAGE_LEFT = 2; // shard/pool/crush +constexpr match_stage_t STAGE_STRING = 1; // nspace/oid +constexpr match_stage_t STAGE_RIGHT = 0; // snap/gen +constexpr auto STAGE_TOP = STAGE_LEFT; +constexpr auto STAGE_BOTTOM = STAGE_RIGHT; +constexpr bool is_valid_stage(match_stage_t stage) { + return std::clamp(stage, STAGE_BOTTOM, STAGE_TOP) == stage; +} +// TODO: replace by +// using match_history_t = int8_t; +// left_m, str_m, right_m +// 3: GT, +// 2: EQ, GT, +// 1: EQ, EQ, GT +// 0: EQ, EQ, EQ +// -1: EQ, EQ, LT +// -2: EQ, LT, +// -3: LT, + +struct MatchHistory { + template <match_stage_t STAGE> + const std::optional<MatchKindCMP>& get() const { + static_assert(is_valid_stage(STAGE)); + if constexpr (STAGE == STAGE_RIGHT) { + return right_match; + } else if (STAGE == STAGE_STRING) { + return string_match; + } else { + return left_match; + } + } + + const std::optional<MatchKindCMP>& + get_by_stage(match_stage_t stage) const { + assert(is_valid_stage(stage)); + if (stage == STAGE_RIGHT) { + return right_match; + } else if (stage == STAGE_STRING) { + return string_match; + } else { + return left_match; + } + } + + template <match_stage_t STAGE = STAGE_TOP> + const bool is_GT() const; + + template <match_stage_t STAGE> + void set(MatchKindCMP match) { + static_assert(is_valid_stage(STAGE)); + if constexpr (STAGE < STAGE_TOP) { + assert(*get<STAGE + 1>() == MatchKindCMP::EQ); + } + assert(!get<STAGE>().has_value() || *get<STAGE>() != MatchKindCMP::EQ); + const_cast<std::optional<MatchKindCMP>&>(get<STAGE>()) = match; + } + + std::ostream& dump(std::ostream& os) const { + os << "history("; + dump_each(os, left_match) << ", "; + dump_each(os, string_match) << ", "; + dump_each(os, right_match) << ")"; + return os; + } + + std::ostream& dump_each( + std::ostream& os, const std::optional<MatchKindCMP>& match) const { + if (!match.has_value()) { + return os << "--"; + } else if (*match == MatchKindCMP::LT) { + return os << "LT"; + } else if (*match == MatchKindCMP::EQ) { + return os << "EQ"; + } else if (*match == MatchKindCMP::GT) { + return os << "GT"; + } else { + ceph_abort("impossble path"); + } + } + + std::optional<MatchKindCMP> left_match; + std::optional<MatchKindCMP> string_match; + std::optional<MatchKindCMP> right_match; +}; +inline std::ostream& operator<<(std::ostream& os, const MatchHistory& pos) { + return pos.dump(os); +} + +template <match_stage_t STAGE> +struct _check_GT_t { + static bool eval(const MatchHistory* history) { + return history->get<STAGE>() && + (*history->get<STAGE>() == MatchKindCMP::GT || + (*history->get<STAGE>() == MatchKindCMP::EQ && + _check_GT_t<STAGE - 1>::eval(history))); + } +}; +template <> +struct _check_GT_t<STAGE_RIGHT> { + static bool eval(const MatchHistory* history) { + return history->get<STAGE_RIGHT>() && + *history->get<STAGE_RIGHT>() == MatchKindCMP::GT; + } +}; +template <match_stage_t STAGE> +const bool MatchHistory::is_GT() const { + static_assert(is_valid_stage(STAGE)); + if constexpr (STAGE < STAGE_TOP) { + assert(get<STAGE + 1>() == MatchKindCMP::EQ); + } + return _check_GT_t<STAGE>::eval(this); +} + +template <match_stage_t STAGE> +struct staged_position_t { + static_assert(is_valid_stage(STAGE)); + using me_t = staged_position_t<STAGE>; + using nxt_t = staged_position_t<STAGE - 1>; + bool is_end() const { + if (index == INDEX_END) { + return true; + } else { + assert(is_valid_index(index)); + return false; + } + } + index_t& index_by_stage(match_stage_t stage) { + assert(stage <= STAGE); + if (STAGE == stage) { + return index; + } else { + return nxt.index_by_stage(stage); + } + } + + int cmp(const me_t& o) const { + if (index > o.index) { + return 1; + } else if (index < o.index) { + return -1; + } else { + return nxt.cmp(o.nxt); + } + } + bool operator>(const me_t& o) const { return cmp(o) > 0; } + bool operator>=(const me_t& o) const { return cmp(o) >= 0; } + bool operator<(const me_t& o) const { return cmp(o) < 0; } + bool operator<=(const me_t& o) const { return cmp(o) <= 0; } + bool operator==(const me_t& o) const { return cmp(o) == 0; } + bool operator!=(const me_t& o) const { return cmp(o) != 0; } + + me_t& operator-=(const me_t& o) { + assert(is_valid_index(o.index)); + assert(index >= o.index); + if (index != INDEX_END) { + assert(is_valid_index(index)); + index -= o.index; + if (index == 0) { + nxt -= o.nxt; + } + } + return *this; + } + + void encode(ceph::bufferlist& encoded) const { + ceph::encode(index, encoded); + nxt.encode(encoded); + } + + static me_t decode(ceph::bufferlist::const_iterator& delta) { + me_t ret; + ceph::decode(ret.index, delta); + ret.nxt = nxt_t::decode(delta); + return ret; + } + + static me_t begin() { return {0u, nxt_t::begin()}; } + static me_t end() { + return {INDEX_END, nxt_t::end()}; + } + + index_t index; + nxt_t nxt; +}; +template <match_stage_t STAGE> +std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE>& pos) { + if (pos.index == INDEX_END) { + os << "END"; + } else if (pos.index == INDEX_LAST) { + os << "LAST"; + } else { + os << pos.index; + assert(is_valid_index(pos.index)); + } + return os << ", " << pos.nxt; +} + +template <> +struct staged_position_t<STAGE_BOTTOM> { + using me_t = staged_position_t<STAGE_BOTTOM>; + bool is_end() const { + if (index == INDEX_END) { + return true; + } else { + assert(is_valid_index(index)); + return false; + } + } + index_t& index_by_stage(match_stage_t stage) { + assert(stage == STAGE_BOTTOM); + return index; + } + + int cmp(const staged_position_t<STAGE_BOTTOM>& o) const { + if (index > o.index) { + return 1; + } else if (index < o.index) { + return -1; + } else { + return 0; + } + } + bool operator>(const me_t& o) const { return cmp(o) > 0; } + bool operator>=(const me_t& o) const { return cmp(o) >= 0; } + bool operator<(const me_t& o) const { return cmp(o) < 0; } + bool operator<=(const me_t& o) const { return cmp(o) <= 0; } + bool operator==(const me_t& o) const { return cmp(o) == 0; } + bool operator!=(const me_t& o) const { return cmp(o) != 0; } + + me_t& operator-=(const me_t& o) { + assert(is_valid_index(o.index)); + assert(index >= o.index); + if (index != INDEX_END) { + assert(is_valid_index(index)); + index -= o.index; + } + return *this; + } + + void encode(ceph::bufferlist& encoded) const { + ceph::encode(index, encoded); + } + + static me_t decode(ceph::bufferlist::const_iterator& delta) { + me_t ret; + ceph::decode(ret.index, delta); + return ret; + } + + static me_t begin() { return {0u}; } + static me_t end() { return {INDEX_END}; } + + index_t index; +}; +template <> +inline std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE_BOTTOM>& pos) { + if (pos.index == INDEX_END) { + os << "END"; + } else if (pos.index == INDEX_LAST) { + os << "LAST"; + } else { + os << pos.index; + assert(is_valid_index(pos.index)); + } + return os; +} + +using search_position_t = staged_position_t<STAGE_TOP>; + +template <match_stage_t STAGE> +const staged_position_t<STAGE>& cast_down(const search_position_t& pos) { + if constexpr (STAGE == STAGE_LEFT) { + return pos; + } else if constexpr (STAGE == STAGE_STRING) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(pos.nxt.is_end()); + } else { + assert(pos.index == 0u); + } +#endif + return pos.nxt; + } else if constexpr (STAGE == STAGE_RIGHT) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(pos.nxt.nxt.is_end()); + } else { + assert(pos.index == 0u); + assert(pos.nxt.index == 0u); + } +#endif + return pos.nxt.nxt; + } else { + ceph_abort("impossible path"); + } +} + +template <match_stage_t STAGE> +staged_position_t<STAGE>& cast_down(search_position_t& pos) { + const search_position_t& _pos = pos; + return const_cast<staged_position_t<STAGE>&>(cast_down<STAGE>(_pos)); +} + +template <match_stage_t STAGE> +staged_position_t<STAGE>& cast_down_fill_0(search_position_t& pos) { + if constexpr (STAGE == STAGE_LEFT) { + return pos; + } if constexpr (STAGE == STAGE_STRING) { + pos.index = 0; + return pos.nxt; + } else if constexpr (STAGE == STAGE_RIGHT) { + pos.index = 0; + pos.nxt.index = 0; + return pos.nxt.nxt; + } else { + ceph_abort("impossible path"); + } +} + +inline search_position_t&& normalize(search_position_t&& pos) { return std::move(pos); } + +template <match_stage_t STAGE, typename = std::enable_if_t<STAGE != STAGE_TOP>> +search_position_t normalize(staged_position_t<STAGE>&& pos) { + if (pos.is_end()) { + return search_position_t::end(); + } + if constexpr (STAGE == STAGE_STRING) { + return {0u, std::move(pos)}; + } else if (STAGE == STAGE_RIGHT) { + return {0u, {0u, std::move(pos)}}; + } else { + ceph_abort("impossible path"); + } +} + +struct memory_range_t { + const char* p_start; + const char* p_end; +}; + +enum class ContainerType { ITERATIVE, INDEXABLE }; + +template <node_type_t> struct value_type; +template<> struct value_type<node_type_t::INTERNAL> { using type = laddr_packed_t; }; +template<> struct value_type<node_type_t::LEAF> { using type = onode_t; }; +template <node_type_t NODE_TYPE> +using value_type_t = typename value_type<NODE_TYPE>::type; + +template <node_type_t NODE_TYPE, match_stage_t STAGE> +struct staged_result_t { + using me_t = staged_result_t<NODE_TYPE, STAGE>; + bool is_end() const { return position.is_end(); } + + static me_t end() { + return {staged_position_t<STAGE>::end(), nullptr, MSTAT_END}; + } + template <typename T = me_t> + static std::enable_if_t<STAGE != STAGE_BOTTOM, T> from_nxt( + index_t index, const staged_result_t<NODE_TYPE, STAGE - 1>& nxt_stage_result) { + return {{index, nxt_stage_result.position}, + nxt_stage_result.p_value, + nxt_stage_result.mstat}; + } + + staged_position_t<STAGE> position; + const value_type_t<NODE_TYPE>* p_value; + match_stat_t mstat; +}; + +template <node_type_t NODE_TYPE> +using lookup_result_t = staged_result_t<NODE_TYPE, STAGE_TOP>; + +template <node_type_t NODE_TYPE> +lookup_result_t<NODE_TYPE>&& normalize( + lookup_result_t<NODE_TYPE>&& result) { return std::move(result); } + +template <node_type_t NODE_TYPE, match_stage_t STAGE, + typename = std::enable_if_t<STAGE != STAGE_TOP>> +lookup_result_t<NODE_TYPE> normalize( + staged_result_t<NODE_TYPE, STAGE>&& result) { + // FIXME: assert result.mstat correct + return {normalize(std::move(result.position)), result.p_value, result.mstat}; +} + +struct node_stats_t { + size_t size_persistent = 0; + size_t size_filled = 0; + // filled by staged::get_stats() + size_t size_logical = 0; + size_t size_overhead = 0; + size_t size_value = 0; + unsigned num_kvs = 0; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc new file mode 100644 index 000000000..aaca6c3c6 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc @@ -0,0 +1,208 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "sub_items_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +template <KeyT KT> +const laddr_packed_t* internal_sub_items_t::insert_at( + NodeExtentMutable& mut, const internal_sub_items_t& sub_items, + const full_key_t<KT>& key, const laddr_packed_t& value, + index_t index, node_offset_t size, const char* p_left_bound) { + assert(index <= sub_items.keys()); + assert(size == estimate_insert<KT>(key, value)); + const char* p_shift_start = p_left_bound; + const char* p_shift_end = reinterpret_cast<const char*>( + sub_items.p_first_item + 1 - index); + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size); + + auto p_insert = const_cast<char*>(p_shift_end) - size; + auto item = internal_sub_item_t{snap_gen_t::from_key<KT>(key), value}; + mut.copy_in_absolute(p_insert, item); + return &reinterpret_cast<internal_sub_item_t*>(p_insert)->value; +} +#define IA_TEMPLATE(KT) \ + template const laddr_packed_t* internal_sub_items_t::insert_at<KT>( \ + NodeExtentMutable&, const internal_sub_items_t&, const full_key_t<KT>&, \ + const laddr_packed_t&, index_t, node_offset_t, const char*) +IA_TEMPLATE(KeyT::VIEW); +IA_TEMPLATE(KeyT::HOBJ); + +node_offset_t internal_sub_items_t::trim_until( + NodeExtentMutable&, internal_sub_items_t& items, index_t index) { + assert(index != 0); + auto keys = items.keys(); + assert(index <= keys); + size_t ret = sizeof(internal_sub_item_t) * (keys - index); + assert(ret < NODE_BLOCK_SIZE); + return ret; +} + +template <KeyT KT> +void internal_sub_items_t::Appender<KT>::append( + const internal_sub_items_t& src, index_t from, index_t items) { + assert(from <= src.keys()); + if (items == 0) { + return; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + node_offset_t size = sizeof(internal_sub_item_t) * items; + p_append -= size; + p_mut->copy_in_absolute(p_append, src.p_first_item + 1 - from - items, size); +} + +template <KeyT KT> +void internal_sub_items_t::Appender<KT>::append( + const full_key_t<KT>& key, const laddr_packed_t& value, + const laddr_packed_t*& p_value) { + p_append -= sizeof(internal_sub_item_t); + auto item = internal_sub_item_t{snap_gen_t::from_key<KT>(key), value}; + p_mut->copy_in_absolute(p_append, item); + p_value = &reinterpret_cast<internal_sub_item_t*>(p_append)->value; +} + +template <KeyT KT> +const onode_t* leaf_sub_items_t::insert_at( + NodeExtentMutable& mut, const leaf_sub_items_t& sub_items, + const full_key_t<KT>& key, const onode_t& value, + index_t index, node_offset_t size, const char* p_left_bound) { + assert(index <= sub_items.keys()); + assert(size == estimate_insert<KT>(key, value)); + // a. [... item(index)] << size + const char* p_shift_start = p_left_bound; + const char* p_shift_end = sub_items.get_item_end(index); + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size); + + // b. insert item + auto p_insert = const_cast<char*>(p_shift_end - size); + auto p_value = reinterpret_cast<const onode_t*>(p_insert); + mut.copy_in_absolute(p_insert, &value, value.size); + p_insert += value.size; + mut.copy_in_absolute(p_insert, snap_gen_t::template from_key<KT>(key)); + assert(p_insert + sizeof(snap_gen_t) + sizeof(node_offset_t) == p_shift_end); + + // c. compensate affected offsets + auto item_size = value.size + sizeof(snap_gen_t); + for (auto i = index; i < sub_items.keys(); ++i) { + const node_offset_packed_t& offset_i = sub_items.get_offset(i); + mut.copy_in_absolute((void*)&offset_i, node_offset_t(offset_i.value + item_size)); + } + + // d. [item(index-1) ... item(0) ... offset(index)] <<< sizeof(node_offset_t) + const char* p_offset = (index == 0 ? + (const char*)&sub_items.get_offset(0) + sizeof(node_offset_t) : + (const char*)&sub_items.get_offset(index - 1)); + p_shift_start = p_shift_end; + p_shift_end = p_offset; + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)sizeof(node_offset_t)); + + // e. insert offset + node_offset_t offset_to_item_start = item_size + sub_items.get_offset_to_end(index); + mut.copy_in_absolute( + const_cast<char*>(p_shift_end) - sizeof(node_offset_t), offset_to_item_start); + + // f. update num_sub_keys + mut.copy_in_absolute((void*)sub_items.p_num_keys, num_keys_t(sub_items.keys() + 1)); + + return p_value; +} +template const onode_t* leaf_sub_items_t::insert_at<KeyT::HOBJ>( + NodeExtentMutable&, const leaf_sub_items_t&, const full_key_t<KeyT::HOBJ>&, + const onode_t&, index_t, node_offset_t, const char*); + +node_offset_t leaf_sub_items_t::trim_until( + NodeExtentMutable& mut, leaf_sub_items_t& items, index_t index) { + assert(index != 0); + auto keys = items.keys(); + assert(index <= keys); + if (index == keys) { + return 0; + } + index_t trim_items = keys - index; + const char* p_items_start = items.p_start(); + const char* p_shift_start = items.get_item_end(index); + const char* p_shift_end = items.get_item_end(0); + size_t size_trim_offsets = sizeof(node_offset_t) * trim_items; + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, + size_trim_offsets); + mut.copy_in_absolute((void*)items.p_num_keys, num_keys_t(index)); + size_t ret = size_trim_offsets + (p_shift_start - p_items_start); + assert(ret < NODE_BLOCK_SIZE); + return ret; +} + +template class internal_sub_items_t::Appender<KeyT::VIEW>; +template class internal_sub_items_t::Appender<KeyT::HOBJ>; + +// helper type for the visitor +template<class... Ts> struct overloaded : Ts... { using Ts::operator()...; }; +// explicit deduction guide +template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>; + +template <KeyT KT> +char* leaf_sub_items_t::Appender<KT>::wrap() { + auto p_cur = p_append; + num_keys_t num_keys = 0; + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { num_keys += arg.items; }, + [&] (const kv_item_t& arg) { ++num_keys; } + }, a); + } + assert(num_keys); + p_cur -= sizeof(num_keys_t); + p_mut->copy_in_absolute(p_cur, num_keys); + + node_offset_t last_offset = 0; + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { + int compensate = (last_offset - op_src->get_offset_to_end(arg.from)); + node_offset_t offset; + for (auto i = arg.from; i < arg.from + arg.items; ++i) { + offset = op_src->get_offset(i).value + compensate; + p_cur -= sizeof(node_offset_t); + p_mut->copy_in_absolute(p_cur, offset); + } + last_offset = offset; + }, + [&] (const kv_item_t& arg) { + last_offset += sizeof(snap_gen_t) + arg.p_value->size; + p_cur -= sizeof(node_offset_t); + p_mut->copy_in_absolute(p_cur, last_offset); + } + }, a); + } + + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { + auto _p_start = op_src->get_item_end(arg.from + arg.items); + size_t _len = op_src->get_item_end(arg.from) - _p_start; + p_cur -= _len; + p_mut->copy_in_absolute(p_cur, _p_start, _len); + }, + [&] (const kv_item_t& arg) { + assert(pp_value); + p_cur -= sizeof(snap_gen_t); + p_mut->copy_in_absolute(p_cur, snap_gen_t::template from_key<KT>(*arg.p_key)); + p_cur -= arg.p_value->size; + p_mut->copy_in_absolute(p_cur, arg.p_value, arg.p_value->size); + *pp_value = reinterpret_cast<const onode_t*>(p_cur); + } + }, a); + } + return p_cur; +} + +template class leaf_sub_items_t::Appender<KeyT::VIEW>; +template class leaf_sub_items_t::Appender<KeyT::HOBJ>; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h new file mode 100644 index 000000000..8ef5f7472 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h @@ -0,0 +1,341 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <variant> + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +struct internal_sub_item_t { + const snap_gen_t& get_key() const { return key; } + const laddr_packed_t* get_p_value() const { return &value; } + + snap_gen_t key; + laddr_packed_t value; +} __attribute__((packed)); + +/** + * internal_sub_items_t + * + * The STAGE_RIGHT implementation for internal node N0/N1/N2, implements staged + * contract as an indexable container to index snap-gen to child node + * addresses. + * + * The layout of the contaner storing n sub-items: + * + * # <--------- container range -----------> # + * #<~># sub-items [2, n) # + * # # <- sub-item 1 -> # <- sub-item 0 -> # + * #...# snap-gen | laddr # snap-gen | laddr # + * ^ + * | + * p_first_item + + */ +class internal_sub_items_t { + public: + using num_keys_t = index_t; + + internal_sub_items_t(const memory_range_t& range) { + assert(range.p_start < range.p_end); + assert((range.p_end - range.p_start) % sizeof(internal_sub_item_t) == 0); + num_items = (range.p_end - range.p_start) / sizeof(internal_sub_item_t); + assert(num_items > 0); + auto _p_first_item = range.p_end - sizeof(internal_sub_item_t); + p_first_item = reinterpret_cast<const internal_sub_item_t*>(_p_first_item); + } + + // container type system + using key_get_type = const snap_gen_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + num_keys_t keys() const { return num_items; } + key_get_type operator[](index_t index) const { + assert(index < num_items); + return (p_first_item - index)->get_key(); + } + node_offset_t size_before(index_t index) const { + size_t ret = index * sizeof(internal_sub_item_t); + assert(ret < NODE_BLOCK_SIZE); + return ret; + } + const laddr_packed_t* get_p_value(index_t index) const { + assert(index < num_items); + return (p_first_item - index)->get_p_value(); + } + node_offset_t size_overhead_at(index_t index) const { return 0u; } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + auto p_end = reinterpret_cast<const char*>(p_first_item) + + sizeof(internal_sub_item_t); + auto p_start = p_end - num_items * sizeof(internal_sub_item_t); + int start_offset = p_start - p_node_start; + int end_offset = p_end - p_node_start; + assert(start_offset > 0 && + start_offset < end_offset && + end_offset < NODE_BLOCK_SIZE); + ceph::encode(static_cast<node_offset_t>(start_offset), encoded); + ceph::encode(static_cast<node_offset_t>(end_offset), encoded); + } + + static internal_sub_items_t decode( + const char* p_node_start, ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t end_offset; + ceph::decode(end_offset, delta); + assert(start_offset < end_offset); + assert(end_offset <= NODE_BLOCK_SIZE); + return internal_sub_items_t({p_node_start + start_offset, + p_node_start + end_offset}); + } + + static node_offset_t header_size() { return 0u; } + + template <KeyT KT> + static node_offset_t estimate_insert( + const full_key_t<KT>&, const laddr_packed_t&) { + return sizeof(internal_sub_item_t); + } + + template <KeyT KT> + static const laddr_packed_t* insert_at( + NodeExtentMutable&, const internal_sub_items_t&, + const full_key_t<KT>&, const laddr_packed_t&, + index_t index, node_offset_t size, const char* p_left_bound); + + static node_offset_t trim_until(NodeExtentMutable&, internal_sub_items_t&, index_t); + + template <KeyT KT> + class Appender; + + private: + index_t num_items; + const internal_sub_item_t* p_first_item; +}; + +template <KeyT KT> +class internal_sub_items_t::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} {} + void append(const internal_sub_items_t& src, index_t from, index_t items); + void append(const full_key_t<KT>&, const laddr_packed_t&, const laddr_packed_t*&); + char* wrap() { return p_append; } + private: + NodeExtentMutable* p_mut; + char* p_append; +}; + +/** + * leaf_sub_items_t + * + * The STAGE_RIGHT implementation for leaf node N0/N1/N2, implements staged + * contract as an indexable container to index snap-gen to onode_t. + * + * The layout of the contaner storing n sub-items: + * + * # <------------------------ container range -------------------------------> # + * # <---------- sub-items ----------------> # <--- offsets ---------# # + * #<~># sub-items [2, n) #<~>| offsets [2, n) # # + * # # <- sub-item 1 -> # <- sub-item 0 -> # | # # + * #...# snap-gen | onode # snap-gen | onode #...| offset1 | offset0 # num_keys # + * ^ ^ ^ + * | | | + * p_items_end + p_offsets + | + * p_num_keys + + */ +class leaf_sub_items_t { + public: + // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), + // and the minimal size of onode_t + using num_keys_t = uint8_t; + + leaf_sub_items_t(const memory_range_t& range) { + assert(range.p_start < range.p_end); + auto _p_num_keys = range.p_end - sizeof(num_keys_t); + assert(range.p_start < _p_num_keys); + p_num_keys = reinterpret_cast<const num_keys_t*>(_p_num_keys); + assert(keys()); + auto _p_offsets = _p_num_keys - sizeof(node_offset_t); + assert(range.p_start < _p_offsets); + p_offsets = reinterpret_cast<const node_offset_packed_t*>(_p_offsets); + p_items_end = reinterpret_cast<const char*>(&get_offset(keys() - 1)); + assert(range.p_start < p_items_end); + assert(range.p_start == p_start()); + } + + bool operator==(const leaf_sub_items_t& x) { + return (p_num_keys == x.p_num_keys && + p_offsets == x.p_offsets && + p_items_end == x.p_items_end); + } + + const char* p_start() const { return get_item_end(keys()); } + + const node_offset_packed_t& get_offset(index_t index) const { + assert(index < keys()); + return *(p_offsets - index); + } + + const node_offset_t get_offset_to_end(index_t index) const { + assert(index <= keys()); + return index == 0 ? 0 : get_offset(index - 1).value; + } + + const char* get_item_start(index_t index) const { + return p_items_end - get_offset(index).value; + } + + const char* get_item_end(index_t index) const { + return p_items_end - get_offset_to_end(index); + } + + // container type system + using key_get_type = const snap_gen_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + num_keys_t keys() const { return *p_num_keys; } + key_get_type operator[](index_t index) const { + assert(index < keys()); + auto pointer = get_item_end(index); + assert(get_item_start(index) < pointer); + pointer -= sizeof(snap_gen_t); + assert(get_item_start(index) < pointer); + return *reinterpret_cast<const snap_gen_t*>(pointer); + } + node_offset_t size_before(index_t index) const { + assert(index <= keys()); + size_t ret; + if (index == 0) { + ret = sizeof(num_keys_t); + } else { + --index; + ret = sizeof(num_keys_t) + + (index + 1) * sizeof(node_offset_t) + + get_offset(index).value; + } + assert(ret < NODE_BLOCK_SIZE); + return ret; + } + node_offset_t size_overhead_at(index_t index) const { return sizeof(node_offset_t); } + const onode_t* get_p_value(index_t index) const { + assert(index < keys()); + auto pointer = get_item_start(index); + auto value = reinterpret_cast<const onode_t*>(pointer); + assert(pointer + value->size + sizeof(snap_gen_t) == get_item_end(index)); + return value; + } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + auto p_end = reinterpret_cast<const char*>(p_num_keys) + + sizeof(num_keys_t); + int start_offset = p_start() - p_node_start; + int end_offset = p_end - p_node_start; + assert(start_offset > 0 && + start_offset < end_offset && + end_offset < NODE_BLOCK_SIZE); + ceph::encode(static_cast<node_offset_t>(start_offset), encoded); + ceph::encode(static_cast<node_offset_t>(end_offset), encoded); + } + + static leaf_sub_items_t decode( + const char* p_node_start, ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t end_offset; + ceph::decode(end_offset, delta); + assert(start_offset < end_offset); + assert(end_offset <= NODE_BLOCK_SIZE); + return leaf_sub_items_t({p_node_start + start_offset, + p_node_start + end_offset}); + } + + static node_offset_t header_size() { return sizeof(num_keys_t); } + + template <KeyT KT> + static node_offset_t estimate_insert(const full_key_t<KT>&, const onode_t& value) { + return value.size + sizeof(snap_gen_t) + sizeof(node_offset_t); + } + + template <KeyT KT> + static const onode_t* insert_at( + NodeExtentMutable&, const leaf_sub_items_t&, + const full_key_t<KT>&, const onode_t&, + index_t index, node_offset_t size, const char* p_left_bound); + + static node_offset_t trim_until(NodeExtentMutable&, leaf_sub_items_t&, index_t index); + + template <KeyT KT> + class Appender; + + private: + // TODO: support unaligned access + const num_keys_t* p_num_keys; + const node_offset_packed_t* p_offsets; + const char* p_items_end; +}; + +constexpr index_t APPENDER_LIMIT = 3u; + +template <KeyT KT> +class leaf_sub_items_t::Appender { + struct range_items_t { + index_t from; + index_t items; + }; + struct kv_item_t { + const full_key_t<KT>* p_key; + const onode_t* p_value; + }; + using var_t = std::variant<range_items_t, kv_item_t>; + + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} { + } + + void append(const leaf_sub_items_t& src, index_t from, index_t items) { + assert(cnt <= APPENDER_LIMIT); + assert(from <= src.keys()); + if (items == 0) { + return; + } + if (op_src) { + assert(*op_src == src); + } else { + op_src = src; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + appends[cnt] = range_items_t{from, items}; + ++cnt; + } + void append(const full_key_t<KT>& key, + const onode_t& value, const onode_t*& p_value) { + assert(pp_value == nullptr); + assert(cnt <= APPENDER_LIMIT); + appends[cnt] = kv_item_t{&key, &value}; + ++cnt; + pp_value = &p_value; + } + char* wrap(); + + private: + std::optional<leaf_sub_items_t> op_src; + const onode_t** pp_value = nullptr; + NodeExtentMutable* p_mut; + char* p_append; + var_t appends[APPENDER_LIMIT]; + index_t cnt = 0; +}; + +template <node_type_t> struct _sub_items_t; +template<> struct _sub_items_t<node_type_t::INTERNAL> { using type = internal_sub_items_t; }; +template<> struct _sub_items_t<node_type_t::LEAF> { using type = leaf_sub_items_t; }; +template <node_type_t NODE_TYPE> +using sub_items_t = typename _sub_items_t<NODE_TYPE>::type; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc new file mode 100644 index 000000000..5a28f5097 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc @@ -0,0 +1,26 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "super.h" +#include "node.h" + +namespace crimson::os::seastore::onode { + +Ref<Node> RootNodeTrackerIsolated::get_root(Transaction& t) const { + auto iter = tracked_supers.find(&t); + if (iter == tracked_supers.end()) { + return nullptr; + } else { + return iter->second->get_p_root(); + } +} + +Ref<Node> RootNodeTrackerShared::get_root(Transaction&) const { + if (is_clean()) { + return nullptr; + } else { + return tracked_super->get_p_root(); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.h b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h new file mode 100644 index 000000000..5eefee9ff --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h @@ -0,0 +1,143 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> + +#include "crimson/common/type_helpers.h" + +#include "fwd.h" + +namespace crimson::os::seastore::onode { + +class Node; +class Super; + +/** + * RootNodeTracker + * + * An abstracted tracker to get the root node by Transaction. + */ +class RootNodeTracker { + public: + virtual ~RootNodeTracker() = default; + virtual bool is_clean() const = 0; + virtual Ref<Node> get_root(Transaction&) const = 0; + static RootNodeTrackerURef create(bool read_isolated); + protected: + RootNodeTracker() = default; + RootNodeTracker(const RootNodeTracker&) = delete; + RootNodeTracker(RootNodeTracker&&) = delete; + RootNodeTracker& operator=(const RootNodeTracker&) = delete; + RootNodeTracker& operator=(RootNodeTracker&&) = delete; + virtual void do_track_super(Transaction&, Super&) = 0; + virtual void do_untrack_super(Transaction&, Super&) = 0; + friend class Super; +}; + +/** + * Super + * + * The parent of root node. It contains the relationship between a Transaction + * and a root node address. + */ +class Super { + public: + using URef = std::unique_ptr<Super>; + Super(const Super&) = delete; + Super(Super&&) = delete; + Super& operator=(const Super&) = delete; + Super& operator=(Super&&) = delete; + virtual ~Super() { + assert(tracked_root_node == nullptr); + tracker.do_untrack_super(t, *this); + } + + virtual laddr_t get_root_laddr() const = 0; + virtual void write_root_laddr(context_t, laddr_t) = 0; + + void do_track_root(Node& root) { + assert(tracked_root_node == nullptr); + tracked_root_node = &root; + } + void do_untrack_root(Node& root) { + assert(tracked_root_node == &root); + tracked_root_node = nullptr; + } + Node* get_p_root() const { + assert(tracked_root_node != nullptr); + return tracked_root_node; + } + + protected: + Super(Transaction& t, RootNodeTracker& tracker) + : t{t}, tracker{tracker} { + tracker.do_track_super(t, *this); + } + + private: + Transaction& t; + RootNodeTracker& tracker; + Node* tracked_root_node = nullptr; +}; + +/** + * RootNodeTrackerIsolated + * + * A concrete RootNodeTracker implementation which provides root node isolation + * between Transactions for Seastore backend. + */ +class RootNodeTrackerIsolated final : public RootNodeTracker { + public: + ~RootNodeTrackerIsolated() override { assert(is_clean()); } + protected: + bool is_clean() const override { + return tracked_supers.empty(); + } + void do_track_super(Transaction& t, Super& super) override { + assert(tracked_supers.find(&t) == tracked_supers.end()); + tracked_supers[&t] = &super; + } + void do_untrack_super(Transaction& t, Super& super) override { + [[maybe_unused]] auto removed = tracked_supers.erase(&t); + assert(removed); + } + ::Ref<Node> get_root(Transaction& t) const override; + std::map<Transaction*, Super*> tracked_supers; +}; + +/** + * RootNodeTrackerShared + * + * A concrete RootNodeTracker implementation which has no isolation between + * Transactions for Dummy backend. + */ +class RootNodeTrackerShared final : public RootNodeTracker { + public: + ~RootNodeTrackerShared() override { assert(is_clean()); } + protected: + bool is_clean() const override { + return tracked_super == nullptr; + } + void do_track_super(Transaction&, Super& super) override { + assert(is_clean()); + tracked_super = &super; + } + void do_untrack_super(Transaction&, Super& super) override { + assert(tracked_super == &super); + tracked_super = nullptr; + } + ::Ref<Node> get_root(Transaction&) const override; + Super* tracked_super = nullptr; +}; + +inline RootNodeTrackerURef RootNodeTracker::create(bool read_isolated) { + if (read_isolated) { + return RootNodeTrackerURef(new RootNodeTrackerIsolated()); + } else { + return RootNodeTrackerURef(new RootNodeTrackerShared()); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc new file mode 100644 index 000000000..2c8c21652 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "tree.h" + +#include "node.h" +#include "node_extent_manager.h" +#include "stages/key_layout.h" +#include "super.h" + +namespace crimson::os::seastore::onode { + +using btree_ertr = Btree::btree_ertr; +template <class ValueT=void> +using btree_future = Btree::btree_future<ValueT>; +using Cursor = Btree::Cursor; + +Cursor::Cursor(Btree* p_tree, Ref<tree_cursor_t> _p_cursor) + : p_tree(p_tree) { + if (_p_cursor->is_end()) { + // no need to hold the leaf node + } else { + p_cursor = _p_cursor; + } +} +Cursor::Cursor(Btree* p_tree) : p_tree{p_tree} {} +Cursor::Cursor(const Cursor&) = default; +Cursor::Cursor(Cursor&&) noexcept = default; +Cursor& Cursor::operator=(const Cursor&) = default; +Cursor& Cursor::operator=(Cursor&&) = default; +Cursor::~Cursor() = default; + +bool Cursor::is_end() const { + if (p_cursor) { + assert(!p_cursor->is_end()); + return false; + } else { + return true; + } +} + +ghobject_t Cursor::get_ghobj() const { + return p_cursor->get_key_view().to_ghobj(); +} + +const onode_t* Cursor::value() const { + return p_cursor->get_p_value(); +} + +bool Cursor::operator==(const Cursor& x) const { + return p_cursor == x.p_cursor; +} + +Cursor& Cursor::operator++() { + // TODO + return *this; +} + +Cursor Cursor::operator++(int) { + Cursor tmp = *this; + ++*this; + return tmp; +} + +Cursor Cursor::make_end(Btree* p_tree) { + return {p_tree}; +} + +Btree::Btree(NodeExtentManagerURef&& _nm) + : nm{std::move(_nm)}, + root_tracker{RootNodeTracker::create(nm->is_read_isolated())} {} + +Btree::~Btree() { assert(root_tracker->is_clean()); } + +btree_future<> Btree::mkfs(Transaction& t) { + return Node::mkfs(get_context(t), *root_tracker); +} + +btree_future<Cursor> Btree::begin(Transaction& t) { + return get_root(t).safe_then([this, &t](auto root) { + return root->lookup_smallest(get_context(t)); + }).safe_then([this](auto cursor) { + return Cursor{this, cursor}; + }); +} + +btree_future<Cursor> Btree::last(Transaction& t) { + return get_root(t).safe_then([this, &t](auto root) { + return root->lookup_largest(get_context(t)); + }).safe_then([this](auto cursor) { + return Cursor(this, cursor); + }); +} + +Cursor Btree::end() { + return Cursor::make_end(this); +} + +btree_future<bool> +Btree::contains(Transaction& t, const ghobject_t& obj) { + return seastar::do_with( + full_key_t<KeyT::HOBJ>(obj), + [this, &t](auto& key) -> btree_future<bool> { + return get_root(t).safe_then([this, &t, &key](auto root) { + // TODO: improve lower_bound() + return root->lower_bound(get_context(t), key); + }).safe_then([](auto result) { + return MatchKindBS::EQ == result.match(); + }); + } + ); +} + +btree_future<Cursor> +Btree::find(Transaction& t, const ghobject_t& obj) { + return seastar::do_with( + full_key_t<KeyT::HOBJ>(obj), + [this, &t](auto& key) -> btree_future<Cursor> { + return get_root(t).safe_then([this, &t, &key](auto root) { + // TODO: improve lower_bound() + return root->lower_bound(get_context(t), key); + }).safe_then([this](auto result) { + if (result.match() == MatchKindBS::EQ) { + return Cursor(this, result.p_cursor); + } else { + return Cursor::make_end(this); + } + }); + } + ); +} + +btree_future<Cursor> +Btree::lower_bound(Transaction& t, const ghobject_t& obj) { + return seastar::do_with( + full_key_t<KeyT::HOBJ>(obj), + [this, &t](auto& key) -> btree_future<Cursor> { + return get_root(t).safe_then([this, &t, &key](auto root) { + return root->lower_bound(get_context(t), key); + }).safe_then([this](auto result) { + return Cursor(this, result.p_cursor); + }); + } + ); +} + +btree_future<std::pair<Cursor, bool>> +Btree::insert(Transaction& t, const ghobject_t& obj, const onode_t& value) { + return seastar::do_with( + full_key_t<KeyT::HOBJ>(obj), + [this, &t, &value](auto& key) -> btree_future<std::pair<Cursor, bool>> { + return get_root(t).safe_then([this, &t, &key, &value](auto root) { + return root->insert(get_context(t), key, value); + }).safe_then([this](auto ret) { + auto& [cursor, success] = ret; + return std::make_pair(Cursor(this, cursor), success); + }); + } + ); +} + +btree_future<size_t> Btree::erase(Transaction& t, const ghobject_t& obj) { + // TODO + return btree_ertr::make_ready_future<size_t>(0u); +} + +btree_future<Cursor> Btree::erase(Cursor& pos) { + // TODO + return btree_ertr::make_ready_future<Cursor>( + Cursor::make_end(this)); +} + +btree_future<Cursor> +Btree::erase(Cursor& first, Cursor& last) { + // TODO + return btree_ertr::make_ready_future<Cursor>( + Cursor::make_end(this)); +} + +btree_future<size_t> Btree::height(Transaction& t) { + return get_root(t).safe_then([](auto root) { + return size_t(root->level() + 1); + }); +} + +btree_future<tree_stats_t> Btree::get_stats_slow(Transaction& t) { + return get_root(t).safe_then([this, &t](auto root) { + unsigned height = root->level() + 1; + return root->get_tree_stats(get_context(t) + ).safe_then([height](auto stats) { + stats.height = height; + return btree_ertr::make_ready_future<tree_stats_t>(stats); + }); + }); +} + +std::ostream& Btree::dump(Transaction& t, std::ostream& os) { + auto root = root_tracker->get_root(t); + if (root) { + root->dump(os); + } else { + os << "empty tree!"; + } + return os; +} + +std::ostream& Btree::print(std::ostream& os) const { + return os << "BTree-" << *nm; +} + +btree_future<Ref<Node>> Btree::get_root(Transaction& t) { + auto root = root_tracker->get_root(t); + if (root) { + return btree_ertr::make_ready_future<Ref<Node>>(root); + } else { + return Node::load_root(get_context(t), *root_tracker); + } +} + +bool Btree::test_is_clean() const { + return root_tracker->is_clean(); +} + +btree_future<> Btree::test_clone_from( + Transaction& t, Transaction& t_from, Btree& from) { + // Note: assume the tree to clone is tracked correctly in memory. + // In some unit tests, parts of the tree are stubbed out that they + // should not be loaded from NodeExtentManager. + return from.get_root(t_from + ).safe_then([this, &t](auto root_from) { + return root_from->test_clone_root(get_context(t), *root_tracker); + }); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h new file mode 100644 index 000000000..7ee618cb3 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h @@ -0,0 +1,119 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <ostream> + +#include "common/hobject.h" +#include "crimson/common/type_helpers.h" + +#include "fwd.h" +#include "tree_types.h" + +/** + * tree.h + * + * An example implementation to expose tree interfaces to users. The current + * interface design is based on: + * - ceph::os::Transaction::create/touch/remove() + * - ceph::ObjectStore::collection_list() + * - ceph::BlueStore::get_onode() + * - db->get_iterator(PREFIIX_OBJ) by ceph::BlueStore::fsck() + * + * TODO: Redesign the interfaces based on real onode manager requirements. + */ + +namespace crimson::os::seastore::onode { + +class Node; +class Btree { + public: + using btree_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + template <class ValueT=void> + using btree_future = btree_ertr::future<ValueT>; + + Btree(NodeExtentManagerURef&& nm); + Btree(const Btree&) = delete; + Btree(Btree&&) = delete; + Btree& operator=(const Btree&) = delete; + Btree& operator=(Btree&&) = delete; + ~Btree(); + + btree_future<> mkfs(Transaction&); + + class Cursor; + // lookup + btree_future<Cursor> begin(Transaction&); + btree_future<Cursor> last(Transaction&); + Cursor end(); + btree_future<bool> contains(Transaction&, const ghobject_t&); + btree_future<Cursor> find(Transaction&, const ghobject_t&); + btree_future<Cursor> lower_bound(Transaction&, const ghobject_t&); + + // modifiers + // TODO: replace onode_t + btree_future<std::pair<Cursor, bool>> + insert(Transaction&, const ghobject_t&, const onode_t&); + btree_future<size_t> erase(Transaction&, const ghobject_t& key); + btree_future<Cursor> erase(Cursor& pos); + btree_future<Cursor> erase(Cursor& first, Cursor& last); + + // stats + btree_future<size_t> height(Transaction&); + btree_future<tree_stats_t> get_stats_slow(Transaction&); + std::ostream& dump(Transaction&, std::ostream&); + std::ostream& print(std::ostream& os) const; + + // test_only + bool test_is_clean() const; + btree_future<> test_clone_from(Transaction& t, Transaction& t_from, Btree& from); + + private: + context_t get_context(Transaction& t) { return {*nm, t}; } + btree_future<Ref<Node>> get_root(Transaction& t); + + NodeExtentManagerURef nm; + RootNodeTrackerURef root_tracker; + + friend class DummyChildPool; +}; +inline std::ostream& operator<<(std::ostream& os, const Btree& tree) { + return tree.print(os); +} + +class tree_cursor_t; +class Btree::Cursor { + public: + Cursor(const Cursor&); + Cursor(Cursor&&) noexcept; + Cursor& operator=(const Cursor&); + Cursor& operator=(Cursor&&); + ~Cursor(); + + bool is_end() const; + // XXX: return key_view_t to avoid unecessary ghobject_t constructions + ghobject_t get_ghobj() const; + const onode_t* value() const; + bool operator==(const Cursor& x) const; + bool operator!=(const Cursor& x) const { return !(*this == x); } + Cursor& operator++(); + Cursor operator++(int); + + private: + Cursor(Btree*, Ref<tree_cursor_t>); + Cursor(Btree*); + + static Cursor make_end(Btree*); + + Btree* p_tree; + Ref<tree_cursor_t> p_cursor; + + friend class Btree; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h new file mode 100644 index 000000000..0bb345e0a --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h @@ -0,0 +1,125 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <ostream> + +namespace crimson::os::seastore::onode { + +// TODO: Redesign according to real requirement from onode manager +struct onode_t { + // onode should be smaller than a node + uint16_t size; // address up to 64 KiB sized node + uint16_t id; + // omap, extent_map, inline data + + bool operator==(const onode_t& o) const { return size == o.size && id == o.id; } + bool operator!=(const onode_t& o) const { return !(*this == o); } + + void encode(ceph::bufferlist& encoded) const { + ceph::encode(size, encoded); + ceph::encode(id, encoded); + } + static onode_t decode(ceph::bufferlist::const_iterator& delta) { + uint16_t size; + ceph::decode(size, delta); + uint16_t id; + ceph::decode(id, delta); + onode_t ret{size, id}; + return ret; + } + static void validate_tail_magic(const onode_t& onode) { + auto p_target = (const char*)&onode + onode.size - sizeof(uint32_t); + uint32_t target; + std::memcpy(&target, p_target, sizeof(uint32_t)); + ceph_assert(target == onode.size * 137); + } + static std::unique_ptr<char[]> allocate(const onode_t& config) { + ceph_assert(config.size >= sizeof(onode_t) + sizeof(uint32_t)); + + auto ret = std::make_unique<char[]>(config.size); + char* p_mem = ret.get(); + auto p_onode = reinterpret_cast<onode_t*>(p_mem); + *p_onode = config; + + uint32_t tail_magic = config.size * 137; + p_mem += (config.size - sizeof(uint32_t)); + std::memcpy(p_mem, &tail_magic, sizeof(uint32_t)); + validate_tail_magic(*p_onode); + + return ret; + } +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const onode_t& node) { + return os << "onode(" << node.id << ", " << node.size << "B)"; +} + +struct tree_stats_t { + size_t size_persistent_leaf = 0; + size_t size_persistent_internal = 0; + size_t size_filled_leaf = 0; + size_t size_filled_internal = 0; + size_t size_logical_leaf = 0; + size_t size_logical_internal = 0; + size_t size_overhead_leaf = 0; + size_t size_overhead_internal = 0; + size_t size_value_leaf = 0; + size_t size_value_internal = 0; + unsigned num_kvs_leaf = 0; + unsigned num_kvs_internal = 0; + unsigned num_nodes_leaf = 0; + unsigned num_nodes_internal = 0; + unsigned height = 0; + + size_t size_persistent() const { + return size_persistent_leaf + size_persistent_internal; } + size_t size_filled() const { + return size_filled_leaf + size_filled_internal; } + size_t size_logical() const { + return size_logical_leaf + size_logical_internal; } + size_t size_overhead() const { + return size_overhead_leaf + size_overhead_internal; } + size_t size_value() const { + return size_value_leaf + size_value_internal; } + unsigned num_kvs() const { + return num_kvs_leaf + num_kvs_internal; } + unsigned num_nodes() const { + return num_nodes_leaf + num_nodes_internal; } + + double ratio_fullness() const { + return (double)size_filled() / size_persistent(); } + double ratio_key_compression() const { + return (double)(size_filled() - size_value()) / (size_logical() - size_value()); } + double ratio_overhead() const { + return (double)size_overhead() / size_filled(); } + double ratio_keys_leaf() const { + return (double)num_kvs_leaf / num_kvs(); } + double ratio_nodes_leaf() const { + return (double)num_nodes_leaf / num_nodes(); } + double ratio_filled_leaf() const { + return (double)size_filled_leaf / size_filled(); } +}; +inline std::ostream& operator<<(std::ostream& os, const tree_stats_t& stats) { + os << "Tree stats:" + << "\n height = " << stats.height + << "\n num values = " << stats.num_kvs_leaf + << "\n num nodes = " << stats.num_nodes() + << " (leaf=" << stats.num_nodes_leaf + << ", internal=" << stats.num_nodes_internal << ")" + << "\n size persistent = " << stats.size_persistent() << "B" + << "\n size filled = " << stats.size_filled() << "B" + << " (value=" << stats.size_value_leaf << "B" + << ", rest=" << stats.size_filled() - stats.size_value_leaf << "B)" + << "\n size logical = " << stats.size_logical() << "B" + << "\n size overhead = " << stats.size_overhead() << "B" + << "\n ratio fullness = " << stats.ratio_fullness() + << "\n ratio keys leaf = " << stats.ratio_keys_leaf() + << "\n ratio nodes leaf = " << stats.ratio_nodes_leaf() + << "\n ratio filled leaf = " << stats.ratio_filled_leaf() + << "\n ratio key compression = " << stats.ratio_key_compression(); + assert(stats.num_kvs_internal + 1 == stats.num_nodes()); + return os; +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h new file mode 100644 index 000000000..536052003 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h @@ -0,0 +1,333 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <cstring> +#include <random> +#include <string> +#include <sstream> +#include <utility> +#include <vector> + +#include "crimson/common/log.h" +#include "stages/key_layout.h" +#include "tree.h" + +/** + * tree_utils.h + * + * Contains shared logic for unit tests and perf tool. + */ + +namespace crimson::os::seastore::onode { + +class Onodes { + public: + Onodes(size_t n) { + for (size_t i = 1; i <= n; ++i) { + auto p_onode = &create(i * 8); + onodes.push_back(p_onode); + } + } + + Onodes(std::vector<size_t> sizes) { + for (auto& size : sizes) { + auto p_onode = &create(size); + onodes.push_back(p_onode); + } + } + + ~Onodes() = default; + + const onode_t& create(size_t size) { + ceph_assert(size <= std::numeric_limits<uint16_t>::max()); + onode_t config{static_cast<uint16_t>(size), id++}; + auto onode = onode_t::allocate(config); + auto p_onode = onode.get(); + tracked_onodes.push_back(std::move(onode)); + return *reinterpret_cast<onode_t*>(p_onode); + } + + const onode_t& pick() const { + auto index = rd() % onodes.size(); + return *onodes[index]; + } + + const onode_t& pick_largest() const { + return *onodes[onodes.size() - 1]; + } + + static void validate_cursor( + const Btree::Cursor& cursor, const ghobject_t& key, const onode_t& onode) { + ceph_assert(!cursor.is_end()); + ceph_assert(cursor.get_ghobj() == key); + ceph_assert(cursor.value()); + ceph_assert(cursor.value() != &onode); + ceph_assert(*cursor.value() == onode); + onode_t::validate_tail_magic(*cursor.value()); + } + + private: + uint16_t id = 0; + mutable std::random_device rd; + std::vector<const onode_t*> onodes; + std::vector<std::unique_ptr<char[]>> tracked_onodes; +}; + +class KVPool { + struct kv_conf_t { + unsigned index2; + unsigned index1; + unsigned index0; + size_t ns_size; + size_t oid_size; + const onode_t* p_value; + + ghobject_t get_ghobj() const { + assert(index1 < 10); + std::ostringstream os_ns; + os_ns << "ns" << index1; + unsigned current_size = (unsigned)os_ns.tellp(); + assert(ns_size >= current_size); + os_ns << std::string(ns_size - current_size, '_'); + + std::ostringstream os_oid; + os_oid << "oid" << index1; + current_size = (unsigned)os_oid.tellp(); + assert(oid_size >= current_size); + os_oid << std::string(oid_size - current_size, '_'); + + return ghobject_t(shard_id_t(index2), index2, index2, + os_ns.str(), os_oid.str(), index0, index0); + } + }; + using kv_vector_t = std::vector<kv_conf_t>; + + public: + using kv_t = std::pair<ghobject_t, const onode_t*>; + + KVPool(const std::vector<size_t>& str_sizes, + const std::vector<size_t>& onode_sizes, + const std::pair<unsigned, unsigned>& range2, + const std::pair<unsigned, unsigned>& range1, + const std::pair<unsigned, unsigned>& range0) + : str_sizes{str_sizes}, onodes{onode_sizes} { + ceph_assert(range2.first < range2.second); + ceph_assert(range2.second - 1 <= (unsigned)std::numeric_limits<shard_t>::max()); + ceph_assert(range2.second - 1 <= std::numeric_limits<crush_hash_t>::max()); + ceph_assert(range1.first < range1.second); + ceph_assert(range1.second - 1 <= 9); + ceph_assert(range0.first < range0.second); + std::random_device rd; + for (unsigned i = range2.first; i < range2.second; ++i) { + for (unsigned j = range1.first; j < range1.second; ++j) { + auto ns_size = (unsigned)str_sizes[rd() % str_sizes.size()]; + auto oid_size = (unsigned)str_sizes[rd() % str_sizes.size()]; + for (unsigned k = range0.first; k < range0.second; ++k) { + kvs.emplace_back(kv_conf_t{i, j, k, ns_size, oid_size, &onodes.pick()}); + } + } + } + random_kvs = kvs; + std::random_shuffle(random_kvs.begin(), random_kvs.end()); + } + + class iterator_t { + public: + iterator_t() = default; + iterator_t(const iterator_t&) = default; + iterator_t(iterator_t&&) = default; + iterator_t& operator=(const iterator_t&) = default; + iterator_t& operator=(iterator_t&&) = default; + + kv_t get_kv() const { + assert(!is_end()); + auto& conf = (*p_kvs)[i]; + return std::make_pair(conf.get_ghobj(), conf.p_value); + } + bool is_end() const { return !p_kvs || i >= p_kvs->size(); } + size_t index() const { return i; } + + iterator_t& operator++() { + assert(!is_end()); + ++i; + return *this; + } + + iterator_t operator++(int) { + iterator_t tmp = *this; + ++*this; + return tmp; + } + + private: + iterator_t(const kv_vector_t& kvs) : p_kvs{&kvs} {} + + const kv_vector_t* p_kvs = nullptr; + size_t i = 0; + friend class KVPool; + }; + + iterator_t begin() const { + return iterator_t(kvs); + } + + iterator_t random_begin() const { + return iterator_t(random_kvs); + } + + size_t size() const { + return kvs.size(); + } + + private: + std::vector<size_t> str_sizes; + Onodes onodes; + kv_vector_t kvs; + kv_vector_t random_kvs; +}; + +template <bool TRACK> +class TreeBuilder { + public: + using ertr = Btree::btree_ertr; + template <class ValueT=void> + using future = ertr::future<ValueT>; + + TreeBuilder(KVPool& kvs, NodeExtentManagerURef&& nm) + : kvs{kvs} { + tree.emplace(std::move(nm)); + } + + future<> bootstrap(Transaction& t) { + std::ostringstream oss; +#ifndef NDEBUG + oss << "debug=on, "; +#else + oss << "debug=off, "; +#endif +#ifdef UNIT_TESTS_BUILT + oss << "UNIT_TEST_BUILT=on, "; +#else + oss << "UNIT_TEST_BUILT=off, "; +#endif + if constexpr (TRACK) { + oss << "track=on, "; + } else { + oss << "track=off, "; + } + oss << *tree; + logger().warn("TreeBuilder: {}, bootstrapping ...", oss.str()); + return tree->mkfs(t); + } + + future<> insert(Transaction& t) { + kv_iter = kvs.random_begin(); + auto cursors = seastar::make_lw_shared<std::vector<Btree::Cursor>>(); + logger().warn("start inserting {} kvs ...", kvs.size()); + auto start_time = mono_clock::now(); + return crimson::do_until([&t, this, cursors]() -> future<bool> { + if (kv_iter.is_end()) { + return ertr::make_ready_future<bool>(true); + } + auto [key, p_value] = kv_iter.get_kv(); + logger().debug("[{}] {} -> {}", kv_iter.index(), key_hobj_t{key}, *p_value); + return tree->insert(t, key, *p_value + ).safe_then([&t, this, cursors](auto ret) { + auto& [cursor, success] = ret; + assert(success == true); + if constexpr (TRACK) { + cursors->emplace_back(cursor); + } +#ifndef NDEBUG + auto [key, p_value] = kv_iter.get_kv(); + Onodes::validate_cursor(cursor, key, *p_value); + return tree->lower_bound(t, key).safe_then([this, cursor](auto cursor_) { + auto [key, p_value] = kv_iter.get_kv(); + ceph_assert(cursor_.get_ghobj() == key); + ceph_assert(cursor_.value() == cursor.value()); + ++kv_iter; + return ertr::make_ready_future<bool>(false); + }); +#else + ++kv_iter; + return ertr::make_ready_future<bool>(false); +#endif + }); + }).safe_then([&t, this, start_time, cursors] { + std::chrono::duration<double> duration = mono_clock::now() - start_time; + logger().warn("Insert done! {}s", duration.count()); + if (!cursors->empty()) { + logger().info("Verifing tracked cursors ..."); + kv_iter = kvs.random_begin(); + return seastar::do_with( + cursors->begin(), [&t, this, cursors](auto& c_iter) { + return crimson::do_until([&t, this, &c_iter, cursors]() -> future<bool> { + if (kv_iter.is_end()) { + logger().info("Verify done!"); + return ertr::make_ready_future<bool>(true); + } + assert(c_iter != cursors->end()); + auto [k, v] = kv_iter.get_kv(); + // validate values in tree keep intact + return tree->lower_bound(t, k).safe_then([this, &c_iter](auto cursor) { + auto [k, v] = kv_iter.get_kv(); + Onodes::validate_cursor(cursor, k, *v); + // validate values in cursors keep intact + Onodes::validate_cursor(*c_iter, k, *v); + ++kv_iter; + ++c_iter; + return ertr::make_ready_future<bool>(false); + }); + }); + }); + } else { + return ertr::now(); + } + }); + } + + future<> get_stats(Transaction& t) { + return tree->get_stats_slow(t + ).safe_then([this](auto stats) { + logger().warn("{}", stats); + }); + } + + void reload(NodeExtentManagerURef&& nm) { + tree.emplace(std::move(nm)); + } + + future<> validate(Transaction& t) { + logger().info("Verifing insertion ..."); + return seastar::do_with( + kvs.begin(), [&t, this] (auto& kvs_iter) { + return crimson::do_until([&t, this, &kvs_iter]() -> future<bool> { + if (kvs_iter.is_end()) { + logger().info("Verify done!"); + return ertr::make_ready_future<bool>(true); + } + auto [k, v] = kvs_iter.get_kv(); + return tree->lower_bound(t, k + ).safe_then([&kvs_iter, k=k, v=v] (auto cursor) { + Onodes::validate_cursor(cursor, k, *v); + ++kvs_iter; + return ertr::make_ready_future<bool>(false); + }); + }); + }); + } + + private: + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + + KVPool& kvs; + std::optional<Btree> tree; + KVPool::iterator_t kv_iter; +}; + +} diff --git a/src/crimson/os/seastore/root_block.h b/src/crimson/os/seastore/root_block.h new file mode 100644 index 000000000..4a5024caa --- /dev/null +++ b/src/crimson/os/seastore/root_block.h @@ -0,0 +1,109 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/cached_extent.h" + +namespace crimson::os::seastore { + +/** + * root_t + * + * Contains information required to find metadata roots. + * TODO: generalize this to permit more than one lba_manager implementation + */ +struct __attribute__((aligned(8), packed)) root_t { + depth_t lba_depth = 0; + depth_t segment_depth = 0; + paddr_t lba_root_addr; + paddr_t segment_root; + laddr_t onode_root = L_ADDR_NULL; + + void adjust_addrs_from_base(paddr_t base) { + if (lba_root_addr.is_relative()) { + lba_root_addr = base.add_record_relative(lba_root_addr); + } + } +}; + +/** + * RootBlock + * + * Holds the physical addresses of all metadata roots. + * In-memory values may be + * - absolute: reference to block which predates the current transaction + * - record_relative: reference to block updated in this transaction + * if !pending() + * + * Journal replay only considers deltas and must always discover the most + * recent value for the RootBlock. Because the contents of root_t above are + * very small, it's simplest to stash the entire root_t value into the delta + * and never actually write the RootBlock to a physical location (safe since + * nothing references the location of the RootBlock). + * + * As a result, Cache treats the root differently in a few ways including: + * - state will only ever be DIRTY or MUTATION_PENDING + * - RootBlock's never show up in the transaction fresh or dirty lists -- + * there's a special Transaction::root member for when the root needs to + * be mutated. + * + * TODO: Journal trimming will need to be aware of the most recent RootBlock + * delta location, or, even easier, just always write one out with the + * mutation which changes the journal trim bound. + */ +struct RootBlock : CachedExtent { + constexpr static segment_off_t SIZE = 4<<10; + using Ref = TCachedExtentRef<RootBlock>; + + root_t root; + + RootBlock() : CachedExtent(0) {} + + RootBlock(const RootBlock &rhs) = default; + + CachedExtentRef duplicate_for_write() final { + return CachedExtentRef(new RootBlock(*this)); + }; + + static constexpr extent_types_t TYPE = extent_types_t::ROOT; + extent_types_t get_type() const final { + return extent_types_t::ROOT; + } + + /// dumps root as delta + ceph::bufferlist get_delta() final { + ceph::bufferlist bl; + ceph::buffer::ptr bptr(sizeof(root_t)); + *reinterpret_cast<root_t*>(bptr.c_str()) = root; + bl.append(bptr); + return bl; + } + + /// overwrites root + void apply_delta_and_adjust_crc(paddr_t base, const ceph::bufferlist &_bl) final { + assert(_bl.length() == sizeof(root_t)); + ceph::bufferlist bl = _bl; + bl.rebuild(); + root = *reinterpret_cast<const root_t*>(bl.front().c_str()); + root.adjust_addrs_from_base(base); + } + + /// Patches relative addrs in memory based on record commit addr + void on_delta_write(paddr_t record_block_offset) final { + root.adjust_addrs_from_base(record_block_offset); + } + + complete_load_ertr::future<> complete_load() final { + ceph_abort_msg("Root is only written via deltas"); + } + + void on_initial_write() final { + ceph_abort_msg("Root is only written via deltas"); + } + + root_t &get_root() { return root; } +}; +using RootBlockRef = RootBlock::Ref; + +} diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc new file mode 100644 index 000000000..50c148cea --- /dev/null +++ b/src/crimson/os/seastore/seastore.cc @@ -0,0 +1,532 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "seastore.h" + +#include <boost/algorithm/string/trim.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "common/safe_io.h" +#include "os/Transaction.h" + +#include "crimson/common/buffer_io.h" +#include "crimson/common/config_proxy.h" + +#include "crimson/os/futurized_collection.h" + +#include "crimson/os/seastore/segment_manager/ephemeral.h" +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/onode_manager.h" +#include "crimson/os/seastore/cache.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +using crimson::common::local_conf; + +namespace crimson::os::seastore { + +struct SeastoreCollection final : public FuturizedCollection { + template <typename... T> + SeastoreCollection(T&&... args) : + FuturizedCollection(std::forward<T>(args)...) {} +}; + +SeaStore::SeaStore(const std::string& path) + : segment_manager(segment_manager::create_test_ephemeral() /* TODO */), + segment_cleaner( + std::make_unique<SegmentCleaner>( + SegmentCleaner::config_t::default_from_segment_manager( + *segment_manager))), + cache(std::make_unique<Cache>(*segment_manager)), + journal(new Journal(*segment_manager)), + lba_manager( + lba_manager::create_lba_manager(*segment_manager, *cache)), + transaction_manager( + new TransactionManager( + *segment_manager, + *segment_cleaner, + *journal, + *cache, + *lba_manager)), + onode_manager(onode_manager::create_ephemeral()) +{ + journal->set_segment_provider(&*segment_cleaner); + segment_cleaner->set_extent_callback(&*transaction_manager); +} + +SeaStore::~SeaStore() = default; + +seastar::future<> SeaStore::stop() +{ + return seastar::now(); +} + +seastar::future<> SeaStore::mount() +{ + return seastar::now(); +} + +seastar::future<> SeaStore::umount() +{ + return seastar::now(); +} + +seastar::future<> SeaStore::mkfs(uuid_d new_osd_fsid) +{ + return seastar::now(); +} + +seastar::future<store_statfs_t> SeaStore::stat() const +{ + logger().debug("{}", __func__); + store_statfs_t st; + return seastar::make_ready_future<store_statfs_t>(st); +} + +seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> +SeaStore::list_objects(CollectionRef ch, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const +{ + return seastar::make_ready_future<std::tuple<std::vector<ghobject_t>, ghobject_t>>( + std::make_tuple(std::vector<ghobject_t>(), end)); +} + +seastar::future<CollectionRef> SeaStore::create_new_collection(const coll_t& cid) +{ + auto c = _get_collection(cid); + return seastar::make_ready_future<CollectionRef>(c); +} + +seastar::future<CollectionRef> SeaStore::open_collection(const coll_t& cid) +{ + return seastar::make_ready_future<CollectionRef>(_get_collection(cid)); +} + +seastar::future<std::vector<coll_t>> SeaStore::list_collections() +{ + return seastar::make_ready_future<std::vector<coll_t>>(); +} + +SeaStore::read_errorator::future<ceph::bufferlist> SeaStore::read( + CollectionRef ch, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags) +{ + return read_errorator::make_ready_future<ceph::bufferlist>(); +} + +SeaStore::read_errorator::future<ceph::bufferlist> SeaStore::readv( + CollectionRef ch, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags) +{ + return read_errorator::make_ready_future<ceph::bufferlist>(); +} + +SeaStore::get_attr_errorator::future<ceph::bufferptr> SeaStore::get_attr( + CollectionRef ch, + const ghobject_t& oid, + std::string_view name) const +{ + auto c = static_cast<SeastoreCollection*>(ch.get()); + logger().debug("{} {} {}", + __func__, c->get_cid(), oid); + return crimson::ct_error::enoent::make(); +} + +SeaStore::get_attrs_ertr::future<SeaStore::attrs_t> SeaStore::get_attrs( + CollectionRef ch, + const ghobject_t& oid) +{ + auto c = static_cast<SeastoreCollection*>(ch.get()); + logger().debug("{} {} {}", + __func__, c->get_cid(), oid); + return crimson::ct_error::enoent::make(); +} + +seastar::future<struct stat> stat( + CollectionRef c, + const ghobject_t& oid) +{ + return seastar::make_ready_future<struct stat>(); +} + + +seastar::future<struct stat> SeaStore::stat( + CollectionRef c, + const ghobject_t& oid) +{ + struct stat st; + return seastar::make_ready_future<struct stat>(st); +} + +auto +SeaStore::omap_get_header( + CollectionRef c, + const ghobject_t& oid) + -> read_errorator::future<bufferlist> +{ + return seastar::make_ready_future<bufferlist>(); +} + +auto +SeaStore::omap_get_values( + CollectionRef ch, + const ghobject_t& oid, + const omap_keys_t& keys) + -> read_errorator::future<omap_values_t> +{ + auto c = static_cast<SeastoreCollection*>(ch.get()); + logger().debug("{} {} {}", + __func__, c->get_cid(), oid); + return seastar::make_ready_future<omap_values_t>(); +} + +auto +SeaStore::omap_get_values( + CollectionRef ch, + const ghobject_t &oid, + const std::optional<string> &start) + -> read_errorator::future<std::tuple<bool, SeaStore::omap_values_t>> +{ + auto c = static_cast<SeastoreCollection*>(ch.get()); + logger().debug( + "{} {} {}", + __func__, c->get_cid(), oid); + return seastar::make_ready_future<std::tuple<bool, omap_values_t>>( + std::make_tuple(false, omap_values_t())); +} + +seastar::future<FuturizedStore::OmapIteratorRef> get_omap_iterator( + CollectionRef ch, + const ghobject_t& oid) +{ + return seastar::make_ready_future<FuturizedStore::OmapIteratorRef>(); +} + +seastar::future<std::map<uint64_t, uint64_t>> fiemap( + CollectionRef ch, + const ghobject_t& oid, + uint64_t off, + uint64_t len) +{ + return seastar::make_ready_future<std::map<uint64_t, uint64_t>>(); +} + +seastar::future<> SeaStore::do_transaction( + CollectionRef _ch, + ceph::os::Transaction&& _t) +{ + return seastar::do_with( + _t.begin(), + transaction_manager->create_transaction(), + std::vector<OnodeRef>(), + std::move(_t), + std::move(_ch), + [this](auto &iter, auto &trans, auto &onodes, auto &t, auto &ch) { + return onode_manager->get_or_create_onodes( + *trans, iter.get_objects()).safe_then( + [this, &iter, &trans, &onodes, &t, &ch](auto &&read_onodes) { + onodes = std::move(read_onodes); + return seastar::do_until( + [&iter]() { return iter.have_op(); }, + [this, &iter, &trans, &onodes, &t, &ch]() { + return _do_transaction_step(trans, ch, onodes, iter).safe_then( + [this, &trans] { + return transaction_manager->submit_transaction(std::move(trans)); + }).handle_error( + // TODO: add errorator::do_until + crimson::ct_error::eagain::handle([]() { + // TODO retry + }), + write_ertr::all_same_way([&t](auto e) { + logger().error(" transaction dump:\n"); + JSONFormatter f(true); + f.open_object_section("transaction"); + t.dump(&f); + f.close_section(); + std::stringstream str; + f.flush(str); + logger().error("{}", str.str()); + abort(); + })); + }); + }).safe_then([this, &trans, &onodes]() { + return onode_manager->write_dirty(*trans, onodes); + }).safe_then([]() { + // TODO: complete transaction! + return; + }).handle_error( + write_ertr::all_same_way([&t](auto e) { + logger().error(" transaction dump:\n"); + JSONFormatter f(true); + f.open_object_section("transaction"); + t.dump(&f); + f.close_section(); + std::stringstream str; + f.flush(str); + logger().error("{}", str.str()); + abort(); + })).then([&t]() { + for (auto i : { + t.get_on_applied(), + t.get_on_commit(), + t.get_on_applied_sync()}) { + if (i) { + i->complete(0); + } + } + }); + }); +} + +SeaStore::write_ertr::future<> SeaStore::_do_transaction_step( + TransactionRef &trans, + CollectionRef &col, + std::vector<OnodeRef> &onodes, + ceph::os::Transaction::iterator &i) +{ + auto get_onode = [&onodes](size_t i) -> OnodeRef& { + ceph_assert(i < onodes.size()); + return onodes[i]; + }; + + using ceph::os::Transaction; + try { + switch (auto op = i.decode_op(); op->op) { + case Transaction::OP_NOP: + return write_ertr::now(); + case Transaction::OP_REMOVE: + { + return _remove(trans, get_onode(op->oid)); + } + break; + case Transaction::OP_TOUCH: + { + return _touch(trans, get_onode(op->oid)); + } + break; + case Transaction::OP_WRITE: + { + uint64_t off = op->off; + uint64_t len = op->len; + uint32_t fadvise_flags = i.get_fadvise_flags(); + ceph::bufferlist bl; + i.decode_bl(bl); + return _write(trans, get_onode(op->oid), off, len, bl, fadvise_flags); + } + break; + case Transaction::OP_TRUNCATE: + { + uint64_t off = op->off; + return _truncate(trans, get_onode(op->oid), off); + } + break; + case Transaction::OP_SETATTR: + { + std::string name = i.decode_string(); + ceph::bufferlist bl; + i.decode_bl(bl); + std::map<std::string, bufferptr> to_set; + to_set[name] = bufferptr(bl.c_str(), bl.length()); + return _setattrs(trans, get_onode(op->oid), to_set); + } + break; + case Transaction::OP_MKCOLL: + { + coll_t cid = i.get_cid(op->cid); + return _create_collection(trans, cid, op->split_bits); + } + break; + case Transaction::OP_OMAP_SETKEYS: + { + std::map<std::string, ceph::bufferlist> aset; + i.decode_attrset(aset); + return _omap_set_values(trans, get_onode(op->oid), std::move(aset)); + } + break; + case Transaction::OP_OMAP_SETHEADER: + { + ceph::bufferlist bl; + i.decode_bl(bl); + return _omap_set_header(trans, get_onode(op->oid), bl); + } + break; + case Transaction::OP_OMAP_RMKEYS: + { + omap_keys_t keys; + i.decode_keyset(keys); + return _omap_rmkeys(trans, get_onode(op->oid), keys); + } + break; + case Transaction::OP_OMAP_RMKEYRANGE: + { + string first, last; + first = i.decode_string(); + last = i.decode_string(); + return _omap_rmkeyrange(trans, get_onode(op->oid), first, last); + } + break; + case Transaction::OP_COLL_HINT: + { + ceph::bufferlist hint; + i.decode_bl(hint); + return write_ertr::now(); + } + default: + logger().error("bad op {}", static_cast<unsigned>(op->op)); + return crimson::ct_error::input_output_error::make(); + } + } catch (std::exception &e) { + logger().error("{} got exception {}", __func__, e); + return crimson::ct_error::input_output_error::make(); + } +} + +SeaStore::write_ertr::future<> SeaStore::_remove( + TransactionRef &trans, + OnodeRef &onode) +{ + logger().debug("{} onode={}", + __func__, *onode); + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_touch( + TransactionRef &trans, + OnodeRef &onode) +{ + logger().debug("{} onode={}", + __func__, *onode); + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_write( + TransactionRef &trans, + OnodeRef &onode, + uint64_t offset, size_t len, const ceph::bufferlist& bl, + uint32_t fadvise_flags) +{ + logger().debug("{}: {} {} ~ {}", + __func__, *onode, offset, len); + assert(len == bl.length()); + +/* + return onode_manager->get_or_create_onode(cid, oid).safe_then([=, &bl](auto ref) { + return; + }).handle_error( + crimson::ct_error::enoent::handle([]() { + return; + }), + OnodeManager::open_ertr::pass_further{} + ); + */ + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_omap_set_values( + TransactionRef &trans, + OnodeRef &onode, + std::map<std::string, ceph::bufferlist> &&aset) +{ + logger().debug( + "{}: {} {} keys", + __func__, *onode, aset.size()); + + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_omap_set_header( + TransactionRef &trans, + OnodeRef &onode, + const ceph::bufferlist &header) +{ + logger().debug( + "{}: {} {} bytes", + __func__, *onode, header.length()); + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_omap_rmkeys( + TransactionRef &trans, + OnodeRef &onode, + const omap_keys_t& aset) +{ + logger().debug( + "{} {} {} keys", + __func__, *onode, aset.size()); + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_omap_rmkeyrange( + TransactionRef &trans, + OnodeRef &onode, + const std::string &first, + const std::string &last) +{ + logger().debug( + "{} {} first={} last={}", + __func__, *onode, first, last); + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_truncate( + TransactionRef &trans, + OnodeRef &onode, + uint64_t size) +{ + logger().debug("{} onode={} size={}", + __func__, *onode, size); + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_setattrs( + TransactionRef &trans, + OnodeRef &onode, + std::map<std::string,bufferptr>& aset) +{ + logger().debug("{} onode={}", + __func__, *onode); + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_create_collection( + TransactionRef &trans, + const coll_t& cid, int bits) +{ + return write_ertr::now(); +} + +boost::intrusive_ptr<SeastoreCollection> SeaStore::_get_collection(const coll_t& cid) +{ + return new SeastoreCollection{cid}; +} + +seastar::future<> SeaStore::write_meta(const std::string& key, + const std::string& value) +{ + return seastar::make_ready_future<>(); +} + +seastar::future<std::tuple<int, std::string>> SeaStore::read_meta(const std::string& key) +{ + return seastar::make_ready_future<std::tuple<int, std::string>>( + std::make_tuple(0, ""s)); +} + +uuid_d SeaStore::get_fsid() const +{ + return osd_fsid; +} + +} diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h new file mode 100644 index 000000000..798442c34 --- /dev/null +++ b/src/crimson/os/seastore/seastore.h @@ -0,0 +1,181 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string> +#include <unordered_map> +#include <map> +#include <typeinfo> +#include <vector> + +#include <optional> +#include <seastar/core/future.hh> + +#include "osd/osd_types.h" +#include "include/uuid.h" + +#include "os/Transaction.h" +#include "crimson/os/seastore/segment_cleaner.h" +#include "crimson/os/futurized_store.h" +#include "transaction.h" + +namespace crimson::os::seastore { + +class SeastoreCollection; +class SegmentManager; +class OnodeManager; +class Onode; +using OnodeRef = boost::intrusive_ptr<Onode>; +class Journal; +class LBAManager; +class TransactionManager; +class Cache; + +class SeaStore final : public FuturizedStore { + uuid_d osd_fsid; + +public: + + SeaStore(const std::string& path); + ~SeaStore() final; + + seastar::future<> stop() final; + seastar::future<> mount() final; + seastar::future<> umount() final; + + seastar::future<> mkfs(uuid_d new_osd_fsid) final; + seastar::future<store_statfs_t> stat() const final; + + read_errorator::future<ceph::bufferlist> read( + CollectionRef c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags = 0) final; + read_errorator::future<ceph::bufferlist> readv( + CollectionRef c, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags = 0) final; + get_attr_errorator::future<ceph::bufferptr> get_attr( + CollectionRef c, + const ghobject_t& oid, + std::string_view name) const final; + get_attrs_ertr::future<attrs_t> get_attrs( + CollectionRef c, + const ghobject_t& oid) final; + + seastar::future<struct stat> stat( + CollectionRef c, + const ghobject_t& oid) final; + + read_errorator::future<omap_values_t> omap_get_values( + CollectionRef c, + const ghobject_t& oid, + const omap_keys_t& keys) final; + + /// Retrieves paged set of values > start (if present) + read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values( + CollectionRef c, ///< [in] collection + const ghobject_t &oid, ///< [in] oid + const std::optional<std::string> &start ///< [in] start, empty for begin + ) final; ///< @return <done, values> values.empty() iff done + + read_errorator::future<bufferlist> omap_get_header( + CollectionRef c, + const ghobject_t& oid) final; + + seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( + CollectionRef c, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const final; + + seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; + seastar::future<CollectionRef> open_collection(const coll_t& cid) final; + seastar::future<std::vector<coll_t>> list_collections() final; + + seastar::future<> do_transaction( + CollectionRef ch, + ceph::os::Transaction&& txn) final; + + seastar::future<OmapIteratorRef> get_omap_iterator( + CollectionRef ch, + const ghobject_t& oid) final; + seastar::future<std::map<uint64_t, uint64_t>> fiemap( + CollectionRef ch, + const ghobject_t& oid, + uint64_t off, + uint64_t len) final; + + seastar::future<> write_meta(const std::string& key, + const std::string& value) final; + seastar::future<std::tuple<int, std::string>> read_meta(const std::string& key) final; + uuid_d get_fsid() const final; + + unsigned get_max_attr_name_length() const final { + return 256; + } + +private: + std::unique_ptr<SegmentManager> segment_manager; + std::unique_ptr<SegmentCleaner> segment_cleaner; + std::unique_ptr<Cache> cache; + std::unique_ptr<Journal> journal; + std::unique_ptr<LBAManager> lba_manager; + std::unique_ptr<TransactionManager> transaction_manager; + std::unique_ptr<OnodeManager> onode_manager; + + + using write_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + write_ertr::future<> _do_transaction_step( + TransactionRef &trans, + CollectionRef &col, + std::vector<OnodeRef> &onodes, + ceph::os::Transaction::iterator &i); + + write_ertr::future<> _remove( + TransactionRef &trans, + OnodeRef &onode); + write_ertr::future<> _touch( + TransactionRef &trans, + OnodeRef &onode); + write_ertr::future<> _write( + TransactionRef &trans, + OnodeRef &onode, + uint64_t offset, size_t len, const ceph::bufferlist& bl, + uint32_t fadvise_flags); + write_ertr::future<> _omap_set_values( + TransactionRef &trans, + OnodeRef &onode, + std::map<std::string, ceph::bufferlist> &&aset); + write_ertr::future<> _omap_set_header( + TransactionRef &trans, + OnodeRef &onode, + const ceph::bufferlist &header); + write_ertr::future<> _omap_rmkeys( + TransactionRef &trans, + OnodeRef &onode, + const omap_keys_t& aset); + write_ertr::future<> _omap_rmkeyrange( + TransactionRef &trans, + OnodeRef &onode, + const std::string &first, + const std::string &last); + write_ertr::future<> _truncate( + TransactionRef &trans, + OnodeRef &onode, uint64_t size); + write_ertr::future<> _setattrs( + TransactionRef &trans, + OnodeRef &onode, + std::map<std::string,bufferptr>& aset); + write_ertr::future<> _create_collection( + TransactionRef &trans, + const coll_t& cid, int bits); + + boost::intrusive_ptr<SeastoreCollection> _get_collection(const coll_t& cid); +}; + +} diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc new file mode 100644 index 000000000..ff43b1e51 --- /dev/null +++ b/src/crimson/os/seastore/seastore_types.cc @@ -0,0 +1,105 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/seastore_types.h" + +namespace crimson::os::seastore { + +std::ostream &segment_to_stream(std::ostream &out, const segment_id_t &t) +{ + if (t == NULL_SEG_ID) + return out << "NULL_SEG"; + else if (t == BLOCK_REL_SEG_ID) + return out << "BLOCK_REL_SEG"; + else if (t == RECORD_REL_SEG_ID) + return out << "RECORD_REL_SEG"; + else if (t == FAKE_SEG_ID) + return out << "FAKE_SEG"; + else + return out << t; +} + +std::ostream &offset_to_stream(std::ostream &out, const segment_off_t &t) +{ + if (t == NULL_SEG_OFF) + return out << "NULL_OFF"; + else + return out << t; +} + +std::ostream &operator<<(std::ostream &out, const paddr_t &rhs) +{ + out << "paddr_t<"; + segment_to_stream(out, rhs.segment); + out << ", "; + offset_to_stream(out, rhs.offset); + return out << ">"; +} + +std::ostream &operator<<(std::ostream &out, const journal_seq_t &seq) +{ + return out << "journal_seq_t(segment_seq=" + << seq.segment_seq << ", offset=" + << seq.offset + << ")"; +} + +std::ostream &operator<<(std::ostream &out, extent_types_t t) +{ + switch (t) { + case extent_types_t::ROOT: + return out << "ROOT"; + case extent_types_t::LADDR_INTERNAL: + return out << "LADDR_INTERNAL"; + case extent_types_t::LADDR_LEAF: + return out << "LADDR_LEAF"; + case extent_types_t::EXTMAP_INNER: + return out << "EXTMAP_INNER"; + case extent_types_t::EXTMAP_LEAF: + return out << "EXTMAP_LEAF"; + case extent_types_t::ONODE_BLOCK_STAGED: + return out << "ONODE_BLOCK_STAGED"; + case extent_types_t::TEST_BLOCK: + return out << "TEST_BLOCK"; + case extent_types_t::TEST_BLOCK_PHYSICAL: + return out << "TEST_BLOCK_PHYSICAL"; + case extent_types_t::NONE: + return out << "NONE"; + default: + return out << "UNKNOWN"; + } +} + +std::ostream &operator<<(std::ostream &out, const laddr_list_t &rhs) +{ + bool first = false; + for (auto &i: rhs) { + out << (first ? '[' : ',') << '(' << i.first << ',' << i.second << ')'; + first = true; + } + return out << ']'; +} +std::ostream &operator<<(std::ostream &out, const paddr_list_t &rhs) +{ + bool first = false; + for (auto &i: rhs) { + out << (first ? '[' : ',') << '(' << i.first << ',' << i.second << ')'; + first = true; + } + return out << ']'; +} + +std::ostream &operator<<(std::ostream &lhs, const delta_info_t &rhs) +{ + return lhs << "delta_info_t(" + << "type: " << rhs.type + << ", paddr: " << rhs.paddr + << ", laddr: " << rhs.laddr + << ", prev_crc: " << rhs.prev_crc + << ", final_crc: " << rhs.final_crc + << ", length: " << rhs.length + << ", pversion: " << rhs.pversion + << ")"; +} + +} diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h new file mode 100644 index 000000000..cb8480268 --- /dev/null +++ b/src/crimson/os/seastore/seastore_types.h @@ -0,0 +1,369 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <limits> +#include <iostream> + +#include "include/byteorder.h" +#include "include/denc.h" +#include "include/buffer.h" +#include "include/cmp.h" +#include "include/uuid.h" + +namespace crimson::os::seastore { + +using depth_t = int32_t; +using depth_le_t = ceph_les32; + +using checksum_t = uint32_t; + +// Immutable metadata for seastore to set at mkfs time +struct seastore_meta_t { + uuid_d seastore_id; + + DENC(seastore_meta_t, v, p) { + DENC_START(1, 1, p); + denc(v.seastore_id, p); + DENC_FINISH(p); + } +}; + +// Identifies segment location on disk, see SegmentManager, +using segment_id_t = uint32_t; +constexpr segment_id_t NULL_SEG_ID = + std::numeric_limits<segment_id_t>::max() - 1; +/* Used to denote relative paddr_t */ +constexpr segment_id_t RECORD_REL_SEG_ID = + std::numeric_limits<segment_id_t>::max() - 2; +constexpr segment_id_t BLOCK_REL_SEG_ID = + std::numeric_limits<segment_id_t>::max() - 3; + +// for tests which generate fake paddrs +constexpr segment_id_t FAKE_SEG_ID = + std::numeric_limits<segment_id_t>::max() - 4; + +std::ostream &segment_to_stream(std::ostream &, const segment_id_t &t); + +// Offset within a segment on disk, see SegmentManager +// may be negative for relative offsets +using segment_off_t = int32_t; +constexpr segment_off_t NULL_SEG_OFF = + std::numeric_limits<segment_id_t>::max(); + +std::ostream &offset_to_stream(std::ostream &, const segment_off_t &t); + +/* Monotonically increasing segment seq, uniquely identifies + * the incarnation of a segment */ +using segment_seq_t = uint32_t; +static constexpr segment_seq_t NULL_SEG_SEQ = + std::numeric_limits<segment_seq_t>::max(); + +// Offset of delta within a record +using record_delta_idx_t = uint32_t; +constexpr record_delta_idx_t NULL_DELTA_IDX = + std::numeric_limits<record_delta_idx_t>::max(); + +/** + * paddr_t + * + * <segment, offset> offset on disk, see SegmentManager + * + * May be absolute, record_relative, or block_relative. + * + * Blocks get read independently of the surrounding record, + * so paddrs embedded directly within a block need to refer + * to other blocks within the same record by a block_relative + * addr relative to the block's own offset. By contrast, + * deltas to existing blocks need to use record_relative + * addrs relative to the first block of the record. + * + * Fresh extents during a transaction are refered to by + * record_relative paddrs. + */ +struct paddr_t { + segment_id_t segment = NULL_SEG_ID; + segment_off_t offset = NULL_SEG_OFF; + + bool is_relative() const { + return segment == RECORD_REL_SEG_ID || + segment == BLOCK_REL_SEG_ID; + } + + bool is_record_relative() const { + return segment == RECORD_REL_SEG_ID; + } + + bool is_block_relative() const { + return segment == BLOCK_REL_SEG_ID; + } + + paddr_t add_offset(segment_off_t o) const { + return paddr_t{segment, offset + o}; + } + + paddr_t add_relative(paddr_t o) const { + assert(o.is_relative()); + return paddr_t{segment, offset + o.offset}; + } + + paddr_t add_block_relative(paddr_t o) const { + // special version mainly for documentation purposes + assert(o.is_block_relative()); + return add_relative(o); + } + + paddr_t add_record_relative(paddr_t o) const { + // special version mainly for documentation purposes + assert(o.is_record_relative()); + return add_relative(o); + } + + /** + * paddr_t::operator- + * + * Only defined for record_relative paddr_ts. Yields a + * block_relative address. + */ + paddr_t operator-(paddr_t rhs) const { + assert(rhs.is_relative() && is_relative()); + assert(rhs.segment == segment); + return paddr_t{ + BLOCK_REL_SEG_ID, + offset - rhs.offset + }; + } + + /** + * maybe_relative_to + * + * Helper for the case where an in-memory paddr_t may be + * either block_relative or absolute (not record_relative). + * + * base must be either absolute or record_relative. + */ + paddr_t maybe_relative_to(paddr_t base) const { + assert(!base.is_block_relative()); + if (is_block_relative()) + return base.add_block_relative(*this); + else + return *this; + } + + DENC(paddr_t, v, p) { + DENC_START(1, 1, p); + denc(v.segment, p); + denc(v.offset, p); + DENC_FINISH(p); + } +}; +WRITE_CMP_OPERATORS_2(paddr_t, segment, offset) +WRITE_EQ_OPERATORS_2(paddr_t, segment, offset) +constexpr paddr_t P_ADDR_NULL = paddr_t{}; +constexpr paddr_t P_ADDR_MIN = paddr_t{0, 0}; +constexpr paddr_t make_record_relative_paddr(segment_off_t off) { + return paddr_t{RECORD_REL_SEG_ID, off}; +} +constexpr paddr_t make_block_relative_paddr(segment_off_t off) { + return paddr_t{BLOCK_REL_SEG_ID, off}; +} +constexpr paddr_t make_fake_paddr(segment_off_t off) { + return paddr_t{FAKE_SEG_ID, off}; +} + +struct paddr_le_t { + ceph_le32 segment = init_le32(NULL_SEG_ID); + ceph_les32 offset = init_les32(NULL_SEG_OFF); + + paddr_le_t() = default; + paddr_le_t(ceph_le32 segment, ceph_les32 offset) + : segment(segment), offset(offset) {} + paddr_le_t(segment_id_t segment, segment_off_t offset) + : segment(init_le32(segment)), offset(init_les32(offset)) {} + paddr_le_t(const paddr_t &addr) : paddr_le_t(addr.segment, addr.offset) {} + + operator paddr_t() const { + return paddr_t{segment, offset}; + } +}; + +std::ostream &operator<<(std::ostream &out, const paddr_t &rhs); + +using objaddr_t = uint32_t; +constexpr objaddr_t OBJ_ADDR_MIN = std::numeric_limits<objaddr_t>::min(); + +/* Monotonically increasing identifier for the location of a + * journal_record. + */ +struct journal_seq_t { + segment_seq_t segment_seq = 0; + paddr_t offset; + + DENC(journal_seq_t, v, p) { + DENC_START(1, 1, p); + denc(v.segment_seq, p); + denc(v.offset, p); + DENC_FINISH(p); + } +}; +WRITE_CMP_OPERATORS_2(journal_seq_t, segment_seq, offset) +WRITE_EQ_OPERATORS_2(journal_seq_t, segment_seq, offset) + +std::ostream &operator<<(std::ostream &out, const journal_seq_t &seq); + +static constexpr journal_seq_t NO_DELTAS = journal_seq_t{ + NULL_SEG_SEQ, + P_ADDR_NULL +}; + +// logical addr, see LBAManager, TransactionManager +using laddr_t = uint64_t; +constexpr laddr_t L_ADDR_MIN = std::numeric_limits<laddr_t>::min(); +constexpr laddr_t L_ADDR_MAX = std::numeric_limits<laddr_t>::max(); +constexpr laddr_t L_ADDR_NULL = std::numeric_limits<laddr_t>::max(); +constexpr laddr_t L_ADDR_ROOT = std::numeric_limits<laddr_t>::max() - 1; +constexpr laddr_t L_ADDR_LBAT = std::numeric_limits<laddr_t>::max() - 2; + +struct laddr_le_t { + ceph_le64 laddr = init_le64(L_ADDR_NULL); + + laddr_le_t() = default; + laddr_le_t(const laddr_le_t &) = default; + explicit laddr_le_t(const laddr_t &addr) + : laddr(init_le64(addr)) {} + + operator laddr_t() const { + return laddr_t(laddr); + } + laddr_le_t& operator=(laddr_t addr) { + ceph_le64 val; + val = addr; + laddr = val; + return *this; + } +}; + +// logical offset, see LBAManager, TransactionManager +using extent_len_t = uint32_t; +constexpr extent_len_t EXTENT_LEN_MAX = + std::numeric_limits<extent_len_t>::max(); + +using extent_len_le_t = ceph_le32; +inline extent_len_le_t init_extent_len_le_t(extent_len_t len) { + return init_le32(len); +} + +struct laddr_list_t : std::list<std::pair<laddr_t, extent_len_t>> { + template <typename... T> + laddr_list_t(T&&... args) + : std::list<std::pair<laddr_t, extent_len_t>>(std::forward<T>(args)...) {} +}; +struct paddr_list_t : std::list<std::pair<paddr_t, extent_len_t>> { + template <typename... T> + paddr_list_t(T&&... args) + : std::list<std::pair<paddr_t, extent_len_t>>(std::forward<T>(args)...) {} +}; + +std::ostream &operator<<(std::ostream &out, const laddr_list_t &rhs); +std::ostream &operator<<(std::ostream &out, const paddr_list_t &rhs); + +/* identifies type of extent, used for interpretting deltas, managing + * writeback. + * + * Note that any new extent type needs to be added to + * Cache::get_extent_by_type in cache.cc + */ +enum class extent_types_t : uint8_t { + ROOT = 0, + LADDR_INTERNAL = 1, + LADDR_LEAF = 2, + ONODE_BLOCK = 3, + EXTMAP_INNER = 4, + EXTMAP_LEAF = 5, + ONODE_BLOCK_STAGED = 6, + + // Test Block Types + TEST_BLOCK = 0xF0, + TEST_BLOCK_PHYSICAL = 0xF1, + + // None + NONE = 0xFF +}; + +inline bool is_logical_type(extent_types_t type) { + switch (type) { + case extent_types_t::ROOT: + case extent_types_t::LADDR_INTERNAL: + case extent_types_t::LADDR_LEAF: + return false; + default: + return true; + } +} + +std::ostream &operator<<(std::ostream &out, extent_types_t t); + +/* description of a new physical extent */ +struct extent_t { + extent_types_t type; ///< type of extent + laddr_t addr; ///< laddr of extent (L_ADDR_NULL for non-logical) + ceph::bufferlist bl; ///< payload, bl.length() == length, aligned +}; + +using extent_version_t = uint32_t; +constexpr extent_version_t EXTENT_VERSION_NULL = 0; + +/* description of a mutation to a physical extent */ +struct delta_info_t { + extent_types_t type = extent_types_t::NONE; ///< delta type + paddr_t paddr; ///< physical address + laddr_t laddr = L_ADDR_NULL; ///< logical address + uint32_t prev_crc = 0; + uint32_t final_crc = 0; + segment_off_t length = NULL_SEG_OFF; ///< extent length + extent_version_t pversion; ///< prior version + ceph::bufferlist bl; ///< payload + + DENC(delta_info_t, v, p) { + DENC_START(1, 1, p); + denc(v.type, p); + denc(v.paddr, p); + denc(v.laddr, p); + denc(v.prev_crc, p); + denc(v.final_crc, p); + denc(v.length, p); + denc(v.pversion, p); + denc(v.bl, p); + DENC_FINISH(p); + } + + bool operator==(const delta_info_t &rhs) const { + return ( + type == rhs.type && + paddr == rhs.paddr && + laddr == rhs.laddr && + prev_crc == rhs.prev_crc && + final_crc == rhs.final_crc && + length == rhs.length && + pversion == rhs.pversion && + bl == rhs.bl + ); + } + + friend std::ostream &operator<<(std::ostream &lhs, const delta_info_t &rhs); +}; + +std::ostream &operator<<(std::ostream &lhs, const delta_info_t &rhs); + +struct record_t { + std::vector<extent_t> extents; + std::vector<delta_info_t> deltas; +}; + +} + +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::seastore_meta_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::paddr_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_seq_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::delta_info_t) diff --git a/src/crimson/os/seastore/segment_cleaner.cc b/src/crimson/os/seastore/segment_cleaner.cc new file mode 100644 index 000000000..3597c21df --- /dev/null +++ b/src/crimson/os/seastore/segment_cleaner.cc @@ -0,0 +1,340 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/common/log.h" + +#include "crimson/os/seastore/transaction.h" +#include "crimson/os/seastore/segment_cleaner.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore { + +bool SpaceTrackerSimple::equals(const SpaceTrackerI &_other) const +{ + const auto &other = static_cast<const SpaceTrackerSimple&>(_other); + + if (other.live_bytes_by_segment.size() != live_bytes_by_segment.size()) { + logger().error("{}: different segment counts, bug in test"); + assert(0 == "segment counts should match"); + return false; + } + + bool all_match = true; + for (segment_id_t i = 0; i < live_bytes_by_segment.size(); ++i) { + if (other.live_bytes_by_segment[i] != live_bytes_by_segment[i]) { + all_match = false; + logger().debug( + "{}: segment_id {} live bytes mismatch *this: {}, other: {}", + __func__, + i, + live_bytes_by_segment[i], + other.live_bytes_by_segment[i]); + } + } + return all_match; +} + +int64_t SpaceTrackerDetailed::SegmentMap::allocate( + segment_id_t segment, + segment_off_t offset, + extent_len_t len, + const extent_len_t block_size) +{ + assert(offset % block_size == 0); + assert(len % block_size == 0); + + const auto b = (offset / block_size); + const auto e = (offset + len) / block_size; + + bool error = false; + for (auto i = b; i < e; ++i) { + if (bitmap[i]) { + if (!error) { + logger().error( + "SegmentMap::allocate found allocated in {}, {} ~ {}", + segment, + offset, + len); + error = true; + } + logger().debug( + "SegmentMap::allocate block {} allocated", + i * block_size); + } + bitmap[i] = true; + } + return update_usage(block_size); +} + +int64_t SpaceTrackerDetailed::SegmentMap::release( + segment_id_t segment, + segment_off_t offset, + extent_len_t len, + const extent_len_t block_size) +{ + assert(offset % block_size == 0); + assert(len % block_size == 0); + + const auto b = (offset / block_size); + const auto e = (offset + len) / block_size; + + bool error = false; + for (auto i = b; i < e; ++i) { + if (!bitmap[i]) { + if (!error) { + logger().error( + "SegmentMap::release found unallocated in {}, {} ~ {}", + segment, + offset, + len); + error = true; + } + logger().debug( + "SegmentMap::release block {} unallocated", + i * block_size); + } + bitmap[i] = false; + } + return update_usage(-(int64_t)block_size); +} + +bool SpaceTrackerDetailed::equals(const SpaceTrackerI &_other) const +{ + const auto &other = static_cast<const SpaceTrackerDetailed&>(_other); + + if (other.segment_usage.size() != segment_usage.size()) { + logger().error("{}: different segment counts, bug in test"); + assert(0 == "segment counts should match"); + return false; + } + + bool all_match = true; + for (segment_id_t i = 0; i < segment_usage.size(); ++i) { + if (other.segment_usage[i].get_usage() != segment_usage[i].get_usage()) { + all_match = false; + logger().error( + "{}: segment_id {} live bytes mismatch *this: {}, other: {}", + __func__, + i, + segment_usage[i].get_usage(), + other.segment_usage[i].get_usage()); + } + } + return all_match; +} + +void SpaceTrackerDetailed::SegmentMap::dump_usage(extent_len_t block_size) const +{ + for (unsigned i = 0; i < bitmap.size(); ++i) { + if (bitmap[i]) { + logger().debug(" {} still live", i * block_size); + } + } +} + +void SpaceTrackerDetailed::dump_usage(segment_id_t id) const +{ + logger().debug("SpaceTrackerDetailed::dump_usage {}", id); + segment_usage[id].dump_usage(block_size); +} + +SegmentCleaner::get_segment_ret SegmentCleaner::get_segment() +{ + for (size_t i = 0; i < segments.size(); ++i) { + if (segments[i].is_empty()) { + mark_open(i); + logger().debug("{}: returning segment {}", __func__, i); + return get_segment_ret( + get_segment_ertr::ready_future_marker{}, + i); + } + } + assert(0 == "out of space handling todo"); + return get_segment_ret( + get_segment_ertr::ready_future_marker{}, + 0); +} + +void SegmentCleaner::update_journal_tail_target(journal_seq_t target) +{ + logger().debug( + "{}: {}", + __func__, + target); + assert(journal_tail_target == journal_seq_t() || target >= journal_tail_target); + if (journal_tail_target == journal_seq_t() || target > journal_tail_target) { + journal_tail_target = target; + } +} + +void SegmentCleaner::update_journal_tail_committed(journal_seq_t committed) +{ + if (journal_tail_committed == journal_seq_t() || + committed > journal_tail_committed) { + logger().debug( + "{}: update journal_tail_committed {}", + __func__, + committed); + journal_tail_committed = committed; + } + if (journal_tail_target == journal_seq_t() || + committed > journal_tail_target) { + logger().debug( + "{}: update journal_tail_target {}", + __func__, + committed); + journal_tail_target = committed; + } +} + +void SegmentCleaner::close_segment(segment_id_t segment) +{ + mark_closed(segment); +} + +SegmentCleaner::do_immediate_work_ret SegmentCleaner::do_immediate_work( + Transaction &t) +{ + auto next_target = get_dirty_tail_limit(); + logger().debug( + "{}: journal_tail_target={} get_dirty_tail_limit()={}", + __func__, + journal_tail_target, + next_target); + + logger().debug( + "SegmentCleaner::do_immediate_work gc total {}, available {}, unavailable {}, used {} available_ratio {}, reclaim_ratio {}, bytes_to_gc_for_available {}, bytes_to_gc_for_reclaim {}", + get_total_bytes(), + get_available_bytes(), + get_unavailable_bytes(), + get_used_bytes(), + get_available_ratio(), + get_reclaim_ratio(), + get_immediate_bytes_to_gc_for_available(), + get_immediate_bytes_to_gc_for_reclaim()); + + auto dirty_fut = do_immediate_work_ertr::now(); + if (journal_tail_target < next_target) { + dirty_fut = rewrite_dirty(t, next_target); + } + return dirty_fut.safe_then([=, &t] { + return do_gc(t, get_immediate_bytes_to_gc()); + }).handle_error( + do_immediate_work_ertr::pass_further{}, + crimson::ct_error::assert_all{} + ); +} + +SegmentCleaner::do_deferred_work_ret SegmentCleaner::do_deferred_work( + Transaction &t) +{ + return do_deferred_work_ret( + do_deferred_work_ertr::ready_future_marker{}, + ceph::timespan()); +} + +SegmentCleaner::rewrite_dirty_ret SegmentCleaner::rewrite_dirty( + Transaction &t, + journal_seq_t limit) +{ + return ecb->get_next_dirty_extents( + limit + ).then([=, &t](auto dirty_list) { + if (dirty_list.empty()) { + return do_immediate_work_ertr::now(); + } else { + update_journal_tail_target(dirty_list.front()->get_dirty_from()); + } + return seastar::do_with( + std::move(dirty_list), + [this, &t](auto &dirty_list) { + return crimson::do_for_each( + dirty_list, + [this, &t](auto &e) { + logger().debug( + "SegmentCleaner::do_immediate_work cleaning {}", + *e); + return ecb->rewrite_extent(t, e); + }); + }); + }); +} + +SegmentCleaner::do_gc_ret SegmentCleaner::do_gc( + Transaction &t, + size_t bytes) +{ + if (bytes == 0) { + return do_gc_ertr::now(); + } + + if (!scan_cursor) { + paddr_t next = P_ADDR_NULL; + next.segment = get_next_gc_target(); + if (next == P_ADDR_NULL) { + logger().debug( + "SegmentCleaner::do_gc: no segments to gc"); + return do_gc_ertr::now(); + } + next.offset = 0; + scan_cursor = + std::make_unique<ExtentCallbackInterface::scan_extents_cursor>( + next); + logger().debug( + "SegmentCleaner::do_gc: starting gc on segment {}", + scan_cursor->get_offset().segment); + } + + return ecb->scan_extents( + *scan_cursor, + bytes + ).safe_then([=, &t](auto addrs) { + return seastar::do_with( + std::move(addrs), + [=, &t](auto &addr_list) { + return crimson::do_for_each( + addr_list, + [=, &t](auto &addr_pair) { + auto &[addr, info] = addr_pair; + logger().debug( + "SegmentCleaner::do_gc: checking addr {}", + addr); + return ecb->get_extent_if_live( + t, + info.type, + addr, + info.addr, + info.len + ).safe_then([addr=addr, &t, this](CachedExtentRef ext) { + if (!ext) { + logger().debug( + "SegmentCleaner::do_gc: addr {} dead, skipping", + addr); + return ExtentCallbackInterface::rewrite_extent_ertr::now(); + } else { + logger().debug( + "SegmentCleaner::do_gc: addr {} alive, gc'ing {}", + addr, + *ext); + } + return ecb->rewrite_extent( + t, + ext); + }); + }).safe_then([&t, this] { + if (scan_cursor->is_complete()) { + t.mark_segment_to_release(scan_cursor->get_offset().segment); + scan_cursor.reset(); + } + return ExtentCallbackInterface::release_segment_ertr::now(); + }); + }); + }); +} + +} diff --git a/src/crimson/os/seastore/segment_cleaner.h b/src/crimson/os/seastore/segment_cleaner.h new file mode 100644 index 000000000..38ebd05bc --- /dev/null +++ b/src/crimson/os/seastore/segment_cleaner.h @@ -0,0 +1,691 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive/set.hpp> + +#include "common/ceph_time.h" + +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/journal.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/segment_manager.h" + +namespace crimson::os::seastore { +class Transaction; + +struct segment_info_t { + Segment::segment_state_t state = Segment::segment_state_t::EMPTY; + + // Will be non-null for any segments in the current journal + segment_seq_t journal_segment_seq = NULL_SEG_SEQ; + + + bool is_in_journal(journal_seq_t tail_committed) const { + return journal_segment_seq != NULL_SEG_SEQ && + tail_committed.segment_seq <= journal_segment_seq; + } + + bool is_empty() const { + return state == Segment::segment_state_t::EMPTY; + } + + bool is_closed() const { + return state == Segment::segment_state_t::CLOSED; + } + + bool is_open() const { + return state == Segment::segment_state_t::OPEN; + } +}; + +class SpaceTrackerI { +public: + virtual int64_t allocate( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) = 0; + + virtual int64_t release( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) = 0; + + virtual int64_t get_usage( + segment_id_t segment) const = 0; + + virtual bool equals(const SpaceTrackerI &other) const = 0; + + virtual std::unique_ptr<SpaceTrackerI> make_empty() const = 0; + + virtual void dump_usage(segment_id_t) const = 0; + + virtual void reset() = 0; + + virtual ~SpaceTrackerI() = default; +}; +using SpaceTrackerIRef = std::unique_ptr<SpaceTrackerI>; + +class SpaceTrackerSimple : public SpaceTrackerI { + // Tracks live space for each segment + std::vector<int64_t> live_bytes_by_segment; + + int64_t update_usage(segment_id_t segment, int64_t delta) { + assert(segment < live_bytes_by_segment.size()); + live_bytes_by_segment[segment] += delta; + assert(live_bytes_by_segment[segment] >= 0); + return live_bytes_by_segment[segment]; + } +public: + SpaceTrackerSimple(size_t num_segments) + : live_bytes_by_segment(num_segments, 0) {} + + int64_t allocate( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) final { + return update_usage(segment, len); + } + + int64_t release( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) final { + return update_usage(segment, -len); + } + + int64_t get_usage(segment_id_t segment) const final { + assert(segment < live_bytes_by_segment.size()); + return live_bytes_by_segment[segment]; + } + + void dump_usage(segment_id_t) const final {} + + void reset() final { + for (auto &i: live_bytes_by_segment) + i = 0; + } + + SpaceTrackerIRef make_empty() const final { + return SpaceTrackerIRef( + new SpaceTrackerSimple(live_bytes_by_segment.size())); + } + + bool equals(const SpaceTrackerI &other) const; +}; + +class SpaceTrackerDetailed : public SpaceTrackerI { + class SegmentMap { + int64_t used = 0; + std::vector<bool> bitmap; + + public: + SegmentMap(size_t blocks) : bitmap(blocks, false) {} + + int64_t update_usage(int64_t delta) { + used += delta; + return used; + } + + int64_t allocate( + segment_id_t segment, + segment_off_t offset, + extent_len_t len, + const extent_len_t block_size); + + int64_t release( + segment_id_t segment, + segment_off_t offset, + extent_len_t len, + const extent_len_t block_size); + + int64_t get_usage() const { + return used; + } + + void dump_usage(extent_len_t block_size) const; + + void reset() { + used = 0; + for (auto &&i: bitmap) { + i = false; + } + } + }; + const size_t block_size; + const size_t segment_size; + + // Tracks live space for each segment + std::vector<SegmentMap> segment_usage; + +public: + SpaceTrackerDetailed(size_t num_segments, size_t segment_size, size_t block_size) + : block_size(block_size), + segment_size(segment_size), + segment_usage(num_segments, segment_size / block_size) {} + + int64_t allocate( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) final { + assert(segment < segment_usage.size()); + return segment_usage[segment].allocate(segment, offset, len, block_size); + } + + int64_t release( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) final { + assert(segment < segment_usage.size()); + return segment_usage[segment].release(segment, offset, len, block_size); + } + + int64_t get_usage(segment_id_t segment) const final { + assert(segment < segment_usage.size()); + return segment_usage[segment].get_usage(); + } + + void dump_usage(segment_id_t seg) const final; + + void reset() final { + for (auto &i: segment_usage) + i.reset(); + } + + SpaceTrackerIRef make_empty() const final { + return SpaceTrackerIRef( + new SpaceTrackerDetailed( + segment_usage.size(), + segment_size, + block_size)); + } + + bool equals(const SpaceTrackerI &other) const; +}; + + +class SegmentCleaner : public JournalSegmentProvider { +public: + /// Config + struct config_t { + size_t num_segments = 0; + size_t segment_size = 0; + size_t block_size = 0; + size_t target_journal_segments = 0; + size_t max_journal_segments = 0; + + double reclaim_ratio_hard_limit = 0; + // don't apply reclaim ratio with available space below this + double reclaim_ratio_usage_min = 0; + + double available_ratio_hard_limit = 0; + + static config_t default_from_segment_manager( + SegmentManager &manager) { + return config_t{ + manager.get_num_segments(), + static_cast<size_t>(manager.get_segment_size()), + (size_t)manager.get_block_size(), + 2, + 4, + .5, + .95, + .2 + }; + } + }; + + /// Callback interface for querying and operating on segments + class ExtentCallbackInterface { + public: + virtual ~ExtentCallbackInterface() = default; + /** + * get_next_dirty_extent + * + * returns all extents with dirty_from < bound + */ + using get_next_dirty_extents_ertr = crimson::errorator<>; + using get_next_dirty_extents_ret = get_next_dirty_extents_ertr::future< + std::vector<CachedExtentRef>>; + virtual get_next_dirty_extents_ret get_next_dirty_extents( + journal_seq_t bound ///< [in] return extents with dirty_from < bound + ) = 0; + + /** + * rewrite_extent + * + * Updates t with operations moving the passed extents to a new + * segment. extent may be invalid, implementation must correctly + * handle finding the current instance if it is still alive and + * otherwise ignore it. + */ + using rewrite_extent_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using rewrite_extent_ret = rewrite_extent_ertr::future<>; + virtual rewrite_extent_ret rewrite_extent( + Transaction &t, + CachedExtentRef extent) = 0; + + /** + * get_extent_if_live + * + * Returns extent at specified location if still referenced by + * lba_manager and not removed by t. + * + * See TransactionManager::get_extent_if_live and + * LBAManager::get_physical_extent_if_live. + */ + using get_extent_if_live_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using get_extent_if_live_ret = get_extent_if_live_ertr::future< + CachedExtentRef>; + virtual get_extent_if_live_ret get_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + segment_off_t len) = 0; + + /** + * scan_extents + * + * Interface shim for Journal::scan_extents + */ + using scan_extents_cursor = Journal::scan_valid_records_cursor; + using scan_extents_ertr = Journal::scan_extents_ertr; + using scan_extents_ret = Journal::scan_extents_ret; + virtual scan_extents_ret scan_extents( + scan_extents_cursor &cursor, + extent_len_t bytes_to_read) = 0; + + /** + * release_segment + * + * Release segment. + */ + using release_segment_ertr = SegmentManager::release_ertr; + using release_segment_ret = release_segment_ertr::future<>; + virtual release_segment_ret release_segment( + segment_id_t id) = 0; + }; + +private: + const config_t config; + + SpaceTrackerIRef space_tracker; + std::vector<segment_info_t> segments; + size_t empty_segments; + int64_t used_bytes = 0; + bool init_complete = false; + + journal_seq_t journal_tail_target; + journal_seq_t journal_tail_committed; + journal_seq_t journal_head; + + ExtentCallbackInterface *ecb = nullptr; + +public: + SegmentCleaner(config_t config, bool detailed = false) + : config(config), + space_tracker( + detailed ? + (SpaceTrackerI*)new SpaceTrackerDetailed( + config.num_segments, + config.segment_size, + config.block_size) : + (SpaceTrackerI*)new SpaceTrackerSimple( + config.num_segments)), + segments(config.num_segments), + empty_segments(config.num_segments) {} + + get_segment_ret get_segment() final; + + void close_segment(segment_id_t segment) final; + + void set_journal_segment( + segment_id_t segment, segment_seq_t seq) final { + assert(segment < segments.size()); + segments[segment].journal_segment_seq = seq; + assert(segments[segment].is_open()); + } + + journal_seq_t get_journal_tail_target() const final { + return journal_tail_target; + } + + void update_journal_tail_committed(journal_seq_t committed) final; + + void update_journal_tail_target(journal_seq_t target); + + void init_journal_tail(journal_seq_t tail) { + journal_tail_target = journal_tail_committed = tail; + } + + void set_journal_head(journal_seq_t head) { + assert(journal_head == journal_seq_t() || head >= journal_head); + journal_head = head; + } + + void init_mark_segment_closed(segment_id_t segment, segment_seq_t seq) final { + crimson::get_logger(ceph_subsys_filestore).debug( + "SegmentCleaner::init_mark_segment_closed: segment {}, seq {}", + segment, + seq); + mark_closed(segment); + segments[segment].journal_segment_seq = seq; + } + + segment_seq_t get_seq(segment_id_t id) final { + return segments[id].journal_segment_seq; + } + + void mark_segment_released(segment_id_t segment) { + return mark_empty(segment); + } + + void mark_space_used( + paddr_t addr, + extent_len_t len, + bool init_scan = false) { + assert(addr.segment < segments.size()); + + if (!init_scan && !init_complete) + return; + + if (!init_scan) { + assert(segments[addr.segment].state == Segment::segment_state_t::OPEN); + } + + used_bytes += len; + [[maybe_unused]] auto ret = space_tracker->allocate( + addr.segment, + addr.offset, + len); + assert(ret > 0); + } + + void mark_space_free( + paddr_t addr, + extent_len_t len) { + if (!init_complete) + return; + + used_bytes -= len; + assert(addr.segment < segments.size()); + + [[maybe_unused]] auto ret = space_tracker->release( + addr.segment, + addr.offset, + len); + assert(ret >= 0); + } + + segment_id_t get_next_gc_target() const { + segment_id_t ret = NULL_SEG_ID; + int64_t least_live_bytes = std::numeric_limits<int64_t>::max(); + for (segment_id_t i = 0; i < segments.size(); ++i) { + if (segments[i].is_closed() && + !segments[i].is_in_journal(journal_tail_committed) && + space_tracker->get_usage(i) < least_live_bytes) { + ret = i; + least_live_bytes = space_tracker->get_usage(i); + } + } + if (ret != NULL_SEG_ID) { + crimson::get_logger(ceph_subsys_filestore).debug( + "SegmentCleaner::get_next_gc_target: segment {} seq {}", + ret, + segments[ret].journal_segment_seq); + } + return ret; + } + + SpaceTrackerIRef get_empty_space_tracker() const { + return space_tracker->make_empty(); + } + + void complete_init() { init_complete = true; } + + void set_extent_callback(ExtentCallbackInterface *cb) { + ecb = cb; + } + + bool debug_check_space(const SpaceTrackerI &tracker) { + return space_tracker->equals(tracker); + } + + /** + * do_immediate_work + * + * Should be invoked prior to submission of any transaction, + * will piggy-back work required to maintain deferred work + * constraints. + */ + using do_immediate_work_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using do_immediate_work_ret = do_immediate_work_ertr::future<>; + do_immediate_work_ret do_immediate_work( + Transaction &t); + + + /** + * do_deferred_work + * + * Should be called at idle times -- will perform background + * operations based on deferred work constraints. + * + * If returned timespan is non-zero, caller should pause calling + * back into do_deferred_work before returned timespan has elapsed, + * or a foreground operation occurs. + */ + using do_deferred_work_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using do_deferred_work_ret = do_deferred_work_ertr::future< + ceph::timespan + >; + do_deferred_work_ret do_deferred_work( + Transaction &t); + +private: + + // journal status helpers + + /** + * rewrite_dirty + * + * Writes out dirty blocks dirtied earlier than limit. + */ + using rewrite_dirty_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using rewrite_dirty_ret = rewrite_dirty_ertr::future<>; + rewrite_dirty_ret rewrite_dirty( + Transaction &t, + journal_seq_t limit); + + journal_seq_t get_dirty_tail() const { + auto ret = journal_head; + ret.segment_seq -= std::min( + static_cast<size_t>(ret.segment_seq), + config.target_journal_segments); + return ret; + } + + journal_seq_t get_dirty_tail_limit() const { + auto ret = journal_head; + ret.segment_seq -= std::min( + static_cast<size_t>(ret.segment_seq), + config.max_journal_segments); + return ret; + } + + // GC status helpers + std::unique_ptr<ExtentCallbackInterface::scan_extents_cursor> scan_cursor; + + /** + * do_gc + * + * Performs bytes worth of gc work on t. + */ + using do_gc_ertr = SegmentManager::read_ertr; + using do_gc_ret = do_gc_ertr::future<>; + do_gc_ret do_gc( + Transaction &t, + size_t bytes); + + size_t get_bytes_used_current_segment() const { + assert(journal_head != journal_seq_t()); + return journal_head.offset.offset; + } + + size_t get_bytes_available_current_segment() const { + return config.segment_size - get_bytes_used_current_segment(); + } + + /** + * get_bytes_scanned_current_segment + * + * Returns the number of bytes from the current gc segment that + * have been scanned. + */ + size_t get_bytes_scanned_current_segment() const { + if (!scan_cursor) + return 0; + + return scan_cursor->get_offset().offset; + } + + size_t get_available_bytes() const { + return (empty_segments * config.segment_size) + + get_bytes_available_current_segment() + + get_bytes_scanned_current_segment(); + } + + size_t get_total_bytes() const { + return config.segment_size * config.num_segments; + } + + size_t get_unavailable_bytes() const { + return get_total_bytes() - get_available_bytes(); + } + + /// Returns bytes currently occupied by live extents (not journal) + size_t get_used_bytes() const { + return used_bytes; + } + + /// Returns the number of bytes in unavailable segments that are not live + size_t get_reclaimable_bytes() const { + return get_unavailable_bytes() - get_used_bytes(); + } + + /** + * get_reclaim_ratio + * + * Returns the ratio of unavailable space that is not currently used. + */ + double get_reclaim_ratio() const { + if (get_unavailable_bytes() == 0) return 0; + return (double)get_reclaimable_bytes() / (double)get_unavailable_bytes(); + } + + /** + * get_available_ratio + * + * Returns ratio of available space to write to total space + */ + double get_available_ratio() const { + return (double)get_available_bytes() / (double)get_total_bytes(); + } + + /** + * get_immediate_bytes_to_gc_for_reclaim + * + * Returns the number of bytes to gc in order to bring the + * reclaim ratio below reclaim_ratio_usage_min. + */ + size_t get_immediate_bytes_to_gc_for_reclaim() const { + if (get_reclaim_ratio() < config.reclaim_ratio_hard_limit) + return 0; + + const size_t unavailable_target = std::max( + get_used_bytes() / (1.0 - config.reclaim_ratio_hard_limit), + (1 - config.reclaim_ratio_usage_min) * get_total_bytes()); + + if (unavailable_target > get_unavailable_bytes()) + return 0; + + return (get_unavailable_bytes() - unavailable_target) / get_reclaim_ratio(); + } + + /** + * get_immediate_bytes_to_gc_for_available + * + * Returns the number of bytes to gc in order to bring the + * the ratio of available disk space to total disk space above + * available_ratio_hard_limit. + */ + size_t get_immediate_bytes_to_gc_for_available() const { + if (get_available_ratio() > config.available_ratio_hard_limit) { + return 0; + } + + const double ratio_to_make_available = config.available_ratio_hard_limit - + get_available_ratio(); + return ratio_to_make_available * (double)get_total_bytes() + / get_reclaim_ratio(); + } + + /** + * get_immediate_bytes_to_gc + * + * Returns number of bytes to gc in order to restore any strict + * limits. + */ + size_t get_immediate_bytes_to_gc() const { + // number of bytes to gc in order to correct reclaim ratio + size_t for_reclaim = get_immediate_bytes_to_gc_for_reclaim(); + + // number of bytes to gc in order to correct available_ratio + size_t for_available = get_immediate_bytes_to_gc_for_available(); + + return std::max(for_reclaim, for_available); + } + + void mark_closed(segment_id_t segment) { + assert(segments.size() > segment); + if (init_complete) { + assert(segments[segment].is_open()); + } else { + assert(segments[segment].is_empty()); + assert(empty_segments > 0); + --empty_segments; + } + crimson::get_logger(ceph_subsys_filestore).debug( + "mark_closed: empty_segments: {}", + empty_segments); + segments[segment].state = Segment::segment_state_t::CLOSED; + } + + void mark_empty(segment_id_t segment) { + assert(segments.size() > segment); + assert(segments[segment].is_closed()); + assert(segments.size() > empty_segments); + ++empty_segments; + if (space_tracker->get_usage(segment) != 0) { + space_tracker->dump_usage(segment); + assert(space_tracker->get_usage(segment) == 0); + } + segments[segment].state = Segment::segment_state_t::EMPTY; + } + + void mark_open(segment_id_t segment) { + assert(segments.size() > segment); + assert(segments[segment].is_empty()); + assert(empty_segments > 0); + --empty_segments; + segments[segment].state = Segment::segment_state_t::OPEN; + } +}; + +} diff --git a/src/crimson/os/seastore/segment_manager.h b/src/crimson/os/seastore/segment_manager.h new file mode 100644 index 000000000..61c6509d1 --- /dev/null +++ b/src/crimson/os/seastore/segment_manager.h @@ -0,0 +1,128 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iosfwd> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "crimson/os/seastore/seastore_types.h" +#include "include/buffer_fwd.h" +#include "crimson/osd/exceptions.h" + +namespace crimson::os::seastore { + +class Segment : public boost::intrusive_ref_counter< + Segment, + boost::thread_unsafe_counter>{ +public: + + enum class segment_state_t : uint8_t { + EMPTY = 0, + OPEN = 1, + CLOSED = 2 + }; + + /** + * get_segment_id + */ + virtual segment_id_t get_segment_id() const = 0; + + /** + * min next write location + */ + virtual segment_off_t get_write_ptr() const = 0; + + /** + * max capacity + */ + virtual segment_off_t get_write_capacity() const = 0; + + /** + * close + * + * Closes segment for writes. Won't complete until + * outstanding writes to this segment are complete. + */ + using close_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent>; + virtual close_ertr::future<> close() = 0; + + + /** + * write + * + * @param offset offset of write, must be aligned to <> and >= write pointer, advances + * write pointer + * @param bl buffer to write, will be padded if not aligned + */ + using write_ertr = crimson::errorator< + crimson::ct_error::input_output_error, // media error or corruption + crimson::ct_error::invarg, // if offset is < write pointer or misaligned + crimson::ct_error::ebadf, // segment closed + crimson::ct_error::enospc // write exceeds segment size + >; + virtual write_ertr::future<> write( + segment_off_t offset, ceph::bufferlist bl) = 0; + + virtual ~Segment() {} +}; +using SegmentRef = boost::intrusive_ptr<Segment>; + +constexpr size_t PADDR_SIZE = sizeof(paddr_t); + +class SegmentManager { +public: + using open_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent>; + virtual open_ertr::future<SegmentRef> open(segment_id_t id) = 0; + + using release_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent>; + virtual release_ertr::future<> release(segment_id_t id) = 0; + + using read_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + virtual read_ertr::future<> read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) = 0; + read_ertr::future<ceph::bufferptr> read( + paddr_t addr, + size_t len) { + auto ptrref = std::make_unique<ceph::bufferptr>( + buffer::create_page_aligned(len)); + return read(addr, len, *ptrref).safe_then( + [ptrref=std::move(ptrref)]() mutable { + return read_ertr::make_ready_future<bufferptr>(std::move(*ptrref)); + }); + } + + /* Methods for discovering device geometry, segmentid set, etc */ + virtual size_t get_size() const = 0; + virtual segment_off_t get_block_size() const = 0; + virtual segment_off_t get_segment_size() const = 0; + virtual segment_id_t get_num_segments() const { + ceph_assert(get_size() % get_segment_size() == 0); + return ((segment_id_t)(get_size() / get_segment_size())); + } + virtual const seastore_meta_t &get_meta() const = 0; + + virtual ~SegmentManager() {} +}; +using SegmentManagerRef = std::unique_ptr<SegmentManager>; + +} diff --git a/src/crimson/os/seastore/segment_manager/block.cc b/src/crimson/os/seastore/segment_manager/block.cc new file mode 100644 index 000000000..6a4991d42 --- /dev/null +++ b/src/crimson/os/seastore/segment_manager/block.cc @@ -0,0 +1,402 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include "crimson/common/log.h" + +#include "include/buffer.h" +#include "crimson/os/seastore/segment_manager/block.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + + +namespace crimson::os::seastore::segment_manager::block { + +static write_ertr::future<> do_write( + seastar::file &device, + uint64_t offset, + bufferptr &bptr) +{ + logger().debug( + "block: do_write offset {} len {}", + offset, + bptr.length()); + return device.dma_write( + offset, + bptr.c_str(), + bptr.length() + ).handle_exception([](auto e) -> write_ertr::future<size_t> { + logger().error( + "do_write: dma_write got error {}", + e); + return crimson::ct_error::input_output_error::make(); + }).then([length=bptr.length()](auto result) + -> write_ertr::future<> { + if (result != length) { + return crimson::ct_error::input_output_error::make(); + } + return write_ertr::now(); + }); +} + +static read_ertr::future<> do_read( + seastar::file &device, + uint64_t offset, + bufferptr &bptr) +{ + logger().debug( + "block: do_read offset {} len {}", + offset, + bptr.length()); + return device.dma_read( + offset, + bptr.c_str(), + bptr.length() + ).handle_exception([](auto e) -> read_ertr::future<size_t> { + logger().error( + "do_read: dma_read got error {}", + e); + return crimson::ct_error::input_output_error::make(); + }).then([length=bptr.length()](auto result) -> read_ertr::future<> { + if (result != length) { + return crimson::ct_error::input_output_error::make(); + } + return read_ertr::now(); + }); +} + +write_ertr::future<> +SegmentStateTracker::write_out( + seastar::file &device, + uint64_t offset) +{ + return do_write(device, offset, bptr); +} + +write_ertr::future<> +SegmentStateTracker::read_in( + seastar::file &device, + uint64_t offset) +{ + return do_read( + device, + offset, + bptr); +} + +static +block_sm_superblock_t make_superblock( + const BlockSegmentManager::mkfs_config_t &config, + const seastar::stat_data &data) +{ + logger().debug( + "{}: size {}, block_size {}, allocated_size {}, configured_size {}", + __func__, + data.size, + data.block_size, + data.allocated_size, + config.total_size); + size_t size = (data.size == 0) ? config.total_size : data.size; + size_t raw_segments = size / config.segment_size; + size_t tracker_size = SegmentStateTracker::get_raw_size( + raw_segments, + data.block_size); + size_t segments = (size - tracker_size - data.block_size) + / config.segment_size; + return block_sm_superblock_t{ + size, + config.segment_size, + data.block_size, + segments, + data.block_size, + tracker_size + data.block_size, + config.meta + }; +} + +using open_device_ret = + BlockSegmentManager::access_ertr::future< + std::pair<seastar::file, seastar::stat_data> + >; +static +open_device_ret open_device(const std::string &in_path, seastar::open_flags mode) +{ + return seastar::do_with( + in_path, + [mode](auto &path) { + return seastar::file_stat(path, seastar::follow_symlink::yes + ).then([mode, &path](auto stat) mutable { + return seastar::open_file_dma(path, mode).then([=](auto file) { + logger().debug("open_device: open successful"); + return std::make_pair(file, stat); + }); + }).handle_exception([](auto e) -> open_device_ret { + logger().error( + "open_device: got error {}", + e); + return crimson::ct_error::input_output_error::make(); + }); + }); +} + + +static +BlockSegmentManager::access_ertr::future<> +write_superblock(seastar::file &device, block_sm_superblock_t sb) +{ + assert(ceph::encoded_sizeof_bounded<block_sm_superblock_t>() < + sb.block_size); + return seastar::do_with( + bufferptr(ceph::buffer::create_page_aligned(sb.block_size)), + [=, &device](auto &bp) { + bufferlist bl; + encode(sb, bl); + auto iter = bl.begin(); + assert(bl.length() < sb.block_size); + iter.copy(bl.length(), bp.c_str()); + logger().debug("write_superblock: doing writeout"); + return do_write(device, 0, bp); + }); +} + +static +BlockSegmentManager::access_ertr::future<block_sm_superblock_t> +read_superblock(seastar::file &device, seastar::stat_data sd) +{ + assert(ceph::encoded_sizeof_bounded<block_sm_superblock_t>() < + sd.block_size); + return seastar::do_with( + bufferptr(ceph::buffer::create_page_aligned(sd.block_size)), + [=, &device](auto &bp) { + return do_read( + device, + 0, + bp + ).safe_then([=, &bp] { + bufferlist bl; + bl.push_back(bp); + block_sm_superblock_t ret; + auto bliter = bl.cbegin(); + decode(ret, bliter); + return BlockSegmentManager::access_ertr::future<block_sm_superblock_t>( + BlockSegmentManager::access_ertr::ready_future_marker{}, + ret); + }); + }); +} + +BlockSegment::BlockSegment( + BlockSegmentManager &manager, segment_id_t id) + : manager(manager), id(id) {} + +segment_off_t BlockSegment::get_write_capacity() const +{ + return manager.get_segment_size(); +} + +Segment::close_ertr::future<> BlockSegment::close() +{ + manager.segment_close(id); + return close_ertr::now(); +} + +Segment::write_ertr::future<> BlockSegment::write( + segment_off_t offset, ceph::bufferlist bl) +{ + if (offset < write_pointer || offset % manager.superblock.block_size != 0) + return crimson::ct_error::invarg::make(); + + if (offset + bl.length() > manager.superblock.segment_size) + return crimson::ct_error::enospc::make(); + + write_pointer = offset + bl.length(); + return manager.segment_write({id, offset}, bl); +} + +Segment::close_ertr::future<> BlockSegmentManager::segment_close(segment_id_t id) +{ + assert(tracker); + tracker->set(id, segment_state_t::CLOSED); + return tracker->write_out(device, superblock.tracker_offset); +} + +Segment::write_ertr::future<> BlockSegmentManager::segment_write( + paddr_t addr, + ceph::bufferlist bl, + bool ignore_check) +{ + assert((bl.length() % superblock.block_size) == 0); + logger().debug( + "segment_write to segment {} at offset {}, physical offset {}, len {}", + addr.segment, + addr.offset, + get_offset(addr), + bl.length()); + + + // TODO send an iovec and avoid the copy -- bl should have aligned + // constituent buffers and they will remain unmodified until the write + // completes + return seastar::do_with( + bufferptr(ceph::buffer::create_page_aligned(bl.length())), + [&](auto &bp) { + auto iter = bl.cbegin(); + iter.copy(bl.length(), bp.c_str()); + return do_write(device, get_offset(addr), bp); + }); +} + +BlockSegmentManager::~BlockSegmentManager() +{ +} + +BlockSegmentManager::mount_ret BlockSegmentManager::mount(mount_config_t config) +{ + return open_device( + config.path, seastar::open_flags::rw | seastar::open_flags::dsync + ).safe_then([=](auto p) { + device = std::move(p.first); + auto sd = p.second; + return read_superblock(device, sd); + }).safe_then([=](auto sb) { + superblock = sb; + tracker = std::make_unique<SegmentStateTracker>( + superblock.segments, + superblock.block_size); + return tracker->read_in( + device, + superblock.tracker_offset + ).safe_then([this] { + for (segment_id_t i = 0; i < tracker->get_capacity(); ++i) { + if (tracker->get(i) == segment_state_t::OPEN) { + tracker->set(i, segment_state_t::CLOSED); + } + } + return tracker->write_out(device, superblock.tracker_offset); + }); + }); +} + +BlockSegmentManager::mkfs_ret BlockSegmentManager::mkfs(mkfs_config_t config) +{ + return seastar::do_with( + seastar::file{}, + seastar::stat_data{}, + block_sm_superblock_t{}, + std::unique_ptr<SegmentStateTracker>(), + [=](auto &device, auto &stat, auto &sb, auto &tracker) { + return open_device( + config.path, seastar::open_flags::rw + ).safe_then([&, config](auto p) { + device = p.first; + stat = p.second; + sb = make_superblock(config, stat); + return write_superblock(device, sb); + }).safe_then([&] { + logger().debug("BlockSegmentManager::mkfs: superblock written"); + tracker.reset(new SegmentStateTracker(sb.segments, sb.block_size)); + return tracker->write_out(device, sb.tracker_offset); + }).finally([&] { + return device.close(); + }).safe_then([] { + logger().debug("BlockSegmentManager::mkfs: complete"); + return mkfs_ertr::now(); + }); + }); +} + +BlockSegmentManager::close_ertr::future<> BlockSegmentManager::close() +{ + return device.close(); +} + +SegmentManager::open_ertr::future<SegmentRef> BlockSegmentManager::open( + segment_id_t id) +{ + if (id >= get_num_segments()) { + logger().error("BlockSegmentManager::open: invalid segment {}", id); + return crimson::ct_error::invarg::make(); + } + + if (tracker->get(id) != segment_state_t::EMPTY) { + logger().error( + "BlockSegmentManager::open: invalid segment {} state {}", + id, + tracker->get(id)); + return crimson::ct_error::invarg::make(); + } + + tracker->set(id, segment_state_t::OPEN); + return tracker->write_out(device, superblock.tracker_offset + ).safe_then([this, id] { + return open_ertr::future<SegmentRef>( + open_ertr::ready_future_marker{}, + SegmentRef(new BlockSegment(*this, id))); + }); +} + +SegmentManager::release_ertr::future<> BlockSegmentManager::release( + segment_id_t id) +{ + logger().debug("BlockSegmentManager::release: {}", id); + + if (id >= get_num_segments()) { + logger().error( + "BlockSegmentManager::release: invalid segment {}", + id); + return crimson::ct_error::invarg::make(); + } + + if (tracker->get(id) != segment_state_t::CLOSED) { + logger().error( + "BlockSegmentManager::release: invalid segment {} state {}", + id, + tracker->get(id)); + return crimson::ct_error::invarg::make(); + } + + tracker->set(id, segment_state_t::EMPTY); + return tracker->write_out(device, superblock.tracker_offset); +} + +SegmentManager::read_ertr::future<> BlockSegmentManager::read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) +{ + if (addr.segment >= get_num_segments()) { + logger().error( + "BlockSegmentManager::read: invalid segment {}", + addr); + return crimson::ct_error::invarg::make(); + } + + if (addr.offset + len > superblock.segment_size) { + logger().error( + "BlockSegmentManager::read: invalid offset {}~{}!", + addr, + len); + return crimson::ct_error::invarg::make(); + } + + if (tracker->get(addr.segment) == segment_state_t::EMPTY) { + logger().error( + "BlockSegmentManager::read: read on invalid segment {} state {}", + addr.segment, + tracker->get(addr.segment)); + return crimson::ct_error::enoent::make(); + } + + return do_read( + device, + get_offset(addr), + out); +} + +} diff --git a/src/crimson/os/seastore/segment_manager/block.h b/src/crimson/os/seastore/segment_manager/block.h new file mode 100644 index 000000000..927b13e4e --- /dev/null +++ b/src/crimson/os/seastore/segment_manager/block.h @@ -0,0 +1,222 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include <seastar/core/file.hh> +#include <seastar/core/future.hh> +#include <seastar/core/reactor.hh> + +#include "crimson/common/layout.h" + +#include "crimson/os/seastore/segment_manager.h" + +namespace crimson::os::seastore::segment_manager::block { + +struct block_sm_superblock_t { + size_t size = 0; + size_t segment_size = 0; + size_t block_size = 0; + + size_t segments = 0; + uint64_t tracker_offset = 0; + uint64_t first_segment_offset = 0; + + seastore_meta_t meta; + + DENC(block_sm_superblock_t, v, p) { + DENC_START(1, 1, p); + denc(v.size, p); + denc(v.segment_size, p); + denc(v.block_size, p); + denc(v.segments, p); + denc(v.tracker_offset, p); + denc(v.first_segment_offset, p); + denc(v.meta, p); + DENC_FINISH(p); + } +}; + +using write_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; +using read_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + +/** + * SegmentStateTracker + * + * Tracks lifecycle state of each segment using space at the beginning + * of the drive. + */ +class SegmentStateTracker { + using segment_state_t = Segment::segment_state_t; + + bufferptr bptr; + + using L = absl::container_internal::Layout<uint8_t>; + const L layout; + +public: + static size_t get_raw_size(size_t segments, size_t block_size) { + return p2roundup(segments, block_size); + } + + SegmentStateTracker(size_t segments, size_t block_size) + : bptr(ceph::buffer::create_page_aligned( + get_raw_size(segments, block_size))), + layout(bptr.length()) + { + ::memset( + bptr.c_str(), + static_cast<char>(segment_state_t::EMPTY), + bptr.length()); + } + + size_t get_size() const { + return bptr.length(); + } + + size_t get_capacity() const { + return bptr.length(); + } + + segment_state_t get(segment_id_t offset) const { + assert(offset < get_capacity()); + return static_cast<segment_state_t>( + layout.template Pointer<0>( + bptr.c_str())[offset]); + } + + void set(segment_id_t offset, segment_state_t state) { + assert(offset < get_capacity()); + layout.template Pointer<0>(bptr.c_str())[offset] = + static_cast<uint8_t>(state); + } + + write_ertr::future<> write_out( + seastar::file &device, + uint64_t offset); + + read_ertr::future<> read_in( + seastar::file &device, + uint64_t offset); +}; + +class BlockSegmentManager; +class BlockSegment final : public Segment { + friend class BlockSegmentManager; + BlockSegmentManager &manager; + const segment_id_t id; + segment_off_t write_pointer = 0; +public: + BlockSegment(BlockSegmentManager &manager, segment_id_t id); + + segment_id_t get_segment_id() const final { return id; } + segment_off_t get_write_capacity() const final; + segment_off_t get_write_ptr() const final { return write_pointer; } + close_ertr::future<> close() final; + write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final; + + ~BlockSegment() {} +}; + +/** + * BlockSegmentManager + * + * Implements SegmentManager on a conventional block device. + * SegmentStateTracker uses space at the start of the device to store + * state analagous to that of the segments of a zns device. + */ +class BlockSegmentManager final : public SegmentManager { +public: + using access_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::permission_denied, + crimson::ct_error::enoent>; + + + struct mount_config_t { + std::string path; + }; + using mount_ertr = access_ertr; + using mount_ret = access_ertr::future<>; + mount_ret mount(mount_config_t); + + struct mkfs_config_t { + std::string path; + size_t segment_size = 0; + size_t total_size = 0; + seastore_meta_t meta; + }; + using mkfs_ertr = access_ertr; + using mkfs_ret = mkfs_ertr::future<>; + static mkfs_ret mkfs(mkfs_config_t); + + using close_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + close_ertr::future<> close(); + + BlockSegmentManager() = default; + ~BlockSegmentManager(); + + open_ertr::future<SegmentRef> open(segment_id_t id) final; + + release_ertr::future<> release(segment_id_t id) final; + + read_ertr::future<> read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) final; + + size_t get_size() const final { + return superblock.size; + } + segment_off_t get_block_size() const { + return superblock.block_size; + } + segment_off_t get_segment_size() const { + return superblock.segment_size; + } + + // public so tests can bypass segment interface when simpler + Segment::write_ertr::future<> segment_write( + paddr_t addr, + ceph::bufferlist bl, + bool ignore_check=false); + +private: + friend class BlockSegment; + using segment_state_t = Segment::segment_state_t; + + + std::unique_ptr<SegmentStateTracker> tracker; + block_sm_superblock_t superblock; + seastar::file device; + + size_t get_offset(paddr_t addr) { + return superblock.first_segment_offset + + (addr.segment * superblock.segment_size) + + addr.offset; + } + + const seastore_meta_t &get_meta() const { + return superblock.meta; + } + + std::vector<segment_state_t> segment_state; + + char *buffer = nullptr; + + Segment::close_ertr::future<> segment_close(segment_id_t id); +}; + +} + +WRITE_CLASS_DENC_BOUNDED( + crimson::os::seastore::segment_manager::block::block_sm_superblock_t +) + diff --git a/src/crimson/os/seastore/segment_manager/ephemeral.cc b/src/crimson/os/seastore/segment_manager/ephemeral.cc new file mode 100644 index 000000000..3250303ad --- /dev/null +++ b/src/crimson/os/seastore/segment_manager/ephemeral.cc @@ -0,0 +1,226 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include "seastar/core/sleep.hh" + +#include "crimson/common/log.h" + +#include "include/buffer.h" +#include "crimson/os/seastore/segment_manager/ephemeral.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::segment_manager { + +std::ostream &operator<<(std::ostream &lhs, const ephemeral_config_t &c) { + return lhs << "ephemeral_config_t(size=" << c.size << ", block_size=" << c.block_size + << ", segment_size=" << c.segment_size << ")"; +} + +EphemeralSegmentManagerRef create_test_ephemeral() { + return EphemeralSegmentManagerRef( + new EphemeralSegmentManager(DEFAULT_TEST_EPHEMERAL)); +} + +EphemeralSegment::EphemeralSegment( + EphemeralSegmentManager &manager, segment_id_t id) + : manager(manager), id(id) {} + +segment_off_t EphemeralSegment::get_write_capacity() const +{ + return manager.get_segment_size(); +} + +Segment::close_ertr::future<> EphemeralSegment::close() +{ + manager.segment_close(id); + return close_ertr::now().safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +Segment::write_ertr::future<> EphemeralSegment::write( + segment_off_t offset, ceph::bufferlist bl) +{ + if (offset < write_pointer || offset % manager.config.block_size != 0) + return crimson::ct_error::invarg::make(); + + if (offset + bl.length() > (size_t)manager.get_segment_size()) + return crimson::ct_error::enospc::make(); + + return manager.segment_write({id, offset}, bl); +} + +Segment::close_ertr::future<> EphemeralSegmentManager::segment_close(segment_id_t id) +{ + if (segment_state[id] != segment_state_t::OPEN) + return crimson::ct_error::invarg::make(); + + segment_state[id] = segment_state_t::CLOSED; + return Segment::close_ertr::now().safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +Segment::write_ertr::future<> EphemeralSegmentManager::segment_write( + paddr_t addr, + ceph::bufferlist bl, + bool ignore_check) +{ + logger().debug( + "segment_write to segment {} at offset {}, physical offset {}, len {}, crc {}", + addr.segment, + addr.offset, + get_offset(addr), + bl.length(), + bl.crc32c(1)); + if (!ignore_check && segment_state[addr.segment] != segment_state_t::OPEN) + return crimson::ct_error::invarg::make(); + + bl.begin().copy(bl.length(), buffer + get_offset(addr)); + return Segment::write_ertr::now().safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +EphemeralSegmentManager::init_ertr::future<> EphemeralSegmentManager::init() +{ + logger().debug( + "Initing ephemeral segment manager with config {}", + config); + + meta = seastore_meta_t{}; + + if (config.block_size % (4<<10) != 0) { + return crimson::ct_error::invarg::make(); + } + if (config.segment_size % config.block_size != 0) { + return crimson::ct_error::invarg::make(); + } + if (config.size % config.segment_size != 0) { + return crimson::ct_error::invarg::make(); + } + + auto addr = ::mmap( + nullptr, + config.size, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, + -1, + 0); + + segment_state.resize(config.size / config.segment_size, segment_state_t::EMPTY); + + if (addr == MAP_FAILED) + return crimson::ct_error::enospc::make(); + + buffer = (char*)addr; + + ::memset(buffer, 0, config.size); + return init_ertr::now().safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +EphemeralSegmentManager::~EphemeralSegmentManager() +{ + if (buffer) { + ::munmap(buffer, config.size); + } +} + +void EphemeralSegmentManager::remount() +{ + for (auto &i : segment_state) { + if (i == Segment::segment_state_t::OPEN) + i = Segment::segment_state_t::CLOSED; + } +} + +SegmentManager::open_ertr::future<SegmentRef> EphemeralSegmentManager::open( + segment_id_t id) +{ + if (id >= get_num_segments()) { + logger().error("EphemeralSegmentManager::open: invalid segment {}", id); + return crimson::ct_error::invarg::make(); + } + + if (segment_state[id] != segment_state_t::EMPTY) { + logger().error("EphemeralSegmentManager::open: segment {} not empty", id); + return crimson::ct_error::invarg::make(); + } + + segment_state[id] = segment_state_t::OPEN; + return open_ertr::make_ready_future<SegmentRef>(new EphemeralSegment(*this, id)); +} + +SegmentManager::release_ertr::future<> EphemeralSegmentManager::release( + segment_id_t id) +{ + logger().debug("EphemeralSegmentManager::release: {}", id); + + if (id >= get_num_segments()) { + logger().error( + "EphemeralSegmentManager::release: invalid segment {}", + id); + return crimson::ct_error::invarg::make(); + } + + if (segment_state[id] != segment_state_t::CLOSED) { + logger().error( + "EphemeralSegmentManager::release: segment id {} not closed", + id); + return crimson::ct_error::invarg::make(); + } + + ::memset(buffer + get_offset({id, 0}), 0, config.segment_size); + segment_state[id] = segment_state_t::EMPTY; + return release_ertr::now().safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +SegmentManager::read_ertr::future<> EphemeralSegmentManager::read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) +{ + if (addr.segment >= get_num_segments()) { + logger().error( + "EphemeralSegmentManager::read: invalid segment {}", + addr); + return crimson::ct_error::invarg::make(); + } + + if (addr.offset + len > config.segment_size) { + logger().error( + "EphemeralSegmentManager::read: invalid offset {}~{}!", + addr, + len); + return crimson::ct_error::invarg::make(); + } + + out.copy_in(0, len, buffer + get_offset(addr)); + + bufferlist bl; + bl.push_back(out); + logger().debug( + "segment_read to segment {} at offset {}, physical offset {}, length {}, crc {}", + addr.segment, + addr.offset, + get_offset(addr), + len, + bl.begin().crc32c(len, 1)); + + return read_ertr::now().safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +} diff --git a/src/crimson/os/seastore/segment_manager/ephemeral.h b/src/crimson/os/seastore/segment_manager/ephemeral.h new file mode 100644 index 000000000..9f19cb4d0 --- /dev/null +++ b/src/crimson/os/seastore/segment_manager/ephemeral.h @@ -0,0 +1,111 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "crimson/os/seastore/segment_manager.h" + +#include "crimson/os/seastore/segment_manager/ephemeral.h" + +namespace crimson::os::seastore::segment_manager { + +class EphemeralSegmentManager; +using EphemeralSegmentManagerRef = std::unique_ptr<EphemeralSegmentManager>; + +struct ephemeral_config_t { + size_t size = 0; + size_t block_size = 0; + size_t segment_size = 0; +}; + +constexpr ephemeral_config_t DEFAULT_TEST_EPHEMERAL = { + 1 << 30, + 4 << 10, + 8 << 20 +}; + +std::ostream &operator<<(std::ostream &, const ephemeral_config_t &); +EphemeralSegmentManagerRef create_test_ephemeral(); + +class EphemeralSegment final : public Segment { + friend class EphemeralSegmentManager; + EphemeralSegmentManager &manager; + const segment_id_t id; + segment_off_t write_pointer = 0; +public: + EphemeralSegment(EphemeralSegmentManager &manager, segment_id_t id); + + segment_id_t get_segment_id() const final { return id; } + segment_off_t get_write_capacity() const final; + segment_off_t get_write_ptr() const final { return write_pointer; } + close_ertr::future<> close() final; + write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final; + + ~EphemeralSegment() {} +}; + +class EphemeralSegmentManager final : public SegmentManager { + friend class EphemeralSegment; + using segment_state_t = Segment::segment_state_t; + + const ephemeral_config_t config; + std::optional<seastore_meta_t> meta; + + size_t get_offset(paddr_t addr) { + return (addr.segment * config.segment_size) + addr.offset; + } + + std::vector<segment_state_t> segment_state; + + char *buffer = nullptr; + + Segment::close_ertr::future<> segment_close(segment_id_t id); + +public: + EphemeralSegmentManager(ephemeral_config_t config) : config(config) {} + ~EphemeralSegmentManager(); + + using init_ertr = crimson::errorator< + crimson::ct_error::enospc, + crimson::ct_error::invarg, + crimson::ct_error::erange>; + init_ertr::future<> init(); + + open_ertr::future<SegmentRef> open(segment_id_t id) final; + + release_ertr::future<> release(segment_id_t id) final; + + read_ertr::future<> read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) final; + + size_t get_size() const final { + return config.size; + } + segment_off_t get_block_size() const final { + return config.block_size; + } + segment_off_t get_segment_size() const final { + return config.segment_size; + } + + const seastore_meta_t &get_meta() const final { + assert(meta); + return *meta; + } + + void remount(); + + // public so tests can bypass segment interface when simpler + Segment::write_ertr::future<> segment_write( + paddr_t addr, + ceph::bufferlist bl, + bool ignore_check=false); +}; + +} diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h new file mode 100644 index 000000000..e189d1d32 --- /dev/null +++ b/src/crimson/os/seastore/transaction.h @@ -0,0 +1,145 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/root_block.h" + +namespace crimson::os::seastore { + +/** + * Transaction + * + * Representation of in-progress mutation. Used exclusively through Cache methods. + */ +class Transaction { +public: + using Ref = std::unique_ptr<Transaction>; + enum class get_extent_ret { + PRESENT, + ABSENT, + RETIRED + }; + get_extent_ret get_extent(paddr_t addr, CachedExtentRef *out) { + if (retired_set.count(addr)) { + return get_extent_ret::RETIRED; + } else if (auto iter = write_set.find_offset(addr); + iter != write_set.end()) { + if (out) + *out = CachedExtentRef(&*iter); + return get_extent_ret::PRESENT; + } else if ( + auto iter = read_set.find(addr); + iter != read_set.end()) { + if (out) + *out = CachedExtentRef(*iter); + return get_extent_ret::PRESENT; + } else { + return get_extent_ret::ABSENT; + } + } + + void add_to_retired_set(CachedExtentRef ref) { + ceph_assert(!is_weak()); + if (!ref->is_initial_pending()) { + // && retired_set.count(ref->get_paddr()) == 0 + // If it's already in the set, insert here will be a noop, + // which is what we want. + retired_set.insert(ref); + } else { + ref->state = CachedExtent::extent_state_t::INVALID; + } + if (ref->is_pending()) { + write_set.erase(*ref); + } + } + + void add_to_read_set(CachedExtentRef ref) { + if (is_weak()) return; + + ceph_assert(read_set.count(ref) == 0); + read_set.insert(ref); + } + + void add_fresh_extent(CachedExtentRef ref) { + ceph_assert(!is_weak()); + fresh_block_list.push_back(ref); + ref->set_paddr(make_record_relative_paddr(offset)); + offset += ref->get_length(); + write_set.insert(*ref); + } + + void add_mutated_extent(CachedExtentRef ref) { + ceph_assert(!is_weak()); + mutated_block_list.push_back(ref); + write_set.insert(*ref); + } + + void mark_segment_to_release(segment_id_t segment) { + assert(to_release == NULL_SEG_ID); + to_release = segment; + } + + segment_id_t get_segment_to_release() const { + return to_release; + } + + const auto &get_fresh_block_list() { + return fresh_block_list; + } + + const auto &get_mutated_block_list() { + return mutated_block_list; + } + + const auto &get_retired_set() { + return retired_set; + } + + bool is_weak() const { + return weak; + } + +private: + friend class Cache; + friend Ref make_transaction(); + friend Ref make_weak_transaction(); + + /** + * If set, *this may not be used to perform writes and will not provide + * consistentency allowing operations using to avoid maintaining a read_set. + */ + const bool weak; + + RootBlockRef root; ///< ref to root if read or written by transaction + + segment_off_t offset = 0; ///< relative offset of next block + + pextent_set_t read_set; ///< set of extents read by paddr + ExtentIndex write_set; ///< set of extents written by paddr + + std::list<CachedExtentRef> fresh_block_list; ///< list of fresh blocks + std::list<CachedExtentRef> mutated_block_list; ///< list of mutated blocks + + pextent_set_t retired_set; ///< list of extents mutated by this transaction + + ///< if != NULL_SEG_ID, release this segment after completion + segment_id_t to_release = NULL_SEG_ID; + + Transaction(bool weak) : weak(weak) {} +}; +using TransactionRef = Transaction::Ref; + +inline TransactionRef make_transaction() { + return std::unique_ptr<Transaction>(new Transaction(false)); +} + +inline TransactionRef make_weak_transaction() { + return std::unique_ptr<Transaction>(new Transaction(true)); +} + +} diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc new file mode 100644 index 000000000..7b86631e2 --- /dev/null +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -0,0 +1,306 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/denc.h" +#include "include/intarith.h" + +#include "crimson/common/log.h" + +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/segment_manager.h" +#include "crimson/os/seastore/journal.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore { + +TransactionManager::TransactionManager( + SegmentManager &segment_manager, + SegmentCleaner &segment_cleaner, + Journal &journal, + Cache &cache, + LBAManager &lba_manager) + : segment_manager(segment_manager), + segment_cleaner(segment_cleaner), + cache(cache), + lba_manager(lba_manager), + journal(journal) +{} + +TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs() +{ + return journal.open_for_write().safe_then([this](auto addr) { + logger().debug("TransactionManager::mkfs: about to do_with"); + segment_cleaner.set_journal_head(addr); + return seastar::do_with( + create_transaction(), + [this](auto &transaction) { + logger().debug("TransactionManager::mkfs: about to cache.mkfs"); + cache.init(); + return cache.mkfs(*transaction + ).safe_then([this, &transaction] { + return lba_manager.mkfs(*transaction); + }).safe_then([this, &transaction] { + logger().debug("TransactionManager::mkfs: about to submit_transaction"); + return submit_transaction(std::move(transaction)).handle_error( + crimson::ct_error::eagain::handle([] { + ceph_assert(0 == "eagain impossible"); + return mkfs_ertr::now(); + }), + mkfs_ertr::pass_further{} + ); + }); + }); + }).safe_then([this] { + return journal.close(); + }); +} + +TransactionManager::mount_ertr::future<> TransactionManager::mount() +{ + cache.init(); + return journal.replay([this](auto seq, auto paddr, const auto &e) { + return cache.replay_delta(seq, paddr, e); + }).safe_then([this] { + return journal.open_for_write(); + }).safe_then([this](auto addr) { + segment_cleaner.set_journal_head(addr); + return seastar::do_with( + make_weak_transaction(), + [this](auto &t) { + return cache.init_cached_extents(*t, [this](auto &t, auto &e) { + return lba_manager.init_cached_extent(t, e); + }).safe_then([this, &t] { + assert(segment_cleaner.debug_check_space( + *segment_cleaner.get_empty_space_tracker())); + return lba_manager.scan_mapped_space( + *t, + [this](paddr_t addr, extent_len_t len) { + logger().debug("TransactionManager::mount: marking {}~{} used", + addr, + len); + segment_cleaner.mark_space_used( + addr, + len , + /* init_scan = */ true); + }); + }); + }); + }).safe_then([this] { + segment_cleaner.complete_init(); + }).handle_error( + mount_ertr::pass_further{}, + crimson::ct_error::all_same_way([] { + ceph_assert(0 == "unhandled error"); + return mount_ertr::now(); + })); +} + +TransactionManager::close_ertr::future<> TransactionManager::close() { + return cache.close( + ).safe_then([this] { + return journal.close(); + }); +} + +TransactionManager::ref_ret TransactionManager::inc_ref( + Transaction &t, + LogicalCachedExtentRef &ref) +{ + return lba_manager.incref_extent(t, ref->get_laddr()).safe_then([](auto r) { + return r.refcount; + }).handle_error( + ref_ertr::pass_further{}, + ct_error::all_same_way([](auto e) { + ceph_assert(0 == "unhandled error, TODO"); + })); +} + +TransactionManager::ref_ret TransactionManager::inc_ref( + Transaction &t, + laddr_t offset) +{ + return lba_manager.incref_extent(t, offset).safe_then([](auto result) { + return result.refcount; + }); +} + +TransactionManager::ref_ret TransactionManager::dec_ref( + Transaction &t, + LogicalCachedExtentRef &ref) +{ + return lba_manager.decref_extent(t, ref->get_laddr() + ).safe_then([this, &t, ref](auto ret) { + if (ret.refcount == 0) { + logger().debug( + "TransactionManager::dec_ref: extent {} refcount 0", + *ref); + cache.retire_extent(t, ref); + } + return ret.refcount; + }); +} + +TransactionManager::ref_ret TransactionManager::dec_ref( + Transaction &t, + laddr_t offset) +{ + return lba_manager.decref_extent(t, offset + ).safe_then([this, offset, &t](auto result) -> ref_ret { + if (result.refcount == 0) { + logger().debug( + "TransactionManager::dec_ref: offset {} refcount 0", + offset); + return cache.retire_extent_if_cached(t, result.addr).safe_then([] { + return ref_ret( + ref_ertr::ready_future_marker{}, + 0); + }); + } else { + return ref_ret( + ref_ertr::ready_future_marker{}, + result.refcount); + } + }); +} + +TransactionManager::submit_transaction_ertr::future<> +TransactionManager::submit_transaction( + TransactionRef t) +{ + logger().debug("TransactionManager::submit_transaction"); + return segment_cleaner.do_immediate_work(*t + ).safe_then([this, t=std::move(t)]() mutable -> submit_transaction_ertr::future<> { + auto record = cache.try_construct_record(*t); + if (!record) { + return crimson::ct_error::eagain::make(); + } + + return journal.submit_record(std::move(*record) + ).safe_then([this, t=std::move(t)](auto p) mutable { + auto [addr, journal_seq] = p; + segment_cleaner.set_journal_head(journal_seq); + cache.complete_commit(*t, addr, journal_seq, &segment_cleaner); + lba_manager.complete_transaction(*t); + auto to_release = t->get_segment_to_release(); + if (to_release != NULL_SEG_ID) { + segment_cleaner.mark_segment_released(to_release); + return segment_manager.release(to_release); + } else { + return SegmentManager::release_ertr::now(); + } + }).handle_error( + submit_transaction_ertr::pass_further{}, + crimson::ct_error::all_same_way([](auto e) { + ceph_assert(0 == "Hit error submitting to journal"); + })); + }); +} + +TransactionManager::get_next_dirty_extents_ret +TransactionManager::get_next_dirty_extents(journal_seq_t seq) +{ + return cache.get_next_dirty_extents(seq); +} + +TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent( + Transaction &t, + CachedExtentRef extent) +{ + { + auto updated = cache.update_extent_from_transaction(t, extent); + if (!updated) { + logger().debug( + "{}: {} is already retired, skipping", + __func__, + *extent); + return rewrite_extent_ertr::now(); + } + extent = updated; + } + + if (extent->get_type() == extent_types_t::ROOT) { + logger().debug( + "{}: marking root {} for rewrite", + __func__, + *extent); + cache.duplicate_for_write(t, extent); + return rewrite_extent_ertr::now(); + } + return lba_manager.rewrite_extent(t, extent); +} + +TransactionManager::get_extent_if_live_ret TransactionManager::get_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + segment_off_t len) +{ + CachedExtentRef ret; + auto status = cache.get_extent_if_cached(t, addr, &ret); + if (status != Transaction::get_extent_ret::ABSENT) { + return get_extent_if_live_ret( + get_extent_if_live_ertr::ready_future_marker{}, + ret); + } + + if (is_logical_type(type)) { + return lba_manager.get_mapping( + t, + laddr, + len).safe_then([=, &t](lba_pin_list_t pins) { + ceph_assert(pins.size() <= 1); + if (pins.empty()) { + return get_extent_if_live_ret( + get_extent_if_live_ertr::ready_future_marker{}, + CachedExtentRef()); + } + + auto pin = std::move(pins.front()); + pins.pop_front(); + ceph_assert(pin->get_laddr() == laddr); + ceph_assert(pin->get_length() == (extent_len_t)len); + if (pin->get_paddr() == addr) { + return cache.get_extent_by_type( + t, + type, + addr, + laddr, + len).safe_then( + [this, pin=std::move(pin)](CachedExtentRef ret) mutable { + auto lref = ret->cast<LogicalCachedExtent>(); + if (!lref->has_pin()) { + lref->set_pin(std::move(pin)); + lba_manager.add_pin(lref->get_pin()); + } + return get_extent_if_live_ret( + get_extent_if_live_ertr::ready_future_marker{}, + ret); + }); + } else { + return get_extent_if_live_ret( + get_extent_if_live_ertr::ready_future_marker{}, + CachedExtentRef()); + } + }); + } else { + logger().debug( + "TransactionManager::get_extent_if_live: non-logical extent {}", + addr); + return lba_manager.get_physical_extent_if_live( + t, + type, + addr, + laddr, + len); + } +} + +TransactionManager::~TransactionManager() {} + +} diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h new file mode 100644 index 000000000..d28fd0b87 --- /dev/null +++ b/src/crimson/os/seastore/transaction_manager.h @@ -0,0 +1,296 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <optional> +#include <vector> +#include <utility> +#include <functional> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "include/buffer.h" + +#include "crimson/osd/exceptions.h" + +#include "crimson/os/seastore/segment_cleaner.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/segment_manager.h" +#include "crimson/os/seastore/lba_manager.h" +#include "crimson/os/seastore/journal.h" + +namespace crimson::os::seastore { +class Journal; + +/** + * TransactionManager + * + * Abstraction hiding reading and writing to persistence. + * Exposes transaction based interface with read isolation. + */ +class TransactionManager : public SegmentCleaner::ExtentCallbackInterface { +public: + TransactionManager( + SegmentManager &segment_manager, + SegmentCleaner &segment_cleaner, + Journal &journal, + Cache &cache, + LBAManager &lba_manager); + + /// Writes initial metadata to disk + using mkfs_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + mkfs_ertr::future<> mkfs(); + + /// Reads initial metadata from disk + using mount_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + mount_ertr::future<> mount(); + + /// Closes transaction_manager + using close_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + close_ertr::future<> close(); + + /// Creates empty transaction + TransactionRef create_transaction() { + return make_transaction(); + } + + /// Creates weak transaction + TransactionRef create_weak_transaction() { + return make_weak_transaction(); + } + + /** + * Read extents corresponding to specified lba range + */ + using read_extent_ertr = SegmentManager::read_ertr; + template <typename T> + using read_extent_ret = read_extent_ertr::future<lextent_list_t<T>>; + template <typename T> + read_extent_ret<T> read_extents( + Transaction &t, + laddr_t offset, + extent_len_t length) + { + std::unique_ptr<lextent_list_t<T>> ret = + std::make_unique<lextent_list_t<T>>(); + auto &ret_ref = *ret; + std::unique_ptr<lba_pin_list_t> pin_list = + std::make_unique<lba_pin_list_t>(); + auto &pin_list_ref = *pin_list; + return lba_manager.get_mapping( + t, offset, length + ).safe_then([this, &t, &pin_list_ref, &ret_ref](auto pins) { + crimson::get_logger(ceph_subsys_filestore).debug( + "read_extents: mappings {}", + pins); + pins.swap(pin_list_ref); + return crimson::do_for_each( + pin_list_ref.begin(), + pin_list_ref.end(), + [this, &t, &ret_ref](auto &pin) { + crimson::get_logger(ceph_subsys_filestore).debug( + "read_extents: get_extent {}~{}", + pin->get_paddr(), + pin->get_length()); + return cache.get_extent<T>( + t, + pin->get_paddr(), + pin->get_length() + ).safe_then([this, &pin, &ret_ref](auto ref) mutable { + if (!ref->has_pin()) { + ref->set_pin(std::move(pin)); + lba_manager.add_pin(ref->get_pin()); + } + ret_ref.push_back(std::make_pair(ref->get_laddr(), ref)); + crimson::get_logger(ceph_subsys_filestore).debug( + "read_extents: got extent {}", + *ref); + return read_extent_ertr::now(); + }); + }); + }).safe_then([ret=std::move(ret), pin_list=std::move(pin_list)]() mutable { + return read_extent_ret<T>( + read_extent_ertr::ready_future_marker{}, + std::move(*ret)); + }); + } + + /// Obtain mutable copy of extent + LogicalCachedExtentRef get_mutable_extent(Transaction &t, LogicalCachedExtentRef ref) { + auto &logger = crimson::get_logger(ceph_subsys_filestore); + auto ret = cache.duplicate_for_write( + t, + ref)->cast<LogicalCachedExtent>(); + if (!ret->has_pin()) { + logger.debug( + "{}: duplicating {} for write: {}", + __func__, + *ref, + *ret); + ret->set_pin(ref->get_pin().duplicate()); + } else { + logger.debug( + "{}: {} already pending", + __func__, + *ref); + assert(ref->is_pending()); + assert(&*ref == &*ret); + } + return ret; + } + + + using ref_ertr = LBAManager::ref_ertr; + using ref_ret = ref_ertr::future<unsigned>; + + /// Add refcount for ref + ref_ret inc_ref( + Transaction &t, + LogicalCachedExtentRef &ref); + + /// Add refcount for offset + ref_ret inc_ref( + Transaction &t, + laddr_t offset); + + /// Remove refcount for ref + ref_ret dec_ref( + Transaction &t, + LogicalCachedExtentRef &ref); + + /// Remove refcount for offset + ref_ret dec_ref( + Transaction &t, + laddr_t offset); + + /** + * alloc_extent + * + * Allocates a new block of type T with the minimum lba range of size len + * greater than hint. + */ + using alloc_extent_ertr = SegmentManager::read_ertr; + template <typename T> + using alloc_extent_ret = alloc_extent_ertr::future<TCachedExtentRef<T>>; + template <typename T> + alloc_extent_ret<T> alloc_extent( + Transaction &t, + laddr_t hint, + extent_len_t len) { + auto ext = cache.alloc_new_extent<T>( + t, + len); + return lba_manager.alloc_extent( + t, + hint, + len, + ext->get_paddr() + ).safe_then([ext=std::move(ext)](auto &&ref) mutable { + ext->set_pin(std::move(ref)); + return alloc_extent_ertr::make_ready_future<TCachedExtentRef<T>>( + std::move(ext)); + }); + } + + /** + * submit_transaction + * + * Atomically submits transaction to persistence + */ + using submit_transaction_ertr = crimson::errorator< + crimson::ct_error::eagain, // Caller should retry transaction from beginning + crimson::ct_error::input_output_error // Media error + >; + submit_transaction_ertr::future<> submit_transaction(TransactionRef); + + /// SegmentCleaner::ExtentCallbackInterface + + using SegmentCleaner::ExtentCallbackInterface::get_next_dirty_extents_ret; + get_next_dirty_extents_ret get_next_dirty_extents( + journal_seq_t seq) final; + + using SegmentCleaner::ExtentCallbackInterface::rewrite_extent_ret; + rewrite_extent_ret rewrite_extent( + Transaction &t, + CachedExtentRef extent) final; + + using SegmentCleaner::ExtentCallbackInterface::get_extent_if_live_ret; + get_extent_if_live_ret get_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + segment_off_t len) final; + + using scan_extents_cursor = + SegmentCleaner::ExtentCallbackInterface::scan_extents_cursor; + using scan_extents_ertr = + SegmentCleaner::ExtentCallbackInterface::scan_extents_ertr; + using scan_extents_ret = + SegmentCleaner::ExtentCallbackInterface::scan_extents_ret; + scan_extents_ret scan_extents( + scan_extents_cursor &cursor, + extent_len_t bytes_to_read) final { + return journal.scan_extents(cursor, bytes_to_read); + } + + using release_segment_ret = + SegmentCleaner::ExtentCallbackInterface::release_segment_ret; + release_segment_ret release_segment( + segment_id_t id) final { + return segment_manager.release(id); + } + + /** + * read_onode_root + * + * Get onode-tree root logical address + */ + using read_onode_root_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + using read_onode_root_ret = read_onode_root_ertr::future<laddr_t>; + read_onode_root_ret read_onode_root(Transaction &t) { + return cache.get_root(t).safe_then([](auto croot) { + return croot->get_root().onode_root; + }); + } + + /** + * write_onode_root + * + * Write onode-tree root logical address, must be called after read. + */ + void write_onode_root(Transaction &t, laddr_t addr) { + auto croot = cache.get_root_fast(t); + croot = cache.duplicate_for_write(t, croot)->cast<RootBlock>(); + croot->get_root().onode_root = addr; + } + + ~TransactionManager(); + +private: + friend class Transaction; + + SegmentManager &segment_manager; + SegmentCleaner &segment_cleaner; + Cache &cache; + LBAManager &lba_manager; + Journal &journal; +}; +using TransactionManagerRef = std::unique_ptr<TransactionManager>; + +} |